基本思路,通过newlisp定时下载jobtracker页面,用正则表达式解析html中的table元素,然后获得最新的mapreduce的状态。
每次获得状态数据后,存入mysql数据库,然后用tableau将mapreduce集群状态用报表呈现。
这是jobtracker站点的数据

这是Tableau绘制的报表

这样就可以用数据可视化的方式展示Hadoop集群计算的压力状态。
下面是newlisp代码,主要就是用正则表达式解析html,用mysql模块写入数据库。
#!/usr/bin/newlisp
(load "mysql.lsp")
(define (check-args)
(print "args: ")
(println (main-args))
(set 'args-length (length (main-args)))
(if (< args-length 3)
(begin
(println "the number of args must be 3 or 4, e.g. ./job.lsp jobtracker.bigdata.cn 8080")
(exit))))
(define (parse-args)
(set 'domain (main-args 2))
(if (= 4 args-length)
(set 'port (main-args 3)))
(if (= 3 args-length)
(set 'port "80"))
(set 'url (string "http://" domain ":" port "/jobtracker.jsp"))
(println (string "job tracker site is located at " url)))
(define (access-job-tracker-site)
(set 'page-content (get-url url))
(extract-tables page-content)
)
(define (extract-summary-table table)
(if (regex "Running Map Tasks" table)
table)
)
(define (get-number td)
(set 'r ((regex "(.*) " td) 3))
(if (find "(.*)" r) 3)
r))
(define (remove-td tds)
(set 'result '())
(dolist (td tds)
(push (get-number td) result -1)
)
result
)
(define (parse-summary-table table)
(set 'all-tds (find-all "[\\s\\S]* " table))
(set 'all-summary-values (remove-td all-tds))
)
(define (extract-tables html-content)
(set 'all-tables (find-all "