該文章有更新,請(qǐng)移步 http://pickerel./admin/blogs/267912
說(shuō)到用Javascript解析html,大家肯定會(huì)想到dom或者正則表達(dá)式,但這兩個(gè)都不是我今天我要說(shuō)的。dom很不錯(cuò),不過(guò)效率不高,而且必須將要解析的html插入到當(dāng)前頁(yè)面或者建立一個(gè)iframe才能進(jìn)行,而用正則表達(dá)式,又有太過(guò)繁瑣和難以維護(hù)的問(wèn)題。
有人要說(shuō)了,ruby、php、python有了那么多開源的優(yōu)秀的html解析的類庫(kù),什么beautiful soap,什么Mechanize,什么Hpricot,什么ScRUBYt,你為什么非要自討苦吃用javascript來(lái)干這活呢?
答案是:如果只允許你用javascript和html開發(fā)程序呢,比如開發(fā)adobe air的程序,比如下一步我要做的基于基于內(nèi)嵌webkit組件的Android應(yīng)用快速開發(fā)框架,有時(shí)候,輪子還是得自己造的。
我的這個(gè)解析實(shí)現(xiàn)只是雛形,它以Erik Arvidsson開發(fā)的SimpleHtmlParser
作為html的分析器。SimpleHtmlParser是一個(gè)基于Sax模型實(shí)現(xiàn)的html分析器,他能分析非標(biāo)準(zhǔn)的xml的格式的html別把轉(zhuǎn)換作為一個(gè)標(biāo)準(zhǔn)的xml處理。有了這個(gè)解析器做基礎(chǔ),我寫了個(gè)簡(jiǎn)單的html_extractor,用來(lái)分析html并獲取指定標(biāo)記內(nèi)的內(nèi)容。
html_extractor的使用
new html_extractor(html): 指定html字符串創(chuàng)建一個(gè)html_extractor對(duì)象
方法:
tag(tagName):設(shè)定一個(gè)待匹配的元素名,返回結(jié)果為當(dāng)面的html_extractor對(duì)象
attr(attrName, attrValue):設(shè)定匹配的屬性條件,attr必須在tag后,返回結(jié)果為當(dāng)面的html_extractor對(duì)象
match(innerOrNot):執(zhí)行匹配,返回結(jié)果為符合條件的字符串?dāng)?shù)組。
示例:
- html = "<div>div1</div>";
- //取出div標(biāo)記下的內(nèi)容,ret的結(jié)果是["div1"]
- var ret = new html_extractor(html).tag("div").match();
-
- html = "<div id=\"head\">head</div><div id=\"content\"><p><ul><li>item1</li><li>item2</li></ul></div>";
- //取出屬性id=content的div下的所有l(wèi)i下的內(nèi)容,返回結(jié)果將是["item1", "item2"]
- ret = new html_extractor(html).tag("div").attr("id", "content").tag("li").match();
-
- //提取baidu搜索結(jié)果
- ret = new html_extractor(html).tag("td").attr("class", "f").match();
- //提取google搜索結(jié)果
- ret = new html_extractor(html).tag("li").attr("class", "g").match();
源代碼(當(dāng)前代碼還非常原始,進(jìn)攻參考,請(qǐng)慎重使用)
- var html_extractor = function(html)
- {
- this.parser = new SimpleHtmlParser;
- this.html = html;
- this.tags = [];
- this.attrs = [];
- }
- html_extractor.prototype.tag = function(tag)
- {
- this.tags.push(tag.toLowerCase());
-
- return this;
- }
- html_extractor.prototype.attr = function(name, value)
- {
- var len = this.tags.length - 1;
- if (this.attrs[len] == undefined)this.attrs[len] = [];
- this.attrs[len].push({name:name.toLowerCase(), value: value});
- return this;
- }
- html_extractor.prototype.match = function(inner)
- {
- var self = this;
- var handler = function(){
- this._tag_index = 0;
- this._matched_tags = [];
- this._matched = [];
- this._result = "";
- this.result = [];
- this._all_matched = false;
- for( var i = 0; i < self.tags.length; i++)this._matched[i] = false;
- this.inner = true;
- if (inner != undefined && inner != null)
- {
- this.inner = inner;
- }
-
- };
- handler.prototype = {
- startElement: function (tag, attrs) {
- this.tag_index++;
- tag = tag.toLowerCase();
- //air.trace("process tag:" + tag + " " + this.tag_index);
-
- if (this._all_matched )
- {
- this._result += get_start_tag(tag, attrs);
- return;
- }
-
- for( var i = 0; i < this._matched.length; i++)
- {
- //air.trace(i + ":" + this._matched[i]);
- if (!this._matched[i] )
- {
- if (self.tags[i] == tag)
- {
- this._matched[i] = true;
- if (self.attrs[i] != undefined)
- {
- for(var n = 0; n < self.attrs[i].length; n++)
- {
- var attr = self.attrs[i][n];
- if (attr != undefined)
- {
- if(attrs[attr.name] != attr.value) this._matched[i] = false;
- };
- }
- }
- if (this._matched[i] )
- {
- //todo callback
- //air.trace(i + ":" + this._matched[i] + " first");
- this._matched_tags[this.tag_index] = i;
- if (i == self.tags.length -1)
- {
- this._all_matched = true;
- if (!this.inner) this._result += get_start_tag(tag, attrs);
- }
- return;
- }
- }
-
- if (!this._matched[i] ){break;}
-
- }
- }
- },
- endElement: function (tag) {
- tag = tag.toLowerCase();
-
- if (this._matched_tags[this.tag_index] != undefined)
- {
- this._matched[this._matched_tags[this.tag_index]] = false;
- if (this._all_matched)
- {
- if (!this.inner)this._result += "</" + tag +">";
- this.result.push(this._result);
- this._result = "";
- this._all_matched = false;
- }
- }
- else if (this._all_matched)
- {
- this._result += "</" + tag +">";
- }
- //air.trace("finished tag:" + tag + " " + this.tag_index);
-
- this.tag_index--;
- },
- characters: function (s) { if(this._all_matched)this._result += s;},
- comment: function (s) {}
- };
- this.parser.contentHandler = new handler;
-
- this.parser.parse(this.html);
- //reset
- this.tags = [];
- this.attrs = [];
- return this.parser.contentHandler.result;
- }
- function get_start_tag(tag, attrs)
- {
- var ret = "<" + tag;
- for (var key in attrs)
- {
- value = attrs[key];
- ret += " " + key + "=\"" + value + "\"";
-
- }
- ret += ">";
- return ret;
- }
-
- /** SimpleHtmlParser
- * Original code by Erik Arvidsson, Mozilla Public License
- * http://erik./simplehtmlparser/simplehtmlparser.js
- */
-
- /*
- var handler ={
- startElement: function (sTagName, oAttrs) {},
- endElement: function (sTagName) {},
- characters: function (s) {},
- comment: function (s) {}
- };
- */
-
- function SimpleHtmlParser()
- {
- }
-
- SimpleHtmlParser.prototype = {
-
- handler: null,
-
- // regexps
-
- startTagRe: /^<([^>\s\/]+)((\s+[^=>\s]+(\s*=\s*((\"[^"]*\")|(\'[^']*\')|[^>\s]+))?)*)\s*\/?\s*>/m,
- endTagRe: /^<\/([^>\s]+)[^>]*>/m,
- attrRe: /([^=\s]+)(\s*=\s*((\"([^"]*)\")|(\'([^']*)\')|[^>\s]+))?/gm,
-
- parse: function (s, oHandler)
- {
- if (oHandler)
- this.contentHandler = oHandler;
-
- var i = 0;
- var res, lc, lm, rc, index;
- var treatAsChars = false;
- var oThis = this;
- while (s.length > 0)
- {
- // Comment
- if (s.substring(0, 4) == "<!--")
- {
- index = s.indexOf("-->");
- if (index != -1)
- {
- this.contentHandler.comment(s.substring(4, index));
- s = s.substring(index + 3);
- treatAsChars = false;
- }
- else
- {
- treatAsChars = true;
- }
- }
-
- // end tag
- else if (s.substring(0, 2) == "</")
- {
- if (this.endTagRe.test(s))
- {
- lc = RegExp.leftContext;
- lm = RegExp.lastMatch;
- rc = RegExp.rightContext;
-
- lm.replace(this.endTagRe, function ()
- {
- return oThis.parseEndTag.apply(oThis, arguments);
- });
-
- s = rc;
- treatAsChars = false;
- }
- else
- {
- treatAsChars = true;
- }
- }
- // start tag
- else if (s.charAt(0) == "<")
- {
- if (this.startTagRe.test(s))
- {
- lc = RegExp.leftContext;
- lm = RegExp.lastMatch;
- rc = RegExp.rightContext;
-
- lm.replace(this.startTagRe, function ()
- {
- return oThis.parseStartTag.apply(oThis, arguments);
- });
-
- s = rc;
- treatAsChars = false;
- }
- else
- {
- treatAsChars = true;
- }
- }
-
- if (treatAsChars)
- {
- index = s.indexOf("<");
- if (index == -1)
- {
- this.contentHandler.characters(s);
- s = "";
- }
- else
- {
- this.contentHandler.characters(s.substring(0, index));
- s = s.substring(index);
- }
- }
-
- treatAsChars = true;
- }
- },
-
- parseStartTag: function (sTag, sTagName, sRest)
- {
- var attrs = this.parseAttributes(sTagName, sRest);
- this.contentHandler.startElement(sTagName, attrs);
- },
-
- parseEndTag: function (sTag, sTagName)
- {
- this.contentHandler.endElement(sTagName);
- },
-
- parseAttributes: function (sTagName, s)
- {
- var oThis = this;
- var attrs = {};
- s.replace(this.attrRe, function (a0, a1, a2, a3, a4, a5, a6)
- {
- //attrs.push(oThis.parseAttribute(sTagName, a0, a1, a2, a3, a4, a5, a6));
- attr = oThis.parseAttribute(sTagName, a0, a1, a2, a3, a4, a5, a6);
- attrs[attr.name] = attr.value;
- });
- return attrs;
- },
-
- parseAttribute: function (sTagName, sAttribute, sName)
- {
- var value = "";
- if (arguments[7])
- value = arguments[8];
- else if (arguments[5])
- value = arguments[6];
- else if (arguments[3])
- value = arguments[4];
-
- var empty = !value && !arguments[3];
- return {name: sName.toLowerCase(), value: empty ? null : value};
- }
- };
|