用Javascript解析html

逗咳嗽 2015-12-13

展開全文

該文章有更新，請(qǐng)移步 http://pickerel./admin/blogs/267912

說(shuō)到用Javascript解析html，大家肯定會(huì)想到dom或者正則表達(dá)式，但這兩個(gè)都不是我今天我要說(shuō)的。dom很不錯(cuò)，不過(guò)效率不高，而且必須將要解析的html插入到當(dāng)前頁(yè)面或者建立一個(gè)iframe才能進(jìn)行，而用正則表達(dá)式，又有太過(guò)繁瑣和難以維護(hù)的問(wèn)題。

有人要說(shuō)了，ruby、php、python有了那么多開源的優(yōu)秀的html解析的類庫(kù)，什么beautiful soap，什么Mechanize，什么Hpricot，什么ScRUBYt，你為什么非要自討苦吃用javascript來(lái)干這活呢？

答案是：如果只允許你用javascript和html開發(fā)程序呢，比如開發(fā)adobe air的程序，比如下一步我要做的基于基于內(nèi)嵌webkit組件的Android應(yīng)用快速開發(fā)框架，有時(shí)候，輪子還是得自己造的。

我的這個(gè)解析實(shí)現(xiàn)只是雛形，它以Erik Arvidsson開發(fā)的SimpleHtmlParser 作為html的分析器。SimpleHtmlParser是一個(gè)基于Sax模型實(shí)現(xiàn)的html分析器，他能分析非標(biāo)準(zhǔn)的xml的格式的html別把轉(zhuǎn)換作為一個(gè)標(biāo)準(zhǔn)的xml處理。有了這個(gè)解析器做基礎(chǔ)，我寫了個(gè)簡(jiǎn)單的html_extractor，用來(lái)分析html并獲取指定標(biāo)記內(nèi)的內(nèi)容。

html_extractor的使用

new html_extractor(html): 指定html字符串創(chuàng)建一個(gè)html_extractor對(duì)象

方法：

tag(tagName):設(shè)定一個(gè)待匹配的元素名，返回結(jié)果為當(dāng)面的html_extractor對(duì)象

attr(attrName, attrValue):設(shè)定匹配的屬性條件，attr必須在tag后，返回結(jié)果為當(dāng)面的html_extractor對(duì)象

match(innerOrNot):執(zhí)行匹配，返回結(jié)果為符合條件的字符串?dāng)?shù)組。

示例：

Js代碼

html = "<div>div1</div>";
//取出div標(biāo)記下的內(nèi)容，ret的結(jié)果是["div1"]
var ret = new html_extractor(html).tag("div").match();
html = "<div id=\"head\">head</div><div id=\"content\"><p><ul><li>item1</li><li>item2</li></ul></div>";
//取出屬性id=content的div下的所有l(wèi)i下的內(nèi)容，返回結(jié)果將是["item1", "item2"]
ret = new html_extractor(html).tag("div").attr("id", "content").tag("li").match();
//提取baidu搜索結(jié)果
ret = new html_extractor(html).tag("td").attr("class", "f").match();
//提取google搜索結(jié)果
ret = new html_extractor(html).tag("li").attr("class", "g").match();

源代碼(當(dāng)前代碼還非常原始，進(jìn)攻參考，請(qǐng)慎重使用)

Js代碼

var html_extractor = function(html)
{
this.parser = new SimpleHtmlParser;
this.html = html;
this.tags = [];
this.attrs = [];
}
html_extractor.prototype.tag = function(tag)
{
this.tags.push(tag.toLowerCase());
return this;
}
html_extractor.prototype.attr = function(name, value)
{
var len = this.tags.length - 1;
if (this.attrs[len] == undefined)this.attrs[len] = [];
this.attrs[len].push({name:name.toLowerCase(), value: value});
return this;
}
html_extractor.prototype.match = function(inner)
{
var self = this;
var handler = function(){
this._tag_index = 0;
this._matched_tags = [];
this._matched = [];
this._result = "";
this.result = [];
this._all_matched = false;
for( var i = 0; i < self.tags.length; i++)this._matched[i] = false;
this.inner = true;
if (inner != undefined && inner != null)
{
this.inner = inner;
}
};
handler.prototype = {
startElement: function (tag, attrs) {
this.tag_index++;
tag = tag.toLowerCase();
//air.trace("process tag:" + tag + " " + this.tag_index);
if (this._all_matched )
{
this._result += get_start_tag(tag, attrs);
return;
}
for( var i = 0; i < this._matched.length; i++)
{
//air.trace(i + ":" + this._matched[i]);
if (!this._matched[i] )
{
if (self.tags[i] == tag)
{
this._matched[i] = true;
if (self.attrs[i] != undefined)
{
for(var n = 0; n < self.attrs[i].length; n++)
{
var attr = self.attrs[i][n];
if (attr != undefined)
{
if(attrs[attr.name] != attr.value) this._matched[i] = false;
};
}
}
if (this._matched[i] )
{
//todo callback
//air.trace(i + ":" + this._matched[i] + " first");
this._matched_tags[this.tag_index] = i;
if (i == self.tags.length -1)
{
this._all_matched = true;
if (!this.inner) this._result += get_start_tag(tag, attrs);
}
return;
}
}
if (!this._matched[i] ){break;}
}
}
},
endElement: function (tag) {
tag = tag.toLowerCase();
if (this._matched_tags[this.tag_index] != undefined)
{
this._matched[this._matched_tags[this.tag_index]] = false;
if (this._all_matched)
{
if (!this.inner)this._result += "</" + tag +">";
this.result.push(this._result);
this._result = "";
this._all_matched = false;
}
}
else if (this._all_matched)
{
this._result += "</" + tag +">";
}
//air.trace("finished tag:" + tag + " " + this.tag_index);
this.tag_index--;
},
characters: function (s) { if(this._all_matched)this._result += s;},
comment: function (s) {}
};
this.parser.contentHandler = new handler;
this.parser.parse(this.html);
//reset
this.tags = [];
this.attrs = [];
return this.parser.contentHandler.result;
}
function get_start_tag(tag, attrs)
{
var ret = "<" + tag;
for (var key in attrs)
{
value = attrs[key];
ret += " " + key + "=\"" + value + "\"";
}
ret += ">";
return ret;
}
/** SimpleHtmlParser
* Original code by Erik Arvidsson, Mozilla Public License
* http://erik./simplehtmlparser/simplehtmlparser.js
*/
/*
var handler ={
startElement: function (sTagName, oAttrs) {},
endElement: function (sTagName) {},
characters: function (s) {},
comment: function (s) {}
};
*/
function SimpleHtmlParser()
{
}
SimpleHtmlParser.prototype = {
handler: null,
// regexps
startTagRe: /^<([^>\s\/]+)((\s+[^=>\s]+(\s*=\s*((\"[^"]*\")|(\'[^']*\')|[^>\s]+))?)*)\s*\/?\s*>/m,
endTagRe: /^<\/([^>\s]+)[^>]*>/m,
attrRe: /([^=\s]+)(\s*=\s*((\"([^"]*)\")|(\'([^']*)\')|[^>\s]+))?/gm,
parse: function (s, oHandler)
{
if (oHandler)
this.contentHandler = oHandler;
var i = 0;
var res, lc, lm, rc, index;
var treatAsChars = false;
var oThis = this;
while (s.length > 0)
{
// Comment
if (s.substring(0, 4) == "<!--")
{
index = s.indexOf("-->");
if (index != -1)
{
this.contentHandler.comment(s.substring(4, index));
s = s.substring(index + 3);
treatAsChars = false;
}
else
{
treatAsChars = true;
}
}
// end tag
else if (s.substring(0, 2) == "</")
{
if (this.endTagRe.test(s))
{
lc = RegExp.leftContext;
lm = RegExp.lastMatch;
rc = RegExp.rightContext;
lm.replace(this.endTagRe, function ()
{
return oThis.parseEndTag.apply(oThis, arguments);
});
s = rc;
treatAsChars = false;
}
else
{
treatAsChars = true;
}
}
// start tag
else if (s.charAt(0) == "<")
{
if (this.startTagRe.test(s))
{
lc = RegExp.leftContext;
lm = RegExp.lastMatch;
rc = RegExp.rightContext;
lm.replace(this.startTagRe, function ()
{
return oThis.parseStartTag.apply(oThis, arguments);
});
s = rc;
treatAsChars = false;
}
else
{
treatAsChars = true;
}
}
if (treatAsChars)
{
index = s.indexOf("<");
if (index == -1)
{
this.contentHandler.characters(s);
s = "";
}
else
{
this.contentHandler.characters(s.substring(0, index));
s = s.substring(index);
}
}
treatAsChars = true;
}
},
parseStartTag: function (sTag, sTagName, sRest)
{
var attrs = this.parseAttributes(sTagName, sRest);
this.contentHandler.startElement(sTagName, attrs);
},
parseEndTag: function (sTag, sTagName)
{
this.contentHandler.endElement(sTagName);
},
parseAttributes: function (sTagName, s)
{
var oThis = this;
var attrs = {};
s.replace(this.attrRe, function (a0, a1, a2, a3, a4, a5, a6)
{
//attrs.push(oThis.parseAttribute(sTagName, a0, a1, a2, a3, a4, a5, a6));
attr = oThis.parseAttribute(sTagName, a0, a1, a2, a3, a4, a5, a6);
attrs[attr.name] = attr.value;
});
return attrs;
},
parseAttribute: function (sTagName, sAttribute, sName)
{
var value = "";
if (arguments[7])
value = arguments[8];
else if (arguments[5])
value = arguments[6];
else if (arguments[3])
value = arguments[4];
var empty = !value && !arguments[3];
return {name: sName.toLowerCase(), value: empty ? null : value};
}
};

本站是提供個(gè)人知識(shí)管理的網(wǎng)絡(luò)存儲(chǔ)空間，所有內(nèi)容均由用戶發(fā)布，不代表本站觀點(diǎn)。請(qǐng)注意甄別內(nèi)容中的聯(lián)系方式、誘導(dǎo)購(gòu)買等信息，謹(jǐn)防詐騙。如發(fā)現(xiàn)有害或侵權(quán)內(nèi)容，請(qǐng)點(diǎn)擊一鍵舉報(bào)。

轉(zhuǎn)藏 分享

QQ空間 QQ好友新浪微博微信

獻(xiàn)花（0） +1

來(lái)自：逗咳嗽 > 《編程》

舉報(bào)/認(rèn)領(lǐng)