123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240 |
- //Parser.js
- const Tokenizer = require("./Tokenizer.js");
- const DomHandler = require("./DomHandler.js");
- const trustAttrs = {
- align: true,
- alt: true,
- // #ifdef MP-BAIDU
- appid: true,
- apid: true,
- // #endif
- author: true,
- autoplay: true,
- border: true,
- cellpadding: true,
- cellspacing: true,
- class: true,
- color: true,
- colspan: true,
- controls: true,
- "data-src": true,
- dir: true,
- face: true,
- height: true,
- href: true,
- id: true,
- ignore: true,
- loop: true,
- muted: true,
- name: true,
- poster: true,
- rowspan: true,
- size: true,
- span: true,
- src: true,
- start: true,
- style: true,
- type: true,
- // #ifdef MP-WEIXIN || MP-QQ
- "unit-id": true,
- // #endif
- width: true,
- };
- // #ifdef MP-BAIDU || MP-TOUTIAO || H5
- const textTag = {
- abbr: true,
- b: true,
- big: true,
- code: true,
- del: true,
- em: true,
- font: true,
- i: true,
- ins: true,
- label: true,
- mark: true,
- q: true,
- s: true,
- small: true,
- span: true,
- strong: true,
- sub: true,
- sup: true,
- u: true
- }
- const _traverse = function(nodes) {
- for (var element of nodes) {
- if (element.type == "text")
- continue;
- if (!element.continue) {
- // #ifdef H5
- if(textTag[element.name]){
- element.continue = true;
- _traverse(element.children);
- continue;
- }
- // #endif
- var res = "";
- var style = element.attrs.style;
- var reg = /float\s*:\s*[^;]*/i;
- if (reg.test(style)) res += reg.exec(style)[0];
- reg = /margin[^;]*/gi;
- var margin = reg.exec(style);
- while (margin) {
- res += (';' + margin[0]);
- margin = reg.exec(style);
- }
- reg = /display\s*:\s*([^;]*)/i;
- if (reg.test(style) && reg.exec(style)[1] != "flex") res += (';' + reg.exec(style)[0]);
- // #ifdef MP-BAIDU || MP-TOUTIAO
- else if(textTag[element.name]) res+=";display:inline";
- // #endif
- else res += (";display:" + (element.name == 'img' ? 'inline-block' : 'block'));
- reg = /flex\s*:[^;]*/i;
- if (reg.test(style)) res += (';' + reg.exec(style)[0]);
- reg = /[^;\s]*width[^;]*/ig;
- var width = reg.exec(style);
- while (width) {
- res += (';' + width[0]);
- width = reg.exec(style);
- }
- element.attrs.containStyle = res;
- if (/[^-]width[^pev;]+/.test(";" + style))
- element.attrs.style += ";width:100%";
- let addMargin = "";
- if (/margin\s*:/.test(style)) addMargin = ';margin:0';
- else if (/margin-top/.test(style)) addMargin = ';margin-top:0';
- else if (/margin-bottom/.test(style)) addMargin = ';margin-bottom:0';
- element.attrs.style = element.attrs.style.replace(/margin[^;]*/gi, "");
- element.attrs.style += addMargin;
- } else _traverse(element.children);
- }
- };
- // #endif
- const voidTag = {
- area: true,
- base: true,
- basefont: true,
- br: true,
- col: true,
- circle: true,
- command: true,
- ellipse: true,
- embed: true,
- frame: true,
- hr: true,
- img: true,
- input: true,
- isindex: true,
- keygen: true,
- line: true,
- link: true,
- meta: true,
- param: true,
- path: true,
- polygon: true,
- polyline: true,
- rect: true,
- source: true,
- stop: true,
- track: true,
- use: true,
- wbr: true
- };
- function Parser(cbs, callback) {
- this._cbs = cbs;
- this._callback = callback;
- this._tagname = "";
- this._attribname = "";
- this._attribvalue = "";
- this._attribs = null;
- this._stack = [];
- this._tokenizer = new Tokenizer(this);
- }
- Parser.prototype.ontext = function(data) {
- this._cbs.ontext(data);
- };
- Parser.prototype.onopentagname = function(name) {
- name = name.toLowerCase();
- this._tagname = name;
- this._attribs = {
- style: ''
- };
- if (!voidTag[name]) this._stack.push(name);
- };
- Parser.prototype.onopentagend = function() {
- if (this._attribs) {
- this._cbs.onopentag(this._tagname, this._attribs);
- this._attribs = null;
- }
- if (voidTag[this._tagname]) this._cbs.onclosetag(this._tagname);
- this._tagname = "";
- };
- Parser.prototype.onclosetag = function(name) {
- name = name.toLowerCase();
- if (this._stack.length && !voidTag[name]) {
- var pos = this._stack.lastIndexOf(name);
- if (pos !== -1) {
- pos = this._stack.length - pos;
- while (pos--) this._cbs.onclosetag(this._stack.pop());
- } else if (name === "p") {
- this.onopentagname(name);
- this._closeCurrentTag();
- }
- } else if (name === "br" || name === "hr" || name === "p") {
- this.onopentagname(name);
- this._closeCurrentTag();
- }
- };
- Parser.prototype._closeCurrentTag = function() {
- let name = this._tagname;
- this.onopentagend();
- if (this._stack[this._stack.length - 1] === name) {
- this._cbs.onclosetag(name);
- this._stack.pop();
- }
- };
- Parser.prototype.onattribend = function() {
- this._attribvalue = this._attribvalue.replace(/"/g, '"');
- if (this._attribs && trustAttrs[this._attribname]) {
- this._attribs[this._attribname] = this._attribvalue;
- }
- this._attribname = "";
- this._attribvalue = "";
- };
- Parser.prototype.onend = function() {
- for (
- var i = this._stack.length; i > 0; this._cbs.onclosetag(this._stack[--i])
- );
- this._callback({
- 'nodes': this._cbs.nodes,
- 'title': this._cbs.title,
- 'imgList': this._cbs.imgList
- });
- };
- Parser.prototype.write = function(chunk) {
- this._tokenizer.parse(chunk);
- };
- function html2nodes(data, options) {
- return new Promise(function(resolve, reject) {
- try {
- let style = '';
- data = data.replace(/<style.*?>([\s\S]*?)<\/style>/gi, function() {
- style += arguments[1];
- return '';
- });
- let handler = new DomHandler(style, options);
- new Parser(handler, (res) => {
- // #ifdef MP-BAIDU || MP-TOUTIAO || H5
- _traverse(res.nodes);
- // #endif
- return resolve(res);
- }).write(data);
- } catch (err) {
- return reject(err);
- }
- })
- }
- module.exports = html2nodes;
|