Parser.js 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. //Parser.js
  2. const Tokenizer = require("./Tokenizer.js");
  3. const DomHandler = require("./DomHandler.js");
  4. const trustAttrs = {
  5. align: true,
  6. alt: true,
  7. // #ifdef MP-BAIDU
  8. appid: true,
  9. apid: true,
  10. // #endif
  11. author: true,
  12. autoplay: true,
  13. border: true,
  14. cellpadding: true,
  15. cellspacing: true,
  16. class: true,
  17. color: true,
  18. colspan: true,
  19. controls: true,
  20. "data-src": true,
  21. dir: true,
  22. face: true,
  23. height: true,
  24. href: true,
  25. id: true,
  26. ignore: true,
  27. loop: true,
  28. muted: true,
  29. name: true,
  30. poster: true,
  31. rowspan: true,
  32. size: true,
  33. span: true,
  34. src: true,
  35. start: true,
  36. style: true,
  37. type: true,
  38. // #ifdef MP-WEIXIN || MP-QQ
  39. "unit-id": true,
  40. // #endif
  41. width: true,
  42. };
  43. // #ifdef MP-BAIDU || MP-TOUTIAO || H5
  44. const textTag = {
  45. abbr: true,
  46. b: true,
  47. big: true,
  48. code: true,
  49. del: true,
  50. em: true,
  51. font: true,
  52. i: true,
  53. ins: true,
  54. label: true,
  55. mark: true,
  56. q: true,
  57. s: true,
  58. small: true,
  59. span: true,
  60. strong: true,
  61. sub: true,
  62. sup: true,
  63. u: true
  64. }
  65. const _traverse = function(nodes) {
  66. for (var element of nodes) {
  67. if (element.type == "text")
  68. continue;
  69. if (!element.continue) {
  70. // #ifdef H5
  71. if(textTag[element.name]){
  72. element.continue = true;
  73. _traverse(element.children);
  74. continue;
  75. }
  76. // #endif
  77. var res = "";
  78. var style = element.attrs.style;
  79. var reg = /float\s*:\s*[^;]*/i;
  80. if (reg.test(style)) res += reg.exec(style)[0];
  81. reg = /margin[^;]*/gi;
  82. var margin = reg.exec(style);
  83. while (margin) {
  84. res += (';' + margin[0]);
  85. margin = reg.exec(style);
  86. }
  87. reg = /display\s*:\s*([^;]*)/i;
  88. if (reg.test(style) && reg.exec(style)[1] != "flex") res += (';' + reg.exec(style)[0]);
  89. // #ifdef MP-BAIDU || MP-TOUTIAO
  90. else if(textTag[element.name]) res+=";display:inline";
  91. // #endif
  92. else res += (";display:" + (element.name == 'img' ? 'inline-block' : 'block'));
  93. reg = /flex\s*:[^;]*/i;
  94. if (reg.test(style)) res += (';' + reg.exec(style)[0]);
  95. reg = /[^;\s]*width[^;]*/ig;
  96. var width = reg.exec(style);
  97. while (width) {
  98. res += (';' + width[0]);
  99. width = reg.exec(style);
  100. }
  101. element.attrs.containStyle = res;
  102. if (/[^-]width[^pev;]+/.test(";" + style))
  103. element.attrs.style += ";width:100%";
  104. let addMargin = "";
  105. if (/margin\s*:/.test(style)) addMargin = ';margin:0';
  106. else if (/margin-top/.test(style)) addMargin = ';margin-top:0';
  107. else if (/margin-bottom/.test(style)) addMargin = ';margin-bottom:0';
  108. element.attrs.style = element.attrs.style.replace(/margin[^;]*/gi, "");
  109. element.attrs.style += addMargin;
  110. } else _traverse(element.children);
  111. }
  112. };
  113. // #endif
  114. const voidTag = {
  115. area: true,
  116. base: true,
  117. basefont: true,
  118. br: true,
  119. col: true,
  120. circle: true,
  121. command: true,
  122. ellipse: true,
  123. embed: true,
  124. frame: true,
  125. hr: true,
  126. img: true,
  127. input: true,
  128. isindex: true,
  129. keygen: true,
  130. line: true,
  131. link: true,
  132. meta: true,
  133. param: true,
  134. path: true,
  135. polygon: true,
  136. polyline: true,
  137. rect: true,
  138. source: true,
  139. stop: true,
  140. track: true,
  141. use: true,
  142. wbr: true
  143. };
  144. function Parser(cbs, callback) {
  145. this._cbs = cbs;
  146. this._callback = callback;
  147. this._tagname = "";
  148. this._attribname = "";
  149. this._attribvalue = "";
  150. this._attribs = null;
  151. this._stack = [];
  152. this._tokenizer = new Tokenizer(this);
  153. }
  154. Parser.prototype.ontext = function(data) {
  155. this._cbs.ontext(data);
  156. };
  157. Parser.prototype.onopentagname = function(name) {
  158. name = name.toLowerCase();
  159. this._tagname = name;
  160. this._attribs = {
  161. style: ''
  162. };
  163. if (!voidTag[name]) this._stack.push(name);
  164. };
  165. Parser.prototype.onopentagend = function() {
  166. if (this._attribs) {
  167. this._cbs.onopentag(this._tagname, this._attribs);
  168. this._attribs = null;
  169. }
  170. if (voidTag[this._tagname]) this._cbs.onclosetag(this._tagname);
  171. this._tagname = "";
  172. };
  173. Parser.prototype.onclosetag = function(name) {
  174. name = name.toLowerCase();
  175. if (this._stack.length && !voidTag[name]) {
  176. var pos = this._stack.lastIndexOf(name);
  177. if (pos !== -1) {
  178. pos = this._stack.length - pos;
  179. while (pos--) this._cbs.onclosetag(this._stack.pop());
  180. } else if (name === "p") {
  181. this.onopentagname(name);
  182. this._closeCurrentTag();
  183. }
  184. } else if (name === "br" || name === "hr" || name === "p") {
  185. this.onopentagname(name);
  186. this._closeCurrentTag();
  187. }
  188. };
  189. Parser.prototype._closeCurrentTag = function() {
  190. let name = this._tagname;
  191. this.onopentagend();
  192. if (this._stack[this._stack.length - 1] === name) {
  193. this._cbs.onclosetag(name);
  194. this._stack.pop();
  195. }
  196. };
  197. Parser.prototype.onattribend = function() {
  198. this._attribvalue = this._attribvalue.replace(/"/g, '"');
  199. if (this._attribs && trustAttrs[this._attribname]) {
  200. this._attribs[this._attribname] = this._attribvalue;
  201. }
  202. this._attribname = "";
  203. this._attribvalue = "";
  204. };
  205. Parser.prototype.onend = function() {
  206. for (
  207. var i = this._stack.length; i > 0; this._cbs.onclosetag(this._stack[--i])
  208. );
  209. this._callback({
  210. 'nodes': this._cbs.nodes,
  211. 'title': this._cbs.title,
  212. 'imgList': this._cbs.imgList
  213. });
  214. };
  215. Parser.prototype.write = function(chunk) {
  216. this._tokenizer.parse(chunk);
  217. };
  218. function html2nodes(data, options) {
  219. return new Promise(function(resolve, reject) {
  220. try {
  221. let style = '';
  222. data = data.replace(/<style.*?>([\s\S]*?)<\/style>/gi, function() {
  223. style += arguments[1];
  224. return '';
  225. });
  226. let handler = new DomHandler(style, options);
  227. new Parser(handler, (res) => {
  228. // #ifdef MP-BAIDU || MP-TOUTIAO || H5
  229. _traverse(res.nodes);
  230. // #endif
  231. return resolve(res);
  232. }).write(data);
  233. } catch (err) {
  234. return reject(err);
  235. }
  236. })
  237. }
  238. module.exports = html2nodes;