Tokenizer.js 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. //Tokenizer.js
  2. function Tokenizer(cbs) {
  3. this._state = "TEXT";
  4. this._buffer = "";
  5. this._sectionStart = 0;
  6. this._index = 0;
  7. this._cbs = cbs;
  8. }
  9. Tokenizer.prototype.TEXT = function(c) {
  10. var index = this._buffer.indexOf("<", this._index);
  11. if (index != -1) {
  12. this._index = index;
  13. this._cbs.ontext(this._getSection());
  14. this._state = "BeforeTag";
  15. this._sectionStart = this._index;
  16. } else this._index = this._buffer.length;
  17. };
  18. Tokenizer.prototype.BeforeTag = function(c) {
  19. switch (c) {
  20. case "/":
  21. this._state = "BeforeCloseTag";
  22. break;
  23. case "!":
  24. this._state = "BeforeDeclaration";
  25. break;
  26. case "?":
  27. let index = this._buffer.indexOf(">", this._index);
  28. if (index != -1) {
  29. this._index = index;
  30. this._sectionStart = this._index + 1;
  31. } else this._sectionStart = this._index = this._buffer.length;
  32. this._state = "TEXT";
  33. break;
  34. case ">":
  35. this._state = "TEXT";
  36. break;
  37. case "<":
  38. this._cbs.ontext(this._getSection());
  39. this._sectionStart = this._index;
  40. break;
  41. default:
  42. if (/\s/.test(c)) this._state = "TEXT";
  43. else {
  44. this._state = "InTag";
  45. this._sectionStart = this._index;
  46. }
  47. }
  48. };
  49. Tokenizer.prototype.InTag = function(c) {
  50. if (c === "/" || c === ">" || /\s/.test(c)) {
  51. this._cbs.onopentagname(this._getSection());
  52. this._state = "BeforeAttrsName";
  53. this._index--;
  54. }
  55. };
  56. Tokenizer.prototype.BeforeAttrsName = function(c) {
  57. if (c === ">") {
  58. this._cbs.onopentagend();
  59. this._state = "TEXT";
  60. this._sectionStart = this._index + 1;
  61. } else if (c === "/") {
  62. this._state = "InSelfCloseTag";
  63. } else if (!(/\s/.test(c))) {
  64. this._state = "InAttrsName";
  65. this._sectionStart = this._index;
  66. }
  67. };
  68. Tokenizer.prototype.InAttrsName = function(c) {
  69. if (c === "=" || c === "/" || c === ">" || /\s/.test(c)) {
  70. this._cbs._attribname = this._getSection().toLowerCase();
  71. this._sectionStart = -1;
  72. this._state = "AfterAttrsName";
  73. this._index--;
  74. }
  75. };
  76. Tokenizer.prototype.AfterAttrsName = function(c) {
  77. if (c === "=") {
  78. this._state = "BeforeAttrsValue";
  79. } else if (c === "/" || c === ">") {
  80. this._cbs.onattribend();
  81. this._state = "BeforeAttrsName";
  82. this._index--;
  83. } else if (!(/\s/.test(c))) {
  84. this._cbs.onattribend();
  85. this._state = "InAttrsName";
  86. this._sectionStart = this._index;
  87. }
  88. };
  89. Tokenizer.prototype.BeforeAttrsValue = function(c) {
  90. if (c === '"') {
  91. this._state = "InAttrsValueDQ";
  92. this._sectionStart = this._index + 1;
  93. } else if (c === "'") {
  94. this._state = "InAttrsValueSQ";
  95. this._sectionStart = this._index + 1;
  96. } else if (!(/\s/.test(c))) {
  97. this._state = "InAttrsValueNQ";
  98. this._sectionStart = this._index;
  99. this._index--;
  100. }
  101. };
  102. Tokenizer.prototype.InAttrsValueDQ = function(c) {
  103. if (c === '"') {
  104. this._cbs._attribvalue += this._getSection();
  105. this._cbs.onattribend();
  106. this._state = "BeforeAttrsName";
  107. }
  108. };
  109. Tokenizer.prototype.InAttrsValueSQ = function(c) {
  110. if (c === "'") {
  111. this._cbs._attribvalue += this._getSection();
  112. this._cbs.onattribend();
  113. this._state = "BeforeAttrsName";
  114. }
  115. };
  116. Tokenizer.prototype.InAttrsValueNQ = function(c) {
  117. if (/\s/.test(c) || c === ">") {
  118. this._cbs._attribvalue += this._getSection();
  119. this._cbs.onattribend();
  120. this._state = "BeforeAttrsName";
  121. this._index--;
  122. }
  123. };
  124. Tokenizer.prototype.BeforeCloseTag = function(c) {
  125. if (/\s/.test(c));
  126. else if (c === ">") {
  127. this._state = "TEXT";
  128. } else {
  129. this._state = "InCloseTag";
  130. this._sectionStart = this._index;
  131. }
  132. };
  133. Tokenizer.prototype.InCloseTag = function(c) {
  134. if (c === ">" || /\s/.test(c)) {
  135. this._cbs.onclosetag(this._getSection());
  136. this._state = "AfterCloseTag";
  137. this._index--;
  138. }
  139. };
  140. Tokenizer.prototype.InSelfCloseTag = function(c) {
  141. if (c === ">") {
  142. this._cbs.onopentagend();
  143. this._state = "TEXT";
  144. this._sectionStart = this._index + 1;
  145. } else if (!(/\s/.test(c))) {
  146. this._state = "BeforeAttrsName";
  147. this._index--;
  148. }
  149. };
  150. Tokenizer.prototype.AfterCloseTag = function(c) {
  151. if (c === ">") {
  152. this._state = "TEXT";
  153. this._sectionStart = this._index + 1;
  154. }
  155. };
  156. Tokenizer.prototype.BeforeDeclaration = function(c) {
  157. if (c == '-') this._state = "InComment";
  158. else if (c == '[') this._state = "BeforeCDATA1";
  159. else this._state = "InDeclaration";
  160. };
  161. Tokenizer.prototype.InDeclaration = function(c) {
  162. var index = this._buffer.indexOf(">", this._index);
  163. if (index != -1) {
  164. this._index = index;
  165. this._sectionStart = index + 1;
  166. } else this._sectionStart = this._index = this._buffer.length;
  167. this._state = "TEXT";
  168. };
  169. Tokenizer.prototype.InComment = function(c) {
  170. let key = (c == '-' ? '-->' : '>');
  171. let index = this._buffer.indexOf(key, this._index);
  172. if (index != -1) {
  173. this._index = index + key.length - 1;
  174. this._sectionStart = this._index + 1;
  175. } else this._sectionStart = this._index = this._buffer.length;
  176. this._state = "TEXT";
  177. };
  178. Tokenizer.prototype.BeforeCDATA1 = function(c) {
  179. if (c == 'C') this._state = "BeforeCDATA2";
  180. else this._state = "InDeclaration";
  181. };
  182. Tokenizer.prototype.BeforeCDATA2 = function(c) {
  183. if (c == 'D') this._state = "BeforeCDATA3";
  184. else this._state = "InDeclaration";
  185. };
  186. Tokenizer.prototype.BeforeCDATA3 = function(c) {
  187. if (c == 'A') this._state = "BeforeCDATA4";
  188. else this._state = "InDeclaration";
  189. };
  190. Tokenizer.prototype.BeforeCDATA4 = function(c) {
  191. if (c == 'T') this._state = "BeforeCDATA5";
  192. else this._state = "InDeclaration";
  193. };
  194. Tokenizer.prototype.BeforeCDATA5 = function(c) {
  195. if (c == 'A') this._state = "InCDATA";
  196. else this._state = "InDeclaration";
  197. };
  198. Tokenizer.prototype.InCDATA = function(c) {
  199. let key = (c == '[' ? ']]>' : '>');
  200. let index = this._buffer.indexOf(key, this._index);
  201. if (index != -1) {
  202. this._index = index + key.length - 1;
  203. this._sectionStart = this._index + 1;
  204. } else this._sectionStart = this._index = this._buffer.length;
  205. this._state = "TEXT";
  206. };
  207. Tokenizer.prototype.parse = function(chunk) {
  208. this._buffer += chunk;
  209. for (; this._index < this._buffer.length; this._index++)
  210. this[this._state](this._buffer[this._index]);
  211. if (this._state === "TEXT" && this._sectionStart !== this._index)
  212. this._cbs.ontext(this._buffer.substr(this._sectionStart));
  213. this._cbs.onend();
  214. };
  215. Tokenizer.prototype._getSection = function() {
  216. return this._buffer.substring(this._sectionStart, this._index);
  217. };
  218. module.exports = Tokenizer;