123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218 |
- //Tokenizer.js
- function Tokenizer(cbs) {
- this._state = "TEXT";
- this._buffer = "";
- this._sectionStart = 0;
- this._index = 0;
- this._cbs = cbs;
- }
- Tokenizer.prototype.TEXT = function(c) {
- var index = this._buffer.indexOf("<", this._index);
- if (index != -1) {
- this._index = index;
- this._cbs.ontext(this._getSection());
- this._state = "BeforeTag";
- this._sectionStart = this._index;
- } else this._index = this._buffer.length;
- };
- Tokenizer.prototype.BeforeTag = function(c) {
- switch (c) {
- case "/":
- this._state = "BeforeCloseTag";
- break;
- case "!":
- this._state = "BeforeDeclaration";
- break;
- case "?":
- let index = this._buffer.indexOf(">", this._index);
- if (index != -1) {
- this._index = index;
- this._sectionStart = this._index + 1;
- } else this._sectionStart = this._index = this._buffer.length;
- this._state = "TEXT";
- break;
- case ">":
- this._state = "TEXT";
- break;
- case "<":
- this._cbs.ontext(this._getSection());
- this._sectionStart = this._index;
- break;
- default:
- if (/\s/.test(c)) this._state = "TEXT";
- else {
- this._state = "InTag";
- this._sectionStart = this._index;
- }
- }
- };
- Tokenizer.prototype.InTag = function(c) {
- if (c === "/" || c === ">" || /\s/.test(c)) {
- this._cbs.onopentagname(this._getSection());
- this._state = "BeforeAttrsName";
- this._index--;
- }
- };
- Tokenizer.prototype.BeforeAttrsName = function(c) {
- if (c === ">") {
- this._cbs.onopentagend();
- this._state = "TEXT";
- this._sectionStart = this._index + 1;
- } else if (c === "/") {
- this._state = "InSelfCloseTag";
- } else if (!(/\s/.test(c))) {
- this._state = "InAttrsName";
- this._sectionStart = this._index;
- }
- };
- Tokenizer.prototype.InAttrsName = function(c) {
- if (c === "=" || c === "/" || c === ">" || /\s/.test(c)) {
- this._cbs._attribname = this._getSection().toLowerCase();
- this._sectionStart = -1;
- this._state = "AfterAttrsName";
- this._index--;
- }
- };
- Tokenizer.prototype.AfterAttrsName = function(c) {
- if (c === "=") {
- this._state = "BeforeAttrsValue";
- } else if (c === "/" || c === ">") {
- this._cbs.onattribend();
- this._state = "BeforeAttrsName";
- this._index--;
- } else if (!(/\s/.test(c))) {
- this._cbs.onattribend();
- this._state = "InAttrsName";
- this._sectionStart = this._index;
- }
- };
- Tokenizer.prototype.BeforeAttrsValue = function(c) {
- if (c === '"') {
- this._state = "InAttrsValueDQ";
- this._sectionStart = this._index + 1;
- } else if (c === "'") {
- this._state = "InAttrsValueSQ";
- this._sectionStart = this._index + 1;
- } else if (!(/\s/.test(c))) {
- this._state = "InAttrsValueNQ";
- this._sectionStart = this._index;
- this._index--;
- }
- };
- Tokenizer.prototype.InAttrsValueDQ = function(c) {
- if (c === '"') {
- this._cbs._attribvalue += this._getSection();
- this._cbs.onattribend();
- this._state = "BeforeAttrsName";
- }
- };
- Tokenizer.prototype.InAttrsValueSQ = function(c) {
- if (c === "'") {
- this._cbs._attribvalue += this._getSection();
- this._cbs.onattribend();
- this._state = "BeforeAttrsName";
- }
- };
- Tokenizer.prototype.InAttrsValueNQ = function(c) {
- if (/\s/.test(c) || c === ">") {
- this._cbs._attribvalue += this._getSection();
- this._cbs.onattribend();
- this._state = "BeforeAttrsName";
- this._index--;
- }
- };
- Tokenizer.prototype.BeforeCloseTag = function(c) {
- if (/\s/.test(c));
- else if (c === ">") {
- this._state = "TEXT";
- } else {
- this._state = "InCloseTag";
- this._sectionStart = this._index;
- }
- };
- Tokenizer.prototype.InCloseTag = function(c) {
- if (c === ">" || /\s/.test(c)) {
- this._cbs.onclosetag(this._getSection());
- this._state = "AfterCloseTag";
- this._index--;
- }
- };
- Tokenizer.prototype.InSelfCloseTag = function(c) {
- if (c === ">") {
- this._cbs.onopentagend();
- this._state = "TEXT";
- this._sectionStart = this._index + 1;
- } else if (!(/\s/.test(c))) {
- this._state = "BeforeAttrsName";
- this._index--;
- }
- };
- Tokenizer.prototype.AfterCloseTag = function(c) {
- if (c === ">") {
- this._state = "TEXT";
- this._sectionStart = this._index + 1;
- }
- };
- Tokenizer.prototype.BeforeDeclaration = function(c) {
- if (c == '-') this._state = "InComment";
- else if (c == '[') this._state = "BeforeCDATA1";
- else this._state = "InDeclaration";
- };
- Tokenizer.prototype.InDeclaration = function(c) {
- var index = this._buffer.indexOf(">", this._index);
- if (index != -1) {
- this._index = index;
- this._sectionStart = index + 1;
- } else this._sectionStart = this._index = this._buffer.length;
- this._state = "TEXT";
- };
- Tokenizer.prototype.InComment = function(c) {
- let key = (c == '-' ? '-->' : '>');
- let index = this._buffer.indexOf(key, this._index);
- if (index != -1) {
- this._index = index + key.length - 1;
- this._sectionStart = this._index + 1;
- } else this._sectionStart = this._index = this._buffer.length;
- this._state = "TEXT";
- };
- Tokenizer.prototype.BeforeCDATA1 = function(c) {
- if (c == 'C') this._state = "BeforeCDATA2";
- else this._state = "InDeclaration";
- };
- Tokenizer.prototype.BeforeCDATA2 = function(c) {
- if (c == 'D') this._state = "BeforeCDATA3";
- else this._state = "InDeclaration";
- };
- Tokenizer.prototype.BeforeCDATA3 = function(c) {
- if (c == 'A') this._state = "BeforeCDATA4";
- else this._state = "InDeclaration";
- };
- Tokenizer.prototype.BeforeCDATA4 = function(c) {
- if (c == 'T') this._state = "BeforeCDATA5";
- else this._state = "InDeclaration";
- };
- Tokenizer.prototype.BeforeCDATA5 = function(c) {
- if (c == 'A') this._state = "InCDATA";
- else this._state = "InDeclaration";
- };
- Tokenizer.prototype.InCDATA = function(c) {
- let key = (c == '[' ? ']]>' : '>');
- let index = this._buffer.indexOf(key, this._index);
- if (index != -1) {
- this._index = index + key.length - 1;
- this._sectionStart = this._index + 1;
- } else this._sectionStart = this._index = this._buffer.length;
- this._state = "TEXT";
- };
- Tokenizer.prototype.parse = function(chunk) {
- this._buffer += chunk;
- for (; this._index < this._buffer.length; this._index++)
- this[this._state](this._buffer[this._index]);
- if (this._state === "TEXT" && this._sectionStart !== this._index)
- this._cbs.ontext(this._buffer.substr(this._sectionStart));
- this._cbs.onend();
- };
- Tokenizer.prototype._getSection = function() {
- return this._buffer.substring(this._sectionStart, this._index);
- };
- module.exports = Tokenizer;
|