930 lines
28 KiB
JavaScript
930 lines
28 KiB
JavaScript
'use strict';
|
|
|
|
var conventions = require('./conventions');
|
|
var g = require('./grammar');
|
|
var errors = require('./errors');
|
|
|
|
var isHTMLEscapableRawTextElement = conventions.isHTMLEscapableRawTextElement;
|
|
var isHTMLMimeType = conventions.isHTMLMimeType;
|
|
var isHTMLRawTextElement = conventions.isHTMLRawTextElement;
|
|
var hasOwn = conventions.hasOwn;
|
|
var NAMESPACE = conventions.NAMESPACE;
|
|
var ParseError = errors.ParseError;
|
|
var DOMException = errors.DOMException;
|
|
|
|
//var handlers = 'resolveEntity,getExternalSubset,characters,endDocument,endElement,endPrefixMapping,ignorableWhitespace,processingInstruction,setDocumentLocator,skippedEntity,startDocument,startElement,startPrefixMapping,notationDecl,unparsedEntityDecl,error,fatalError,warning,attributeDecl,elementDecl,externalEntityDecl,internalEntityDecl,comment,endCDATA,endDTD,endEntity,startCDATA,startDTD,startEntity'.split(',')
|
|
|
|
//S_TAG, S_ATTR, S_EQ, S_ATTR_NOQUOT_VALUE
|
|
//S_ATTR_SPACE, S_ATTR_END, S_TAG_SPACE, S_TAG_CLOSE
|
|
var S_TAG = 0; //tag name offerring
|
|
var S_ATTR = 1; //attr name offerring
|
|
var S_ATTR_SPACE = 2; //attr name end and space offer
|
|
var S_EQ = 3; //=space?
|
|
var S_ATTR_NOQUOT_VALUE = 4; //attr value(no quot value only)
|
|
var S_ATTR_END = 5; //attr value end and no space(quot end)
|
|
var S_TAG_SPACE = 6; //(attr value end || tag end ) && (space offer)
|
|
var S_TAG_CLOSE = 7; //closed el<el />
|
|
|
|
function XMLReader() {}
|
|
|
|
XMLReader.prototype = {
|
|
parse: function (source, defaultNSMap, entityMap) {
|
|
var domBuilder = this.domBuilder;
|
|
domBuilder.startDocument();
|
|
_copy(defaultNSMap, (defaultNSMap = Object.create(null)));
|
|
parse(source, defaultNSMap, entityMap, domBuilder, this.errorHandler);
|
|
domBuilder.endDocument();
|
|
},
|
|
};
|
|
|
|
/**
|
|
* Detecting everything that might be a reference,
|
|
* including those without ending `;`, since those are allowed in HTML.
|
|
* The entityReplacer takes care of verifying and transforming each occurrence,
|
|
* and reports to the errorHandler on those that are not OK,
|
|
* depending on the context.
|
|
*/
|
|
var ENTITY_REG = /&#?\w+;?/g;
|
|
|
|
function parse(source, defaultNSMapCopy, entityMap, domBuilder, errorHandler) {
|
|
var isHTML = isHTMLMimeType(domBuilder.mimeType);
|
|
if (source.indexOf(g.UNICODE_REPLACEMENT_CHARACTER) >= 0) {
|
|
errorHandler.warning('Unicode replacement character detected, source encoding issues?');
|
|
}
|
|
|
|
function fixedFromCharCode(code) {
|
|
// String.prototype.fromCharCode does not supports
|
|
// > 2 bytes unicode chars directly
|
|
if (code > 0xffff) {
|
|
code -= 0x10000;
|
|
var surrogate1 = 0xd800 + (code >> 10),
|
|
surrogate2 = 0xdc00 + (code & 0x3ff);
|
|
|
|
return String.fromCharCode(surrogate1, surrogate2);
|
|
} else {
|
|
return String.fromCharCode(code);
|
|
}
|
|
}
|
|
|
|
function entityReplacer(a) {
|
|
var complete = a[a.length - 1] === ';' ? a : a + ';';
|
|
if (!isHTML && complete !== a) {
|
|
errorHandler.error('EntityRef: expecting ;');
|
|
return a;
|
|
}
|
|
var match = g.Reference.exec(complete);
|
|
if (!match || match[0].length !== complete.length) {
|
|
errorHandler.error('entity not matching Reference production: ' + a);
|
|
return a;
|
|
}
|
|
var k = complete.slice(1, -1);
|
|
if (hasOwn(entityMap, k)) {
|
|
return entityMap[k];
|
|
} else if (k.charAt(0) === '#') {
|
|
return fixedFromCharCode(parseInt(k.substring(1).replace('x', '0x')));
|
|
} else {
|
|
errorHandler.error('entity not found:' + a);
|
|
return a;
|
|
}
|
|
}
|
|
|
|
function appendText(end) {
|
|
//has some bugs
|
|
if (end > start) {
|
|
var xt = source.substring(start, end).replace(ENTITY_REG, entityReplacer);
|
|
locator && position(start);
|
|
domBuilder.characters(xt, 0, end - start);
|
|
start = end;
|
|
}
|
|
}
|
|
|
|
var lineStart = 0;
|
|
var lineEnd = 0;
|
|
var linePattern = /\r\n?|\n|$/g;
|
|
var locator = domBuilder.locator;
|
|
|
|
function position(p, m) {
|
|
while (p >= lineEnd && (m = linePattern.exec(source))) {
|
|
lineStart = lineEnd;
|
|
lineEnd = m.index + m[0].length;
|
|
locator.lineNumber++;
|
|
}
|
|
locator.columnNumber = p - lineStart + 1;
|
|
}
|
|
|
|
var parseStack = [{ currentNSMap: defaultNSMapCopy }];
|
|
var unclosedTags = [];
|
|
var start = 0;
|
|
while (true) {
|
|
try {
|
|
var tagStart = source.indexOf('<', start);
|
|
if (tagStart < 0) {
|
|
if (!isHTML && unclosedTags.length > 0) {
|
|
return errorHandler.fatalError('unclosed xml tag(s): ' + unclosedTags.join(', '));
|
|
}
|
|
if (!source.substring(start).match(/^\s*$/)) {
|
|
var doc = domBuilder.doc;
|
|
var text = doc.createTextNode(source.substring(start));
|
|
if (doc.documentElement) {
|
|
return errorHandler.error('Extra content at the end of the document');
|
|
}
|
|
doc.appendChild(text);
|
|
domBuilder.currentElement = text;
|
|
}
|
|
return;
|
|
}
|
|
if (tagStart > start) {
|
|
var fromSource = source.substring(start, tagStart);
|
|
if (!isHTML && unclosedTags.length === 0) {
|
|
fromSource = fromSource.replace(new RegExp(g.S_OPT.source, 'g'), '');
|
|
fromSource && errorHandler.error("Unexpected content outside root element: '" + fromSource + "'");
|
|
}
|
|
appendText(tagStart);
|
|
}
|
|
switch (source.charAt(tagStart + 1)) {
|
|
case '/':
|
|
var end = source.indexOf('>', tagStart + 2);
|
|
var tagNameRaw = source.substring(tagStart + 2, end > 0 ? end : undefined);
|
|
if (!tagNameRaw) {
|
|
return errorHandler.fatalError('end tag name missing');
|
|
}
|
|
var tagNameMatch = end > 0 && g.reg('^', g.QName_group, g.S_OPT, '$').exec(tagNameRaw);
|
|
if (!tagNameMatch) {
|
|
return errorHandler.fatalError('end tag name contains invalid characters: "' + tagNameRaw + '"');
|
|
}
|
|
if (!domBuilder.currentElement && !domBuilder.doc.documentElement) {
|
|
// not enough information to provide a helpful error message,
|
|
// but parsing will throw since there is no root element
|
|
return;
|
|
}
|
|
var currentTagName =
|
|
unclosedTags[unclosedTags.length - 1] ||
|
|
domBuilder.currentElement.tagName ||
|
|
domBuilder.doc.documentElement.tagName ||
|
|
'';
|
|
if (currentTagName !== tagNameMatch[1]) {
|
|
var tagNameLower = tagNameMatch[1].toLowerCase();
|
|
if (!isHTML || currentTagName.toLowerCase() !== tagNameLower) {
|
|
return errorHandler.fatalError('Opening and ending tag mismatch: "' + currentTagName + '" != "' + tagNameRaw + '"');
|
|
}
|
|
}
|
|
var config = parseStack.pop();
|
|
unclosedTags.pop();
|
|
var localNSMap = config.localNSMap;
|
|
domBuilder.endElement(config.uri, config.localName, currentTagName);
|
|
if (localNSMap) {
|
|
for (var prefix in localNSMap) {
|
|
if (hasOwn(localNSMap, prefix)) {
|
|
domBuilder.endPrefixMapping(prefix);
|
|
}
|
|
}
|
|
}
|
|
|
|
end++;
|
|
break;
|
|
// end element
|
|
case '?': // <?...?>
|
|
locator && position(tagStart);
|
|
end = parseProcessingInstruction(source, tagStart, domBuilder, errorHandler);
|
|
break;
|
|
case '!': // <!doctype,<![CDATA,<!--
|
|
locator && position(tagStart);
|
|
end = parseDoctypeCommentOrCData(source, tagStart, domBuilder, errorHandler, isHTML);
|
|
break;
|
|
default:
|
|
locator && position(tagStart);
|
|
var el = new ElementAttributes();
|
|
var currentNSMap = parseStack[parseStack.length - 1].currentNSMap;
|
|
//elStartEnd
|
|
var end = parseElementStartPart(source, tagStart, el, currentNSMap, entityReplacer, errorHandler, isHTML);
|
|
var len = el.length;
|
|
|
|
if (!el.closed) {
|
|
if (isHTML && conventions.isHTMLVoidElement(el.tagName)) {
|
|
el.closed = true;
|
|
} else {
|
|
unclosedTags.push(el.tagName);
|
|
}
|
|
}
|
|
if (locator && len) {
|
|
var locator2 = copyLocator(locator, {});
|
|
//try{//attribute position fixed
|
|
for (var i = 0; i < len; i++) {
|
|
var a = el[i];
|
|
position(a.offset);
|
|
a.locator = copyLocator(locator, {});
|
|
}
|
|
domBuilder.locator = locator2;
|
|
if (appendElement(el, domBuilder, currentNSMap)) {
|
|
parseStack.push(el);
|
|
}
|
|
domBuilder.locator = locator;
|
|
} else {
|
|
if (appendElement(el, domBuilder, currentNSMap)) {
|
|
parseStack.push(el);
|
|
}
|
|
}
|
|
|
|
if (isHTML && !el.closed) {
|
|
end = parseHtmlSpecialContent(source, end, el.tagName, entityReplacer, domBuilder);
|
|
} else {
|
|
end++;
|
|
}
|
|
}
|
|
} catch (e) {
|
|
if (e instanceof ParseError) {
|
|
throw e;
|
|
} else if (e instanceof DOMException) {
|
|
throw new ParseError(e.name + ': ' + e.message, domBuilder.locator, e);
|
|
}
|
|
errorHandler.error('element parse error: ' + e);
|
|
end = -1;
|
|
}
|
|
if (end > start) {
|
|
start = end;
|
|
} else {
|
|
//Possible sax fallback here, risk of positional error
|
|
appendText(Math.max(tagStart, start) + 1);
|
|
}
|
|
}
|
|
}
|
|
|
|
function copyLocator(f, t) {
|
|
t.lineNumber = f.lineNumber;
|
|
t.columnNumber = f.columnNumber;
|
|
return t;
|
|
}
|
|
|
|
/**
|
|
* @returns
|
|
* end of the elementStartPart(end of elementEndPart for selfClosed el)
|
|
* @see {@link #appendElement}
|
|
*/
|
|
function parseElementStartPart(source, start, el, currentNSMap, entityReplacer, errorHandler, isHTML) {
|
|
/**
|
|
* @param {string} qname
|
|
* @param {string} value
|
|
* @param {number} startIndex
|
|
*/
|
|
function addAttribute(qname, value, startIndex) {
|
|
if (hasOwn(el.attributeNames, qname)) {
|
|
return errorHandler.fatalError('Attribute ' + qname + ' redefined');
|
|
}
|
|
if (!isHTML && value.indexOf('<') >= 0) {
|
|
return errorHandler.fatalError("Unescaped '<' not allowed in attributes values");
|
|
}
|
|
el.addValue(
|
|
qname,
|
|
// @see https://www.w3.org/TR/xml/#AVNormalize
|
|
// since the xmldom sax parser does not "interpret" DTD the following is not implemented:
|
|
// - recursive replacement of (DTD) entity references
|
|
// - trimming and collapsing multiple spaces into a single one for attributes that are not of type CDATA
|
|
value.replace(/[\t\n\r]/g, ' ').replace(ENTITY_REG, entityReplacer),
|
|
startIndex
|
|
);
|
|
}
|
|
|
|
var attrName;
|
|
var value;
|
|
var p = ++start;
|
|
var s = S_TAG; //status
|
|
while (true) {
|
|
var c = source.charAt(p);
|
|
switch (c) {
|
|
case '=':
|
|
if (s === S_ATTR) {
|
|
//attrName
|
|
attrName = source.slice(start, p);
|
|
s = S_EQ;
|
|
} else if (s === S_ATTR_SPACE) {
|
|
s = S_EQ;
|
|
} else {
|
|
//fatalError: equal must after attrName or space after attrName
|
|
throw new Error('attribute equal must after attrName'); // No known test case
|
|
}
|
|
break;
|
|
case "'":
|
|
case '"':
|
|
if (
|
|
s === S_EQ ||
|
|
s === S_ATTR //|| s == S_ATTR_SPACE
|
|
) {
|
|
//equal
|
|
if (s === S_ATTR) {
|
|
errorHandler.warning('attribute value must after "="');
|
|
attrName = source.slice(start, p);
|
|
}
|
|
start = p + 1;
|
|
p = source.indexOf(c, start);
|
|
if (p > 0) {
|
|
value = source.slice(start, p);
|
|
addAttribute(attrName, value, start - 1);
|
|
s = S_ATTR_END;
|
|
} else {
|
|
//fatalError: no end quot match
|
|
throw new Error("attribute value no end '" + c + "' match");
|
|
}
|
|
} else if (s == S_ATTR_NOQUOT_VALUE) {
|
|
value = source.slice(start, p);
|
|
addAttribute(attrName, value, start);
|
|
errorHandler.warning('attribute "' + attrName + '" missed start quot(' + c + ')!!');
|
|
start = p + 1;
|
|
s = S_ATTR_END;
|
|
} else {
|
|
//fatalError: no equal before
|
|
throw new Error('attribute value must after "="'); // No known test case
|
|
}
|
|
break;
|
|
case '/':
|
|
switch (s) {
|
|
case S_TAG:
|
|
el.setTagName(source.slice(start, p));
|
|
case S_ATTR_END:
|
|
case S_TAG_SPACE:
|
|
case S_TAG_CLOSE:
|
|
s = S_TAG_CLOSE;
|
|
el.closed = true;
|
|
case S_ATTR_NOQUOT_VALUE:
|
|
case S_ATTR:
|
|
break;
|
|
case S_ATTR_SPACE:
|
|
el.closed = true;
|
|
break;
|
|
//case S_EQ:
|
|
default:
|
|
throw new Error("attribute invalid close char('/')"); // No known test case
|
|
}
|
|
break;
|
|
case '': //end document
|
|
errorHandler.error('unexpected end of input');
|
|
if (s == S_TAG) {
|
|
el.setTagName(source.slice(start, p));
|
|
}
|
|
return p;
|
|
case '>':
|
|
switch (s) {
|
|
case S_TAG:
|
|
el.setTagName(source.slice(start, p));
|
|
case S_ATTR_END:
|
|
case S_TAG_SPACE:
|
|
case S_TAG_CLOSE:
|
|
break; //normal
|
|
case S_ATTR_NOQUOT_VALUE: //Compatible state
|
|
case S_ATTR:
|
|
value = source.slice(start, p);
|
|
if (value.slice(-1) === '/') {
|
|
el.closed = true;
|
|
value = value.slice(0, -1);
|
|
}
|
|
case S_ATTR_SPACE:
|
|
if (s === S_ATTR_SPACE) {
|
|
value = attrName;
|
|
}
|
|
if (s == S_ATTR_NOQUOT_VALUE) {
|
|
errorHandler.warning('attribute "' + value + '" missed quot(")!');
|
|
addAttribute(attrName, value, start);
|
|
} else {
|
|
if (!isHTML) {
|
|
errorHandler.warning('attribute "' + value + '" missed value!! "' + value + '" instead!!');
|
|
}
|
|
addAttribute(value, value, start);
|
|
}
|
|
break;
|
|
case S_EQ:
|
|
if (!isHTML) {
|
|
return errorHandler.fatalError('AttValue: \' or " expected');
|
|
}
|
|
}
|
|
return p;
|
|
/*xml space '\x20' | #x9 | #xD | #xA; */
|
|
case '\u0080':
|
|
c = ' ';
|
|
default:
|
|
if (c <= ' ') {
|
|
//space
|
|
switch (s) {
|
|
case S_TAG:
|
|
el.setTagName(source.slice(start, p)); //tagName
|
|
s = S_TAG_SPACE;
|
|
break;
|
|
case S_ATTR:
|
|
attrName = source.slice(start, p);
|
|
s = S_ATTR_SPACE;
|
|
break;
|
|
case S_ATTR_NOQUOT_VALUE:
|
|
var value = source.slice(start, p);
|
|
errorHandler.warning('attribute "' + value + '" missed quot(")!!');
|
|
addAttribute(attrName, value, start);
|
|
case S_ATTR_END:
|
|
s = S_TAG_SPACE;
|
|
break;
|
|
//case S_TAG_SPACE:
|
|
//case S_EQ:
|
|
//case S_ATTR_SPACE:
|
|
// void();break;
|
|
//case S_TAG_CLOSE:
|
|
//ignore warning
|
|
}
|
|
} else {
|
|
//not space
|
|
//S_TAG, S_ATTR, S_EQ, S_ATTR_NOQUOT_VALUE
|
|
//S_ATTR_SPACE, S_ATTR_END, S_TAG_SPACE, S_TAG_CLOSE
|
|
switch (s) {
|
|
//case S_TAG:void();break;
|
|
//case S_ATTR:void();break;
|
|
//case S_ATTR_NOQUOT_VALUE:void();break;
|
|
case S_ATTR_SPACE:
|
|
if (!isHTML) {
|
|
errorHandler.warning('attribute "' + attrName + '" missed value!! "' + attrName + '" instead2!!');
|
|
}
|
|
addAttribute(attrName, attrName, start);
|
|
start = p;
|
|
s = S_ATTR;
|
|
break;
|
|
case S_ATTR_END:
|
|
errorHandler.warning('attribute space is required"' + attrName + '"!!');
|
|
case S_TAG_SPACE:
|
|
s = S_ATTR;
|
|
start = p;
|
|
break;
|
|
case S_EQ:
|
|
s = S_ATTR_NOQUOT_VALUE;
|
|
start = p;
|
|
break;
|
|
case S_TAG_CLOSE:
|
|
throw new Error("elements closed character '/' and '>' must be connected to");
|
|
}
|
|
}
|
|
} //end outer switch
|
|
p++;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @returns
|
|
* `true` if a new namespace has been defined.
|
|
*/
|
|
function appendElement(el, domBuilder, currentNSMap) {
|
|
var tagName = el.tagName;
|
|
var localNSMap = null;
|
|
var i = el.length;
|
|
while (i--) {
|
|
var a = el[i];
|
|
var qName = a.qName;
|
|
var value = a.value;
|
|
var nsp = qName.indexOf(':');
|
|
if (nsp > 0) {
|
|
var prefix = (a.prefix = qName.slice(0, nsp));
|
|
var localName = qName.slice(nsp + 1);
|
|
var nsPrefix = prefix === 'xmlns' && localName;
|
|
} else {
|
|
localName = qName;
|
|
prefix = null;
|
|
nsPrefix = qName === 'xmlns' && '';
|
|
}
|
|
//can not set prefix,because prefix !== ''
|
|
a.localName = localName;
|
|
//prefix == null for no ns prefix attribute
|
|
if (nsPrefix !== false) {
|
|
//hack!!
|
|
if (localNSMap == null) {
|
|
localNSMap = Object.create(null);
|
|
_copy(currentNSMap, (currentNSMap = Object.create(null)));
|
|
}
|
|
currentNSMap[nsPrefix] = localNSMap[nsPrefix] = value;
|
|
a.uri = NAMESPACE.XMLNS;
|
|
domBuilder.startPrefixMapping(nsPrefix, value);
|
|
}
|
|
}
|
|
var i = el.length;
|
|
while (i--) {
|
|
a = el[i];
|
|
if (a.prefix) {
|
|
//no prefix attribute has no namespace
|
|
if (a.prefix === 'xml') {
|
|
a.uri = NAMESPACE.XML;
|
|
}
|
|
if (a.prefix !== 'xmlns') {
|
|
a.uri = currentNSMap[a.prefix];
|
|
}
|
|
}
|
|
}
|
|
var nsp = tagName.indexOf(':');
|
|
if (nsp > 0) {
|
|
prefix = el.prefix = tagName.slice(0, nsp);
|
|
localName = el.localName = tagName.slice(nsp + 1);
|
|
} else {
|
|
prefix = null; //important!!
|
|
localName = el.localName = tagName;
|
|
}
|
|
//no prefix element has default namespace
|
|
var ns = (el.uri = currentNSMap[prefix || '']);
|
|
domBuilder.startElement(ns, localName, tagName, el);
|
|
//endPrefixMapping and startPrefixMapping have not any help for dom builder
|
|
//localNSMap = null
|
|
if (el.closed) {
|
|
domBuilder.endElement(ns, localName, tagName);
|
|
if (localNSMap) {
|
|
for (prefix in localNSMap) {
|
|
if (hasOwn(localNSMap, prefix)) {
|
|
domBuilder.endPrefixMapping(prefix);
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
el.currentNSMap = currentNSMap;
|
|
el.localNSMap = localNSMap;
|
|
//parseStack.push(el);
|
|
return true;
|
|
}
|
|
}
|
|
|
|
function parseHtmlSpecialContent(source, elStartEnd, tagName, entityReplacer, domBuilder) {
|
|
// https://html.spec.whatwg.org/#raw-text-elements
|
|
// https://html.spec.whatwg.org/#escapable-raw-text-elements
|
|
// https://html.spec.whatwg.org/#cdata-rcdata-restrictions:raw-text-elements
|
|
// TODO: https://html.spec.whatwg.org/#cdata-rcdata-restrictions
|
|
var isEscapableRaw = isHTMLEscapableRawTextElement(tagName);
|
|
if (isEscapableRaw || isHTMLRawTextElement(tagName)) {
|
|
var elEndStart = source.indexOf('</' + tagName + '>', elStartEnd);
|
|
var text = source.substring(elStartEnd + 1, elEndStart);
|
|
|
|
if (isEscapableRaw) {
|
|
text = text.replace(ENTITY_REG, entityReplacer);
|
|
}
|
|
domBuilder.characters(text, 0, text.length);
|
|
return elEndStart;
|
|
}
|
|
return elStartEnd + 1;
|
|
}
|
|
|
|
function _copy(source, target) {
|
|
for (var n in source) {
|
|
if (hasOwn(source, n)) {
|
|
target[n] = source[n];
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @typedef ParseUtils
|
|
* @property {function(relativeIndex: number?): string | undefined} char
|
|
* Provides look ahead access to a singe character relative to the current index.
|
|
* @property {function(): number} getIndex
|
|
* Provides read-only access to the current index.
|
|
* @property {function(reg: RegExp): string | null} getMatch
|
|
* Applies the provided regular expression enforcing that it starts at the current index and
|
|
* returns the complete matching string,
|
|
* and moves the current index by the length of the matching string.
|
|
* @property {function(): string} getSource
|
|
* Provides read-only access to the complete source.
|
|
* @property {function(places: number?): void} skip
|
|
* moves the current index by places (defaults to 1)
|
|
* @property {function(): number} skipBlanks
|
|
* Moves the current index by the amount of white space that directly follows the current index
|
|
* and returns the amount of whitespace chars skipped (0..n),
|
|
* or -1 if the end of the source was reached.
|
|
* @property {function(): string} substringFromIndex
|
|
* creates a substring from the current index to the end of `source`
|
|
* @property {function(compareWith: string): boolean} substringStartsWith
|
|
* Checks if `source` contains `compareWith`, starting from the current index.
|
|
* @property {function(compareWith: string): boolean} substringStartsWithCaseInsensitive
|
|
* Checks if `source` contains `compareWith`, starting from the current index,
|
|
* comparing the upper case of both sides.
|
|
* @see {@link parseUtils}
|
|
*/
|
|
|
|
/**
|
|
* A temporary scope for parsing and look ahead operations in `source`,
|
|
* starting from index `start`.
|
|
*
|
|
* Some operations move the current index by a number of positions,
|
|
* after which `getIndex` returns the new index.
|
|
*
|
|
* @param {string} source
|
|
* @param {number} start
|
|
* @returns {ParseUtils}
|
|
*/
|
|
function parseUtils(source, start) {
|
|
var index = start;
|
|
|
|
function char(n) {
|
|
n = n || 0;
|
|
return source.charAt(index + n);
|
|
}
|
|
|
|
function skip(n) {
|
|
n = n || 1;
|
|
index += n;
|
|
}
|
|
|
|
function skipBlanks() {
|
|
var blanks = 0;
|
|
while (index < source.length) {
|
|
var c = char();
|
|
if (c !== ' ' && c !== '\n' && c !== '\t' && c !== '\r') {
|
|
return blanks;
|
|
}
|
|
blanks++;
|
|
skip();
|
|
}
|
|
return -1;
|
|
}
|
|
function substringFromIndex() {
|
|
return source.substring(index);
|
|
}
|
|
function substringStartsWith(text) {
|
|
return source.substring(index, index + text.length) === text;
|
|
}
|
|
function substringStartsWithCaseInsensitive(text) {
|
|
return source.substring(index, index + text.length).toUpperCase() === text.toUpperCase();
|
|
}
|
|
|
|
function getMatch(args) {
|
|
var expr = g.reg('^', args);
|
|
var match = expr.exec(substringFromIndex());
|
|
if (match) {
|
|
skip(match[0].length);
|
|
return match[0];
|
|
}
|
|
return null;
|
|
}
|
|
return {
|
|
char: char,
|
|
getIndex: function () {
|
|
return index;
|
|
},
|
|
getMatch: getMatch,
|
|
getSource: function () {
|
|
return source;
|
|
},
|
|
skip: skip,
|
|
skipBlanks: skipBlanks,
|
|
substringFromIndex: substringFromIndex,
|
|
substringStartsWith: substringStartsWith,
|
|
substringStartsWithCaseInsensitive: substringStartsWithCaseInsensitive,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* @param {ParseUtils} p
|
|
* @param {DOMHandler} errorHandler
|
|
* @returns {string}
|
|
*/
|
|
function parseDoctypeInternalSubset(p, errorHandler) {
|
|
/**
|
|
* @param {ParseUtils} p
|
|
* @param {DOMHandler} errorHandler
|
|
* @returns {string}
|
|
*/
|
|
function parsePI(p, errorHandler) {
|
|
var match = g.PI.exec(p.substringFromIndex());
|
|
if (!match) {
|
|
return errorHandler.fatalError('processing instruction is not well-formed at position ' + p.getIndex());
|
|
}
|
|
if (match[1].toLowerCase() === 'xml') {
|
|
return errorHandler.fatalError(
|
|
'xml declaration is only allowed at the start of the document, but found at position ' + p.getIndex()
|
|
);
|
|
}
|
|
p.skip(match[0].length);
|
|
return match[0];
|
|
}
|
|
// Parse internal subset
|
|
var source = p.getSource();
|
|
if (p.char() === '[') {
|
|
p.skip(1);
|
|
var intSubsetStart = p.getIndex();
|
|
while (p.getIndex() < source.length) {
|
|
p.skipBlanks();
|
|
if (p.char() === ']') {
|
|
var internalSubset = source.substring(intSubsetStart, p.getIndex());
|
|
p.skip(1);
|
|
return internalSubset;
|
|
}
|
|
var current = null;
|
|
// Only in external subset
|
|
// if (char() === '<' && char(1) === '!' && char(2) === '[') {
|
|
// parseConditionalSections(p, errorHandler);
|
|
// } else
|
|
if (p.char() === '<' && p.char(1) === '!') {
|
|
switch (p.char(2)) {
|
|
case 'E': // ELEMENT | ENTITY
|
|
if (p.char(3) === 'L') {
|
|
current = p.getMatch(g.elementdecl);
|
|
} else if (p.char(3) === 'N') {
|
|
current = p.getMatch(g.EntityDecl);
|
|
}
|
|
break;
|
|
case 'A': // ATTRIBUTE
|
|
current = p.getMatch(g.AttlistDecl);
|
|
break;
|
|
case 'N': // NOTATION
|
|
current = p.getMatch(g.NotationDecl);
|
|
break;
|
|
case '-': // COMMENT
|
|
current = p.getMatch(g.Comment);
|
|
break;
|
|
}
|
|
} else if (p.char() === '<' && p.char(1) === '?') {
|
|
current = parsePI(p, errorHandler);
|
|
} else if (p.char() === '%') {
|
|
current = p.getMatch(g.PEReference);
|
|
} else {
|
|
return errorHandler.fatalError('Error detected in Markup declaration');
|
|
}
|
|
if (!current) {
|
|
return errorHandler.fatalError('Error in internal subset at position ' + p.getIndex());
|
|
}
|
|
}
|
|
return errorHandler.fatalError('doctype internal subset is not well-formed, missing ]');
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Called when the parser encounters an element starting with '<!'.
|
|
*
|
|
* @param {string} source
|
|
* The xml.
|
|
* @param {number} start
|
|
* the start index of the '<!'
|
|
* @param {DOMHandler} domBuilder
|
|
* @param {DOMHandler} errorHandler
|
|
* @param {boolean} isHTML
|
|
* @returns {number | never}
|
|
* The end index of the element.
|
|
* @throws {ParseError}
|
|
* In case the element is not well-formed.
|
|
*/
|
|
function parseDoctypeCommentOrCData(source, start, domBuilder, errorHandler, isHTML) {
|
|
var p = parseUtils(source, start);
|
|
|
|
switch (isHTML ? p.char(2).toUpperCase() : p.char(2)) {
|
|
case '-':
|
|
// should be a comment
|
|
var comment = p.getMatch(g.Comment);
|
|
if (comment) {
|
|
domBuilder.comment(comment, g.COMMENT_START.length, comment.length - g.COMMENT_START.length - g.COMMENT_END.length);
|
|
return p.getIndex();
|
|
} else {
|
|
return errorHandler.fatalError('comment is not well-formed at position ' + p.getIndex());
|
|
}
|
|
case '[':
|
|
// should be CDATA
|
|
var cdata = p.getMatch(g.CDSect);
|
|
if (cdata) {
|
|
if (!isHTML && !domBuilder.currentElement) {
|
|
return errorHandler.fatalError('CDATA outside of element');
|
|
}
|
|
domBuilder.startCDATA();
|
|
domBuilder.characters(cdata, g.CDATA_START.length, cdata.length - g.CDATA_START.length - g.CDATA_END.length);
|
|
domBuilder.endCDATA();
|
|
return p.getIndex();
|
|
} else {
|
|
return errorHandler.fatalError('Invalid CDATA starting at position ' + start);
|
|
}
|
|
case 'D': {
|
|
// should be DOCTYPE
|
|
if (domBuilder.doc && domBuilder.doc.documentElement) {
|
|
return errorHandler.fatalError('Doctype not allowed inside or after documentElement at position ' + p.getIndex());
|
|
}
|
|
if (isHTML ? !p.substringStartsWithCaseInsensitive(g.DOCTYPE_DECL_START) : !p.substringStartsWith(g.DOCTYPE_DECL_START)) {
|
|
return errorHandler.fatalError('Expected ' + g.DOCTYPE_DECL_START + ' at position ' + p.getIndex());
|
|
}
|
|
p.skip(g.DOCTYPE_DECL_START.length);
|
|
if (p.skipBlanks() < 1) {
|
|
return errorHandler.fatalError('Expected whitespace after ' + g.DOCTYPE_DECL_START + ' at position ' + p.getIndex());
|
|
}
|
|
|
|
var doctype = {
|
|
name: undefined,
|
|
publicId: undefined,
|
|
systemId: undefined,
|
|
internalSubset: undefined,
|
|
};
|
|
// Parse the DOCTYPE name
|
|
doctype.name = p.getMatch(g.Name);
|
|
if (!doctype.name)
|
|
return errorHandler.fatalError('doctype name missing or contains unexpected characters at position ' + p.getIndex());
|
|
|
|
if (isHTML && doctype.name.toLowerCase() !== 'html') {
|
|
errorHandler.warning('Unexpected DOCTYPE in HTML document at position ' + p.getIndex());
|
|
}
|
|
p.skipBlanks();
|
|
|
|
// Check for ExternalID
|
|
if (p.substringStartsWith(g.PUBLIC) || p.substringStartsWith(g.SYSTEM)) {
|
|
var match = g.ExternalID_match.exec(p.substringFromIndex());
|
|
if (!match) {
|
|
return errorHandler.fatalError('doctype external id is not well-formed at position ' + p.getIndex());
|
|
}
|
|
if (match.groups.SystemLiteralOnly !== undefined) {
|
|
doctype.systemId = match.groups.SystemLiteralOnly;
|
|
} else {
|
|
doctype.systemId = match.groups.SystemLiteral;
|
|
doctype.publicId = match.groups.PubidLiteral;
|
|
}
|
|
p.skip(match[0].length);
|
|
} else if (isHTML && p.substringStartsWithCaseInsensitive(g.SYSTEM)) {
|
|
// https://html.spec.whatwg.org/multipage/syntax.html#doctype-legacy-string
|
|
p.skip(g.SYSTEM.length);
|
|
if (p.skipBlanks() < 1) {
|
|
return errorHandler.fatalError('Expected whitespace after ' + g.SYSTEM + ' at position ' + p.getIndex());
|
|
}
|
|
doctype.systemId = p.getMatch(g.ABOUT_LEGACY_COMPAT_SystemLiteral);
|
|
if (!doctype.systemId) {
|
|
return errorHandler.fatalError(
|
|
'Expected ' + g.ABOUT_LEGACY_COMPAT + ' in single or double quotes after ' + g.SYSTEM + ' at position ' + p.getIndex()
|
|
);
|
|
}
|
|
}
|
|
if (isHTML && doctype.systemId && !g.ABOUT_LEGACY_COMPAT_SystemLiteral.test(doctype.systemId)) {
|
|
errorHandler.warning('Unexpected doctype.systemId in HTML document at position ' + p.getIndex());
|
|
}
|
|
if (!isHTML) {
|
|
p.skipBlanks();
|
|
doctype.internalSubset = parseDoctypeInternalSubset(p, errorHandler);
|
|
}
|
|
p.skipBlanks();
|
|
if (p.char() !== '>') {
|
|
return errorHandler.fatalError('doctype not terminated with > at position ' + p.getIndex());
|
|
}
|
|
p.skip(1);
|
|
domBuilder.startDTD(doctype.name, doctype.publicId, doctype.systemId, doctype.internalSubset);
|
|
domBuilder.endDTD();
|
|
return p.getIndex();
|
|
}
|
|
default:
|
|
return errorHandler.fatalError('Not well-formed XML starting with "<!" at position ' + start);
|
|
}
|
|
}
|
|
|
|
function parseProcessingInstruction(source, start, domBuilder, errorHandler) {
|
|
var match = source.substring(start).match(g.PI);
|
|
if (!match) {
|
|
return errorHandler.fatalError('Invalid processing instruction starting at position ' + start);
|
|
}
|
|
if (match[1].toLowerCase() === 'xml') {
|
|
if (start > 0) {
|
|
return errorHandler.fatalError(
|
|
'processing instruction at position ' + start + ' is an xml declaration which is only at the start of the document'
|
|
);
|
|
}
|
|
if (!g.XMLDecl.test(source.substring(start))) {
|
|
return errorHandler.fatalError('xml declaration is not well-formed');
|
|
}
|
|
}
|
|
domBuilder.processingInstruction(match[1], match[2]);
|
|
return start + match[0].length;
|
|
}
|
|
|
|
function ElementAttributes() {
|
|
this.attributeNames = Object.create(null);
|
|
}
|
|
|
|
ElementAttributes.prototype = {
|
|
setTagName: function (tagName) {
|
|
if (!g.QName_exact.test(tagName)) {
|
|
throw new Error('invalid tagName:' + tagName);
|
|
}
|
|
this.tagName = tagName;
|
|
},
|
|
addValue: function (qName, value, offset) {
|
|
if (!g.QName_exact.test(qName)) {
|
|
throw new Error('invalid attribute:' + qName);
|
|
}
|
|
this.attributeNames[qName] = this.length;
|
|
this[this.length++] = { qName: qName, value: value, offset: offset };
|
|
},
|
|
length: 0,
|
|
getLocalName: function (i) {
|
|
return this[i].localName;
|
|
},
|
|
getLocator: function (i) {
|
|
return this[i].locator;
|
|
},
|
|
getQName: function (i) {
|
|
return this[i].qName;
|
|
},
|
|
getURI: function (i) {
|
|
return this[i].uri;
|
|
},
|
|
getValue: function (i) {
|
|
return this[i].value;
|
|
},
|
|
// ,getIndex:function(uri, localName)){
|
|
// if(localName){
|
|
//
|
|
// }else{
|
|
// var qName = uri
|
|
// }
|
|
// },
|
|
// getValue:function(){return this.getValue(this.getIndex.apply(this,arguments))},
|
|
// getType:function(uri,localName){}
|
|
// getType:function(i){},
|
|
};
|
|
|
|
exports.XMLReader = XMLReader;
|
|
exports.parseUtils = parseUtils;
|
|
exports.parseDoctypeCommentOrCData = parseDoctypeCommentOrCData;
|