diff options
| author | nsfisis <nsfisis@gmail.com> | 2023-09-07 22:27:48 +0900 |
|---|---|---|
| committer | nsfisis <nsfisis@gmail.com> | 2023-09-07 22:35:53 +0900 |
| commit | 994e0114d76ae19768d5c303874a968cf6369fd0 (patch) | |
| tree | 5fd3f8b169eea00084b24fbae820f75273864d2a /vhosts/blog/nuldoc-src/xml.ts | |
| parent | 57f015992f678bfd7281f171fb9d71349c96a1a0 (diff) | |
| download | nsfisis.dev-994e0114d76ae19768d5c303874a968cf6369fd0.tar.gz nsfisis.dev-994e0114d76ae19768d5c303874a968cf6369fd0.tar.zst nsfisis.dev-994e0114d76ae19768d5c303874a968cf6369fd0.zip | |
meta: migrate to monorepo
Diffstat (limited to 'vhosts/blog/nuldoc-src/xml.ts')
| -rw-r--r-- | vhosts/blog/nuldoc-src/xml.ts | 269 |
1 files changed, 269 insertions, 0 deletions
diff --git a/vhosts/blog/nuldoc-src/xml.ts b/vhosts/blog/nuldoc-src/xml.ts new file mode 100644 index 00000000..847b5e12 --- /dev/null +++ b/vhosts/blog/nuldoc-src/xml.ts @@ -0,0 +1,269 @@ +import { Element, Node, Text } from "./dom.ts"; +import { XmlParseError } from "./errors.ts"; + +export async function parseXmlFile(filePath: string): Promise<Element> { + return parseXmlString(await Deno.readTextFile(filePath)); +} + +export function parseXmlString(source: string): Element { + return parse({ source: source, index: 0 }); +} + +type Parser = { + source: string; + index: number; +}; + +function parse(p: Parser): Element { + parseXmlDeclaration(p); + skipWhitespaces(p); + const e = parseXmlElement(p); + const root: Element = { + kind: "element", + name: "__root__", + attributes: new Map(), + children: [e], + }; + return root; +} + +function parseXmlDeclaration(p: Parser) { + expect(p, "<?xml "); + skipTo(p, "?>"); + next(p, 2); +} + +function parseXmlElement(p: Parser): Element { + const { name, attributes, closed } = parseStartTag(p); + if (closed) { + return { + kind: "element", + name: name, + attributes: attributes, + children: [], + }; + } + const children = parseChildNodes(p); + parseEndTag(p, name); + + const thisElement: Element = { + kind: "element", + name: name, + attributes: attributes, + children: children, + }; + return thisElement; +} + +function parseChildNodes(p: Parser): Node[] { + const nodes = []; + while (true) { + const c = peek(p); + const c2 = peekN(p, 2); + const c3 = peekN(p, 3); + if (c === "<") { + if (c2 === "/") { + break; + } else if (c2 === "!") { + if (c3 === "[") { + // <![CDATA[ + nodes.push(parseCdata(p)); + } else { + // <!-- + skipComment(p); + } + } else { + nodes.push(parseXmlElement(p)); + } + } else { + nodes.push(parseTextNode(p)); + } + } + return nodes; +} + +function parseTextNode(p: Parser): Text { + const content = skipTo(p, "<"); + return { + kind: "text", + content: replaceEntityReferences(content), + raw: false, + }; +} + +function parseCdata(p: Parser): Text { + expect(p, "<![CDATA["); + const content = skipTo(p, "]]>"); + next(p, "]]>".length); + return { + kind: "text", + content: formatCdata(content), + raw: false, + }; +} + +function skipComment(p: Parser) { + expect(p, "<!--"); + skipTo(p, "-->"); + next(p, "-->".length); +} + +function formatCdata(s: string): string { + // <![CDATA[ + // foo + // bar + // baz + // ]]> + // => "foo\n bar\nbaz" + s = s.replace(/^\n(.*)\n *$/s, "$1"); + const ls = s.split("\n"); + const n = Math.min( + ...ls.filter((l) => l !== "").map((l) => + l.match(/^( *)/)?.[0]?.length ?? 0 + ), + ); + let z = ""; + for (const p of s.split("\n")) { + z += p.slice(n) + "\n"; + } + return z.slice(0, -1); +} + +function parseStartTag( + p: Parser, +): { name: string; attributes: Map<string, string>; closed: boolean } { + expect(p, "<"); + const name = parseIdentifier(p); + skipWhitespaces(p); + if (peek(p) === "/") { + expect(p, "/>"); + return { name: name, attributes: new Map(), closed: true }; + } + if (peek(p) === ">") { + next(p); + return { name: name, attributes: new Map(), closed: false }; + } + const attributes = new Map(); + while (peek(p) !== ">" && peek(p) !== "/") { + const { name, value } = parseAttribute(p); + attributes.set(name, value); + } + let closed = false; + if (peek(p) === "/") { + next(p); + closed = true; + } + expect(p, ">"); + return { name: name, attributes: attributes, closed: closed }; +} + +function parseEndTag(p: Parser, name: string) { + expect(p, `</${name}>`); +} + +function parseAttribute(p: Parser): { name: string; value: string } { + skipWhitespaces(p); + let name = parseIdentifier(p); + if (peek(p) === ":") { + next(p); + const name2 = parseIdentifier(p); + name += ":" + name2; + } + expect(p, "="); + const value = parseQuotedString(p); + skipWhitespaces(p); + return { name: name, value: replaceEntityReferences(value) }; +} + +function parseQuotedString(p: Parser): string { + expect(p, '"'); + const content = skipTo(p, '"'); + next(p); + return content; +} + +function parseIdentifier(p: Parser): string { + let id = ""; + while (p.index < p.source.length) { + const c = peek(p); + if (!c || !/[A-Za-z]/.test(c)) { + break; + } + id += c; + next(p); + } + return id; +} + +function expect(p: Parser, expected: string) { + let actual = ""; + for (let i = 0; i < expected.length; i++) { + actual += peek(p); + next(p); + } + if (actual !== expected) { + throw new XmlParseError( + `[parse.expect] expected ${expected}, but actually got ${ + escapeForHuman(actual) + } (pos: ${p.index})`, + ); + } +} + +function skipTo(p: Parser, delimiter: string): string { + const indexStart = p.index; + let i = 0; + while (i < delimiter.length) { + if (peek(p) === delimiter[i]) { + i++; + } else { + i = 0; + } + next(p); + } + back(p, delimiter.length); + return p.source.substring(indexStart, p.index); +} + +function skipWhitespaces(p: Parser) { + while (p.index < p.source.length) { + const c = peek(p); + if (!c || !/[ \n\t]/.test(c)) { + break; + } + next(p); + } +} + +function peek(p: Parser): string | null { + return peekN(p, 1); +} + +function peekN(p: Parser, n: number): string | null { + return (p.index + n - 1 < p.source.length) ? p.source[p.index + n - 1] : null; +} + +function next(p: Parser, n = 1) { + p.index += n; +} + +function back(p: Parser, n = 1) { + p.index -= n; +} + +function replaceEntityReferences(s: string): string { + return s + .replaceAll(/&/g, "&") + .replaceAll(/</g, "<") + .replaceAll(/>/g, ">") + .replaceAll(/'/g, "'") + .replaceAll(/"/g, '"'); +} + +function escapeForHuman(s: string): string { + // support more characters? + return s + .replaceAll("\n", "\\n") + .replaceAll("\t", "\\t") + .replaceAll("\r", "\\r"); +} |
