/** * @import { * Chunk, * Code, * ConstructRecord, * Construct, * Effects, * InitialConstruct, * ParseContext, * Point, * State, * TokenizeContext, * Token * } from 'micromark-util-types' */ /** * @callback Restore * Restore the state. * @returns {undefined} * Nothing. * * @typedef Info * Info. * @property {Restore} restore * Restore. * @property {number} from * From. * * @callback ReturnHandle * Handle a successful run. * @param {Construct} construct * Construct. * @param {Info} info * Info. * @returns {undefined} * Nothing. */ import { markdownLineEnding } from 'micromark-util-character'; import { push, splice } from 'micromark-util-chunked'; import { resolveAll } from 'micromark-util-resolve-all'; /** * Create a tokenizer. * Tokenizers deal with one type of data (e.g., containers, flow, text). * The parser is the object dealing with it all. * `initialize` works like other constructs, except that only its `tokenize` * function is used, in which case it doesn’t receive an `ok` or `nok`. * `from` can be given to set the point before the first character, although * when further lines are indented, they must be set with `defineSkip`. * * @param {ParseContext} parser * Parser. * @param {InitialConstruct} initialize * Construct. * @param {Omit | undefined} [from] * Point (optional). * @returns {TokenizeContext} * Context. */ export function createTokenizer(parser, initialize, from) { /** @type {Point} */ let point = { _bufferIndex: -1, _index: 0, line: from && from.line || 1, column: from && from.column || 1, offset: from && from.offset || 0 }; /** @type {Record} */ const columnStart = {}; /** @type {Array} */ const resolveAllConstructs = []; /** @type {Array} */ let chunks = []; /** @type {Array} */ let stack = []; /** @type {boolean | undefined} */ let consumed = true; /** * Tools used for tokenizing. * * @type {Effects} */ const effects = { attempt: constructFactory(onsuccessfulconstruct), check: constructFactory(onsuccessfulcheck), consume, enter, exit, interrupt: constructFactory(onsuccessfulcheck, { interrupt: true }) }; /** * State and tools for resolving and serializing. * * @type {TokenizeContext} */ const context = { code: null, containerState: {}, defineSkip, events: [], now, parser, previous: null, sliceSerialize, sliceStream, write }; /** * The state function. * * @type {State | undefined} */ let state = initialize.tokenize.call(context, effects); /** * Track which character we expect to be consumed, to catch bugs. * * @type {Code} */ let expectedCode; if (initialize.resolveAll) { resolveAllConstructs.push(initialize); } return context; /** @type {TokenizeContext['write']} */ function write(slice) { chunks = push(chunks, slice); main(); // Exit if we’re not done, resolve might change stuff. if (chunks[chunks.length - 1] !== null) { return []; } addResult(initialize, 0); // Otherwise, resolve, and exit. context.events = resolveAll(resolveAllConstructs, context.events, context); return context.events; } // // Tools. // /** @type {TokenizeContext['sliceSerialize']} */ function sliceSerialize(token, expandTabs) { return serializeChunks(sliceStream(token), expandTabs); } /** @type {TokenizeContext['sliceStream']} */ function sliceStream(token) { return sliceChunks(chunks, token); } /** @type {TokenizeContext['now']} */ function now() { // This is a hot path, so we clone manually instead of `Object.assign({}, point)` const { _bufferIndex, _index, line, column, offset } = point; return { _bufferIndex, _index, line, column, offset }; } /** @type {TokenizeContext['defineSkip']} */ function defineSkip(value) { columnStart[value.line] = value.column; accountForPotentialSkip(); } // // State management. // /** * Main loop (note that `_index` and `_bufferIndex` in `point` are modified by * `consume`). * Here is where we walk through the chunks, which either include strings of * several characters, or numerical character codes. * The reason to do this in a loop instead of a call is so the stack can * drain. * * @returns {undefined} * Nothing. */ function main() { /** @type {number} */ let chunkIndex; while (point._index < chunks.length) { const chunk = chunks[point._index]; // If we’re in a buffer chunk, loop through it. if (typeof chunk === 'string') { chunkIndex = point._index; if (point._bufferIndex < 0) { point._bufferIndex = 0; } while (point._index === chunkIndex && point._bufferIndex < chunk.length) { go(chunk.charCodeAt(point._bufferIndex)); } } else { go(chunk); } } } /** * Deal with one code. * * @param {Code} code * Code. * @returns {undefined} * Nothing. */ function go(code) { consumed = undefined; expectedCode = code; state = state(code); } /** @type {Effects['consume']} */ function consume(code) { if (markdownLineEnding(code)) { point.line++; point.column = 1; point.offset += code === -3 ? 2 : 1; accountForPotentialSkip(); } else if (code !== -1) { point.column++; point.offset++; } // Not in a string chunk. if (point._bufferIndex < 0) { point._index++; } else { point._bufferIndex++; // At end of string chunk. if (point._bufferIndex === // Points w/ non-negative `_bufferIndex` reference // strings. /** @type {string} */ chunks[point._index].length) { point._bufferIndex = -1; point._index++; } } // Expose the previous character. context.previous = code; // Mark as consumed. consumed = true; } /** @type {Effects['enter']} */ function enter(type, fields) { /** @type {Token} */ // @ts-expect-error Patch instead of assign required fields to help GC. const token = fields || {}; token.type = type; token.start = now(); context.events.push(['enter', token, context]); stack.push(token); return token; } /** @type {Effects['exit']} */ function exit(type) { const token = stack.pop(); token.end = now(); context.events.push(['exit', token, context]); return token; } /** * Use results. * * @type {ReturnHandle} */ function onsuccessfulconstruct(construct, info) { addResult(construct, info.from); } /** * Discard results. * * @type {ReturnHandle} */ function onsuccessfulcheck(_, info) { info.restore(); } /** * Factory to attempt/check/interrupt. * * @param {ReturnHandle} onreturn * Callback. * @param {{interrupt?: boolean | undefined} | undefined} [fields] * Fields. */ function constructFactory(onreturn, fields) { return hook; /** * Handle either an object mapping codes to constructs, a list of * constructs, or a single construct. * * @param {Array | ConstructRecord | Construct} constructs * Constructs. * @param {State} returnState * State. * @param {State | undefined} [bogusState] * State. * @returns {State} * State. */ function hook(constructs, returnState, bogusState) { /** @type {ReadonlyArray} */ let listOfConstructs; /** @type {number} */ let constructIndex; /** @type {Construct} */ let currentConstruct; /** @type {Info} */ let info; return Array.isArray(constructs) ? /* c8 ignore next 1 */ handleListOfConstructs(constructs) : 'tokenize' in constructs ? // Looks like a construct. handleListOfConstructs([(/** @type {Construct} */constructs)]) : handleMapOfConstructs(constructs); /** * Handle a list of construct. * * @param {ConstructRecord} map * Constructs. * @returns {State} * State. */ function handleMapOfConstructs(map) { return start; /** @type {State} */ function start(code) { const left = code !== null && map[code]; const all = code !== null && map.null; const list = [ // To do: add more extension tests. /* c8 ignore next 2 */ ...(Array.isArray(left) ? left : left ? [left] : []), ...(Array.isArray(all) ? all : all ? [all] : [])]; return handleListOfConstructs(list)(code); } } /** * Handle a list of construct. * * @param {ReadonlyArray} list * Constructs. * @returns {State} * State. */ function handleListOfConstructs(list) { listOfConstructs = list; constructIndex = 0; if (list.length === 0) { return bogusState; } return handleConstruct(list[constructIndex]); } /** * Handle a single construct. * * @param {Construct} construct * Construct. * @returns {State} * State. */ function handleConstruct(construct) { return start; /** @type {State} */ function start(code) { // To do: not needed to store if there is no bogus state, probably? // Currently doesn’t work because `inspect` in document does a check // w/o a bogus, which doesn’t make sense. But it does seem to help perf // by not storing. info = store(); currentConstruct = construct; if (!construct.partial) { context.currentConstruct = construct; } // Always populated by defaults. if (construct.name && context.parser.constructs.disable.null.includes(construct.name)) { return nok(code); } return construct.tokenize.call( // If we do have fields, create an object w/ `context` as its // prototype. // This allows a “live binding”, which is needed for `interrupt`. fields ? Object.assign(Object.create(context), fields) : context, effects, ok, nok)(code); } } /** @type {State} */ function ok(code) { consumed = true; onreturn(currentConstruct, info); return returnState; } /** @type {State} */ function nok(code) { consumed = true; info.restore(); if (++constructIndex < listOfConstructs.length) { return handleConstruct(listOfConstructs[constructIndex]); } return bogusState; } } } /** * @param {Construct} construct * Construct. * @param {number} from * From. * @returns {undefined} * Nothing. */ function addResult(construct, from) { if (construct.resolveAll && !resolveAllConstructs.includes(construct)) { resolveAllConstructs.push(construct); } if (construct.resolve) { splice(context.events, from, context.events.length - from, construct.resolve(context.events.slice(from), context)); } if (construct.resolveTo) { context.events = construct.resolveTo(context.events, context); } } /** * Store state. * * @returns {Info} * Info. */ function store() { const startPoint = now(); const startPrevious = context.previous; const startCurrentConstruct = context.currentConstruct; const startEventsIndex = context.events.length; const startStack = Array.from(stack); return { from: startEventsIndex, restore }; /** * Restore state. * * @returns {undefined} * Nothing. */ function restore() { point = startPoint; context.previous = startPrevious; context.currentConstruct = startCurrentConstruct; context.events.length = startEventsIndex; stack = startStack; accountForPotentialSkip(); } } /** * Move the current point a bit forward in the line when it’s on a column * skip. * * @returns {undefined} * Nothing. */ function accountForPotentialSkip() { if (point.line in columnStart && point.column < 2) { point.column = columnStart[point.line]; point.offset += columnStart[point.line] - 1; } } } /** * Get the chunks from a slice of chunks in the range of a token. * * @param {ReadonlyArray} chunks * Chunks. * @param {Pick} token * Token. * @returns {Array} * Chunks. */ function sliceChunks(chunks, token) { const startIndex = token.start._index; const startBufferIndex = token.start._bufferIndex; const endIndex = token.end._index; const endBufferIndex = token.end._bufferIndex; /** @type {Array} */ let view; if (startIndex === endIndex) { // @ts-expect-error `_bufferIndex` is used on string chunks. view = [chunks[startIndex].slice(startBufferIndex, endBufferIndex)]; } else { view = chunks.slice(startIndex, endIndex); if (startBufferIndex > -1) { const head = view[0]; if (typeof head === 'string') { view[0] = head.slice(startBufferIndex); /* c8 ignore next 4 -- used to be used, no longer */ } else { view.shift(); } } if (endBufferIndex > 0) { // @ts-expect-error `_bufferIndex` is used on string chunks. view.push(chunks[endIndex].slice(0, endBufferIndex)); } } return view; } /** * Get the string value of a slice of chunks. * * @param {ReadonlyArray} chunks * Chunks. * @param {boolean | undefined} [expandTabs=false] * Whether to expand tabs (default: `false`). * @returns {string} * Result. */ function serializeChunks(chunks, expandTabs) { let index = -1; /** @type {Array} */ const result = []; /** @type {boolean | undefined} */ let atTab; while (++index < chunks.length) { const chunk = chunks[index]; /** @type {string} */ let value; if (typeof chunk === 'string') { value = chunk; } else switch (chunk) { case -5: { value = "\r"; break; } case -4: { value = "\n"; break; } case -3: { value = "\r" + "\n"; break; } case -2: { value = expandTabs ? " " : "\t"; break; } case -1: { if (!expandTabs && atTab) continue; value = " "; break; } default: { // Currently only replacement character. value = String.fromCharCode(chunk); } } atTab = chunk === -2; result.push(value); } return result.join(''); }