knowledgebase_law/node_modules/micromark/lib/create-tokenizer.js

611 lines
15 KiB
JavaScript
Raw Normal View History

2025-04-11 23:47:09 +08:00
/**
* @import {
* Chunk,
* Code,
* ConstructRecord,
* Construct,
* Effects,
* InitialConstruct,
* ParseContext,
* Point,
* State,
* TokenizeContext,
* Token
* } from 'micromark-util-types'
*/
/**
* @callback Restore
* Restore the state.
* @returns {undefined}
* Nothing.
*
* @typedef Info
* Info.
* @property {Restore} restore
* Restore.
* @property {number} from
* From.
*
* @callback ReturnHandle
* Handle a successful run.
* @param {Construct} construct
* Construct.
* @param {Info} info
* Info.
* @returns {undefined}
* Nothing.
*/
import { markdownLineEnding } from 'micromark-util-character';
import { push, splice } from 'micromark-util-chunked';
import { resolveAll } from 'micromark-util-resolve-all';
/**
* Create a tokenizer.
* Tokenizers deal with one type of data (e.g., containers, flow, text).
* The parser is the object dealing with it all.
* `initialize` works like other constructs, except that only its `tokenize`
* function is used, in which case it doesnt receive an `ok` or `nok`.
* `from` can be given to set the point before the first character, although
* when further lines are indented, they must be set with `defineSkip`.
*
* @param {ParseContext} parser
* Parser.
* @param {InitialConstruct} initialize
* Construct.
* @param {Omit<Point, '_bufferIndex' | '_index'> | undefined} [from]
* Point (optional).
* @returns {TokenizeContext}
* Context.
*/
export function createTokenizer(parser, initialize, from) {
/** @type {Point} */
let point = {
_bufferIndex: -1,
_index: 0,
line: from && from.line || 1,
column: from && from.column || 1,
offset: from && from.offset || 0
};
/** @type {Record<string, number>} */
const columnStart = {};
/** @type {Array<Construct>} */
const resolveAllConstructs = [];
/** @type {Array<Chunk>} */
let chunks = [];
/** @type {Array<Token>} */
let stack = [];
/** @type {boolean | undefined} */
let consumed = true;
/**
* Tools used for tokenizing.
*
* @type {Effects}
*/
const effects = {
attempt: constructFactory(onsuccessfulconstruct),
check: constructFactory(onsuccessfulcheck),
consume,
enter,
exit,
interrupt: constructFactory(onsuccessfulcheck, {
interrupt: true
})
};
/**
* State and tools for resolving and serializing.
*
* @type {TokenizeContext}
*/
const context = {
code: null,
containerState: {},
defineSkip,
events: [],
now,
parser,
previous: null,
sliceSerialize,
sliceStream,
write
};
/**
* The state function.
*
* @type {State | undefined}
*/
let state = initialize.tokenize.call(context, effects);
/**
* Track which character we expect to be consumed, to catch bugs.
*
* @type {Code}
*/
let expectedCode;
if (initialize.resolveAll) {
resolveAllConstructs.push(initialize);
}
return context;
/** @type {TokenizeContext['write']} */
function write(slice) {
chunks = push(chunks, slice);
main();
// Exit if were not done, resolve might change stuff.
if (chunks[chunks.length - 1] !== null) {
return [];
}
addResult(initialize, 0);
// Otherwise, resolve, and exit.
context.events = resolveAll(resolveAllConstructs, context.events, context);
return context.events;
}
//
// Tools.
//
/** @type {TokenizeContext['sliceSerialize']} */
function sliceSerialize(token, expandTabs) {
return serializeChunks(sliceStream(token), expandTabs);
}
/** @type {TokenizeContext['sliceStream']} */
function sliceStream(token) {
return sliceChunks(chunks, token);
}
/** @type {TokenizeContext['now']} */
function now() {
// This is a hot path, so we clone manually instead of `Object.assign({}, point)`
const {
_bufferIndex,
_index,
line,
column,
offset
} = point;
return {
_bufferIndex,
_index,
line,
column,
offset
};
}
/** @type {TokenizeContext['defineSkip']} */
function defineSkip(value) {
columnStart[value.line] = value.column;
accountForPotentialSkip();
}
//
// State management.
//
/**
* Main loop (note that `_index` and `_bufferIndex` in `point` are modified by
* `consume`).
* Here is where we walk through the chunks, which either include strings of
* several characters, or numerical character codes.
* The reason to do this in a loop instead of a call is so the stack can
* drain.
*
* @returns {undefined}
* Nothing.
*/
function main() {
/** @type {number} */
let chunkIndex;
while (point._index < chunks.length) {
const chunk = chunks[point._index];
// If were in a buffer chunk, loop through it.
if (typeof chunk === 'string') {
chunkIndex = point._index;
if (point._bufferIndex < 0) {
point._bufferIndex = 0;
}
while (point._index === chunkIndex && point._bufferIndex < chunk.length) {
go(chunk.charCodeAt(point._bufferIndex));
}
} else {
go(chunk);
}
}
}
/**
* Deal with one code.
*
* @param {Code} code
* Code.
* @returns {undefined}
* Nothing.
*/
function go(code) {
consumed = undefined;
expectedCode = code;
state = state(code);
}
/** @type {Effects['consume']} */
function consume(code) {
if (markdownLineEnding(code)) {
point.line++;
point.column = 1;
point.offset += code === -3 ? 2 : 1;
accountForPotentialSkip();
} else if (code !== -1) {
point.column++;
point.offset++;
}
// Not in a string chunk.
if (point._bufferIndex < 0) {
point._index++;
} else {
point._bufferIndex++;
// At end of string chunk.
if (point._bufferIndex ===
// Points w/ non-negative `_bufferIndex` reference
// strings.
/** @type {string} */
chunks[point._index].length) {
point._bufferIndex = -1;
point._index++;
}
}
// Expose the previous character.
context.previous = code;
// Mark as consumed.
consumed = true;
}
/** @type {Effects['enter']} */
function enter(type, fields) {
/** @type {Token} */
// @ts-expect-error Patch instead of assign required fields to help GC.
const token = fields || {};
token.type = type;
token.start = now();
context.events.push(['enter', token, context]);
stack.push(token);
return token;
}
/** @type {Effects['exit']} */
function exit(type) {
const token = stack.pop();
token.end = now();
context.events.push(['exit', token, context]);
return token;
}
/**
* Use results.
*
* @type {ReturnHandle}
*/
function onsuccessfulconstruct(construct, info) {
addResult(construct, info.from);
}
/**
* Discard results.
*
* @type {ReturnHandle}
*/
function onsuccessfulcheck(_, info) {
info.restore();
}
/**
* Factory to attempt/check/interrupt.
*
* @param {ReturnHandle} onreturn
* Callback.
* @param {{interrupt?: boolean | undefined} | undefined} [fields]
* Fields.
*/
function constructFactory(onreturn, fields) {
return hook;
/**
* Handle either an object mapping codes to constructs, a list of
* constructs, or a single construct.
*
* @param {Array<Construct> | ConstructRecord | Construct} constructs
* Constructs.
* @param {State} returnState
* State.
* @param {State | undefined} [bogusState]
* State.
* @returns {State}
* State.
*/
function hook(constructs, returnState, bogusState) {
/** @type {ReadonlyArray<Construct>} */
let listOfConstructs;
/** @type {number} */
let constructIndex;
/** @type {Construct} */
let currentConstruct;
/** @type {Info} */
let info;
return Array.isArray(constructs) ? /* c8 ignore next 1 */
handleListOfConstructs(constructs) : 'tokenize' in constructs ?
// Looks like a construct.
handleListOfConstructs([(/** @type {Construct} */constructs)]) : handleMapOfConstructs(constructs);
/**
* Handle a list of construct.
*
* @param {ConstructRecord} map
* Constructs.
* @returns {State}
* State.
*/
function handleMapOfConstructs(map) {
return start;
/** @type {State} */
function start(code) {
const left = code !== null && map[code];
const all = code !== null && map.null;
const list = [
// To do: add more extension tests.
/* c8 ignore next 2 */
...(Array.isArray(left) ? left : left ? [left] : []), ...(Array.isArray(all) ? all : all ? [all] : [])];
return handleListOfConstructs(list)(code);
}
}
/**
* Handle a list of construct.
*
* @param {ReadonlyArray<Construct>} list
* Constructs.
* @returns {State}
* State.
*/
function handleListOfConstructs(list) {
listOfConstructs = list;
constructIndex = 0;
if (list.length === 0) {
return bogusState;
}
return handleConstruct(list[constructIndex]);
}
/**
* Handle a single construct.
*
* @param {Construct} construct
* Construct.
* @returns {State}
* State.
*/
function handleConstruct(construct) {
return start;
/** @type {State} */
function start(code) {
// To do: not needed to store if there is no bogus state, probably?
// Currently doesnt work because `inspect` in document does a check
// w/o a bogus, which doesnt make sense. But it does seem to help perf
// by not storing.
info = store();
currentConstruct = construct;
if (!construct.partial) {
context.currentConstruct = construct;
}
// Always populated by defaults.
if (construct.name && context.parser.constructs.disable.null.includes(construct.name)) {
return nok(code);
}
return construct.tokenize.call(
// If we do have fields, create an object w/ `context` as its
// prototype.
// This allows a “live binding”, which is needed for `interrupt`.
fields ? Object.assign(Object.create(context), fields) : context, effects, ok, nok)(code);
}
}
/** @type {State} */
function ok(code) {
consumed = true;
onreturn(currentConstruct, info);
return returnState;
}
/** @type {State} */
function nok(code) {
consumed = true;
info.restore();
if (++constructIndex < listOfConstructs.length) {
return handleConstruct(listOfConstructs[constructIndex]);
}
return bogusState;
}
}
}
/**
* @param {Construct} construct
* Construct.
* @param {number} from
* From.
* @returns {undefined}
* Nothing.
*/
function addResult(construct, from) {
if (construct.resolveAll && !resolveAllConstructs.includes(construct)) {
resolveAllConstructs.push(construct);
}
if (construct.resolve) {
splice(context.events, from, context.events.length - from, construct.resolve(context.events.slice(from), context));
}
if (construct.resolveTo) {
context.events = construct.resolveTo(context.events, context);
}
}
/**
* Store state.
*
* @returns {Info}
* Info.
*/
function store() {
const startPoint = now();
const startPrevious = context.previous;
const startCurrentConstruct = context.currentConstruct;
const startEventsIndex = context.events.length;
const startStack = Array.from(stack);
return {
from: startEventsIndex,
restore
};
/**
* Restore state.
*
* @returns {undefined}
* Nothing.
*/
function restore() {
point = startPoint;
context.previous = startPrevious;
context.currentConstruct = startCurrentConstruct;
context.events.length = startEventsIndex;
stack = startStack;
accountForPotentialSkip();
}
}
/**
* Move the current point a bit forward in the line when its on a column
* skip.
*
* @returns {undefined}
* Nothing.
*/
function accountForPotentialSkip() {
if (point.line in columnStart && point.column < 2) {
point.column = columnStart[point.line];
point.offset += columnStart[point.line] - 1;
}
}
}
/**
* Get the chunks from a slice of chunks in the range of a token.
*
* @param {ReadonlyArray<Chunk>} chunks
* Chunks.
* @param {Pick<Token, 'end' | 'start'>} token
* Token.
* @returns {Array<Chunk>}
* Chunks.
*/
function sliceChunks(chunks, token) {
const startIndex = token.start._index;
const startBufferIndex = token.start._bufferIndex;
const endIndex = token.end._index;
const endBufferIndex = token.end._bufferIndex;
/** @type {Array<Chunk>} */
let view;
if (startIndex === endIndex) {
// @ts-expect-error `_bufferIndex` is used on string chunks.
view = [chunks[startIndex].slice(startBufferIndex, endBufferIndex)];
} else {
view = chunks.slice(startIndex, endIndex);
if (startBufferIndex > -1) {
const head = view[0];
if (typeof head === 'string') {
view[0] = head.slice(startBufferIndex);
/* c8 ignore next 4 -- used to be used, no longer */
} else {
view.shift();
}
}
if (endBufferIndex > 0) {
// @ts-expect-error `_bufferIndex` is used on string chunks.
view.push(chunks[endIndex].slice(0, endBufferIndex));
}
}
return view;
}
/**
* Get the string value of a slice of chunks.
*
* @param {ReadonlyArray<Chunk>} chunks
* Chunks.
* @param {boolean | undefined} [expandTabs=false]
* Whether to expand tabs (default: `false`).
* @returns {string}
* Result.
*/
function serializeChunks(chunks, expandTabs) {
let index = -1;
/** @type {Array<string>} */
const result = [];
/** @type {boolean | undefined} */
let atTab;
while (++index < chunks.length) {
const chunk = chunks[index];
/** @type {string} */
let value;
if (typeof chunk === 'string') {
value = chunk;
} else switch (chunk) {
case -5:
{
value = "\r";
break;
}
case -4:
{
value = "\n";
break;
}
case -3:
{
value = "\r" + "\n";
break;
}
case -2:
{
value = expandTabs ? " " : "\t";
break;
}
case -1:
{
if (!expandTabs && atTab) continue;
value = " ";
break;
}
default:
{
// Currently only replacement character.
value = String.fromCharCode(chunk);
}
}
atTab = chunk === -2;
result.push(value);
}
return result.join('');
}