/** * While micromark is a lexer/tokenizer, the common case of going from markdown * to html is currently built in as this module, even though the parts can be * used separately to build ASTs, CSTs, or many other output formats. * * Having an HTML compiler built in is useful because it allows us to check for * compliancy to CommonMark, the de facto norm of markdown, specified in roughly * 600 input/output cases. * * This module has an interface that accepts lists of events instead of the * whole at once, however, because markdown can’t be truly streaming, we buffer * events before processing and outputting the final result. */ /** * @import { * CompileContext, * CompileData, * CompileOptions, * Compile, * Definition, * Event, * Handle, * HtmlExtension, * LineEnding, * NormalizedHtmlExtension, * Token * } from 'micromark-util-types' */ /** * @typedef Media * @property {boolean | undefined} [image] * @property {string | undefined} [labelId] * @property {string | undefined} [label] * @property {string | undefined} [referenceId] * @property {string | undefined} [destination] * @property {string | undefined} [title] */ import { decodeNamedCharacterReference } from 'decode-named-character-reference'; import { push } from 'micromark-util-chunked'; import { combineHtmlExtensions } from 'micromark-util-combine-extensions'; import { decodeNumericCharacterReference } from 'micromark-util-decode-numeric-character-reference'; import { encode as _encode } from 'micromark-util-encode'; import { normalizeIdentifier } from 'micromark-util-normalize-identifier'; import { sanitizeUri } from 'micromark-util-sanitize-uri'; const hasOwnProperty = {}.hasOwnProperty; /** * These two are allowlists of safe protocols for full URLs in respectively the * `href` (on ``) and `src` (on ``) attributes. * They are based on what is allowed on GitHub, * */ const protocolHref = /^(https?|ircs?|mailto|xmpp)$/i; const protocolSource = /^https?$/i; /** * @param {CompileOptions | null | undefined} [options] * @returns {Compile} */ export function compile(options) { const settings = options || {}; /** * Tags is needed because according to markdown, links and emphasis and * whatnot can exist in images, however, as HTML doesn’t allow content in * images, the tags are ignored in the `alt` attribute, but the content * remains. * * @type {boolean | undefined} */ let tags = true; /** * An object to track identifiers to media (URLs and titles) defined with * definitions. * * @type {Record} */ const definitions = {}; /** * A lot of the handlers need to capture some of the output data, modify it * somehow, and then deal with it. * We do that by tracking a stack of buffers, that can be opened (with * `buffer`) and closed (with `resume`) to access them. * * @type {Array>} */ const buffers = [[]]; /** * As we can have links in images and the other way around, where the deepest * ones are closed first, we need to track which one we’re in. * * @type {Array} */ const mediaStack = []; /** * Same as `mediaStack` for tightness, which is specific to lists. * We need to track if we’re currently in a tight or loose container. * * @type {Array} */ const tightStack = []; /** @type {HtmlExtension} */ const defaultHandlers = { enter: { blockQuote: onenterblockquote, codeFenced: onentercodefenced, codeFencedFenceInfo: buffer, codeFencedFenceMeta: buffer, codeIndented: onentercodeindented, codeText: onentercodetext, content: onentercontent, definition: onenterdefinition, definitionDestinationString: onenterdefinitiondestinationstring, definitionLabelString: buffer, definitionTitleString: buffer, emphasis: onenteremphasis, htmlFlow: onenterhtmlflow, htmlText: onenterhtml, image: onenterimage, label: buffer, link: onenterlink, listItemMarker: onenterlistitemmarker, listItemValue: onenterlistitemvalue, listOrdered: onenterlistordered, listUnordered: onenterlistunordered, paragraph: onenterparagraph, reference: buffer, resource: onenterresource, resourceDestinationString: onenterresourcedestinationstring, resourceTitleString: buffer, setextHeading: onentersetextheading, strong: onenterstrong }, exit: { atxHeading: onexitatxheading, atxHeadingSequence: onexitatxheadingsequence, autolinkEmail: onexitautolinkemail, autolinkProtocol: onexitautolinkprotocol, blockQuote: onexitblockquote, characterEscapeValue: onexitdata, characterReferenceMarkerHexadecimal: onexitcharacterreferencemarker, characterReferenceMarkerNumeric: onexitcharacterreferencemarker, characterReferenceValue: onexitcharacterreferencevalue, codeFenced: onexitflowcode, codeFencedFence: onexitcodefencedfence, codeFencedFenceInfo: onexitcodefencedfenceinfo, codeFencedFenceMeta: onresumedrop, codeFlowValue: onexitcodeflowvalue, codeIndented: onexitflowcode, codeText: onexitcodetext, codeTextData: onexitdata, data: onexitdata, definition: onexitdefinition, definitionDestinationString: onexitdefinitiondestinationstring, definitionLabelString: onexitdefinitionlabelstring, definitionTitleString: onexitdefinitiontitlestring, emphasis: onexitemphasis, hardBreakEscape: onexithardbreak, hardBreakTrailing: onexithardbreak, htmlFlow: onexithtml, htmlFlowData: onexitdata, htmlText: onexithtml, htmlTextData: onexitdata, image: onexitmedia, label: onexitlabel, labelText: onexitlabeltext, lineEnding: onexitlineending, link: onexitmedia, listOrdered: onexitlistordered, listUnordered: onexitlistunordered, paragraph: onexitparagraph, reference: onresumedrop, referenceString: onexitreferencestring, resource: onresumedrop, resourceDestinationString: onexitresourcedestinationstring, resourceTitleString: onexitresourcetitlestring, setextHeading: onexitsetextheading, setextHeadingLineSequence: onexitsetextheadinglinesequence, setextHeadingText: onexitsetextheadingtext, strong: onexitstrong, thematicBreak: onexitthematicbreak } }; /** * Combine the HTML extensions with the default handlers. * An HTML extension is an object whose fields are either `enter` or `exit` * (reflecting whether a token is entered or exited). * The values at such objects are names of tokens mapping to handlers. * Handlers are called, respectively when a token is opener or closed, with * that token, and a context as `this`. */ const handlers = /** @type {NormalizedHtmlExtension} */ combineHtmlExtensions([defaultHandlers, ...(settings.htmlExtensions || [])]); /** * Handlers do often need to keep track of some state. * That state is provided here as a key-value store (an object). * * @type {CompileData} */ const data = { definitions, tightStack }; /** * The context for handlers references a couple of useful functions. * In handlers from extensions, those can be accessed at `this`. * For the handlers here, they can be accessed directly. * * @type {Omit} */ const context = { buffer, encode, getData, lineEndingIfNeeded, options: settings, raw, resume, setData, tag }; /** * Generally, micromark copies line endings (`'\r'`, `'\n'`, `'\r\n'`) in the * markdown document over to the compiled HTML. * In some cases, such as `> a`, CommonMark requires that extra line endings * are added: `

\n
a
\n

'); setData('expectFirstItem'); // “Hack” to prevent a line ending from showing up if the item is empty. setData('lastWasTag'); } /** * @returns {undefined} */ function onexitlistordered() { onexitlistitem(); tightStack.pop(); lineEnding(); tag(''); } /** * @returns {undefined} */ function onexitlistunordered() { onexitlistitem(); tightStack.pop(); lineEnding(); tag(''); } /** * @returns {undefined} */ function onexitlistitem() { if (getData('lastWasTag') && !getData('slurpAllLineEndings')) { lineEndingIfNeeded(); } tag('

'); setData('slurpAllLineEndings'); } /** * @this {CompileContext} * @type {Handle} */ function onenterblockquote() { tightStack.push(false); lineEndingIfNeeded(); tag('

'); } /** * @this {CompileContext} * @type {Handle} */ function onexitblockquote() { tightStack.pop(); lineEndingIfNeeded(); tag('

'); setData('slurpAllLineEndings'); } /** * @this {CompileContext} * @type {Handle} */ function onenterparagraph() { if (!tightStack[tightStack.length - 1]) { lineEndingIfNeeded(); tag('

'); } setData('slurpAllLineEndings'); } /** * @this {CompileContext} * @type {Handle} */ function onexitparagraph() { if (tightStack[tightStack.length - 1]) { setData('slurpAllLineEndings', true); } else { tag('

'); } } /** * @this {CompileContext} * @type {Handle} */ function onentercodefenced() { lineEndingIfNeeded(); tag('

');
      setData('slurpOneLineEnding', true);
    }
    setData('fencesCount', count + 1);
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onentercodeindented() {
    lineEndingIfNeeded();
    tag('');
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onexitflowcode() {
    const count = getData('fencesCount');

    // One special case is if we are inside a container, and the fenced code was
    // not closed (meaning it runs to the end).
    // In that case, the following line ending, is considered *outside* the
    // fenced code and block quote by micromark, but CM wants to treat that
    // ending as part of the code.
    if (count !== undefined && count < 2 && data.tightStack.length > 0 && !getData('lastWasTag')) {
      lineEnding();
    }

    // But in most cases, it’s simpler: when we’ve seen some data, emit an extra
    // line ending when needed.
    if (getData('flowCodeSeenData')) {
      lineEndingIfNeeded();
    }
    tag('');
    if (count !== undefined && count < 2) lineEndingIfNeeded();
    setData('flowCodeSeenData');
    setData('fencesCount');
    setData('slurpOneLineEnding');
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onenterimage() {
    mediaStack.push({
      image: true
    });
    tags = undefined; // Disallow tags.
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onenterlink() {
    mediaStack.push({});
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onexitlabeltext(token) {
    mediaStack[mediaStack.length - 1].labelId = this.sliceSerialize(token);
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onexitlabel() {
    mediaStack[mediaStack.length - 1].label = resume();
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onexitreferencestring(token) {
    mediaStack[mediaStack.length - 1].referenceId = this.sliceSerialize(token);
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onenterresource() {
    buffer(); // We can have line endings in the resource, ignore them.
    mediaStack[mediaStack.length - 1].destination = '';
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onenterresourcedestinationstring() {
    buffer();
    // Ignore encoding the result, as we’ll first percent encode the url and
    // encode manually after.
    setData('ignoreEncode', true);
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onexitresourcedestinationstring() {
    mediaStack[mediaStack.length - 1].destination = resume();
    setData('ignoreEncode');
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onexitresourcetitlestring() {
    mediaStack[mediaStack.length - 1].title = resume();
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onexitmedia() {
    let index = mediaStack.length - 1; // Skip current.
    const media = mediaStack[index];
    const id = media.referenceId || media.labelId;
    const context = media.destination === undefined ? definitions[normalizeIdentifier(id)] : media;
    tags = true;
    while (index--) {
      if (mediaStack[index].image) {
        tags = undefined;
        break;
      }
    }
    if (media.image) {
      tag('');
    } else {
      tag('>');
      raw(media.label);
      tag('');
    }
    mediaStack.pop();
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onenterdefinition() {
    buffer();
    mediaStack.push({});
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onexitdefinitionlabelstring(token) {
    // Discard label, use the source content instead.
    resume();
    mediaStack[mediaStack.length - 1].labelId = this.sliceSerialize(token);
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onenterdefinitiondestinationstring() {
    buffer();
    setData('ignoreEncode', true);
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onexitdefinitiondestinationstring() {
    mediaStack[mediaStack.length - 1].destination = resume();
    setData('ignoreEncode');
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onexitdefinitiontitlestring() {
    mediaStack[mediaStack.length - 1].title = resume();
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onexitdefinition() {
    const media = mediaStack[mediaStack.length - 1];
    const id = normalizeIdentifier(media.labelId);
    resume();
    if (!hasOwnProperty.call(definitions, id)) {
      definitions[id] = mediaStack[mediaStack.length - 1];
    }
    mediaStack.pop();
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onentercontent() {
    setData('slurpAllLineEndings', true);
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onexitatxheadingsequence(token) {
    // Exit for further sequences.
    if (getData('headingRank')) return;
    setData('headingRank', this.sliceSerialize(token).length);
    lineEndingIfNeeded();
    tag('');
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onentersetextheading() {
    buffer();
    setData('slurpAllLineEndings');
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onexitsetextheadingtext() {
    setData('slurpAllLineEndings', true);
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onexitatxheading() {
    tag('');
    setData('headingRank');
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onexitsetextheadinglinesequence(token) {
    setData('headingRank', this.sliceSerialize(token).charCodeAt(0) === 61 ? 1 : 2);
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onexitsetextheading() {
    const value = resume();
    lineEndingIfNeeded();
    tag('');
    raw(value);
    tag('');
    setData('slurpAllLineEndings');
    setData('headingRank');
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onexitdata(token) {
    raw(encode(this.sliceSerialize(token)));
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onexitlineending(token) {
    if (getData('slurpAllLineEndings')) {
      return;
    }
    if (getData('slurpOneLineEnding')) {
      setData('slurpOneLineEnding');
      return;
    }
    if (getData('inCodeText')) {
      raw(' ');
      return;
    }
    raw(encode(this.sliceSerialize(token)));
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onexitcodeflowvalue(token) {
    raw(encode(this.sliceSerialize(token)));
    setData('flowCodeSeenData', true);
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onexithardbreak() {
    tag('
');
  }

  /**
   * @returns {undefined}
   */
  function onenterhtmlflow() {
    lineEndingIfNeeded();
    onenterhtml();
  }

  /**
   * @returns {undefined}
   */
  function onexithtml() {
    setData('ignoreEncode');
  }

  /**
   * @returns {undefined}
   */
  function onenterhtml() {
    if (settings.allowDangerousHtml) {
      setData('ignoreEncode', true);
    }
  }

  /**
   * @returns {undefined}
   */
  function onenteremphasis() {
    tag('');
  }

  /**
   * @returns {undefined}
   */
  function onenterstrong() {
    tag('');
  }

  /**
   * @returns {undefined}
   */
  function onentercodetext() {
    setData('inCodeText', true);
    tag('');
  }

  /**
   * @returns {undefined}
   */
  function onexitcodetext() {
    setData('inCodeText');
    tag('');
  }

  /**
   * @returns {undefined}
   */
  function onexitemphasis() {
    tag('');
  }

  /**
   * @returns {undefined}
   */
  function onexitstrong() {
    tag('');
  }

  /**
   * @returns {undefined}
   */
  function onexitthematicbreak() {
    lineEndingIfNeeded();
    tag('');
  }

  /**
   * @this {CompileContext}
   * @param {Token} token
   * @returns {undefined}
   */
  function onexitcharacterreferencemarker(token) {
    setData('characterReferenceType', token.type);
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onexitcharacterreferencevalue(token) {
    const value = this.sliceSerialize(token);
    const decoded = getData('characterReferenceType') ? decodeNumericCharacterReference(value, getData('characterReferenceType') === "characterReferenceMarkerNumeric" ? 10 : 16) : decodeNamedCharacterReference(value);

    // `decodeNamedCharacterReference` can return `false` for invalid named
    // character references,
    // but everything we’ve tokenized is valid.
    raw(encode(/** @type {string} */decoded));
    setData('characterReferenceType');
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onexitautolinkprotocol(token) {
    const uri = this.sliceSerialize(token);
    tag('');
    raw(encode(uri));
    tag('');
  }

  /**
   * @this {CompileContext}
   * @type {Handle}
   */
  function onexitautolinkemail(token) {
    const uri = this.sliceSerialize(token);
    tag('');
    raw(encode(uri));
    tag('');
  }
}