/** * @import {Code, ConstructRecord, Event, Extension, Previous, State, TokenizeContext, Tokenizer} from 'micromark-util-types' */ import { asciiAlpha, asciiAlphanumeric, asciiControl, markdownLineEndingOrSpace, unicodePunctuation, unicodeWhitespace } from 'micromark-util-character'; const wwwPrefix = { tokenize: tokenizeWwwPrefix, partial: true }; const domain = { tokenize: tokenizeDomain, partial: true }; const path = { tokenize: tokenizePath, partial: true }; const trail = { tokenize: tokenizeTrail, partial: true }; const emailDomainDotTrail = { tokenize: tokenizeEmailDomainDotTrail, partial: true }; const wwwAutolink = { name: 'wwwAutolink', tokenize: tokenizeWwwAutolink, previous: previousWww }; const protocolAutolink = { name: 'protocolAutolink', tokenize: tokenizeProtocolAutolink, previous: previousProtocol }; const emailAutolink = { name: 'emailAutolink', tokenize: tokenizeEmailAutolink, previous: previousEmail }; /** @type {ConstructRecord} */ const text = {}; /** * Create an extension for `micromark` to support GitHub autolink literal * syntax. * * @returns {Extension} * Extension for `micromark` that can be passed in `extensions` to enable GFM * autolink literal syntax. */ export function gfmAutolinkLiteral() { return { text }; } /** @type {Code} */ let code = 48; // Add alphanumerics. while (code < 123) { text[code] = emailAutolink; code++; if (code === 58) code = 65;else if (code === 91) code = 97; } text[43] = emailAutolink; text[45] = emailAutolink; text[46] = emailAutolink; text[95] = emailAutolink; text[72] = [emailAutolink, protocolAutolink]; text[104] = [emailAutolink, protocolAutolink]; text[87] = [emailAutolink, wwwAutolink]; text[119] = [emailAutolink, wwwAutolink]; // To do: perform email autolink literals on events, afterwards. // That’s where `markdown-rs` and `cmark-gfm` perform it. // It should look for `@`, then for atext backwards, and then for a label // forwards. // To do: `mailto:`, `xmpp:` protocol as prefix. /** * Email autolink literal. * * ```markdown * > | a contact@example.org b * ^^^^^^^^^^^^^^^^^^^ * ``` * * @this {TokenizeContext} * @type {Tokenizer} */ function tokenizeEmailAutolink(effects, ok, nok) { const self = this; /** @type {boolean | undefined} */ let dot; /** @type {boolean} */ let data; return start; /** * Start of email autolink literal. * * ```markdown * > | a contact@example.org b * ^ * ``` * * @type {State} */ function start(code) { if (!gfmAtext(code) || !previousEmail.call(self, self.previous) || previousUnbalanced(self.events)) { return nok(code); } effects.enter('literalAutolink'); effects.enter('literalAutolinkEmail'); return atext(code); } /** * In email atext. * * ```markdown * > | a contact@example.org b * ^ * ``` * * @type {State} */ function atext(code) { if (gfmAtext(code)) { effects.consume(code); return atext; } if (code === 64) { effects.consume(code); return emailDomain; } return nok(code); } /** * In email domain. * * The reference code is a bit overly complex as it handles the `@`, of which * there may be just one. * Source: * * ```markdown * > | a contact@example.org b * ^ * ``` * * @type {State} */ function emailDomain(code) { // Dot followed by alphanumerical (not `-` or `_`). if (code === 46) { return effects.check(emailDomainDotTrail, emailDomainAfter, emailDomainDot)(code); } // Alphanumerical, `-`, and `_`. if (code === 45 || code === 95 || asciiAlphanumeric(code)) { data = true; effects.consume(code); return emailDomain; } // To do: `/` if xmpp. // Note: normally we’d truncate trailing punctuation from the link. // However, email autolink literals cannot contain any of those markers, // except for `.`, but that can only occur if it isn’t trailing. // So we can ignore truncating! return emailDomainAfter(code); } /** * In email domain, on dot that is not a trail. * * ```markdown * > | a contact@example.org b * ^ * ``` * * @type {State} */ function emailDomainDot(code) { effects.consume(code); dot = true; return emailDomain; } /** * After email domain. * * ```markdown * > | a contact@example.org b * ^ * ``` * * @type {State} */ function emailDomainAfter(code) { // Domain must not be empty, must include a dot, and must end in alphabetical. // Source: . if (data && dot && asciiAlpha(self.previous)) { effects.exit('literalAutolinkEmail'); effects.exit('literalAutolink'); return ok(code); } return nok(code); } } /** * `www` autolink literal. * * ```markdown * > | a www.example.org b * ^^^^^^^^^^^^^^^ * ``` * * @this {TokenizeContext} * @type {Tokenizer} */ function tokenizeWwwAutolink(effects, ok, nok) { const self = this; return wwwStart; /** * Start of www autolink literal. * * ```markdown * > | www.example.com/a?b#c * ^ * ``` * * @type {State} */ function wwwStart(code) { if (code !== 87 && code !== 119 || !previousWww.call(self, self.previous) || previousUnbalanced(self.events)) { return nok(code); } effects.enter('literalAutolink'); effects.enter('literalAutolinkWww'); // Note: we *check*, so we can discard the `www.` we parsed. // If it worked, we consider it as a part of the domain. return effects.check(wwwPrefix, effects.attempt(domain, effects.attempt(path, wwwAfter), nok), nok)(code); } /** * After a www autolink literal. * * ```markdown * > | www.example.com/a?b#c * ^ * ``` * * @type {State} */ function wwwAfter(code) { effects.exit('literalAutolinkWww'); effects.exit('literalAutolink'); return ok(code); } } /** * Protocol autolink literal. * * ```markdown * > | a https://example.org b * ^^^^^^^^^^^^^^^^^^^ * ``` * * @this {TokenizeContext} * @type {Tokenizer} */ function tokenizeProtocolAutolink(effects, ok, nok) { const self = this; let buffer = ''; let seen = false; return protocolStart; /** * Start of protocol autolink literal. * * ```markdown * > | https://example.com/a?b#c * ^ * ``` * * @type {State} */ function protocolStart(code) { if ((code === 72 || code === 104) && previousProtocol.call(self, self.previous) && !previousUnbalanced(self.events)) { effects.enter('literalAutolink'); effects.enter('literalAutolinkHttp'); buffer += String.fromCodePoint(code); effects.consume(code); return protocolPrefixInside; } return nok(code); } /** * In protocol. * * ```markdown * > | https://example.com/a?b#c * ^^^^^ * ``` * * @type {State} */ function protocolPrefixInside(code) { // `5` is size of `https` if (asciiAlpha(code) && buffer.length < 5) { // @ts-expect-error: definitely number. buffer += String.fromCodePoint(code); effects.consume(code); return protocolPrefixInside; } if (code === 58) { const protocol = buffer.toLowerCase(); if (protocol === 'http' || protocol === 'https') { effects.consume(code); return protocolSlashesInside; } } return nok(code); } /** * In slashes. * * ```markdown * > | https://example.com/a?b#c * ^^ * ``` * * @type {State} */ function protocolSlashesInside(code) { if (code === 47) { effects.consume(code); if (seen) { return afterProtocol; } seen = true; return protocolSlashesInside; } return nok(code); } /** * After protocol, before domain. * * ```markdown * > | https://example.com/a?b#c * ^ * ``` * * @type {State} */ function afterProtocol(code) { // To do: this is different from `markdown-rs`: // https://github.com/wooorm/markdown-rs/blob/b3a921c761309ae00a51fe348d8a43adbc54b518/src/construct/gfm_autolink_literal.rs#L172-L182 return code === null || asciiControl(code) || markdownLineEndingOrSpace(code) || unicodeWhitespace(code) || unicodePunctuation(code) ? nok(code) : effects.attempt(domain, effects.attempt(path, protocolAfter), nok)(code); } /** * After a protocol autolink literal. * * ```markdown * > | https://example.com/a?b#c * ^ * ``` * * @type {State} */ function protocolAfter(code) { effects.exit('literalAutolinkHttp'); effects.exit('literalAutolink'); return ok(code); } } /** * `www` prefix. * * ```markdown * > | a www.example.org b * ^^^^ * ``` * * @this {TokenizeContext} * @type {Tokenizer} */ function tokenizeWwwPrefix(effects, ok, nok) { let size = 0; return wwwPrefixInside; /** * In www prefix. * * ```markdown * > | www.example.com * ^^^^ * ``` * * @type {State} */ function wwwPrefixInside(code) { if ((code === 87 || code === 119) && size < 3) { size++; effects.consume(code); return wwwPrefixInside; } if (code === 46 && size === 3) { effects.consume(code); return wwwPrefixAfter; } return nok(code); } /** * After www prefix. * * ```markdown * > | www.example.com * ^ * ``` * * @type {State} */ function wwwPrefixAfter(code) { // If there is *anything*, we can link. return code === null ? nok(code) : ok(code); } } /** * Domain. * * ```markdown * > | a https://example.org b * ^^^^^^^^^^^ * ``` * * @this {TokenizeContext} * @type {Tokenizer} */ function tokenizeDomain(effects, ok, nok) { /** @type {boolean | undefined} */ let underscoreInLastSegment; /** @type {boolean | undefined} */ let underscoreInLastLastSegment; /** @type {boolean | undefined} */ let seen; return domainInside; /** * In domain. * * ```markdown * > | https://example.com/a * ^^^^^^^^^^^ * ``` * * @type {State} */ function domainInside(code) { // Check whether this marker, which is a trailing punctuation // marker, optionally followed by more trailing markers, and then // followed by an end. if (code === 46 || code === 95) { return effects.check(trail, domainAfter, domainAtPunctuation)(code); } // GH documents that only alphanumerics (other than `-`, `.`, and `_`) can // occur, which sounds like ASCII only, but they also support `www.點看.com`, // so that’s Unicode. // Instead of some new production for Unicode alphanumerics, markdown // already has that for Unicode punctuation and whitespace, so use those. // Source: . if (code === null || markdownLineEndingOrSpace(code) || unicodeWhitespace(code) || code !== 45 && unicodePunctuation(code)) { return domainAfter(code); } seen = true; effects.consume(code); return domainInside; } /** * In domain, at potential trailing punctuation, that was not trailing. * * ```markdown * > | https://example.com * ^ * ``` * * @type {State} */ function domainAtPunctuation(code) { // There is an underscore in the last segment of the domain if (code === 95) { underscoreInLastSegment = true; } // Otherwise, it’s a `.`: save the last segment underscore in the // penultimate segment slot. else { underscoreInLastLastSegment = underscoreInLastSegment; underscoreInLastSegment = undefined; } effects.consume(code); return domainInside; } /** * After domain. * * ```markdown * > | https://example.com/a * ^ * ``` * * @type {State} */ function domainAfter(code) { // Note: that’s GH says a dot is needed, but it’s not true: // if (underscoreInLastLastSegment || underscoreInLastSegment || !seen) { return nok(code); } return ok(code); } } /** * Path. * * ```markdown * > | a https://example.org/stuff b * ^^^^^^ * ``` * * @this {TokenizeContext} * @type {Tokenizer} */ function tokenizePath(effects, ok) { let sizeOpen = 0; let sizeClose = 0; return pathInside; /** * In path. * * ```markdown * > | https://example.com/a * ^^ * ``` * * @type {State} */ function pathInside(code) { if (code === 40) { sizeOpen++; effects.consume(code); return pathInside; } // To do: `markdown-rs` also needs this. // If this is a paren, and there are less closings than openings, // we don’t check for a trail. if (code === 41 && sizeClose < sizeOpen) { return pathAtPunctuation(code); } // Check whether this trailing punctuation marker is optionally // followed by more trailing markers, and then followed // by an end. if (code === 33 || code === 34 || code === 38 || code === 39 || code === 41 || code === 42 || code === 44 || code === 46 || code === 58 || code === 59 || code === 60 || code === 63 || code === 93 || code === 95 || code === 126) { return effects.check(trail, ok, pathAtPunctuation)(code); } if (code === null || markdownLineEndingOrSpace(code) || unicodeWhitespace(code)) { return ok(code); } effects.consume(code); return pathInside; } /** * In path, at potential trailing punctuation, that was not trailing. * * ```markdown * > | https://example.com/a"b * ^ * ``` * * @type {State} */ function pathAtPunctuation(code) { // Count closing parens. if (code === 41) { sizeClose++; } effects.consume(code); return pathInside; } } /** * Trail. * * This calls `ok` if this *is* the trail, followed by an end, which means * the entire trail is not part of the link. * It calls `nok` if this *is* part of the link. * * ```markdown * > | https://example.com"). * ^^^ * ``` * * @this {TokenizeContext} * @type {Tokenizer} */ function tokenizeTrail(effects, ok, nok) { return trail; /** * In trail of domain or path. * * ```markdown * > | https://example.com"). * ^ * ``` * * @type {State} */ function trail(code) { // Regular trailing punctuation. if (code === 33 || code === 34 || code === 39 || code === 41 || code === 42 || code === 44 || code === 46 || code === 58 || code === 59 || code === 63 || code === 95 || code === 126) { effects.consume(code); return trail; } // `&` followed by one or more alphabeticals and then a `;`, is // as a whole considered as trailing punctuation. // In all other cases, it is considered as continuation of the URL. if (code === 38) { effects.consume(code); return trailCharacterReferenceStart; } // Needed because we allow literals after `[`, as we fix: // . // Check that it is not followed by `(` or `[`. if (code === 93) { effects.consume(code); return trailBracketAfter; } if ( // `<` is an end. code === 60 || // So is whitespace. code === null || markdownLineEndingOrSpace(code) || unicodeWhitespace(code)) { return ok(code); } return nok(code); } /** * In trail, after `]`. * * > 👉 **Note**: this deviates from `cmark-gfm` to fix a bug. * > See end of for more. * * ```markdown * > | https://example.com]( * ^ * ``` * * @type {State} */ function trailBracketAfter(code) { // Whitespace or something that could start a resource or reference is the end. // Switch back to trail otherwise. if (code === null || code === 40 || code === 91 || markdownLineEndingOrSpace(code) || unicodeWhitespace(code)) { return ok(code); } return trail(code); } /** * In character-reference like trail, after `&`. * * ```markdown * > | https://example.com&). * ^ * ``` * * @type {State} */ function trailCharacterReferenceStart(code) { // When non-alpha, it’s not a trail. return asciiAlpha(code) ? trailCharacterReferenceInside(code) : nok(code); } /** * In character-reference like trail. * * ```markdown * > | https://example.com&). * ^ * ``` * * @type {State} */ function trailCharacterReferenceInside(code) { // Switch back to trail if this is well-formed. if (code === 59) { effects.consume(code); return trail; } if (asciiAlpha(code)) { effects.consume(code); return trailCharacterReferenceInside; } // It’s not a trail. return nok(code); } } /** * Dot in email domain trail. * * This calls `ok` if this *is* the trail, followed by an end, which means * the trail is not part of the link. * It calls `nok` if this *is* part of the link. * * ```markdown * > | contact@example.org. * ^ * ``` * * @this {TokenizeContext} * @type {Tokenizer} */ function tokenizeEmailDomainDotTrail(effects, ok, nok) { return start; /** * Dot. * * ```markdown * > | contact@example.org. * ^ ^ * ``` * * @type {State} */ function start(code) { // Must be dot. effects.consume(code); return after; } /** * After dot. * * ```markdown * > | contact@example.org. * ^ ^ * ``` * * @type {State} */ function after(code) { // Not a trail if alphanumeric. return asciiAlphanumeric(code) ? nok(code) : ok(code); } } /** * See: * . * * @type {Previous} */ function previousWww(code) { return code === null || code === 40 || code === 42 || code === 95 || code === 91 || code === 93 || code === 126 || markdownLineEndingOrSpace(code); } /** * See: * . * * @type {Previous} */ function previousProtocol(code) { return !asciiAlpha(code); } /** * @this {TokenizeContext} * @type {Previous} */ function previousEmail(code) { // Do not allow a slash “inside” atext. // The reference code is a bit weird, but that’s what it results in. // Source: . // Other than slash, every preceding character is allowed. return !(code === 47 || gfmAtext(code)); } /** * @param {Code} code * @returns {boolean} */ function gfmAtext(code) { return code === 43 || code === 45 || code === 46 || code === 95 || asciiAlphanumeric(code); } /** * @param {Array} events * @returns {boolean} */ function previousUnbalanced(events) { let index = events.length; let result = false; while (index--) { const token = events[index][1]; if ((token.type === 'labelLink' || token.type === 'labelImage') && !token._balanced) { result = true; break; } // If we’ve seen this token, and it was marked as not having any unbalanced // bracket before it, we can exit. if (token._gfmAutolinkLiteralWalkedInto) { result = false; break; } } if (events.length > 0 && !result) { // Mark the last token as “walked into” w/o finding // anything. events[events.length - 1][1]._gfmAutolinkLiteralWalkedInto = true; } return result; }