role_based_system/venv/Lib/site-packages/twisted/web/sux.py

# -*- test-case-name: twisted.web.test.test_xml -*-
#
# Copyright (c) Twisted Matrix Laboratories.
# See LICENSE for details.


"""
*S*mall, *U*ncomplicated *X*ML.

This is a very simple implementation of XML/HTML as a network
protocol.  It is not at all clever.  Its main features are that it
does not:

  - support namespaces
  - mung mnemonic entity references
  - validate
  - perform *any* external actions (such as fetching URLs or writing files)
    under *any* circumstances
  - has lots and lots of horrible hacks for supporting broken HTML (as an
    option, they're not on by default).
"""


from twisted.internet.protocol import Protocol
from twisted.python.reflect import prefixedMethodNames

# Elements of the three-tuples in the state table.
BEGIN_HANDLER = 0
DO_HANDLER = 1
END_HANDLER = 2

identChars = ".-_:"
lenientIdentChars = identChars + ";+#/%~"


def nop(*args, **kw):
    "Do nothing."


def unionlist(*args):
    l = []
    for x in args:
        l.extend(x)
    d = {x: 1 for x in l}
    return d.keys()


def zipfndict(*args, **kw):
    default = kw.get("default", nop)
    d = {}
    for key in unionlist(*(fndict.keys() for fndict in args)):
        d[key] = tuple(x.get(key, default) for x in args)
    return d


def prefixedMethodClassDict(clazz, prefix):
    return {
        name: getattr(clazz, prefix + name)
        for name in prefixedMethodNames(clazz, prefix)
    }


def prefixedMethodObjDict(obj, prefix):
    return {
        name: getattr(obj, prefix + name)
        for name in prefixedMethodNames(obj.__class__, prefix)
    }


class ParseError(Exception):
    def __init__(self, filename, line, col, message):
        self.filename = filename
        self.line = line
        self.col = col
        self.message = message

    def __str__(self) -> str:
        return f"{self.filename}:{self.line}:{self.col}: {self.message}"


class XMLParser(Protocol):
    state = None
    encodings = None
    filename = "<xml />"
    beExtremelyLenient = 0
    _prepend = None

    # _leadingBodyData will sometimes be set before switching to the
    # 'bodydata' state, when we "accidentally" read a byte of bodydata
    # in a different state.
    _leadingBodyData = None

    def connectionMade(self):
        self.lineno = 1
        self.colno = 0
        self.encodings = []

    def saveMark(self):
        """Get the line number and column of the last character parsed"""
        # This gets replaced during dataReceived, restored afterwards
        return (self.lineno, self.colno)

    def _parseError(self, message):
        raise ParseError(*((self.filename,) + self.saveMark() + (message,)))

    def _buildStateTable(self):
        """Return a dictionary of begin, do, end state function tuples"""
        # _buildStateTable leaves something to be desired but it does what it
        # does.. probably slowly, so I'm doing some evil caching so it doesn't
        # get called more than once per class.
        stateTable = getattr(self.__class__, "__stateTable", None)
        if stateTable is None:
            stateTable = self.__class__.__stateTable = zipfndict(
                *(
                    prefixedMethodObjDict(self, prefix)
                    for prefix in ("begin_", "do_", "end_")
                )
            )
        return stateTable

    def _decode(self, data):
        if "UTF-16" in self.encodings or "UCS-2" in self.encodings:
            assert not len(data) & 1, "UTF-16 must come in pairs for now"
        if self._prepend:
            data = self._prepend + data
        for encoding in self.encodings:
            data = str(data, encoding)
        return data

    def maybeBodyData(self):
        if self.endtag:
            return "bodydata"

        # Get ready for fun! We're going to allow
        # <script>if (foo < bar)</script> to work!
        # We do this by making everything between <script> and
        # </script> a Text
        # BUT <script src="foo"> will be special-cased to do regular,
        # lenient behavior, because those may not have </script>
        # -radix

        if self.tagName == "script" and "src" not in self.tagAttributes:
            # we do this ourselves rather than having begin_waitforendscript
            # because that can get called multiple times and we don't want
            # bodydata to get reset other than the first time.
            self.begin_bodydata(None)
            return "waitforendscript"
        return "bodydata"

    def dataReceived(self, data):
        stateTable = self._buildStateTable()
        if not self.state:
            # all UTF-16 starts with this string
            if data.startswith((b"\xff\xfe", b"\xfe\xff")):
                self._prepend = data[0:2]
                self.encodings.append("UTF-16")
                data = data[2:]
            self.state = "begin"
        if self.encodings:
            data = self._decode(data)
        else:
            data = data.decode("utf-8")
        # bring state, lineno, colno into local scope
        lineno, colno = self.lineno, self.colno
        curState = self.state
        # replace saveMark with a nested scope function
        _saveMark = self.saveMark

        def saveMark():
            return (lineno, colno)

        self.saveMark = saveMark
        # fetch functions from the stateTable
        beginFn, doFn, endFn = stateTable[curState]
        try:
            for byte in data:
                # do newline stuff
                if byte == "\n":
                    lineno += 1
                    colno = 0
                else:
                    colno += 1
                newState = doFn(byte)
                if newState is not None and newState != curState:
                    # this is the endFn from the previous state
                    endFn()
                    curState = newState
                    beginFn, doFn, endFn = stateTable[curState]
                    beginFn(byte)
        finally:
            self.saveMark = _saveMark
            self.lineno, self.colno = lineno, colno
        # state doesn't make sense if there's an exception..
        self.state = curState

    def connectionLost(self, reason):
        """
        End the last state we were in.
        """
        stateTable = self._buildStateTable()
        stateTable[self.state][END_HANDLER]()

    # state methods

    def do_begin(self, byte):
        if byte.isspace():
            return
        if byte != "<":
            if self.beExtremelyLenient:
                self._leadingBodyData = byte
                return "bodydata"
            self._parseError(f"First char of document [{byte!r}] wasn't <")
        return "tagstart"

    def begin_comment(self, byte):
        self.commentbuf = ""

    def do_comment(self, byte):
        self.commentbuf += byte
        if self.commentbuf.endswith("-->"):
            self.gotComment(self.commentbuf[:-3])
            return "bodydata"

    def begin_tagstart(self, byte):
        self.tagName = ""  # name of the tag
        self.tagAttributes = {}  # attributes of the tag
        self.termtag = 0  # is the tag self-terminating
        self.endtag = 0

    def do_tagstart(self, byte):
        if byte.isalnum() or byte in identChars:
            self.tagName += byte
            if self.tagName == "!--":
                return "comment"
        elif byte.isspace():
            if self.tagName:
                if self.endtag:
                    # properly strict thing to do here is probably to only
                    # accept whitespace
                    return "waitforgt"
                return "attrs"
            else:
                self._parseError("Whitespace before tag-name")
        elif byte == ">":
            if self.endtag:
                self.gotTagEnd(self.tagName)
                return "bodydata"
            else:
                self.gotTagStart(self.tagName, {})
                return (
                    (not self.beExtremelyLenient) and "bodydata" or self.maybeBodyData()
                )
        elif byte == "/":
            if self.tagName:
                return "afterslash"
            else:
                self.endtag = 1
        elif byte in "!?":
            if self.tagName:
                if not self.beExtremelyLenient:
                    self._parseError("Invalid character in tag-name")
            else:
                self.tagName += byte
                self.termtag = 1
        elif byte == "[":
            if self.tagName == "!":
                return "expectcdata"
            else:
                self._parseError("Invalid '[' in tag-name")
        else:
            if self.beExtremelyLenient:
                self.bodydata = "<"
                return "unentity"
            self._parseError("Invalid tag character: %r" % byte)

    def begin_unentity(self, byte):
        self.bodydata += byte

    def do_unentity(self, byte):
        self.bodydata += byte
        return "bodydata"

    def end_unentity(self):
        self.gotText(self.bodydata)

    def begin_expectcdata(self, byte):
        self.cdatabuf = byte

    def do_expectcdata(self, byte):
        self.cdatabuf += byte
        cdb = self.cdatabuf
        cd = "[CDATA["
        if len(cd) > len(cdb):
            if cd.startswith(cdb):
                return
            elif self.beExtremelyLenient:
                ## WHAT THE CRAP!?  MSWord9 generates HTML that includes these
                ## bizarre <![if !foo]> <![endif]> chunks, so I've gotta ignore
                ## 'em as best I can.  this should really be a separate parse
                ## state but I don't even have any idea what these _are_.
                return "waitforgt"
            else:
                self._parseError("Mal-formed CDATA header")
        if cd == cdb:
            self.cdatabuf = ""
            return "cdata"
        self._parseError("Mal-formed CDATA header")

    def do_cdata(self, byte):
        self.cdatabuf += byte
        if self.cdatabuf.endswith("]]>"):
            self.cdatabuf = self.cdatabuf[:-3]
            return "bodydata"

    def end_cdata(self):
        self.gotCData(self.cdatabuf)
        self.cdatabuf = ""

    def do_attrs(self, byte):
        if byte.isalnum() or byte in identChars:
            # XXX FIXME really handle !DOCTYPE at some point
            if self.tagName == "!DOCTYPE":
                return "doctype"
            if self.tagName[0] in "!?":
                return "waitforgt"
            return "attrname"
        elif byte.isspace():
            return
        elif byte == ">":
            self.gotTagStart(self.tagName, self.tagAttributes)
            return (not self.beExtremelyLenient) and "bodydata" or self.maybeBodyData()
        elif byte == "/":
            return "afterslash"
        elif self.beExtremelyLenient:
            # discard and move on?  Only case I've seen of this so far was:
            # <foo bar="baz"">
            return
        self._parseError("Unexpected character: %r" % byte)

    def begin_doctype(self, byte):
        self.doctype = byte

    def do_doctype(self, byte):
        if byte == ">":
            return "bodydata"
        self.doctype += byte

    def end_doctype(self):
        self.gotDoctype(self.doctype)
        self.doctype = None

    def do_waitforgt(self, byte):
        if byte == ">":
            if self.endtag or not self.beExtremelyLenient:
                return "bodydata"
            return self.maybeBodyData()

    def begin_attrname(self, byte):
        self.attrname = byte
        self._attrname_termtag = 0

    def do_attrname(self, byte):
        if byte.isalnum() or byte in identChars:
            self.attrname += byte
            return
        elif byte == "=":
            return "beforeattrval"
        elif byte.isspace():
            return "beforeeq"
        elif self.beExtremelyLenient:
            if byte in "\"'":
                return "attrval"
            if byte in lenientIdentChars or byte.isalnum():
                self.attrname += byte
                return
            if byte == "/":
                self._attrname_termtag = 1
                return
            if byte == ">":
                self.attrval = "True"
                self.tagAttributes[self.attrname] = self.attrval
                self.gotTagStart(self.tagName, self.tagAttributes)
                if self._attrname_termtag:
                    self.gotTagEnd(self.tagName)
                    return "bodydata"
                return self.maybeBodyData()
            # something is really broken. let's leave this attribute where it
            # is and move on to the next thing
            return
        self._parseError(f"Invalid attribute name: {self.attrname!r} {byte!r}")

    def do_beforeattrval(self, byte):
        if byte in "\"'":
            return "attrval"
        elif byte.isspace():
            return
        elif self.beExtremelyLenient:
            if byte in lenientIdentChars or byte.isalnum():
                return "messyattr"
            if byte == ">":
                self.attrval = "True"
                self.tagAttributes[self.attrname] = self.attrval
                self.gotTagStart(self.tagName, self.tagAttributes)
                return self.maybeBodyData()
            if byte == "\\":
                # I saw this in actual HTML once:
                # <font size=\"3\"><sup>SM</sup></font>
                return
        self._parseError(
            "Invalid initial attribute value: %r; Attribute values must be quoted."
            % byte
        )

    attrname = ""
    attrval = ""

    def begin_beforeeq(self, byte):
        self._beforeeq_termtag = 0

    def do_beforeeq(self, byte):
        if byte == "=":
            return "beforeattrval"
        elif byte.isspace():
            return
        elif self.beExtremelyLenient:
            if byte.isalnum() or byte in identChars:
                self.attrval = "True"
                self.tagAttributes[self.attrname] = self.attrval
                return "attrname"
            elif byte == ">":
                self.attrval = "True"
                self.tagAttributes[self.attrname] = self.attrval
                self.gotTagStart(self.tagName, self.tagAttributes)
                if self._beforeeq_termtag:
                    self.gotTagEnd(self.tagName)
                    return "bodydata"
                return self.maybeBodyData()
            elif byte == "/":
                self._beforeeq_termtag = 1
                return
        self._parseError("Invalid attribute")

    def begin_attrval(self, byte):
        self.quotetype = byte
        self.attrval = ""

    def do_attrval(self, byte):
        if byte == self.quotetype:
            return "attrs"
        self.attrval += byte

    def end_attrval(self):
        self.tagAttributes[self.attrname] = self.attrval
        self.attrname = self.attrval = ""

    def begin_messyattr(self, byte):
        self.attrval = byte

    def do_messyattr(self, byte):
        if byte.isspace():
            return "attrs"
        elif byte == ">":
            endTag = 0
            if self.attrval.endswith("/"):
                endTag = 1
                self.attrval = self.attrval[:-1]
            self.tagAttributes[self.attrname] = self.attrval
            self.gotTagStart(self.tagName, self.tagAttributes)
            if endTag:
                self.gotTagEnd(self.tagName)
                return "bodydata"
            return self.maybeBodyData()
        else:
            self.attrval += byte

    def end_messyattr(self):
        if self.attrval:
            self.tagAttributes[self.attrname] = self.attrval

    def begin_afterslash(self, byte):
        self._after_slash_closed = 0

    def do_afterslash(self, byte):
        # this state is only after a self-terminating slash, e.g. <foo/>
        if self._after_slash_closed:
            self._parseError("Mal-formed")  # XXX When does this happen??
        if byte != ">":
            if self.beExtremelyLenient:
                return
            else:
                self._parseError("No data allowed after '/'")
        self._after_slash_closed = 1
        self.gotTagStart(self.tagName, self.tagAttributes)
        self.gotTagEnd(self.tagName)
        # don't need maybeBodyData here because there better not be
        # any javascript code after a <script/>... we'll see :(
        return "bodydata"

    def begin_bodydata(self, byte):
        if self._leadingBodyData:
            self.bodydata = self._leadingBodyData
            del self._leadingBodyData
        else:
            self.bodydata = ""

    def do_bodydata(self, byte):
        if byte == "<":
            return "tagstart"
        if byte == "&":
            return "entityref"
        self.bodydata += byte

    def end_bodydata(self):
        self.gotText(self.bodydata)
        self.bodydata = ""

    def do_waitforendscript(self, byte):
        if byte == "<":
            return "waitscriptendtag"
        self.bodydata += byte

    def begin_waitscriptendtag(self, byte):
        self.temptagdata = ""
        self.tagName = ""
        self.endtag = 0

    def do_waitscriptendtag(self, byte):
        # 1 enforce / as first byte read
        # 2 enforce following bytes to be subset of "script" until
        #   tagName == "script"
        #   2a when that happens, gotText(self.bodydata) and gotTagEnd(self.tagName)
        # 3 spaces can happen anywhere, they're ignored
        #   e.g. < / script >
        # 4 anything else causes all data I've read to be moved to the
        #   bodydata, and switch back to waitforendscript state

        # If it turns out this _isn't_ a </script>, we need to
        # remember all the data we've been through so we can append it
        # to bodydata
        self.temptagdata += byte

        # 1
        if byte == "/":
            self.endtag = True
        elif not self.endtag:
            self.bodydata += "<" + self.temptagdata
            return "waitforendscript"
        # 2
        elif byte.isalnum() or byte in identChars:
            self.tagName += byte
            if not "script".startswith(self.tagName):
                self.bodydata += "<" + self.temptagdata
                return "waitforendscript"
            elif self.tagName == "script":
                self.gotText(self.bodydata)
                self.gotTagEnd(self.tagName)
                return "waitforgt"
        # 3
        elif byte.isspace():
            return "waitscriptendtag"
        # 4
        else:
            self.bodydata += "<" + self.temptagdata
            return "waitforendscript"

    def begin_entityref(self, byte):
        self.erefbuf = ""
        self.erefextra = ""  # extra bit for lenient mode

    def do_entityref(self, byte):
        if byte.isspace() or byte == "<":
            if self.beExtremelyLenient:
                # '&foo' probably was '&amp;foo'
                if self.erefbuf and self.erefbuf != "amp":
                    self.erefextra = self.erefbuf
                self.erefbuf = "amp"
                if byte == "<":
                    return "tagstart"
                else:
                    self.erefextra += byte
                    return "spacebodydata"
            self._parseError("Bad entity reference")
        elif byte != ";":
            self.erefbuf += byte
        else:
            return "bodydata"

    def end_entityref(self):
        self.gotEntityReference(self.erefbuf)

    # hacky support for space after & in entityref in beExtremelyLenient
    # state should only happen in that case
    def begin_spacebodydata(self, byte):
        self.bodydata = self.erefextra
        self.erefextra = None

    do_spacebodydata = do_bodydata
    end_spacebodydata = end_bodydata

    # Sorta SAX-ish API

    def gotTagStart(self, name, attributes):
        """Encountered an opening tag.

        Default behaviour is to print."""
        print("begin", name, attributes)

    def gotText(self, data):
        """Encountered text

        Default behaviour is to print."""
        print("text:", repr(data))

    def gotEntityReference(self, entityRef):
        """Encountered mnemonic entity reference

        Default behaviour is to print."""
        print("entityRef: &%s;" % entityRef)

    def gotComment(self, comment):
        """Encountered comment.

        Default behaviour is to ignore."""
        pass

    def gotCData(self, cdata):
        """Encountered CDATA

        Default behaviour is to call the gotText method"""
        self.gotText(cdata)

    def gotDoctype(self, doctype):
        """Encountered DOCTYPE

        This is really grotty: it basically just gives you everything between
        '<!DOCTYPE' and '>' as an argument.
        """
        print("!DOCTYPE", repr(doctype))

    def gotTagEnd(self, name):
        """Encountered closing tag

        Default behaviour is to print."""
        print("end", name)
修改知识库为表中数据,完善了权限通知功能 2025-02-26 21:05:55 +08:00			`# -- test-case-name: twisted.web.test.test_xml --`
			`#`
			`# Copyright (c) Twisted Matrix Laboratories.`
			`# See LICENSE for details.`


			`"""`
			`Small, Uncomplicated XML.`

			`This is a very simple implementation of XML/HTML as a network`
			`protocol. It is not at all clever. Its main features are that it`
			`does not:`

			`- support namespaces`
			`- mung mnemonic entity references`
			`- validate`
			`- perform any external actions (such as fetching URLs or writing files)`
			`under any circumstances`
			`- has lots and lots of horrible hacks for supporting broken HTML (as an`
			`option, they're not on by default).`
			`"""`


			`from twisted.internet.protocol import Protocol`
			`from twisted.python.reflect import prefixedMethodNames`

			`# Elements of the three-tuples in the state table.`
			`BEGIN_HANDLER = 0`
			`DO_HANDLER = 1`
			`END_HANDLER = 2`

			`identChars = ".-_:"`
			`lenientIdentChars = identChars + ";+#/%~"`


			`def nop(args, *kw):`
			`"Do nothing."`


			`def unionlist(*args):`
			`l = []`
			`for x in args:`
			`l.extend(x)`
			`d = {x: 1 for x in l}`
			`return d.keys()`


			`def zipfndict(args, *kw):`
			`default = kw.get("default", nop)`
			`d = {}`
			`for key in unionlist(*(fndict.keys() for fndict in args)):`
			`d[key] = tuple(x.get(key, default) for x in args)`
			`return d`


			`def prefixedMethodClassDict(clazz, prefix):`
			`return {`
			`name: getattr(clazz, prefix + name)`
			`for name in prefixedMethodNames(clazz, prefix)`
			`}`


			`def prefixedMethodObjDict(obj, prefix):`
			`return {`
			`name: getattr(obj, prefix + name)`
			`for name in prefixedMethodNames(obj.__class__, prefix)`
			`}`


			`class ParseError(Exception):`
			`def __init__(self, filename, line, col, message):`
			`self.filename = filename`
			`self.line = line`
			`self.col = col`
			`self.message = message`

			`def __str__(self) -> str:`
			`return f"{self.filename}:{self.line}:{self.col}: {self.message}"`


			`class XMLParser(Protocol):`
			`state = None`
			`encodings = None`
			`filename = "<xml />"`
			`beExtremelyLenient = 0`
			`_prepend = None`

			`# _leadingBodyData will sometimes be set before switching to the`
			`# 'bodydata' state, when we "accidentally" read a byte of bodydata`
			`# in a different state.`
			`_leadingBodyData = None`

			`def connectionMade(self):`
			`self.lineno = 1`
			`self.colno = 0`
			`self.encodings = []`

			`def saveMark(self):`
			`"""Get the line number and column of the last character parsed"""`
			`# This gets replaced during dataReceived, restored afterwards`
			`return (self.lineno, self.colno)`

			`def _parseError(self, message):`
			`raise ParseError(*((self.filename,) + self.saveMark() + (message,)))`

			`def _buildStateTable(self):`
			`"""Return a dictionary of begin, do, end state function tuples"""`
			`# _buildStateTable leaves something to be desired but it does what it`
			`# does.. probably slowly, so I'm doing some evil caching so it doesn't`
			`# get called more than once per class.`
			`stateTable = getattr(self.__class__, "__stateTable", None)`
			`if stateTable is None:`
			`stateTable = self.__class__.__stateTable = zipfndict(`
			`*(`
			`prefixedMethodObjDict(self, prefix)`
			`for prefix in ("begin_", "do_", "end_")`
			`)`
			`)`
			`return stateTable`

			`def _decode(self, data):`
			`if "UTF-16" in self.encodings or "UCS-2" in self.encodings:`
			`assert not len(data) & 1, "UTF-16 must come in pairs for now"`
			`if self._prepend:`
			`data = self._prepend + data`
			`for encoding in self.encodings:`
			`data = str(data, encoding)`
			`return data`

			`def maybeBodyData(self):`
			`if self.endtag:`
			`return "bodydata"`

			`# Get ready for fun! We're going to allow`
			`# <script>if (foo < bar)</script> to work!`
			`# We do this by making everything between <script> and`
			`# </script> a Text`
			`# BUT <script src="foo"> will be special-cased to do regular,`
			`# lenient behavior, because those may not have </script>`
			`# -radix`

			`if self.tagName == "script" and "src" not in self.tagAttributes:`
			`# we do this ourselves rather than having begin_waitforendscript`
			`# because that can get called multiple times and we don't want`
			`# bodydata to get reset other than the first time.`
			`self.begin_bodydata(None)`
			`return "waitforendscript"`
			`return "bodydata"`

			`def dataReceived(self, data):`
			`stateTable = self._buildStateTable()`
			`if not self.state:`
			`# all UTF-16 starts with this string`
			`if data.startswith((b"\xff\xfe", b"\xfe\xff")):`
			`self._prepend = data[0:2]`
			`self.encodings.append("UTF-16")`
			`data = data[2:]`
			`self.state = "begin"`
			`if self.encodings:`
			`data = self._decode(data)`
			`else:`
			`data = data.decode("utf-8")`
			`# bring state, lineno, colno into local scope`
			`lineno, colno = self.lineno, self.colno`
			`curState = self.state`
			`# replace saveMark with a nested scope function`
			`_saveMark = self.saveMark`

			`def saveMark():`
			`return (lineno, colno)`

			`self.saveMark = saveMark`
			`# fetch functions from the stateTable`
			`beginFn, doFn, endFn = stateTable[curState]`
			`try:`
			`for byte in data:`
			`# do newline stuff`
			`if byte == "\n":`
			`lineno += 1`
			`colno = 0`
			`else:`
			`colno += 1`
			`newState = doFn(byte)`
			`if newState is not None and newState != curState:`
			`# this is the endFn from the previous state`
			`endFn()`
			`curState = newState`
			`beginFn, doFn, endFn = stateTable[curState]`
			`beginFn(byte)`
			`finally:`
			`self.saveMark = _saveMark`
			`self.lineno, self.colno = lineno, colno`
			`# state doesn't make sense if there's an exception..`
			`self.state = curState`

			`def connectionLost(self, reason):`
			`"""`
			`End the last state we were in.`
			`"""`
			`stateTable = self._buildStateTable()`
			`stateTable[self.state][END_HANDLER]()`

			`# state methods`

			`def do_begin(self, byte):`
			`if byte.isspace():`
			`return`
			`if byte != "<":`
			`if self.beExtremelyLenient:`
			`self._leadingBodyData = byte`
			`return "bodydata"`
			`self._parseError(f"First char of document [{byte!r}] wasn't <")`
			`return "tagstart"`

			`def begin_comment(self, byte):`
			`self.commentbuf = ""`

			`def do_comment(self, byte):`
			`self.commentbuf += byte`
			`if self.commentbuf.endswith("-->"):`
			`self.gotComment(self.commentbuf[:-3])`
			`return "bodydata"`

			`def begin_tagstart(self, byte):`
			`self.tagName = "" # name of the tag`
			`self.tagAttributes = {} # attributes of the tag`
			`self.termtag = 0 # is the tag self-terminating`
			`self.endtag = 0`

			`def do_tagstart(self, byte):`
			`if byte.isalnum() or byte in identChars:`
			`self.tagName += byte`
			`if self.tagName == "!--":`
			`return "comment"`
			`elif byte.isspace():`
			`if self.tagName:`
			`if self.endtag:`
			`# properly strict thing to do here is probably to only`
			`# accept whitespace`
			`return "waitforgt"`
			`return "attrs"`
			`else:`
			`self._parseError("Whitespace before tag-name")`
			`elif byte == ">":`
			`if self.endtag:`
			`self.gotTagEnd(self.tagName)`
			`return "bodydata"`
			`else:`
			`self.gotTagStart(self.tagName, {})`
			`return (`
			`(not self.beExtremelyLenient) and "bodydata" or self.maybeBodyData()`
			`)`
			`elif byte == "/":`
			`if self.tagName:`
			`return "afterslash"`
			`else:`
			`self.endtag = 1`
			`elif byte in "!?":`
			`if self.tagName:`
			`if not self.beExtremelyLenient:`
			`self._parseError("Invalid character in tag-name")`
			`else:`
			`self.tagName += byte`
			`self.termtag = 1`
			`elif byte == "[":`
			`if self.tagName == "!":`
			`return "expectcdata"`
			`else:`
			`self._parseError("Invalid '[' in tag-name")`
			`else:`
			`if self.beExtremelyLenient:`
			`self.bodydata = "<"`
			`return "unentity"`
			`self._parseError("Invalid tag character: %r" % byte)`

			`def begin_unentity(self, byte):`
			`self.bodydata += byte`

			`def do_unentity(self, byte):`
			`self.bodydata += byte`
			`return "bodydata"`

			`def end_unentity(self):`
			`self.gotText(self.bodydata)`

			`def begin_expectcdata(self, byte):`
			`self.cdatabuf = byte`

			`def do_expectcdata(self, byte):`
			`self.cdatabuf += byte`
			`cdb = self.cdatabuf`
			`cd = "[CDATA["`
			`if len(cd) > len(cdb):`
			`if cd.startswith(cdb):`
			`return`
			`elif self.beExtremelyLenient:`
			`## WHAT THE CRAP!? MSWord9 generates HTML that includes these`
			`## bizarre <![if !foo]> <![endif]> chunks, so I've gotta ignore`
			`## 'em as best I can. this should really be a separate parse`
			`## state but I don't even have any idea what these _are_.`
			`return "waitforgt"`
			`else:`
			`self._parseError("Mal-formed CDATA header")`
			`if cd == cdb:`
			`self.cdatabuf = ""`
			`return "cdata"`
			`self._parseError("Mal-formed CDATA header")`

			`def do_cdata(self, byte):`
			`self.cdatabuf += byte`
			`if self.cdatabuf.endswith("]]>"):`
			`self.cdatabuf = self.cdatabuf[:-3]`
			`return "bodydata"`

			`def end_cdata(self):`
			`self.gotCData(self.cdatabuf)`
			`self.cdatabuf = ""`

			`def do_attrs(self, byte):`
			`if byte.isalnum() or byte in identChars:`
			`# XXX FIXME really handle !DOCTYPE at some point`
			`if self.tagName == "!DOCTYPE":`
			`return "doctype"`
			`if self.tagName[0] in "!?":`
			`return "waitforgt"`
			`return "attrname"`
			`elif byte.isspace():`
			`return`
			`elif byte == ">":`
			`self.gotTagStart(self.tagName, self.tagAttributes)`
			`return (not self.beExtremelyLenient) and "bodydata" or self.maybeBodyData()`
			`elif byte == "/":`
			`return "afterslash"`
			`elif self.beExtremelyLenient:`
			`# discard and move on? Only case I've seen of this so far was:`
			`# <foo bar="baz"">`
			`return`
			`self._parseError("Unexpected character: %r" % byte)`

			`def begin_doctype(self, byte):`
			`self.doctype = byte`

			`def do_doctype(self, byte):`
			`if byte == ">":`
			`return "bodydata"`
			`self.doctype += byte`

			`def end_doctype(self):`
			`self.gotDoctype(self.doctype)`
			`self.doctype = None`

			`def do_waitforgt(self, byte):`
			`if byte == ">":`
			`if self.endtag or not self.beExtremelyLenient:`
			`return "bodydata"`
			`return self.maybeBodyData()`

			`def begin_attrname(self, byte):`
			`self.attrname = byte`
			`self._attrname_termtag = 0`

			`def do_attrname(self, byte):`
			`if byte.isalnum() or byte in identChars:`
			`self.attrname += byte`
			`return`
			`elif byte == "=":`
			`return "beforeattrval"`
			`elif byte.isspace():`
			`return "beforeeq"`
			`elif self.beExtremelyLenient:`
			`if byte in "\"'":`
			`return "attrval"`
			`if byte in lenientIdentChars or byte.isalnum():`
			`self.attrname += byte`
			`return`
			`if byte == "/":`
			`self._attrname_termtag = 1`
			`return`
			`if byte == ">":`
			`self.attrval = "True"`
			`self.tagAttributes[self.attrname] = self.attrval`
			`self.gotTagStart(self.tagName, self.tagAttributes)`
			`if self._attrname_termtag:`
			`self.gotTagEnd(self.tagName)`
			`return "bodydata"`
			`return self.maybeBodyData()`
			`# something is really broken. let's leave this attribute where it`
			`# is and move on to the next thing`
			`return`
			`self._parseError(f"Invalid attribute name: {self.attrname!r} {byte!r}")`

			`def do_beforeattrval(self, byte):`
			`if byte in "\"'":`
			`return "attrval"`
			`elif byte.isspace():`
			`return`
			`elif self.beExtremelyLenient:`
			`if byte in lenientIdentChars or byte.isalnum():`
			`return "messyattr"`
			`if byte == ">":`
			`self.attrval = "True"`
			`self.tagAttributes[self.attrname] = self.attrval`
			`self.gotTagStart(self.tagName, self.tagAttributes)`
			`return self.maybeBodyData()`
			`if byte == "\\":`
			`# I saw this in actual HTML once:`
			`# <font size=\"3\"><sup>SM</sup></font>`
			`return`
			`self._parseError(`
			`"Invalid initial attribute value: %r; Attribute values must be quoted."`
			`% byte`
			`)`

			`attrname = ""`
			`attrval = ""`

			`def begin_beforeeq(self, byte):`
			`self._beforeeq_termtag = 0`

			`def do_beforeeq(self, byte):`
			`if byte == "=":`
			`return "beforeattrval"`
			`elif byte.isspace():`
			`return`
			`elif self.beExtremelyLenient:`
			`if byte.isalnum() or byte in identChars:`
			`self.attrval = "True"`
			`self.tagAttributes[self.attrname] = self.attrval`
			`return "attrname"`
			`elif byte == ">":`
			`self.attrval = "True"`
			`self.tagAttributes[self.attrname] = self.attrval`
			`self.gotTagStart(self.tagName, self.tagAttributes)`
			`if self._beforeeq_termtag:`
			`self.gotTagEnd(self.tagName)`
			`return "bodydata"`
			`return self.maybeBodyData()`
			`elif byte == "/":`
			`self._beforeeq_termtag = 1`
			`return`
			`self._parseError("Invalid attribute")`

			`def begin_attrval(self, byte):`
			`self.quotetype = byte`
			`self.attrval = ""`

			`def do_attrval(self, byte):`
			`if byte == self.quotetype:`
			`return "attrs"`
			`self.attrval += byte`

			`def end_attrval(self):`
			`self.tagAttributes[self.attrname] = self.attrval`
			`self.attrname = self.attrval = ""`

			`def begin_messyattr(self, byte):`
			`self.attrval = byte`

			`def do_messyattr(self, byte):`
			`if byte.isspace():`
			`return "attrs"`
			`elif byte == ">":`
			`endTag = 0`
			`if self.attrval.endswith("/"):`
			`endTag = 1`
			`self.attrval = self.attrval[:-1]`
			`self.tagAttributes[self.attrname] = self.attrval`
			`self.gotTagStart(self.tagName, self.tagAttributes)`
			`if endTag:`
			`self.gotTagEnd(self.tagName)`
			`return "bodydata"`
			`return self.maybeBodyData()`
			`else:`
			`self.attrval += byte`

			`def end_messyattr(self):`
			`if self.attrval:`
			`self.tagAttributes[self.attrname] = self.attrval`

			`def begin_afterslash(self, byte):`
			`self._after_slash_closed = 0`

			`def do_afterslash(self, byte):`
			`# this state is only after a self-terminating slash, e.g. <foo/>`
			`if self._after_slash_closed:`
			`self._parseError("Mal-formed") # XXX When does this happen??`
			`if byte != ">":`
			`if self.beExtremelyLenient:`
			`return`
			`else:`
			`self._parseError("No data allowed after '/'")`
			`self._after_slash_closed = 1`
			`self.gotTagStart(self.tagName, self.tagAttributes)`
			`self.gotTagEnd(self.tagName)`
			`# don't need maybeBodyData here because there better not be`
			`# any javascript code after a <script/>... we'll see :(`
			`return "bodydata"`

			`def begin_bodydata(self, byte):`
			`if self._leadingBodyData:`
			`self.bodydata = self._leadingBodyData`
			`del self._leadingBodyData`
			`else:`
			`self.bodydata = ""`

			`def do_bodydata(self, byte):`
			`if byte == "<":`
			`return "tagstart"`
			`if byte == "&":`
			`return "entityref"`
			`self.bodydata += byte`

			`def end_bodydata(self):`
			`self.gotText(self.bodydata)`
			`self.bodydata = ""`

			`def do_waitforendscript(self, byte):`
			`if byte == "<":`
			`return "waitscriptendtag"`
			`self.bodydata += byte`

			`def begin_waitscriptendtag(self, byte):`
			`self.temptagdata = ""`
			`self.tagName = ""`
			`self.endtag = 0`

			`def do_waitscriptendtag(self, byte):`
			`# 1 enforce / as first byte read`
			`# 2 enforce following bytes to be subset of "script" until`
			`# tagName == "script"`
			`# 2a when that happens, gotText(self.bodydata) and gotTagEnd(self.tagName)`
			`# 3 spaces can happen anywhere, they're ignored`
			`# e.g. < / script >`
			`# 4 anything else causes all data I've read to be moved to the`
			`# bodydata, and switch back to waitforendscript state`

			`# If it turns out this _isn't_ a </script>, we need to`
			`# remember all the data we've been through so we can append it`
			`# to bodydata`
			`self.temptagdata += byte`

			`# 1`
			`if byte == "/":`
			`self.endtag = True`
			`elif not self.endtag:`
			`self.bodydata += "<" + self.temptagdata`
			`return "waitforendscript"`
			`# 2`
			`elif byte.isalnum() or byte in identChars:`
			`self.tagName += byte`
			`if not "script".startswith(self.tagName):`
			`self.bodydata += "<" + self.temptagdata`
			`return "waitforendscript"`
			`elif self.tagName == "script":`
			`self.gotText(self.bodydata)`
			`self.gotTagEnd(self.tagName)`
			`return "waitforgt"`
			`# 3`
			`elif byte.isspace():`
			`return "waitscriptendtag"`
			`# 4`
			`else:`
			`self.bodydata += "<" + self.temptagdata`
			`return "waitforendscript"`

			`def begin_entityref(self, byte):`
			`self.erefbuf = ""`
			`self.erefextra = "" # extra bit for lenient mode`

			`def do_entityref(self, byte):`
			`if byte.isspace() or byte == "<":`
			`if self.beExtremelyLenient:`
			`# '&foo' probably was '&foo'`
			`if self.erefbuf and self.erefbuf != "amp":`
			`self.erefextra = self.erefbuf`
			`self.erefbuf = "amp"`
			`if byte == "<":`
			`return "tagstart"`
			`else:`
			`self.erefextra += byte`
			`return "spacebodydata"`
			`self._parseError("Bad entity reference")`
			`elif byte != ";":`
			`self.erefbuf += byte`
			`else:`
			`return "bodydata"`

			`def end_entityref(self):`
			`self.gotEntityReference(self.erefbuf)`

			`# hacky support for space after & in entityref in beExtremelyLenient`
			`# state should only happen in that case`
			`def begin_spacebodydata(self, byte):`
			`self.bodydata = self.erefextra`
			`self.erefextra = None`

			`do_spacebodydata = do_bodydata`
			`end_spacebodydata = end_bodydata`

			`# Sorta SAX-ish API`

			`def gotTagStart(self, name, attributes):`
			`"""Encountered an opening tag.`

			`Default behaviour is to print."""`
			`print("begin", name, attributes)`

			`def gotText(self, data):`
			`"""Encountered text`

			`Default behaviour is to print."""`
			`print("text:", repr(data))`

			`def gotEntityReference(self, entityRef):`
			`"""Encountered mnemonic entity reference`

			`Default behaviour is to print."""`
			`print("entityRef: &%s;" % entityRef)`

			`def gotComment(self, comment):`
			`"""Encountered comment.`

			`Default behaviour is to ignore."""`
			`pass`

			`def gotCData(self, cdata):`
			`"""Encountered CDATA`

			`Default behaviour is to call the gotText method"""`
			`self.gotText(cdata)`

			`def gotDoctype(self, doctype):`
			`"""Encountered DOCTYPE`

			`This is really grotty: it basically just gives you everything between`
			`'<!DOCTYPE' and '>' as an argument.`
			`"""`
			`print("!DOCTYPE", repr(doctype))`

			`def gotTagEnd(self, name):`
			`"""Encountered closing tag`

			`Default behaviour is to print."""`
			`print("end", name)`