322 lines
9.3 KiB
Python
322 lines
9.3 KiB
Python
![]() |
# -*- coding: utf-8 -*-
|
||
|
"""
|
||
|
Hypothesis strategies.
|
||
|
"""
|
||
|
from __future__ import absolute_import
|
||
|
|
||
|
try:
|
||
|
import hypothesis
|
||
|
|
||
|
del hypothesis
|
||
|
except ImportError:
|
||
|
from typing import Tuple
|
||
|
|
||
|
__all__ = () # type: Tuple[str, ...]
|
||
|
else:
|
||
|
from csv import reader as csv_reader
|
||
|
from os.path import dirname, join
|
||
|
from string import ascii_letters, digits
|
||
|
from sys import maxunicode
|
||
|
from typing import (
|
||
|
Callable,
|
||
|
Iterable,
|
||
|
List,
|
||
|
Optional,
|
||
|
Sequence,
|
||
|
Text,
|
||
|
TypeVar,
|
||
|
cast,
|
||
|
)
|
||
|
from gzip import open as open_gzip
|
||
|
|
||
|
from . import DecodedURL, EncodedURL
|
||
|
|
||
|
from hypothesis import assume
|
||
|
from hypothesis.strategies import (
|
||
|
composite,
|
||
|
integers,
|
||
|
lists,
|
||
|
sampled_from,
|
||
|
text,
|
||
|
)
|
||
|
|
||
|
from idna import IDNAError, check_label, encode as idna_encode
|
||
|
|
||
|
__all__ = (
|
||
|
"decoded_urls",
|
||
|
"encoded_urls",
|
||
|
"hostname_labels",
|
||
|
"hostnames",
|
||
|
"idna_text",
|
||
|
"paths",
|
||
|
"port_numbers",
|
||
|
)
|
||
|
|
||
|
T = TypeVar("T")
|
||
|
DrawCallable = Callable[[Callable[..., T]], T]
|
||
|
|
||
|
try:
|
||
|
unichr
|
||
|
except NameError: # Py3
|
||
|
unichr = chr # type: Callable[[int], Text]
|
||
|
|
||
|
def idna_characters():
|
||
|
# type: () -> Text
|
||
|
"""
|
||
|
Returns a string containing IDNA characters.
|
||
|
"""
|
||
|
global _idnaCharacters
|
||
|
|
||
|
if not _idnaCharacters:
|
||
|
result = []
|
||
|
|
||
|
# Data source "IDNA Derived Properties":
|
||
|
# https://www.iana.org/assignments/idna-tables-6.3.0/
|
||
|
# idna-tables-6.3.0.xhtml#idna-tables-properties
|
||
|
dataFileName = join(
|
||
|
dirname(__file__), "idna-tables-properties.csv.gz"
|
||
|
)
|
||
|
with open_gzip(dataFileName) as dataFile:
|
||
|
reader = csv_reader(
|
||
|
(line.decode("utf-8") for line in dataFile),
|
||
|
delimiter=",",
|
||
|
)
|
||
|
next(reader) # Skip header row
|
||
|
for row in reader:
|
||
|
codes, prop, description = row
|
||
|
|
||
|
if prop != "PVALID":
|
||
|
# CONTEXTO or CONTEXTJ are also allowed, but they come
|
||
|
# with rules, so we're punting on those here.
|
||
|
# See: https://tools.ietf.org/html/rfc5892
|
||
|
continue
|
||
|
|
||
|
startEnd = row[0].split("-", 1)
|
||
|
if len(startEnd) == 1:
|
||
|
# No end of range given; use start
|
||
|
startEnd.append(startEnd[0])
|
||
|
start, end = (int(i, 16) for i in startEnd)
|
||
|
|
||
|
for i in range(start, end + 1):
|
||
|
if i > maxunicode: # Happens using Py2 on Windows
|
||
|
break
|
||
|
result.append(unichr(i))
|
||
|
|
||
|
_idnaCharacters = u"".join(result)
|
||
|
|
||
|
return _idnaCharacters
|
||
|
|
||
|
_idnaCharacters = "" # type: Text
|
||
|
|
||
|
@composite
|
||
|
def idna_text(draw, min_size=1, max_size=None):
|
||
|
# type: (DrawCallable, int, Optional[int]) -> Text
|
||
|
"""
|
||
|
A strategy which generates IDNA-encodable text.
|
||
|
|
||
|
@param min_size: The minimum number of characters in the text.
|
||
|
C{None} is treated as C{0}.
|
||
|
|
||
|
@param max_size: The maximum number of characters in the text.
|
||
|
Use C{None} for an unbounded size.
|
||
|
"""
|
||
|
alphabet = idna_characters()
|
||
|
|
||
|
assert min_size >= 1
|
||
|
|
||
|
if max_size is not None:
|
||
|
assert max_size >= 1
|
||
|
|
||
|
result = cast(
|
||
|
Text,
|
||
|
draw(text(min_size=min_size, max_size=max_size, alphabet=alphabet)),
|
||
|
)
|
||
|
|
||
|
# FIXME: There should be a more efficient way to ensure we produce
|
||
|
# valid IDNA text.
|
||
|
try:
|
||
|
idna_encode(result)
|
||
|
except IDNAError:
|
||
|
assume(False)
|
||
|
|
||
|
return result
|
||
|
|
||
|
@composite
|
||
|
def port_numbers(draw, allow_zero=False):
|
||
|
# type: (DrawCallable, bool) -> int
|
||
|
"""
|
||
|
A strategy which generates port numbers.
|
||
|
|
||
|
@param allow_zero: Whether to allow port C{0} as a possible value.
|
||
|
"""
|
||
|
if allow_zero:
|
||
|
min_value = 0
|
||
|
else:
|
||
|
min_value = 1
|
||
|
|
||
|
return cast(int, draw(integers(min_value=min_value, max_value=65535)))
|
||
|
|
||
|
@composite
|
||
|
def hostname_labels(draw, allow_idn=True):
|
||
|
# type: (DrawCallable, bool) -> Text
|
||
|
"""
|
||
|
A strategy which generates host name labels.
|
||
|
|
||
|
@param allow_idn: Whether to allow non-ASCII characters as allowed by
|
||
|
internationalized domain names (IDNs).
|
||
|
"""
|
||
|
if allow_idn:
|
||
|
label = cast(Text, draw(idna_text(min_size=1, max_size=63)))
|
||
|
|
||
|
try:
|
||
|
label.encode("ascii")
|
||
|
except UnicodeEncodeError:
|
||
|
# If the label doesn't encode to ASCII, then we need to check
|
||
|
# the length of the label after encoding to punycode and adding
|
||
|
# the xn-- prefix.
|
||
|
while len(label.encode("punycode")) > 63 - len("xn--"):
|
||
|
# Rather than bombing out, just trim from the end until it
|
||
|
# is short enough, so hypothesis doesn't have to generate
|
||
|
# new data.
|
||
|
label = label[:-1]
|
||
|
|
||
|
else:
|
||
|
label = cast(
|
||
|
Text,
|
||
|
draw(
|
||
|
text(
|
||
|
min_size=1,
|
||
|
max_size=63,
|
||
|
alphabet=Text(ascii_letters + digits + u"-"),
|
||
|
)
|
||
|
),
|
||
|
)
|
||
|
|
||
|
# Filter invalid labels.
|
||
|
# It would be better to reliably avoid generation of bogus labels in
|
||
|
# the first place, but it's hard...
|
||
|
try:
|
||
|
check_label(label)
|
||
|
except UnicodeError: # pragma: no cover (not always drawn)
|
||
|
assume(False)
|
||
|
|
||
|
return label
|
||
|
|
||
|
@composite
|
||
|
def hostnames(draw, allow_leading_digit=True, allow_idn=True):
|
||
|
# type: (DrawCallable, bool, bool) -> Text
|
||
|
"""
|
||
|
A strategy which generates host names.
|
||
|
|
||
|
@param allow_leading_digit: Whether to allow a leading digit in host
|
||
|
names; they were not allowed prior to RFC 1123.
|
||
|
|
||
|
@param allow_idn: Whether to allow non-ASCII characters as allowed by
|
||
|
internationalized domain names (IDNs).
|
||
|
"""
|
||
|
# Draw first label, filtering out labels with leading digits if needed
|
||
|
labels = [
|
||
|
cast(
|
||
|
Text,
|
||
|
draw(
|
||
|
hostname_labels(allow_idn=allow_idn).filter(
|
||
|
lambda l: (
|
||
|
True if allow_leading_digit else l[0] not in digits
|
||
|
)
|
||
|
)
|
||
|
),
|
||
|
)
|
||
|
]
|
||
|
# Draw remaining labels
|
||
|
labels += cast(
|
||
|
List[Text],
|
||
|
draw(
|
||
|
lists(
|
||
|
hostname_labels(allow_idn=allow_idn),
|
||
|
min_size=1,
|
||
|
max_size=4,
|
||
|
)
|
||
|
),
|
||
|
)
|
||
|
|
||
|
# Trim off labels until the total host name length fits in 252
|
||
|
# characters. This avoids having to filter the data.
|
||
|
while sum(len(label) for label in labels) + len(labels) - 1 > 252:
|
||
|
labels = labels[:-1]
|
||
|
|
||
|
return u".".join(labels)
|
||
|
|
||
|
def path_characters():
|
||
|
# type: () -> str
|
||
|
"""
|
||
|
Returns a string containing valid URL path characters.
|
||
|
"""
|
||
|
global _path_characters
|
||
|
|
||
|
if _path_characters is None:
|
||
|
|
||
|
def chars():
|
||
|
# type: () -> Iterable[Text]
|
||
|
for i in range(maxunicode):
|
||
|
c = unichr(i)
|
||
|
|
||
|
# Exclude reserved characters
|
||
|
if c in "#/?":
|
||
|
continue
|
||
|
|
||
|
# Exclude anything not UTF-8 compatible
|
||
|
try:
|
||
|
c.encode("utf-8")
|
||
|
except UnicodeEncodeError:
|
||
|
continue
|
||
|
|
||
|
yield c
|
||
|
|
||
|
_path_characters = "".join(chars())
|
||
|
|
||
|
return _path_characters
|
||
|
|
||
|
_path_characters = None # type: Optional[str]
|
||
|
|
||
|
@composite
|
||
|
def paths(draw):
|
||
|
# type: (DrawCallable) -> Sequence[Text]
|
||
|
return cast(
|
||
|
List[Text],
|
||
|
draw(
|
||
|
lists(text(min_size=1, alphabet=path_characters()), max_size=10)
|
||
|
),
|
||
|
)
|
||
|
|
||
|
@composite
|
||
|
def encoded_urls(draw):
|
||
|
# type: (DrawCallable) -> EncodedURL
|
||
|
"""
|
||
|
A strategy which generates L{EncodedURL}s.
|
||
|
Call the L{EncodedURL.to_uri} method on each URL to get an HTTP
|
||
|
protocol-friendly URI.
|
||
|
"""
|
||
|
port = cast(Optional[int], draw(port_numbers(allow_zero=True)))
|
||
|
host = cast(Text, draw(hostnames()))
|
||
|
path = cast(Sequence[Text], draw(paths()))
|
||
|
|
||
|
if port == 0:
|
||
|
port = None
|
||
|
|
||
|
return EncodedURL(
|
||
|
scheme=cast(Text, draw(sampled_from((u"http", u"https")))),
|
||
|
host=host,
|
||
|
port=port,
|
||
|
path=path,
|
||
|
)
|
||
|
|
||
|
@composite
|
||
|
def decoded_urls(draw):
|
||
|
# type: (DrawCallable) -> DecodedURL
|
||
|
"""
|
||
|
A strategy which generates L{DecodedURL}s.
|
||
|
Call the L{EncodedURL.to_uri} method on each URL to get an HTTP
|
||
|
protocol-friendly URI.
|
||
|
"""
|
||
|
return DecodedURL(draw(encoded_urls()))
|