174 lines
6.6 KiB
Python
174 lines
6.6 KiB
Python
# This script can convert IdnaTestV2.txt to JSON, accounting for the requirements in the
|
|
# URL Standard.
|
|
#
|
|
# The goal is to eventually remove --exclude-std3 and --exclude-bidi. For that we need solutions to
|
|
# these issues:
|
|
#
|
|
# * https://github.com/whatwg/url/issues/341
|
|
# * https://github.com/whatwg/url/issues/543
|
|
# * https://github.com/whatwg/url/issues/733
|
|
# * https://github.com/whatwg/url/issues/744
|
|
#
|
|
# Removal of --exclude-ipv4-like is a stretch goal also dependent upon those issues.
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import requests
|
|
|
|
def get_IdnaTestV2_lines():
|
|
IdnaTestV2 = os.path.join(os.path.dirname(__file__), "IdnaTestV2.txt")
|
|
if not os.path.exists(IdnaTestV2):
|
|
# Download IdnaTestV2.txt if it doesn't exist yet
|
|
open(IdnaTestV2, "w").write(requests.get("https://unicode.org/Public/idna/latest/IdnaTestV2.txt").text)
|
|
return open(IdnaTestV2, "r").readlines()
|
|
|
|
def remove_escapes(input):
|
|
return json.loads("\"" + input + "\"")
|
|
|
|
def ends_in_a_number(input):
|
|
# This method is not robust. It uses https://www.unicode.org/reports/tr46/#Notation but there
|
|
# are likely other ways to end up with a dot, e.g., through decomposition or percent-decoding.
|
|
# It also does not entirely match https://url.spec.whatwg.org/#ends-in-a-number-checker. It
|
|
# appears to suffice for the tests in question though.
|
|
parts = re.split(r"\u002E|\uFF0E|\u3002|\uFF61", input)
|
|
if not parts:
|
|
return False
|
|
if parts[-1] == "":
|
|
if len(parts) == 1:
|
|
return False
|
|
parts.pop()
|
|
return parts[-1].isascii() and parts[-1].isdigit()
|
|
|
|
def contains_bidi_status(statuses):
|
|
for status in statuses:
|
|
if status in ["B1", "B2", "B3", "B4", "B5", "B6"]:
|
|
return True
|
|
return False
|
|
|
|
def parse(lines, exclude_ipv4_like, exclude_std3, exclude_bidi):
|
|
# Main quest.
|
|
output = ["THIS IS A GENERATED FILE. PLEASE DO NOT MODIFY DIRECTLY. See ../tools/IdnaTestV2-parser.py instead."]
|
|
output.append(f"--exclude-ipv4-like: {exclude_ipv4_like}; --exclude-std3: {exclude_std3}; --exclude_bidi: {exclude_bidi}")
|
|
|
|
# Side quest.
|
|
unique_statuses = []
|
|
|
|
for line in lines:
|
|
# Remove newlines
|
|
line = line.rstrip()
|
|
|
|
# Remove lines that are comments or empty
|
|
if line.startswith("#") or line == "":
|
|
continue
|
|
|
|
# Remove escapes (doesn't handle \x{XXXX} but those do not appear in the source)
|
|
line = remove_escapes(line)
|
|
|
|
# Normalize columns
|
|
#
|
|
# Since we are only interested in ToASCII and enforce Transitional_Processing=false we care
|
|
# about the following columns:
|
|
#
|
|
# * Column 1 (source)
|
|
# * Column 4 (toAsciiN)
|
|
# * Column 5 (toAsciiNStatus)
|
|
#
|
|
# We also store Column 2 (toUnicode) to help with UseSTD3ASCIIRules exclusion.
|
|
columns = [column.strip() for column in line.split(";")]
|
|
|
|
# Column 1 (source) and Column 2 (toUnicode; if empty, Column 1 (source))
|
|
source = columns[0]
|
|
to_unicode = columns[1]
|
|
if to_unicode == "":
|
|
to_unicode = source
|
|
|
|
# Immediately exclude IPv4-like tests when desired. While we could force all their
|
|
# expectations to be failure instead, it's not clear we need that many additional tests that
|
|
# were actually trying to test something else.
|
|
if exclude_ipv4_like:
|
|
if ends_in_a_number(source):
|
|
continue
|
|
|
|
if exclude_std3:
|
|
if re.search(r"\u2260|\u226E|\u226F|\<|\>|\$|,", to_unicode):
|
|
continue
|
|
|
|
# Column 4 (toAsciiN; if empty, use Column 2 (toUnicode))
|
|
to_ascii = columns[3]
|
|
if to_ascii == "":
|
|
to_ascii = to_unicode
|
|
|
|
# Column 5 (toAsciiNStatus; if empty, use Column 3 (toUnicodeStatus))
|
|
temp_statuses = columns[4]
|
|
if temp_statuses == "":
|
|
temp_statuses = columns[2]
|
|
|
|
statuses = []
|
|
if temp_statuses != "":
|
|
assert temp_statuses.startswith("[")
|
|
statuses = [status.strip() for status in temp_statuses[1:-1].split(",")]
|
|
|
|
# Side quest time.
|
|
for status in statuses:
|
|
if status not in unique_statuses:
|
|
unique_statuses.append(status)
|
|
|
|
# The URL Standard has
|
|
#
|
|
# * UseSTD3ASCIIRules=false; however there are no tests marked U1 (some should be though)
|
|
# * CheckHyphens=false; thus ignore V2, V3?
|
|
# * VerifyDnsLength=false; thus ignore A4_1 and A4_2
|
|
ignored_statuses = []
|
|
for status in statuses:
|
|
if status in ["A4_1", "A4_2", "U1", "V2", "V3"]:
|
|
ignored_statuses.append(status)
|
|
for status in ignored_statuses:
|
|
statuses.remove(status)
|
|
|
|
if exclude_bidi and contains_bidi_status(statuses):
|
|
continue
|
|
|
|
if len(statuses) > 0:
|
|
to_ascii = None
|
|
|
|
test = { "input": source, "output": to_ascii }
|
|
comment = ""
|
|
for status in statuses:
|
|
comment += status + "; "
|
|
for status in ignored_statuses:
|
|
comment += status + " (ignored); "
|
|
if comment != "":
|
|
test["comment"] = comment.strip()[:-1]
|
|
output.append(test)
|
|
|
|
unique_statuses.sort()
|
|
return { "tests": output, "unique_statuses": unique_statuses }
|
|
|
|
def to_json(data):
|
|
handle = open(os.path.join(os.path.dirname(__file__), "../resources/IdnaTestV2.json"), "w")
|
|
handle.write(json.dumps(data, sort_keys=True, allow_nan=False, indent=2, separators=(',', ': ')))
|
|
handle.write("\n")
|
|
handle.close()
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(epilog="Thanks for caring about IDNA!")
|
|
parser.add_argument("--generate", action="store_true", help="Generate the JSON resource.")
|
|
parser.add_argument("--exclude-ipv4-like", action="store_true", help="Exclude inputs that end with an ASCII digit label. (Not robust, but works for current input.)")
|
|
parser.add_argument("--exclude-std3", action="store_true", help="Exclude tests impacted by UseSTD3ASCIIRules. (Not robust, but works for current input.)")
|
|
parser.add_argument("--exclude-bidi", action="store_true", help="Exclude tests impacted by CheckBidi.")
|
|
parser.add_argument("--statuses", action="store_true", help="Print the unique statuses in IdnaTestV2.txt.")
|
|
args = parser.parse_args()
|
|
|
|
if args.generate or args.statuses:
|
|
output = parse(get_IdnaTestV2_lines(), args.exclude_ipv4_like, args.exclude_std3, args.exclude_bidi)
|
|
if args.statuses:
|
|
print(output["unique_statuses"])
|
|
else:
|
|
assert args.generate
|
|
to_json(output["tests"])
|
|
else:
|
|
parser.print_usage()
|
|
|
|
main()
|