pcre2 / maint /GenerateTest.py

Upload folder using huggingface_hub

864071c verified 3 months ago

5.65 kB

	#! /usr/bin/env python3

	# PCRE2 UNICODE PROPERTY SUPPORT
	# ------------------------------
	#
	# This file auto-generates Unicode property tests and their expected output.
	# It is recommended to re-run this generator after the Unicode files are
	# updated. The names of the generated files are `testinput` and `testoutput`
	# and should be copied over to replace either test26 or test27 files.

	import re
	import sys

	from GenerateCommon import \
	script_names, \
	script_abbrevs

	def write_both(text):
	input_file.write(text)
	output_file.write(text)

	def to_string_char(ch_idx):
	if ch_idx < 128:
	if ch_idx < 16:
	return "\\x{0%x}" % ch_idx
	if ch_idx >= 32:
	return chr(ch_idx)
	return "\\x{%x}" % ch_idx

	try:
	input_file = open("testinput", "w")
	output_file = open("testoutput", "w")
	except IOError:
	print("** Couldn't create output files")
	sys.exit(1)

	write_both("# These tests were generated by maint/GenerateTest.py using PCRE2's UCP\n");
	write_both("# data, do not edit unless that data has changed and they are reflecting\n");
	write_both("# a previous version.\n\n");

	# ---------------------------------------------------------------------------
	# UNICODE SCRIPT EXTENSION TESTS
	# ---------------------------------------------------------------------------


	def gen_script_tests():
	script_data = [None] * len(script_names)
	char_data = [None] * 0x110000

	property_re = re.compile(r"^([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+[A-Za-z]) +#")
	prev_name = ""
	script_idx = -1

	with open("Unicode.tables/Scripts.txt") as f:
	version_pat = r"^# Scripts-(\d+\.\d+\.\d+)\.txt$"
	v = re.match(version_pat, f.readline())
	unicode_version = v.group(1)

	write_both("# Unicode Script Extension tests for version " + unicode_version + "\n\n")
	write_both("#perltest\n\n")

	for line in f:
	match_obj = property_re.match(line)

	if match_obj == None:
	continue

	name = match_obj.group(3)
	if name != prev_name:
	script_idx = script_names.index(name)
	prev_name = name

	low = int(match_obj.group(1), 16)
	high = low
	char_data[low] = name

	if match_obj.group(2) != None:
	high = int(match_obj.group(2), 16)
	for idx in range(low + 1, high + 1):
	char_data[idx] = name

	if script_data[script_idx] == None:
	script_data[script_idx] = [low, None, None, None, None]
	script_data[script_idx][1] = high

	extended_script_indicies = {}

	with open("Unicode.tables/ScriptExtensions.txt") as f:
	for line in f:
	match_obj = property_re.match(line)

	if match_obj == None:
	continue

	low = int(match_obj.group(1), 16)
	high = low
	if match_obj.group(2) != None:
	high = int(match_obj.group(2), 16)

	for abbrev in match_obj.group(3).split(" "):
	if abbrev not in extended_script_indicies:
	idx = script_abbrevs.index(abbrev)
	extended_script_indicies[abbrev] = idx
	rec = script_data[idx]
	rec[2] = low
	rec[3] = high
	else:
	idx = extended_script_indicies[abbrev]
	rec = script_data[idx]
	if rec[2] > low:
	rec[2] = low
	if rec[3] < high:
	rec[3] = high

	if rec[4] == None:
	name = script_names[idx]
	for idx in range(low, high + 1):
	if char_data[idx] != name:
	rec[4] = idx
	break

	long_property_name = False

	for idx, rec in enumerate(script_data):
	script_name = script_names[idx]

	if script_name == "Unknown":
	continue

	script_abbrev = script_abbrevs[idx]

	write_both("# Base script check\n")
	write_both("/^\\p{sc=%s}/utf\n" % script_name)
	write_both(" %s\n" % to_string_char(rec[0]))
	output_file.write(" 0: %s\n" % to_string_char(rec[0]))
	write_both("\n")

	write_both("/^\\p{Script=%s}/utf\n" % script_abbrev)
	write_both(" %s\n" % to_string_char(rec[1]))
	output_file.write(" 0: %s\n" % to_string_char(rec[1]))
	write_both("\n")

	if rec[2] != None:
	property_name = "scx"
	if long_property_name:
	property_name = "Script_Extensions"

	write_both("# Script extension check\n")
	write_both("/^\\p{%s}/utf\n" % script_name)
	write_both(" %s\n" % to_string_char(rec[2]))
	output_file.write(" 0: %s\n" % to_string_char(rec[2]))
	write_both("\n")

	write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev))
	write_both(" %s\n" % to_string_char(rec[3]))
	output_file.write(" 0: %s\n" % to_string_char(rec[3]))
	write_both("\n")

	long_property_name = not long_property_name

	if rec[4] != None:
	write_both("# Script extension only character\n")
	write_both("/^\\p{%s}/utf\n" % script_name)
	write_both(" %s\n" % to_string_char(rec[4]))
	output_file.write(" 0: %s\n" % to_string_char(rec[4]))
	write_both("\n")

	write_both("/^\\p{sc=%s}/utf\n" % script_name)
	write_both(" %s\n" % to_string_char(rec[4]))
	output_file.write("No match\n")
	write_both("\n")
	else:
	print("External character has not found for %s" % script_name)

	high = rec[1]
	if rec[3] != None and rec[3] > rec[1]:
	high = rec[3]
	write_both("# Character not in script\n")
	write_both("/^\\p{%s}/utf\n" % script_name)
	write_both(" %s\n" % to_string_char(high + 1))
	output_file.write("No match\n")
	write_both("\n")

	gen_script_tests()

	write_both("# End of test\n")