wav2vec2-xls-r-1b-ca-lm / text /soros.py

add LM to 1b model

04b0dd9 almost 3 years ago

5.78 kB

	"Soros interpreter (see http://numbertext.org)"
	from __future__ import unicode_literals
	from __future__ import print_function
	import re
	import sys


	def run(program, data, lang):
	return compile(program, lang).run(data)


	def compile(program, lang):
	return _Soros(program, lang)

	# conversion function


	def _tr(text, chars, chars2, delim):
	for i in range(0, len(chars)):
	text = text.replace(delim + chars[i], chars2[i])
	return text


	# string literals for metacharacter encoding
	_m = "\\\";#$()\|[]"
	# Unicode private area
	_c = u"\uE000\uE001\uE002\uE003\uE004\uE005\uE006\uE007\uE008\uE009"
	_pipe = u"\uE003"
	# separator prefix = \uE00A

	# pattern to recognize function calls in the replacement string
	_func = re.compile(_tr(r"""(?:\\|?(?:\$\()+)? # optional nested calls
	(\\|?\$$([^\($]*)\)\\|?) # inner call (2 subgroups)
	(?:\)+\\|?)?""", # optional nested calls
	_m[4:8], _c[:4], "\\"), re.X) # \$, $, $, \\| -> \uE000..\uE003


	class _Soros:
	def __init__(self, prg, lang):
	self.lines = []
	if prg.find("__numbertext__") == -1:
	prg = "__numbertext__;" + prg
	# default left zero deletion
	# and separator function (no separation, if subcall returns with empty string)
	prg = prg.replace("__numbertext__", u"""0+(0\|[1-9]\\d*) $1
	\"([a-z][-a-z]* )0+(0\|[1-9]\\d*)\" $(\\1\\2)
	\"\uE00A(.)\uE00A(.+)\uE00A(.)\" \\1\\2\\3
	\"\uE00A.\uE00A\uE00A.\"
	""")
	prg = _tr(prg, _m[:4], _c[:4],
	"\\") # \\, \", \;, \# -> \uE000..\uE003
	# switch off all country-dependent lines, and switch on the requested ones
	prg = re.sub(
	r"(^\|[\n;])([^\n;#]#[^\n][\[]:[^\n:\]]:][^\n])", r"\1#\2", prg)
	prg = re.sub(r"(^\|[\n;])#([^\n;#]#[^\n][\[]:" +
	lang.replace("_", "-") + r":][^\n]*)", r"\1\2", prg)
	matchline = re.compile("^\s(\"[^\"]\"\|[^\s])\s(.[^\s])?\s$")
	prefix = ""
	for s in re.sub("(#[^\n]*)?(\n\|$)", ";", prg).split(";"):
	macro = re.match("== (.[^ ]?) ==", s)
	if macro != None:
	prefix = macro.group(1)
	continue
	m = matchline.match(s)
	if prefix != "" and s != "" and m != None:
	s = m.group(1).strip("\"")
	space = " " if s != "" else ""
	caret = ""
	if s[0:1] == "^":
	s = s[1:]
	caret = "^"
	s2 = m.group(2) if m.group(2) != None else ""
	s = "\"" + caret + prefix + space + s + "\" " + s2
	m = matchline.match(s)
	if m != None:
	s = _tr(m.group(1).strip("\""), _c[1:4], _m[1:4], "") \
	.replace(_c[_m.find("\\")], "\\\\") # -> \\, ", ;, #
	if m.group(2) != None:
	s2 = m.group(2).strip("\"")
	else:
	s2 = ""
	# \$, $, $, \\|, \[, \] -> \uE004..\uE009
	s2 = _tr(s2, _m[4:], _c[4:], "\\")
	# call inner separator: [ ... $1 ... ] -> $(\uE00A ... \uE00A$1\uE00A ... )
	s2 = re.sub(r"[\[]\$(\d\d?\|$[^$]+\))",
	u"$(\uE00A\uE00A\|$\\1\uE00A", s2)
	s2 = re.sub(r"[\[]([^\$[\\]*)\$(\d\d?\|$[^$]+\))",
	u"$(\uE00A\\1\uE00A$\\2\uE00A", s2)
	# add "\|" in terminating position
	s2 = re.sub(r"\uE00A]$", "\|\uE00A)", s2)
	s2 = re.sub(r"]", ")", s2)
	s2 = re.sub(r"(\$\d\|\))\\|\$", r"\1\|\|$",
	s2) # $()\|$() -> $()\|\|$()
	# \uE000..\uE003-> \, ", ;, #
	s2 = _tr(s2, _c[:4], _m[:4], "")
	# $, (, ), \| -> \uE000..\uE003
	s2 = _tr(s2, _m[4:8], _c[:4], "")
	# \uE004..\uE009 -> $, (, ), \|, [, ]
	s2 = _tr(s2, _c[4:], _m[4:], "")
	s2 = re.sub(r"\\(\d)", r"\\g<\1>",
	re.sub(r"\uE000(\d)", "\uE000\uE001\\\\g<\\1>\uE002", s2))
	try:
	self.lines = self.lines + [[
	re.compile("^" + s.lstrip("^").rstrip("$") + "$"),
	s2, s[:1] == "^", s[-1:] == "$"]]
	except:
	print("Error in following regex line: " + s, file=sys.stderr)
	raise

	def run(self, data):
	return self._run(data, True, True)

	def _run(self, data, begin, end):
	for i in self.lines:
	if not ((begin == False and i[2]) or (end == False and i[3])):
	m = i[0].match(data)
	if m:
	try:
	s = m.expand(i[1])
	except:
	print("Error for the following input: " +
	data, file=sys.stderr)
	raise
	n = _func.search(s)
	while n:
	b = False
	e = False
	if n.group(1)[0:1] == _pipe or n.group()[0:1] == _pipe:
	b = True
	elif n.start() == 0:
	b = begin
	if n.group(1)[-1:] == _pipe or n.group()[-1:] == _pipe:
	e = True
	elif n.end() == len(s):
	e = end
	s = s[:n.start(1)] + self._run(n.group(2),
	b, e) + s[n.end(1):]
	n = _func.search(s)
	return s
	return ""