PereLluis13's picture
add LM to 1b model
04b0dd9
"Soros interpreter (see http://numbertext.org)"
from __future__ import unicode_literals
from __future__ import print_function
import re
import sys
def run(program, data, lang):
return compile(program, lang).run(data)
def compile(program, lang):
return _Soros(program, lang)
# conversion function
def _tr(text, chars, chars2, delim):
for i in range(0, len(chars)):
text = text.replace(delim + chars[i], chars2[i])
return text
# string literals for metacharacter encoding
_m = "\\\";#$()|[]"
# Unicode private area
_c = u"\uE000\uE001\uE002\uE003\uE004\uE005\uE006\uE007\uE008\uE009"
_pipe = u"\uE003"
# separator prefix = \uE00A
# pattern to recognize function calls in the replacement string
_func = re.compile(_tr(r"""(?:\|?(?:\$\()+)? # optional nested calls
(\|?\$\(([^\(\)]*)\)\|?) # inner call (2 subgroups)
(?:\)+\|?)?""", # optional nested calls
_m[4:8], _c[:4], "\\"), re.X) # \$, \(, \), \| -> \uE000..\uE003
class _Soros:
def __init__(self, prg, lang):
self.lines = []
if prg.find("__numbertext__") == -1:
prg = "__numbertext__;" + prg
# default left zero deletion
# and separator function (no separation, if subcall returns with empty string)
prg = prg.replace("__numbertext__", u"""0+(0|[1-9]\\d*) $1
\"([a-z][-a-z]* )0+(0|[1-9]\\d*)\" $(\\1\\2)
\"\uE00A(.*)\uE00A(.+)\uE00A(.*)\" \\1\\2\\3
\"\uE00A.*\uE00A\uE00A.*\"
""")
prg = _tr(prg, _m[:4], _c[:4],
"\\") # \\, \", \;, \# -> \uE000..\uE003
# switch off all country-dependent lines, and switch on the requested ones
prg = re.sub(
r"(^|[\n;])([^\n;#]*#[^\n]*[\[]:[^\n:\]]*:][^\n]*)", r"\1#\2", prg)
prg = re.sub(r"(^|[\n;])#([^\n;#]*#[^\n]*[\[]:" +
lang.replace("_", "-") + r":][^\n]*)", r"\1\2", prg)
matchline = re.compile("^\s*(\"[^\"]*\"|[^\s]*)\s*(.*[^\s])?\s*$")
prefix = ""
for s in re.sub("(#[^\n]*)?(\n|$)", ";", prg).split(";"):
macro = re.match("== *(.*[^ ]?) ==", s)
if macro != None:
prefix = macro.group(1)
continue
m = matchline.match(s)
if prefix != "" and s != "" and m != None:
s = m.group(1).strip("\"")
space = " " if s != "" else ""
caret = ""
if s[0:1] == "^":
s = s[1:]
caret = "^"
s2 = m.group(2) if m.group(2) != None else ""
s = "\"" + caret + prefix + space + s + "\" " + s2
m = matchline.match(s)
if m != None:
s = _tr(m.group(1).strip("\""), _c[1:4], _m[1:4], "") \
.replace(_c[_m.find("\\")], "\\\\") # -> \\, ", ;, #
if m.group(2) != None:
s2 = m.group(2).strip("\"")
else:
s2 = ""
# \$, \(, \), \|, \[, \] -> \uE004..\uE009
s2 = _tr(s2, _m[4:], _c[4:], "\\")
# call inner separator: [ ... $1 ... ] -> $(\uE00A ... \uE00A$1\uE00A ... )
s2 = re.sub(r"[\[]\$(\d\d?|\([^\)]+\))",
u"$(\uE00A\uE00A|$\\1\uE00A", s2)
s2 = re.sub(r"[\[]([^\$[\\]*)\$(\d\d?|\([^\)]+\))",
u"$(\uE00A\\1\uE00A$\\2\uE00A", s2)
# add "|" in terminating position
s2 = re.sub(r"\uE00A]$", "|\uE00A)", s2)
s2 = re.sub(r"]", ")", s2)
s2 = re.sub(r"(\$\d|\))\|\$", r"\1||$",
s2) # $()|$() -> $()||$()
# \uE000..\uE003-> \, ", ;, #
s2 = _tr(s2, _c[:4], _m[:4], "")
# $, (, ), | -> \uE000..\uE003
s2 = _tr(s2, _m[4:8], _c[:4], "")
# \uE004..\uE009 -> $, (, ), |, [, ]
s2 = _tr(s2, _c[4:], _m[4:], "")
s2 = re.sub(r"\\(\d)", r"\\g<\1>",
re.sub(r"\uE000(\d)", "\uE000\uE001\\\\g<\\1>\uE002", s2))
try:
self.lines = self.lines + [[
re.compile("^" + s.lstrip("^").rstrip("$") + "$"),
s2, s[:1] == "^", s[-1:] == "$"]]
except:
print("Error in following regex line: " + s, file=sys.stderr)
raise
def run(self, data):
return self._run(data, True, True)
def _run(self, data, begin, end):
for i in self.lines:
if not ((begin == False and i[2]) or (end == False and i[3])):
m = i[0].match(data)
if m:
try:
s = m.expand(i[1])
except:
print("Error for the following input: " +
data, file=sys.stderr)
raise
n = _func.search(s)
while n:
b = False
e = False
if n.group(1)[0:1] == _pipe or n.group()[0:1] == _pipe:
b = True
elif n.start() == 0:
b = begin
if n.group(1)[-1:] == _pipe or n.group()[-1:] == _pipe:
e = True
elif n.end() == len(s):
e = end
s = s[:n.start(1)] + self._run(n.group(2),
b, e) + s[n.end(1):]
n = _func.search(s)
return s
return ""