Spaces:
Build error
Build error
# https://github.com/arXiv/arxiv-base@32e6ad0 | |
""" | |
Copyright 2017 Cornell University | |
Permission is hereby granted, free of charge, to any person obtaining a copy of | |
this software and associated documentation files (the "Software"), to deal in | |
the Software without restriction, including without limitation the rights to | |
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |
of the Software, and to permit persons to whom the Software is furnished to do | |
so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in all | |
copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
SOFTWARE. | |
""" | |
"""Convert between TeX escapes and UTF8.""" | |
import re | |
from typing import Pattern, Dict, Match | |
accents = { | |
# first accents with non-letter prefix, e.g. \'A | |
"'A": 0x00c1, "'C": 0x0106, "'E": 0x00c9, "'I": 0x00cd, | |
"'L": 0x0139, "'N": 0x0143, "'O": 0x00d3, "'R": 0x0154, | |
"'S": 0x015a, "'U": 0x00da, "'Y": 0x00dd, "'Z": 0x0179, | |
"'a": 0x00e1, "'c": 0x0107, "'e": 0x00e9, "'i": 0x00ed, | |
"'l": 0x013a, "'n": 0x0144, "'o": 0x00f3, "'r": 0x0155, | |
"'s": 0x015b, "'u": 0x00fa, "'y": 0x00fd, "'z": 0x017a, | |
'"A': 0x00c4, '"E': 0x00cb, '"I': 0x00cf, '"O': 0x00d6, | |
'"U': 0x00dc, '"Y': 0x0178, '"a': 0x00e4, '"e': 0x00eb, | |
'"i': 0x00ef, '"o': 0x00f6, '"u': 0x00fc, '"y': 0x00ff, | |
'.A': 0x0226, '.C': 0x010a, '.E': 0x0116, '.G': 0x0120, | |
'.I': 0x0130, '.O': 0x022e, '.Z': 0x017b, '.a': 0x0227, | |
'.c': 0x010b, '.e': 0x0117, '.g': 0x0121, '.o': 0x022f, | |
'.z': 0x017c, '=A': 0x0100, '=E': 0x0112, '=I': 0x012a, | |
'=O': 0x014c, '=U': 0x016a, '=Y': 0x0232, '=a': 0x0101, | |
'=e': 0x0113, '=i': 0x012b, '=o': 0x014d, '=u': 0x016b, | |
'=y': 0x0233, '^A': 0x00c2, '^C': 0x0108, '^E': 0x00ca, | |
'^G': 0x011c, '^H': 0x0124, '^I': 0x00ce, '^J': 0x0134, | |
'^O': 0x00d4, '^S': 0x015c, '^U': 0x00db, '^W': 0x0174, | |
'^Y': 0x0176, '^a': 0x00e2, '^c': 0x0109, '^e': 0x00ea, | |
'^g': 0x011d, '^h': 0x0125, '^i': 0x00ee, '^j': 0x0135, | |
'^o': 0x00f4, '^s': 0x015d, '^u': 0x00fb, '^w': 0x0175, | |
'^y': 0x0177, '`A': 0x00c0, '`E': 0x00c8, '`I': 0x00cc, | |
'`O': 0x00d2, '`U': 0x00d9, '`a': 0x00e0, '`e': 0x00e8, | |
'`i': 0x00ec, '`o': 0x00f2, '`u': 0x00f9, '~A': 0x00c3, | |
'~I': 0x0128, '~N': 0x00d1, '~O': 0x00d5, '~U': 0x0168, | |
'~a': 0x00e3, '~i': 0x0129, '~n': 0x00f1, '~o': 0x00f5, | |
'~u': 0x0169, | |
# and now ones with letter prefix \c{c} etc.. | |
'HO': 0x0150, 'HU': 0x0170, 'Ho': 0x0151, 'Hu': 0x0171, | |
'cC': 0x00c7, 'cE': 0x0228, | |
'cG': 0x0122, 'cK': 0x0136, 'cL': 0x013b, 'cN': 0x0145, | |
'cR': 0x0156, 'cS': 0x015e, 'cT': 0x0162, 'cc': 0x00e7, | |
'ce': 0x0229, 'cg': 0x0123, 'ck': 0x0137, 'cl': 0x013c, | |
# Commented out due ARXIVDEV-2322 (bug reported by PG) | |
# 'ci' : 'i\x{0327}' = chr(0x69).ch(0x327) # i with combining cedilla | |
'cn': 0x0146, 'cr': 0x0157, 'cs': 0x015f, 'ct': 0x0163, | |
'kA': 0x0104, 'kE': 0x0118, 'kI': 0x012e, 'kO': 0x01ea, | |
'kU': 0x0172, 'ka': 0x0105, 'ke': 0x0119, 'ki': 0x012f, | |
'ko': 0x01eb, 'ku': 0x0173, 'rA': 0x00c5, 'rU': 0x016e, | |
'ra': 0x00e5, 'ru': 0x016f, 'uA': 0x0102, 'uE': 0x0114, | |
'uG': 0x011e, 'uI': 0x012c, 'uO': 0x014e, 'uU': 0x016c, | |
'ua': 0x0103, 'ue': 0x0115, 'ug': 0x011f, | |
'ui': 0x012d, 'uo': 0x014f, 'uu': 0x016d, | |
'vA': 0x01cd, 'vC': 0x010c, 'vD': 0x010e, | |
'vE': 0x011a, 'vG': 0x01e6, 'vH': 0x021e, 'vI': 0x01cf, | |
'vK': 0x01e8, 'vL': 0x013d, 'vN': 0x0147, 'vO': 0x01d1, | |
'vR': 0x0158, 'vS': 0x0160, 'vT': 0x0164, 'vU': 0x01d3, | |
'vZ': 0x017d, 'va': 0x01ce, 'vc': 0x010d, 'vd': 0x010f, | |
've': 0x011b, 'vg': 0x01e7, 'vh': 0x021f, 'vi': 0x01d0, | |
'vk': 0x01e9, 'vl': 0x013e, 'vn': 0x0148, 'vo': 0x01d2, | |
'vr': 0x0159, 'vs': 0x0161, 'vt': 0x0165, 'vu': 0x01d4, | |
'vz': 0x017e | |
} | |
r""" | |
Hash to lookup tex markup and convert to Unicode. | |
macron: a line above character (overbar \={} in TeX) | |
caron: v-shape above character (\v{ } in TeX) | |
See: http://www.unicode.org/charts/ | |
""" | |
textlet = { | |
'AA': 0x00c5, 'AE': 0x00c6, 'DH': 0x00d0, 'DJ': 0x0110, | |
'ETH': 0x00d0, 'L': 0x0141, 'NG': 0x014a, 'O': 0x00d8, | |
'oe': 0x0153, 'OE': 0x0152, 'TH': 0x00de, 'aa': 0x00e5, | |
'ae': 0x00e6, | |
'dh': 0x00f0, 'dj': 0x0111, 'eth': 0x00f0, 'i': 0x0131, | |
'l': 0x0142, 'ng': 0x014b, 'o': 0x00f8, 'ss': 0x00df, | |
'th': 0x00fe, | |
# Greek (upper) | |
'Gamma': 0x0393, 'Delta': 0x0394, 'Theta': 0x0398, | |
'Lambda': 0x039b, 'Xi': 0x039E, 'Pi': 0x03a0, | |
'Sigma': 0x03a3, 'Upsilon': 0x03a5, 'Phi': 0x03a6, | |
'Psi': 0x03a8, 'Omega': 0x03a9, | |
# Greek (lower) | |
'alpha': 0x03b1, 'beta': 0x03b2, 'gamma': 0x03b3, | |
'delta': 0x03b4, 'epsilon': 0x03b5, 'zeta': 0x03b6, | |
'eta': 0x03b7, 'theta': 0x03b8, 'iota': 0x03b9, | |
'kappa': 0x03ba, 'lambda': 0x03bb, 'mu': 0x03bc, | |
'nu': 0x03bd, 'xi': 0x03be, 'omicron': 0x03bf, | |
'pi': 0x03c0, 'rho': 0x03c1, 'varsigma': 0x03c2, | |
'sigma': 0x03c3, 'tau': 0x03c4, 'upsion': 0x03c5, | |
'varphi': 0x03C6, # φ | |
'phi': 0x03D5, # ϕ | |
'chi': 0x03c7, 'psi': 0x03c8, 'omega': 0x03c9, | |
} | |
def _p_to_match(tex_to_chr: Dict[str, int]) -> Pattern: | |
# textsym and textlet both use the same sort of regex pattern. | |
keys = r'\\(' + '|'.join(tex_to_chr.keys()) + ')' | |
pstr = r'({)?' + keys + r'(\b|(?=_))(?(1)}|(\\(?= )| |{}|)?)' | |
return re.compile(pstr) | |
textlet_pattern = _p_to_match(textlet) | |
textsym = { | |
'P': 0x00b6, 'S': 0x00a7, 'copyright': 0x00a9, | |
'guillemotleft': 0x00ab, 'guillemotright': 0x00bb, | |
'pounds': 0x00a3, 'dag': 0x2020, 'ddag': 0x2021, | |
'div': 0x00f7, 'deg': 0x00b0} | |
textsym_pattern = _p_to_match(textsym) | |
def _textlet_sub(match: Match) -> str: | |
return chr(textlet[match.group(2)]) | |
def _textsym_sub(match: Match) -> str: | |
return chr(textsym[match.group(2)]) | |
def texch2UTF(acc: str) -> str: | |
"""Convert single character TeX accents to UTF-8. | |
Strip non-whitepsace characters from any sequence not recognized (hence | |
could return an empty string if there are no word characters in the input | |
string). | |
chr(num) will automatically create a UTF8 string for big num | |
""" | |
if acc in accents: | |
return chr(accents[acc]) | |
else: | |
return re.sub(r'[^\w]+', '', acc, flags=re.IGNORECASE) | |
def tex2utf(tex: str, letters: bool = True) -> str: | |
r"""Convert some TeX accents and greek symbols to UTF-8 characters. | |
:param tex: Text to filter. | |
:param letters: If False, do not convert greek letters or | |
ligatures. Greek symbols can cause problems. Ex. \phi is not | |
suppose to look like φ. φ looks like \varphi. See ARXIVNG-1612 | |
:returns: string, possibly with some TeX replaced with UTF8 | |
""" | |
# Do dotless i,j -> plain i,j where they are part of an accented i or j | |
utf = re.sub(r"/(\\['`\^\"\~\=\.uvH])\{\\([ij])\}", r"\g<1>\{\g<2>\}", tex) | |
# Now work on the Tex sequences, first those with letters only match | |
if letters: | |
utf = textlet_pattern.sub(_textlet_sub, utf) | |
utf = textsym_pattern.sub(_textsym_sub, utf) | |
utf = re.sub(r'\{\\j\}|\\j\s', 'j', utf) # not in Unicode? | |
# reduce {{x}}, {{{x}}}, ... down to {x} | |
while re.search(r'\{\{([^\}]*)\}\}', utf): | |
utf = re.sub(r'\{\{([^\}]*)\}\}', r'{\g<1>}', utf) | |
# Accents which have a non-letter prefix in TeX, first \'e | |
utf = re.sub(r'\\([\'`^"~=.][a-zA-Z])', | |
lambda m: texch2UTF(m.group(1)), utf) | |
# then \'{e} form: | |
utf = re.sub(r'\\([\'`^"~=.])\{([a-zA-Z])\}', | |
lambda m: texch2UTF(m.group(1) + m.group(2)), utf) | |
# Accents which have a letter prefix in TeX | |
# \u{x} u above (breve), \v{x} v above (caron), \H{x} double accute... | |
utf = re.sub(r'\\([Hckoruv])\{([a-zA-Z])\}', | |
lambda m: texch2UTF(m.group(1) + m.group(2)), utf) | |
# Don't do \t{oo} yet, | |
utf = re.sub(r'\\t{([^\}])\}', r'\g<1>', utf) | |
# bdc34: commented out in original Perl | |
# $utf =~ s/\{(.)\}/$1/g; # remove { } from around {x} | |
return utf | |