karim23657 Matthijs commited on
Commit
1c3a7ff
0 Parent(s):

Duplicate from Matthijs/mms-tts-demo

Browse files

Co-authored-by: Mathijs Hollemans <Matthijs@users.noreply.huggingface.co>

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +35 -0
  2. README.md +14 -0
  3. app.py +140 -0
  4. requirements.txt +8 -0
  5. uroman/.gitignore +35 -0
  6. uroman/LICENSE.txt +11 -0
  7. uroman/README.md +165 -0
  8. uroman/README.txt +141 -0
  9. uroman/bin/de-accent.pl +201 -0
  10. uroman/bin/string-distance.pl +99 -0
  11. uroman/bin/uroman-quick.pl +58 -0
  12. uroman/bin/uroman-tsv.sh +28 -0
  13. uroman/bin/uroman.pl +138 -0
  14. uroman/data/Chinese_to_Pinyin.txt +0 -0
  15. uroman/data/Scripts.txt +135 -0
  16. uroman/data/UnicodeData.txt +0 -0
  17. uroman/data/UnicodeDataOverwrite.txt +442 -0
  18. uroman/data/romanization-table-arabic-block.txt +179 -0
  19. uroman/data/romanization-table.txt +2019 -0
  20. uroman/data/romanization-table.v1.2.1.txt +814 -0
  21. uroman/data/string-distance-cost-rules.txt +896 -0
  22. uroman/lib/JSON.pm +2317 -0
  23. uroman/lib/JSON/backportPP.pm +2806 -0
  24. uroman/lib/JSON/backportPP/Boolean.pm +27 -0
  25. uroman/lib/JSON/backportPP/Compat5005.pm +131 -0
  26. uroman/lib/JSON/backportPP/Compat5006.pm +173 -0
  27. uroman/lib/NLP/Chinese.pm +239 -0
  28. uroman/lib/NLP/English.pm +0 -0
  29. uroman/lib/NLP/Romanizer.pm +2020 -0
  30. uroman/lib/NLP/UTF8.pm +1404 -0
  31. uroman/lib/NLP/stringDistance.pm +724 -0
  32. uroman/lib/NLP/utilities.pm +0 -0
  33. uroman/tarballs/uroman-v1.0.tar.gz +3 -0
  34. uroman/tarballs/uroman-v1.1.tar.gz +3 -0
  35. uroman/tarballs/uroman-v1.2.4.tar.gz +3 -0
  36. uroman/tarballs/uroman-v1.2.5.tar.gz +3 -0
  37. uroman/tarballs/uroman-v1.2.6.tar.gz +3 -0
  38. uroman/tarballs/uroman-v1.2.7.tar.gz +3 -0
  39. uroman/tarballs/uroman-v1.2.tar.gz +3 -0
  40. uroman/test/multi-script.txt +32 -0
  41. uroman/test/multi-script.uroman-ref.txt +32 -0
  42. uroman/test/string-similarity-test-input.txt +7 -0
  43. uroman/test/string-similarity-test-output-ref.txt +8 -0
  44. uroman/text/amh.txt +7 -0
  45. uroman/text/ara.txt +3 -0
  46. uroman/text/ben.txt +8 -0
  47. uroman/text/bod.txt +3 -0
  48. uroman/text/egy.txt +5 -0
  49. uroman/text/ell.txt +8 -0
  50. uroman/text/fas.txt +6 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: MMS-TTS Demo
3
+ emoji: 🥳
4
+ colorFrom: indigo
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 3.35.2
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ duplicated_from: Matthijs/mms-tts-demo
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import torch
4
+ import os
5
+ import re
6
+ import tempfile
7
+
8
+ from transformers import VitsModel, VitsTokenizer
9
+
10
+
11
+ models = {
12
+ "English": VitsModel.from_pretrained("Matthijs/mms-tts-eng"),
13
+ "German": VitsModel.from_pretrained("Matthijs/mms-tts-deu"),
14
+ "Korean": VitsModel.from_pretrained("Matthijs/mms-tts-kor"),
15
+ }
16
+
17
+ tokenizers = {
18
+ "English": VitsTokenizer.from_pretrained("Matthijs/mms-tts-eng"),
19
+ "German": VitsTokenizer.from_pretrained("Matthijs/mms-tts-deu"),
20
+ "Korean": VitsTokenizer.from_pretrained("Matthijs/mms-tts-kor"),
21
+ }
22
+
23
+
24
+ # For certain checkpoints, the text needs to be romanized.
25
+ # MMS-TTS uses uromanize.pl for this from https://github.com/isi-nlp/uroman
26
+ # This needs to be installed in the folder "uroman"
27
+ def uromanize(text, uroman_pl):
28
+ iso = "xxx"
29
+ with tempfile.NamedTemporaryFile() as tf, tempfile.NamedTemporaryFile() as tf2:
30
+ with open(tf.name, "w") as f:
31
+ f.write("\n".join([text]))
32
+ cmd = f"perl " + uroman_pl
33
+ cmd += f" -l {iso} "
34
+ cmd += f" < {tf.name} > {tf2.name}"
35
+ os.system(cmd)
36
+ outtexts = []
37
+ with open(tf2.name) as f:
38
+ for line in f:
39
+ line = re.sub(r"\s+", " ", line).strip()
40
+ outtexts.append(line)
41
+ outtext = outtexts[0]
42
+ return outtext
43
+
44
+
45
+ def predict(text, language=None):
46
+ if len(text.strip()) == 0:
47
+ return (16000, np.zeros(0).astype(np.int16))
48
+
49
+ if language == "Korean":
50
+ uroman_pl = os.path.join("uroman", "bin", "uroman.pl")
51
+ text = uromanize(text, uroman_pl)
52
+
53
+ tokenizer = tokenizers[language]
54
+ inputs = tokenizer(text, return_tensors="pt")
55
+ input_ids = inputs["input_ids"]
56
+
57
+ if language != "Korean":
58
+ text = tokenizer.batch_decode(input_ids)[0]
59
+
60
+ model = models[language]
61
+ with torch.no_grad():
62
+ outputs = model(input_ids)
63
+
64
+ speech = outputs.audio[0]
65
+ speech = (speech.numpy() * 32767).astype(np.int16)
66
+ return (16000, speech), text
67
+
68
+
69
+ title = "MMS-TTS speech synthesis"
70
+
71
+ description = """
72
+ Facebook's [Massively Multilingual Speech](https://arxiv.org/abs/2305.13516) project aims to provide
73
+ speech technology across a diverse range of languages. The MMS-TTS project contains a collection of
74
+ over 1000 text-to-speech (TTS) models.
75
+
76
+ This demo shows how to use MMS-TTS using 🤗 Transformers. Since MMS-TTS is based on the VITS
77
+ model, this code can also be used to run VITS checkpoints.
78
+ For a full list of checkpoints, [click here](https://huggingface.co/models?filter=vits).
79
+
80
+ As the model performs random sampling, the generated speech is slightly different each time.
81
+ The voice may also vary between runs, or sometimes even in the same sentence.
82
+ (Note that 🤗 Transformers also supports multispeaker VITS checkpoints but the MMS-TTS checkpoints
83
+ are not conditioned on a speaker ID.)
84
+ """
85
+
86
+ article = """
87
+ <div style='margin:20px auto;'>
88
+
89
+ <p>References: <a href="https://arxiv.org/abs/2305.13516">MMS paper</a> |
90
+ <a href="https://ai.facebook.com/blog/multilingual-model-speech-recognition/">blog post</a> |
91
+ <a href="https://huggingface.co/facebook/mms-tts">original weights</a> |
92
+ <a href="https://huggingface.co/spaces/mms-meta/MMS">original MMS space</a>
93
+ </p>
94
+
95
+ <pre>
96
+ @article{pratap2023mms,
97
+ title={Scaling Speech Technology to 1,000+ Languages},
98
+ author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli},
99
+ journal={arXiv},
100
+ year={2023}
101
+ }
102
+ </pre>
103
+
104
+ </div>
105
+ """
106
+
107
+ examples = [
108
+ ["It is not in the stars to hold our destiny but in ourselves.", "English"],
109
+ ["The octopus and Oliver went to the opera in October.", "English"],
110
+ ["She sells seashells by the seashore. I saw a kitten eating chicken in the kitchen.", "English"],
111
+ ["Brisk brave brigadiers brandished broad bright blades, blunderbusses, and bludgeons—balancing them badly.", "English"],
112
+ ["A synonym for cinnamon is a cinnamon synonym.", "English"],
113
+ ["How much wood would a woodchuck chuck if a woodchuck could chuck wood?", "English"],
114
+
115
+ ["Eins, zwei, Polizei. Drei, vier, Grenadier. Fünf, sechs, alte Keks. Sieben, acht, gute Nacht.", "German"],
116
+ ["Alle meine Entchen, schwimmen auf dem See. Köpfchen in das Wasser, Schwänzchen in die Höh.", "German"],
117
+
118
+ ["안녕 세상, 날씨는 아름다워", "Korean"], # Hello world, the weather is beautiful (Google Translate)
119
+ ]
120
+
121
+ gr.Interface(
122
+ fn=predict,
123
+ inputs=[
124
+ gr.Text(label="Input Text"),
125
+ gr.Radio(label="Language", choices=[
126
+ "English",
127
+ "German",
128
+ "Korean",
129
+ ],
130
+ value="English"),
131
+ ],
132
+ outputs=[
133
+ gr.Audio(label="Generated Speech", type="numpy"),
134
+ gr.Text(label="Processed text"),
135
+ ],
136
+ title=title,
137
+ description=description,
138
+ article=article,
139
+ examples=examples,
140
+ ).launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/hollance/transformers.git@vits
2
+ torch
3
+ torchaudio
4
+ soundfile
5
+ librosa
6
+ samplerate
7
+ resampy
8
+ sentencepiece
uroman/.gitignore ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !Build/
2
+ .last_cover_stats
3
+ /META.yml
4
+ /META.json
5
+ /MYMETA.*
6
+ *.o
7
+ *.pm.tdy
8
+ *.bs
9
+
10
+ # Devel::Cover
11
+ cover_db/
12
+
13
+ # Devel::NYTProf
14
+ nytprof.out
15
+
16
+ # Dizt::Zilla
17
+ /.build/
18
+
19
+ # Module::Build
20
+ _build/
21
+ Build
22
+ Build.bat
23
+
24
+ # Module::Install
25
+ inc/
26
+
27
+ # ExtUtils::MakeMaker
28
+ /blib/
29
+ /_eumm/
30
+ /*.gz
31
+ /Makefile
32
+ /Makefile.old
33
+ /MANIFEST.bak
34
+ /pm_to_blib
35
+ /*.zip
uroman/LICENSE.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (C) 2015-2020 Ulf Hermjakob, USC Information Sciences Institute
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ Any publication of projects using uroman shall acknowledge its use: "This project uses the universal romanizer software 'uroman' written by Ulf Hermjakob, USC Information Sciences Institute (2015-2020)".
8
+ Bibliography: Ulf Hermjakob, Jonathan May, and Kevin Knight. 2018. Out-of-the-box universal romanization tool uroman. In Proceedings of the 56th Annual Meeting of Association for Computational Linguistics, Demo Track.
9
+
10
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
11
+
uroman/README.md ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # uroman
2
+
3
+ *uroman* is a *universal romanizer*. It converts text in any script to the Latin alphabet.
4
+
5
+ Version: 1.2.8
6
+ Release date: April 23, 2021
7
+ Author: Ulf Hermjakob, USC Information Sciences Institute
8
+
9
+
10
+ ### Usage
11
+ ```bash
12
+ $ uroman.pl [-l <lang-code>] [--chart] [--no-cache] < STDIN
13
+ where the optional <lang-code> is a 3-letter languages code, e.g. ara, bel, bul, deu, ell, eng, fas,
14
+ grc, ell, eng, heb, kaz, kir, lav, lit, mkd, mkd2, oss, pnt, pus, rus, srp, srp2, tur, uig, ukr, yid.
15
+ --chart specifies chart output (in JSON format) to represent alternative romanizations.
16
+ --no-cache disables caching.
17
+ ```
18
+ ### Examples
19
+ ```bash
20
+ $ bin/uroman.pl < text/zho.txt
21
+ $ bin/uroman.pl -l tur < text/tur.txt
22
+ $ bin/uroman.pl -l heb --chart < text/heb.txt
23
+ $ bin/uroman.pl < test/multi-script.txt > test/multi-script.uroman.txt
24
+ ```
25
+
26
+ Identifying the input as Arabic, Belarusian, Bulgarian, English, Farsi, German,
27
+ Ancient Greek, Modern Greek, Pontic Greek, Hebrew, Kazakh, Kyrgyz, Latvian,
28
+ Lithuanian, North Macedonian, Russian, Serbian, Turkish, Ukrainian, Uyghur or
29
+ Yiddish will improve romanization for those languages as some letters in those
30
+ languages have different sound values from other languages using the same script
31
+ (French, Russian, Hebrew respectively).
32
+ No effect for other languages in this version.
33
+
34
+ ### Bibliography
35
+ Ulf Hermjakob, Jonathan May, and Kevin Knight. 2018. Out-of-the-box universal romanization tool uroman. In Proceedings of the 56th Annual Meeting of Association for Computational Linguistics, Demo Track. ACL-2018 Best Demo Paper Award. [Paper in ACL Anthology](https://www.aclweb.org/anthology/P18-4003) | [Poster](https://www.isi.edu/~ulf/papers/poster-uroman-acl2018.pdf) | [BibTex](https://www.aclweb.org/anthology/P18-4003.bib)
36
+
37
+ ### Change History
38
+ Changes in version 1.2.8
39
+ * Updated to Unicode 13.0 (2021), which supports several new scripts (10% larger UnicodeData.txt).
40
+ * Improved support for Georgian.
41
+ * Preserve various symbols (as opposed to mapping to the symbols' names).
42
+ * Various small improvements.
43
+
44
+ Changes in version 1.2.7
45
+ * Improved support for Pashto.
46
+
47
+ Changes in version 1.2.6
48
+ * Improved support for Ukrainian, Russian and Ogham (ancient Irish script).
49
+ * Added support for English Braille.
50
+ * Added alternative Romanization for North Macedonian and Serbian (mkd2/srp2)
51
+ reflecting a casual style that many native speakers of those languages use
52
+ when writing text in Latin script, e.g. non-accented single letters (e.g. "s")
53
+ rather than phonetically motivated combinations of letters (e.g. "sh").
54
+ * When a line starts with "::lcode xyz ", the new uroman version will switch to
55
+ that language for that line. This is used for the new reference test file.
56
+ * Various small improvements.
57
+
58
+ Changes in version 1.2.5
59
+ * Improved support for Armenian and eight languages using Cyrillic scripts.
60
+ -- For Serbian and Macedonian, which are often written in both Cyrillic
61
+ and Latin scripts, uroman will map both official versions to the same
62
+ romanized text, e.g. both "Ниш" and "Niš" will be mapped to "Nish" (which
63
+ properly reflects the pronunciation of the city's name).
64
+ For both Serbian and Macedonian, casual writers often use a simplified
65
+ Latin form without diacritics, e.g. "s" to represent not only Cyrillic "с"
66
+ and Latin "s", but also "ш" or "š", even if this conflates "s" and "sh" and
67
+ other such pairs. The casual romanization can be simulated by using
68
+ alternative uroman language codes "srp2" and "mkd2", which romanize
69
+ both "Ниш" and "Niš" to "Nis" to reflect the casual Latin spelling.
70
+ * Various small improvements.
71
+
72
+ Changes in version 1.2.4
73
+ * Bug-fix that generated two emtpy lines for each empty line in cache mode.
74
+
75
+ Changes in version 1.2
76
+ * Run-time improvement based on (1) token-based caching and (2) shortcut
77
+ romanization (identity) of ASCII strings for default 1-best (non-chart)
78
+ output. Speed-up by a factor of 10 for Bengali and Uyghur on medium and
79
+ large size texts.
80
+ * Incremental improvements for Farsi, Amharic, Russian, Hebrew and related
81
+ languages.
82
+ * Richer lattice structure (more alternatives) for "Romanization" of English
83
+ to support better matching to romanizations of other languages.
84
+ Changes output only when --chart option is specified. No change in output for
85
+ default 1-best output, which for ASCII characters is always the input string.
86
+
87
+ Changes in version 1.1 (major upgrade)
88
+ * Offers chart output (in JSON format) to represent alternative romanizations.
89
+ -- Location of first character is defined to be "line: 1, start:0, end:0".
90
+ * Incremental improvements of Hebrew and Greek romanization; Chinese numbers.
91
+ * Improved web-interface at http://www.isi.edu/~ulf/uroman.html
92
+ -- Shows corresponding original and romanization text in red
93
+ when hovering over a text segment.
94
+ -- Shows alternative romanizations when hovering over romanized text
95
+ marked by dotted underline.
96
+ -- Added right-to-left script detection and improved display for right-to-left
97
+ script text (as determined line by line).
98
+ -- On-page support for some scripts that are often not pre-installed on users'
99
+ computers (Burmese, Egyptian, Klingon).
100
+
101
+ Changes in version 1.0 (major upgrade)
102
+ * Upgraded principal internal data structure from string to lattice.
103
+ * Improvements mostly in vowelization of South and Southeast Asian languages.
104
+ * Vocalic 'r' more consistently treated as vowel (no additional vowel added).
105
+ * Repetition signs (Japanese/Chinese/Thai/Khmer/Lao) are mapped to superscript 2.
106
+ * Japanese Katakana middle dots now mapped to ASCII space.
107
+ * Tibetan intersyllabic mark now mapped to middle dot (U+00B7).
108
+ * Some corrections regarding analysis of Chinese numbers.
109
+ * Many more foreign diacritics and punctuation marks dropped or mapped to ASCII.
110
+ * Zero-width characters dropped, except line/sentence-initial byte order marks.
111
+ * Spaces normalized to ASCII space.
112
+ * Fixed bug that in some cases mapped signs (such as dagger or bullet) to their verbal descriptions.
113
+ * Tested against previous version of uroman with a new uroman visual diff tool.
114
+ * Almost an order of magnitude faster.
115
+
116
+ Changes in version 0.7 (minor upgrade)
117
+ * Added script uroman-quick.pl for Arabic script languages, incl. Uyghur.
118
+ Much faster, pre-caching mapping of Arabic to Latin characters, simple greedy processing.
119
+ Will not convert material from non-Arabic blocks such as any (somewhat unusual) Cyrillic
120
+ or Chinese characters in Uyghur texts.
121
+
122
+ Changes in version 0.6 (minor upgrade)
123
+ * Added support for two letter characters used in Uzbek:
124
+ (1) character "ʻ" ("modifier letter turned comma", which modifies preceding "g" and "u" letters)
125
+ (2) character "ʼ" ("modifier letter apostrophe", which Uzbek uses to mark a glottal stop).
126
+ Both are now mapped to "'" (plain ASCII apostrophe).
127
+ * Added support for Uyghur vowel characters such as "ې" (Arabic e) and "ۆ" (Arabic oe)
128
+ even when they are not preceded by "ئ" (yeh with hamza above).
129
+ * Added support for Arabic semicolon "؛", Arabic ligature forms for phrases such as "ﷺ"
130
+ ("sallallahou alayhe wasallam" = "prayer of God be upon him and his family and peace")
131
+ * Added robustness for Arabic letter presentation forms (initial/medial/final/isolated).
132
+ However, it is strongly recommended to normalize any presentation form Arabic letters
133
+ to their non-presentation form before calling uroman.
134
+ * Added force flush directive ($|=1;).
135
+
136
+ Changes in version 0.5 (minor upgrade)
137
+ * Improvements for Uyghur (make sure to use language option: -l uig)
138
+
139
+ Changes in version 0.4 (minor upgrade)
140
+ * Improvements for Thai (special cases for vowel/consonant reordering, e.g. for "sara o"; dropped some aspiration 'h's)
141
+ * Minor change for Arabic (added "alef+fathatan" = "an")
142
+
143
+ New features in version 0.3
144
+ * Covers Mandarin (Chinese)
145
+ * Improved romanization for numerous languages
146
+ * Preserves capitalization (e.g. from Latin, Cyrillic, Greek scripts)
147
+ * Maps from native digits to Western numbers
148
+ * Faster for South Asian languages
149
+
150
+ ### Other features
151
+ * Web interface: http://www.isi.edu/~ulf/uroman.html
152
+ * Vowelization is provided when locally computable, e.g. for many South Asian languages and Tibetan.
153
+
154
+ ### Limitations
155
+ * The current version of uroman has a few limitations, some of which we plan to address in future versions.
156
+ For Japanese, *uroman* currently romanizes hiragana and katakana as expected, but kanji are interpreted as Chinese characters and romanized as such.
157
+ For Egyptian hieroglyphs, only single-sound phonetic characters and numbers are currently romanized.
158
+ For Linear B, only phonetic syllabic characters are romanized.
159
+ For some other extinct scripts such as cuneiform, no romanization is provided.
160
+ * A romanizer is not a full transliterator. For example, this version of
161
+ uroman does not vowelize text that lacks explicit vowelization such as
162
+ normal text in Arabic and Hebrew (without diacritics/points).
163
+
164
+ ### Acknowledgments
165
+ This research is based upon work supported in part by the Office of the Director of National Intelligence (ODNI), Intelligence Advanced Research Projects Activity (IARPA), via contract # FA8650-17-C-9116, and by research sponsored by Air Force Research Laboratory (AFRL) under agreement number FA8750-19-1-1000. The views and conclusions contained herein are those of the authors and should not be interpreted as necessarily representing the official policies, either expressed or implied, of ODNI, IARPA, Air Force Laboratory, DARPA, or the U.S. Government. The U.S. Government is authorized to reproduce and distribute reprints for governmental purposes notwithstanding any copyright annotation therein.
uroman/README.txt ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ uroman version 1.2.8
2
+ Release date: April 23, 2021
3
+ Author: Ulf Hermjakob, USC Information Sciences Institute
4
+
5
+ uroman is a universal romanizer. It converts text in any script to the Latin alphabet.
6
+
7
+ Usage: uroman.pl [-l <lang-code>] [--chart] [--no-cache] < STDIN
8
+ where the optional <lang-code> is a 3-letter languages code, e.g. ara, bel, bul, deu, ell, eng, fas,
9
+ grc, ell, eng, heb, kaz, kir, lav, lit, mkd, mkd2, oss, pnt, pus, rus, srp, srp2, tur, uig, ukr, yid.
10
+ --chart specifies chart output (in JSON format) to represent alternative romanizations.
11
+ --no-cache disables caching.
12
+ Examples: bin/uroman.pl < text/zho.txt
13
+ bin/uroman.pl -l tur < text/tur.txt
14
+ bin/uroman.pl -l heb --chart < text/heb.txt
15
+ bin/uroman.pl < test/multi-script.txt > test/multi-script.uroman.txt
16
+
17
+ Identifying the input as Arabic, Belarusian, Bulgarian, English, Farsi, German,
18
+ Ancient Greek, Modern Greek, Pontic Greek, Hebrew, Kazakh, Kyrgyz, Latvian,
19
+ Lithuanian, North Macedonian, Russian, Serbian, Turkish, Ukrainian, Uyghur or Yiddish
20
+ will improve romanization for those languages as some letters in those languages
21
+ have different sound values from other languages using the same script.
22
+ No effect for other languages in this version.
23
+
24
+ Bibliography: Ulf Hermjakob, Jonathan May, and Kevin Knight. 2018. Out-of-the-box universal romanization tool uroman. In Proceedings of the 56th Annual Meeting of Association for Computational Linguistics, Demo Track. [Best Demo Paper Award]
25
+
26
+ Changes in version 1.2.8
27
+ * Improved support for Georgian.
28
+ * Updated UnicodeData.txt to version 13 (2021) with several new scripts (10% larger).
29
+ * Preserve various symbols (as opposed to mapping to the symbols' names).
30
+ * Various small improvements.
31
+ Changes in version 1.2.7
32
+ * Improved support for Pashto.
33
+ Changes in version 1.2.6
34
+ * Improved support for Ukrainian, Russian and Ogham (ancient Irish script).
35
+ * Added support for English Braille.
36
+ * Added alternative Romanization for North Macedonian and Serbian (mkd2/srp2)
37
+ reflecting a casual style that many native speakers of those languages use
38
+ when writing text in Latin script, e.g. non-accented single letters (e.g. "s")
39
+ rather than phonetically motivated combinations of letters (e.g. "sh").
40
+ * When a line starts with "::lcode xyz ", the new uroman version will switch to
41
+ that language for that line. This is used for the new reference test file.
42
+ * Various small improvements.
43
+ Changes in version 1.2.5
44
+ * Improved support for Armenian and eight languages using Cyrillic scripts.
45
+ -- For Serbian and Macedonian, which are often written in both Cyrillic
46
+ and Latin scripts, uroman will map both official versions to the same
47
+ romanized text, e.g. both "Ниш" and "Niš" will be mapped to "Nish" (which
48
+ properly reflects the pronunciation of the city's name).
49
+ For both Serbian and Macedonian, casual writers often use a simplified
50
+ Latin form without diacritics, e.g. "s" to represent not only Cyrillic "с"
51
+ and Latin "s", but also "ш" or "š", even if this conflates "s" and "sh" and
52
+ other such pairs. The casual romanization can be simulated by using
53
+ alternative uroman language codes "srp2" and "mkd2", which romanize
54
+ both "Ниш" and "Niš" to "Nis" to reflect the casual Latin spelling.
55
+ * Various small improvements.
56
+ Changes in version 1.2.4
57
+ * Added support for Tifinagh (a script used for Berber languages).
58
+ * Bug-fix that generated two emtpy lines for each empty line in cache mode.
59
+ Changes in version 1.2.3
60
+ * Exclude emojis, dingbats, many other pictographs from being romanized (e.g. to "face")
61
+ Changes in version 1.2
62
+ * Run-time improvement based on (1) token-based caching and (2) shortcut
63
+ romanization (identity) of ASCII strings for default 1-best (non-chart)
64
+ output. Speed-up by a factor of 10 for Bengali and Uyghur on medium and
65
+ large size texts.
66
+ * Incremental improvements for Farsi, Amharic, Russian, Hebrew and related
67
+ languages.
68
+ * Richer lattice structure (more alternatives) for "Romanization" of English
69
+ to support better matching to romanizations of other languages.
70
+ Changes output only when --chart option is specified. No change in output for
71
+ default 1-best output, which for ASCII characters is always the input string.
72
+ Changes in version 1.1 (major upgrade)
73
+ * Offers chart output (in JSON format) to represent alternative romanizations.
74
+ -- Location of first character is defined to be "line: 1, start:0, end:0".
75
+ * Incremental improvements of Hebrew and Greek romanization; Chinese numbers.
76
+ * Improved web-interface at http://www.isi.edu/~ulf/uroman.html
77
+ -- Shows corresponding original and romanization text in red
78
+ when hovering over a text segment.
79
+ -- Shows alternative romanizations when hovering over romanized text
80
+ marked by dotted underline.
81
+ -- Added right-to-left script detection and improved display for right-to-left
82
+ script text (as determined line by line).
83
+ -- On-page support for some scripts that are often not pre-installed on users'
84
+ computers (Burmese, Egyptian, Klingon).
85
+ Changes in version 1.0 (major upgrade)
86
+ * Upgraded principal internal data structure from string to lattice.
87
+ * Improvements mostly in vowelization of South and Southeast Asian languages.
88
+ * Vocalic 'r' more consistently treated as vowel (no additional vowel added).
89
+ * Repetition signs (Japanese/Chinese/Thai/Khmer/Lao) are mapped to superscript 2.
90
+ * Japanese Katakana middle dots now mapped to ASCII space.
91
+ * Tibetan intersyllabic mark now mapped to middle dot (U+00B7).
92
+ * Some corrections regarding analysis of Chinese numbers.
93
+ * Many more foreign diacritics and punctuation marks dropped or mapped to ASCII.
94
+ * Zero-width characters dropped, except line/sentence-initial byte order marks.
95
+ * Spaces normalized to ASCII space.
96
+ * Fixed bug that in some cases mapped signs (such as dagger or bullet) to their verbal descriptions.
97
+ * Tested against previous version of uroman with a new uroman visual diff tool.
98
+ * Almost an order of magnitude faster.
99
+ Changes in version 0.7 (minor upgrade)
100
+ * Added script uroman-quick.pl for Arabic script languages, incl. Uyghur.
101
+ Much faster, pre-caching mapping of Arabic to Latin characters, simple greedy processing.
102
+ Will not convert material from non-Arabic blocks such as any (somewhat unusual) Cyrillic
103
+ or Chinese characters in Uyghur texts.
104
+ Changes in version 0.6 (minor upgrade)
105
+ * Added support for two letter characters used in Uzbek:
106
+ (1) character "ʻ" ("modifier letter turned comma", which modifies preceding "g" and "u" letters)
107
+ (2) character "ʼ" ("modifier letter apostrophe", which Uzbek uses to mark a glottal stop).
108
+ Both are now mapped to "'" (plain ASCII apostrophe).
109
+ * Added support for Uyghur vowel characters such as "ې" (Arabic e) and "ۆ" (Arabic oe)
110
+ even when they are not preceded by "ئ" (yeh with hamza above).
111
+ * Added support for Arabic semicolon "؛", Arabic ligature forms for phrases such as "ﷺ"
112
+ ("sallallahou alayhe wasallam" = "prayer of God be upon him and his family and peace")
113
+ * Added robustness for Arabic letter presentation forms (initial/medial/final/isolated).
114
+ However, it is strongly recommended to normalize any presentation form Arabic letters
115
+ to their non-presentation form before calling uroman.
116
+ * Added force flush directive ($|=1;).
117
+ Changes in version 0.5 (minor upgrade)
118
+ * Improvements for Uyghur (make sure to use language option: -l uig)
119
+ Changes in version 0.4 (minor upgrade)
120
+ * Improvements for Thai (special cases for vowel/consonant reordering, e.g. for "sara o"; dropped some aspiration 'h's)
121
+ * Minor change for Arabic (added "alef+fathatan" = "an")
122
+ New features in version 0.3
123
+ * Covers Mandarin (Chinese)
124
+ * Improved romanization for numerous languages
125
+ * Preserves capitalization (e.g. from Latin, Cyrillic, Greek scripts)
126
+ * Maps from native digits to Western numbers
127
+ * Faster for South Asian languages
128
+
129
+ Other features
130
+ * Web interface: http://www.isi.edu/~ulf/uroman.html
131
+ * Vowelization is provided when locally computable, e.g. for many South Asian
132
+ languages and Tibetan.
133
+
134
+ Limitations
135
+ * This version of uroman assumes all CJK ideographs to be Mandarin (Chinese).
136
+ This means that Japanese kanji are incorrectly romanized; however, Japanese
137
+ hiragana and katakana are properly romanized.
138
+ * A romanizer is not a full transliterator. For example, this version of
139
+ uroman does not vowelize text that lacks explicit vowelization such as
140
+ normal text in Arabic and Hebrew (without diacritics/points).
141
+
uroman/bin/de-accent.pl ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/perl -w
2
+
3
+ sub print_version {
4
+ print STDERR "$0 version 1.1\n";
5
+ print STDERR " Author: Ulf Hermjakob\n";
6
+ print STDERR " Last changed: March 14, 2011\n";
7
+ }
8
+
9
+ sub print_usage {
10
+ print STDERR "$0 [options] < with_accents.txt > without_accents.txt\n";
11
+ print STDERR " -h or -help\n";
12
+ print STDERR " -v or -version\n";
13
+ }
14
+
15
+ sub de_accent_string {
16
+ local($s) = @_;
17
+
18
+ # $s =~ tr/A-Z/a-z/;
19
+ unless (0) {
20
+ # Latin-1
21
+ if ($s =~ /\xC3[\x80-\xBF]/) {
22
+ $s =~ s/(À|Á|Â|Ã|Ä|Å)/A/g;
23
+ $s =~ s/Æ/Ae/g;
24
+ $s =~ s/Ç/C/g;
25
+ $s =~ s/Ð/D/g;
26
+ $s =~ s/(È|É|Ê|Ë)/E/g;
27
+ $s =~ s/(Ì|Í|Î|Ï)/I/g;
28
+ $s =~ s/Ñ/N/g;
29
+ $s =~ s/(Ò|Ó|Ô|Õ|Ö|Ø)/O/g;
30
+ $s =~ s/(Ù|Ú|Û|Ü)/U/g;
31
+ $s =~ s/Þ/Th/g;
32
+ $s =~ s/Ý/Y/g;
33
+ $s =~ s/(à|á|â|ã|ä|å)/a/g;
34
+ $s =~ s/æ/ae/g;
35
+ $s =~ s/ç/c/g;
36
+ $s =~ s/(è|é|ê|ë)/e/g;
37
+ $s =~ s/(ì|í|î|ï)/i/g;
38
+ $s =~ s/ð/d/g;
39
+ $s =~ s/ñ/n/g;
40
+ $s =~ s/(ò|ó|ô|õ|ö)/o/g;
41
+ $s =~ s/ß/ss/g;
42
+ $s =~ s/þ/th/g;
43
+ $s =~ s/(ù|ú|û|ü)/u/g;
44
+ $s =~ s/(ý|ÿ)/y/g;
45
+ }
46
+ # Latin Extended-A
47
+ if ($s =~ /[\xC4-\xC5][\x80-\xBF]/) {
48
+ $s =~ s/(Ā|Ă|Ą)/A/g;
49
+ $s =~ s/(ā|ă|ą)/a/g;
50
+ $s =~ s/(Ć|Ĉ|Ċ|Č)/C/g;
51
+ $s =~ s/(ć|ĉ|ċ|č)/c/g;
52
+ $s =~ s/(Ď|Đ)/D/g;
53
+ $s =~ s/(ď|đ)/d/g;
54
+ $s =~ s/(Ē|Ĕ|Ė|Ę|Ě)/E/g;
55
+ $s =~ s/(ē|ĕ|ė|ę|ě)/e/g;
56
+ $s =~ s/(Ĝ|Ğ|Ġ|Ģ)/G/g;
57
+ $s =~ s/(ĝ|ğ|ġ|ģ)/g/g;
58
+ $s =~ s/(Ĥ|Ħ)/H/g;
59
+ $s =~ s/(ĥ|ħ)/h/g;
60
+ $s =~ s/(Ĩ|Ī|Ĭ|Į|İ)/I/g;
61
+ $s =~ s/(ĩ|ī|ĭ|į|ı)/i/g;
62
+ $s =~ s/IJ/Ij/g;
63
+ $s =~ s/ij/ij/g;
64
+ $s =~ s/Ĵ/J/g;
65
+ $s =~ s/ĵ/j/g;
66
+ $s =~ s/Ķ/K/g;
67
+ $s =~ s/(ķ|ĸ)/k/g;
68
+ $s =~ s/(Ĺ|Ļ|Ľ|Ŀ|Ł)/L/g;
69
+ $s =~ s/(ļ|ľ|ŀ|ł)/l/g;
70
+ $s =~ s/(Ń|Ņ|Ň|Ŋ)/N/g;
71
+ $s =~ s/(ń|ņ|ň|ʼn|ŋ)/n/g;
72
+ $s =~ s/(Ō|Ŏ|Ő)/O/g;
73
+ $s =~ s/(ō|ŏ|ő)/o/g;
74
+ $s =~ s/Œ/Oe/g;
75
+ $s =~ s/œ/oe/g;
76
+ $s =~ s/(Ŕ|Ŗ|Ř)/R/g;
77
+ $s =~ s/(ŕ|ŗ|ř)/r/g;
78
+ $s =~ s/(Ś|Ŝ|Ş|Š)/S/g;
79
+ $s =~ s/(ś|ŝ|ş|š|ſ)/s/g;
80
+ $s =~ s/(Ţ|Ť|Ŧ)/T/g;
81
+ $s =~ s/(ţ|ť|ŧ)/t/g;
82
+ $s =~ s/(Ũ|Ū|Ŭ|Ů|Ű|Ų)/U/g;
83
+ $s =~ s/(ũ|ū|ŭ|ů|ű|ų)/u/g;
84
+ $s =~ s/Ŵ/W/g;
85
+ $s =~ s/ŵ/w/g;
86
+ $s =~ s/(Ŷ|Ÿ)/Y/g;
87
+ $s =~ s/ŷ/y/g;
88
+ $s =~ s/(Ź|Ż|Ž)/Z/g;
89
+ $s =~ s/(ź|ż|ž)/z/g;
90
+ }
91
+ # Latin Extended Additional
92
+ if ($s =~ /\xE1[\xB8-\xBF][\x80-\xBF]/) {
93
+ $s =~ s/(ḁ|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẚ)/a/g;
94
+ $s =~ s/(ḃ|ḅ|ḇ)/b/g;
95
+ $s =~ s/(ḉ)/c/g;
96
+ $s =~ s/(ḋ|ḍ|ḏ|ḑ|ḓ)/d/g;
97
+ $s =~ s/(ḕ|ḗ|ḙ|ḛ|ḝ|ẹ|ẻ|ẽ|ế|ề|ể|ễ|ệ)/e/g;
98
+ $s =~ s/(ḟ)/f/g;
99
+ $s =~ s/(ḡ)/g/g;
100
+ $s =~ s/(ḣ|ḥ|ḧ|ḩ|ḫ)/h/g;
101
+ $s =~ s/(ḭ|ḯ|ỉ|ị)/i/g;
102
+ $s =~ s/(ḱ|ḳ|ḵ)/k/g;
103
+ $s =~ s/(ḷ|ḹ|ḻ|ḽ)/l/g;
104
+ $s =~ s/(ḿ|ṁ|ṃ)/m/g;
105
+ $s =~ s/(ṅ|ṇ|ṉ|ṋ)/m/g;
106
+ $s =~ s/(ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ṍ|ṏ|ṑ|ṓ)/o/g;
107
+ $s =~ s/(ṕ|ṗ)/p/g;
108
+ $s =~ s/(ṙ|ṛ|ṝ|ṟ)/r/g;
109
+ $s =~ s/(ṡ|ṣ|ṥ|ṧ|ṩ|ẛ)/s/g;
110
+ $s =~ s/(ṫ|ṭ|ṯ|ṱ)/t/g;
111
+ $s =~ s/(ṳ|ṵ|ṷ|ṹ|ṻ|ụ|ủ|ứ|ừ|ử|ữ|ự)/u/g;
112
+ $s =~ s/(ṽ|ṿ)/v/g;
113
+ $s =~ s/(ẁ|ẃ|ẅ|ẇ|ẉ|ẘ)/w/g;
114
+ $s =~ s/(ẋ|ẍ)/x/g;
115
+ $s =~ s/(ẏ|ỳ|ỵ|ỷ|ỹ|ẙ)/y/g;
116
+ $s =~ s/(ẑ|ẓ|ẕ)/z/g;
117
+ $s =~ s/(Ḁ|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ)/A/g;
118
+ $s =~ s/(Ḃ|Ḅ|Ḇ)/B/g;
119
+ $s =~ s/(Ḉ)/C/g;
120
+ $s =~ s/(Ḋ|Ḍ|Ḏ|Ḑ|Ḓ)/D/g;
121
+ $s =~ s/(Ḕ|Ḗ|Ḙ|Ḛ|Ḝ|Ẹ|Ẻ|Ẽ|Ế|Ề|Ể|Ễ|Ệ)/E/g;
122
+ $s =~ s/(Ḟ)/F/g;
123
+ $s =~ s/(Ḡ)/G/g;
124
+ $s =~ s/(Ḣ|Ḥ|Ḧ|Ḩ|Ḫ)/H/g;
125
+ $s =~ s/(Ḭ|Ḯ|Ỉ|Ị)/I/g;
126
+ $s =~ s/(Ḱ|Ḳ|Ḵ)/K/g;
127
+ $s =~ s/(Ḷ|Ḹ|Ḻ|Ḽ)/L/g;
128
+ $s =~ s/(Ḿ|Ṁ|Ṃ)/M/g;
129
+ $s =~ s/(Ṅ|Ṇ|Ṉ|Ṋ)/N/g;
130
+ $s =~ s/(Ṍ|Ṏ|Ṑ|Ṓ|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ)/O/g;
131
+ $s =~ s/(Ṕ|Ṗ)/P/g;
132
+ $s =~ s/(Ṙ|Ṛ|Ṝ|Ṟ)/R/g;
133
+ $s =~ s/(Ṡ|Ṣ|Ṥ|Ṧ|Ṩ)/S/g;
134
+ $s =~ s/(Ṫ|Ṭ|Ṯ|Ṱ)/T/g;
135
+ $s =~ s/(Ṳ|Ṵ|Ṷ|Ṹ|Ṻ|Ụ|Ủ|Ứ|Ừ|Ử|Ữ|Ự)/U/g;
136
+ $s =~ s/(Ṽ|Ṿ)/V/g;
137
+ $s =~ s/(Ẁ|Ẃ|Ẅ|Ẇ|Ẉ)/W/g;
138
+ $s =~ s/(Ẍ)/X/g;
139
+ $s =~ s/(Ẏ|Ỳ|Ỵ|Ỷ|Ỹ)/Y/g;
140
+ $s =~ s/(Ẑ|Ẓ|Ẕ)/Z/g;
141
+ }
142
+ # Greek letters
143
+ if ($s =~ /\xCE[\x86-\xAB]/) {
144
+ $s =~ s/ά/α/g;
145
+ $s =~ s/έ/ε/g;
146
+ $s =~ s/ί/ι/g;
147
+ $s =~ s/ϊ/ι/g;
148
+ $s =~ s/ΐ/ι/g;
149
+ $s =~ s/ό/ο/g;
150
+ $s =~ s/ύ/υ/g;
151
+ $s =~ s/ϋ/υ/g;
152
+ $s =~ s/ΰ/υ/g;
153
+ $s =~ s/ώ/ω/g;
154
+ $s =~ s/Ά/Α/g;
155
+ $s =~ s/Έ/Ε/g;
156
+ $s =~ s/Ή/Η/g;
157
+ $s =~ s/Ί/Ι/g;
158
+ $s =~ s/Ϊ/Ι/g;
159
+ $s =~ s/Ύ/Υ/g;
160
+ $s =~ s/Ϋ/Υ/g;
161
+ $s =~ s/Ώ/Ω/g;
162
+ }
163
+ # Cyrillic letters
164
+ if ($s =~ /\xD0[\x80-\xAF]/) {
165
+ $s =~ s/Ѐ/Е/g;
166
+ $s =~ s/Ё/Е/g;
167
+ $s =~ s/Ѓ/Г/g;
168
+ $s =~ s/Ќ/К/g;
169
+ $s =~ s/Ѝ/И/g;
170
+ $s =~ s/Й/И/g;
171
+ $s =~ s/ѐ/е/g;
172
+ $s =~ s/ё/е/g;
173
+ $s =~ s/ѓ/г/g;
174
+ $s =~ s/ќ/к/g;
175
+ $s =~ s/ѝ/и/g;
176
+ $s =~ s/й/и/g;
177
+ }
178
+ }
179
+ return $s;
180
+ }
181
+
182
+ while (@ARGV) {
183
+ $arg = shift @ARGV;
184
+ if ($arg =~ /^-*(h|help)$/i) {
185
+ &print_usage;
186
+ exit 1;
187
+ } elsif ($arg =~ /^-*(v|version)$/i) {
188
+ &print_version;
189
+ exit 1;
190
+ } else {
191
+ print STDERR "Ignoring unrecognized argument $arg\n";
192
+ }
193
+ }
194
+
195
+ $line_number = 0;
196
+ while (<>) {
197
+ $line_number++;
198
+ print &de_accent_string($_);
199
+ }
200
+ exit 0;
201
+
uroman/bin/string-distance.pl ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/perl -w
2
+
3
+ # Author: Ulf Hermjakob
4
+ # Release date: October 13, 2019
5
+
6
+ # Usage: string-distance.pl {-lc1 <language-code>} {-lc2 <language-code>} < STDIN > STDOUT
7
+ # Example: string-distance.pl -lc1 rus -lc2 ukr < STDIN > STDOUT
8
+ # Example: string-distance.pl < ../test/string-similarity-test-input.txt
9
+ # Input format: two strings per line (tab-separated, in Latin script)
10
+ # Strings in non-Latin scripts should first be romanized. (Recommended script: uroman.pl)
11
+ # Output format: repetition of the two input strings, plus the string distance between them (tab-separated).
12
+ # Additional output meta info lines at the top are marked with an initial #.
13
+ #
14
+ # The script uses data from a string-distance-cost-rules file that lists costs,
15
+ # where the default cost is "1" with lower costs for differences in vowels,
16
+ # duplicate consonants, "f" vs. "ph" etc.
17
+ # Language cost rules can be language-specific and context-sensitive.
18
+
19
+ $|=1;
20
+
21
+ use FindBin;
22
+ use Cwd "abs_path";
23
+ use File::Basename qw(dirname);
24
+ use File::Spec;
25
+
26
+ my $bin_dir = abs_path(dirname($0));
27
+ my $root_dir = File::Spec->catfile($bin_dir, File::Spec->updir());
28
+ my $data_dir = File::Spec->catfile($root_dir, "data");
29
+ my $lib_dir = File::Spec->catfile($root_dir, "lib");
30
+
31
+ use lib "$FindBin::Bin/../lib";
32
+ use List::Util qw(min max);
33
+ use NLP::utilities;
34
+ use NLP::stringDistance;
35
+ $util = NLP::utilities;
36
+ $sd = NLP::stringDistance;
37
+ $verbose = 0;
38
+ $separator = "\t";
39
+
40
+ $cost_rule_filename = File::Spec->catfile($data_dir, "string-distance-cost-rules.txt");
41
+
42
+ $lang_code1 = "eng";
43
+ $lang_code2 = "eng";
44
+ %ht = ();
45
+
46
+ while (@ARGV) {
47
+ $arg = shift @ARGV;
48
+ if ($arg =~ /^-+lc1$/) {
49
+ $lang_code_candidate = shift @ARGV;
50
+ $lang_code1 = $lang_code_candidate if $lang_code_candidate =~ /^[a-z]{3,3}$/;
51
+ } elsif ($arg =~ /^-+lc2$/) {
52
+ $lang_code_candidate = shift @ARGV;
53
+ $lang_code2 = $lang_code_candidate if $lang_code_candidate =~ /^[a-z]{3,3}$/;
54
+ } elsif ($arg =~ /^-+(v|verbose)$/) {
55
+ $verbose = shift @ARGV;
56
+ } else {
57
+ print STDERR "Ignoring unrecognized arg $arg\n";
58
+ }
59
+ }
60
+
61
+ $sd->load_string_distance_data($cost_rule_filename, *ht, $verbose);
62
+ print STDERR "Loaded resources.\n" if $verbose;
63
+
64
+ my $chart_id = 0;
65
+ my $line_number = 0;
66
+ print "# Lang-code-1: $lang_code1 Lang-code-2: $lang_code2\n";
67
+ while (<>) {
68
+ $line_number++;
69
+ if ($verbose) {
70
+ if ($line_number =~ /000$/) {
71
+ if ($line_number =~ /0000$/) {
72
+ print STDERR $line_number;
73
+ } else {
74
+ print STDERR ".";
75
+ }
76
+ }
77
+ }
78
+ my $line = $_;
79
+ $line =~ s/^\xEF\xBB\xBF//;
80
+ next if $line =~ /^\s*(\#.*)?$/;
81
+ my $s1;
82
+ my $s2;
83
+ if (($s1, $s2) = ($line =~ /^("(?:\\"|[^"])*"|\S+)$separator("(?:\\"|[^"])*"|\S+)\s*$/)) {
84
+ $s1 = $util->dequote_string($s1);
85
+ $s2 = $util->dequote_string($s2);
86
+ } elsif ($line =~ /^\s*(#.*)$/) {
87
+ } else {
88
+ print STDERR "Could not process line $line_number: $line" if $verbose;
89
+ print "\n";
90
+ next;
91
+ }
92
+
93
+ $cost = $sd->quick_romanized_string_distance_by_chart($s1, $s2, *ht, "", $lang_code1, $lang_code2);
94
+ print "$s1\t$s2\t$cost\n";
95
+ }
96
+ print STDERR "\n" if $verbose;
97
+
98
+ exit 0;
99
+
uroman/bin/uroman-quick.pl ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/perl -w
2
+
3
+ # uroman Nov. 12, 2015 - July 25, 2016
4
+ # version v0.7
5
+ # Author: Ulf Hermjakob
6
+
7
+ # Usage: uroman-quick.pl {-l [tur|uig|ukr|yid]} < STDIN
8
+ # currently only for Arabic script languages, incl. Uyghur
9
+
10
+ $|=1;
11
+
12
+ use FindBin;
13
+ use Cwd "abs_path";
14
+ use File::Basename qw(dirname);
15
+ use File::Spec;
16
+
17
+ my $bin_dir = abs_path(dirname($0));
18
+ my $root_dir = File::Spec->catfile($bin_dir, File::Spec->updir());
19
+ my $data_dir = File::Spec->catfile($root_dir, "data");
20
+ my $lib_dir = File::Spec->catfile($root_dir, "lib");
21
+
22
+ use lib "$FindBin::Bin/../lib";
23
+ use NLP::Romanizer;
24
+ use NLP::UTF8;
25
+ $romanizer = NLP::Romanizer;
26
+ %ht = ();
27
+ $lang_code = "";
28
+
29
+ while (@ARGV) {
30
+ $arg = shift @ARGV;
31
+ if ($arg =~ /^-+(l|lc|lang-code)$/) {
32
+ $lang_code = lc (shift @ARGV || "")
33
+ } else {
34
+ print STDERR "Ignoring unrecognized arg $arg\n";
35
+ }
36
+ }
37
+
38
+ $romanization_table_arabic_block_filename = File::Spec->catfile($data_dir, "romanization-table-arabic-block.txt");
39
+ $romanization_table_filename = File::Spec->catfile($data_dir, "romanization-table.txt");
40
+
41
+ $romanizer->load_romanization_table(*ht, $romanization_table_arabic_block_filename);
42
+ $romanizer->load_romanization_table(*ht, $romanization_table_filename);
43
+
44
+ $line_number = 0;
45
+ while (<>) {
46
+ $line_number++;
47
+ my $line = $_;
48
+ print $romanizer->quick_romanize($line, $lang_code, *ht) . "\n";
49
+ if ($line_number =~ /0000$/) {
50
+ print STDERR $line_number;
51
+ } elsif ($line_number =~ /000$/) {
52
+ print STDERR ".";
53
+ }
54
+ }
55
+ print STDERR "\n";
56
+
57
+ exit 0;
58
+
uroman/bin/uroman-tsv.sh ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Created by Thamme Gowda on June 17, 2019
3
+
4
+ DIR=$(dirname "${BASH_SOURCE[0]}") # get the directory name
5
+ # DIR=$(realpath "${DIR}") # resolve its full path if need be
6
+
7
+ if [[ $# -lt 1 || $# -gt 2 ]]; then
8
+ >&2 echo "ERROR: invalid args"
9
+ >&2 echo "Usage: <input.tsv> [<output.tsv>]"
10
+ exit 2
11
+ fi
12
+
13
+ INP=$1
14
+ OUT=$2
15
+
16
+ CMD=$DIR/uroman.pl
17
+
18
+ function romanize(){
19
+ paste <(cut -f1 $INP) <(cut -f2 $INP | $CMD)
20
+ }
21
+
22
+ if [[ -n $OUT ]]; then
23
+ romanize > $OUT
24
+ else
25
+ romanize
26
+ fi
27
+
28
+
uroman/bin/uroman.pl ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/perl -w
2
+
3
+ # uroman Nov. 12, 2015 - Apr. 23, 2021
4
+ $version = "v1.2.8";
5
+ # Author: Ulf Hermjakob
6
+
7
+ # Usage: uroman.pl {-l [ara|bel|bul|deu|ell|eng|fas|grc|heb|kaz|kir|lav|lit|mkd|mkd2|oss|pnt|rus|srp|srp2|tur|uig|ukr|yid]} {--chart|--offset-mapping} {--no-cache} {--workset} < STDIN
8
+ # Example: cat workset.txt | uroman.pl --offset-mapping --workset
9
+
10
+ $|=1;
11
+
12
+ use FindBin;
13
+ use Cwd "abs_path";
14
+ use File::Basename qw(dirname);
15
+ use File::Spec;
16
+
17
+ my $bin_dir = abs_path(dirname($0));
18
+ my $root_dir = File::Spec->catfile($bin_dir, File::Spec->updir());
19
+ my $data_dir = File::Spec->catfile($root_dir, "data");
20
+ my $lib_dir = File::Spec->catfile($root_dir, "lib");
21
+
22
+ use lib "$FindBin::Bin/../lib";
23
+ use NLP::Chinese;
24
+ use NLP::Romanizer;
25
+ use NLP::UTF8;
26
+ use NLP::utilities;
27
+ use JSON;
28
+ $chinesePM = NLP::Chinese;
29
+ $romanizer = NLP::Romanizer;
30
+ $util = NLP::utilities;
31
+ %ht = ();
32
+ %pinyin_ht = ();
33
+ $lang_code = "";
34
+ $return_chart_p = 0;
35
+ $return_offset_mappings_p = 0;
36
+ $workset_p = 0;
37
+ $cache_rom_tokens_p = 1;
38
+
39
+ $script_data_filename = File::Spec->catfile($data_dir, "Scripts.txt");
40
+ $unicode_data_overwrite_filename = File::Spec->catfile($data_dir, "UnicodeDataOverwrite.txt");
41
+ $unicode_data_filename = File::Spec->catfile($data_dir, "UnicodeData.txt");
42
+ $romanization_table_filename = File::Spec->catfile($data_dir, "romanization-table.txt");
43
+ $chinese_tonal_pinyin_filename = File::Spec->catfile($data_dir, "Chinese_to_Pinyin.txt");
44
+
45
+ while (@ARGV) {
46
+ $arg = shift @ARGV;
47
+ if ($arg =~ /^-+(l|lc|lang-code)$/) {
48
+ $lang_code = lc (shift @ARGV || "")
49
+ } elsif ($arg =~ /^-+chart$/i) {
50
+ $return_chart_p = 1;
51
+ } elsif ($arg =~ /^-+workset$/i) {
52
+ $workset_p = 1;
53
+ } elsif ($arg =~ /^-+offset[-_]*map/i) {
54
+ $return_offset_mappings_p = 1;
55
+ } elsif ($arg =~ /^-+unicode[-_]?data/i) {
56
+ $filename = shift @ARGV;
57
+ if (-r $filename) {
58
+ $unicode_data_filename = $filename;
59
+ } else {
60
+ print STDERR "Ignoring invalid UnicodeData filename $filename\n";
61
+ }
62
+ } elsif ($arg =~ /^-+(no-tok-cach|no-cach)/i) {
63
+ $cache_rom_tokens_p = 0;
64
+ } else {
65
+ print STDERR "Ignoring unrecognized arg $arg\n";
66
+ }
67
+ }
68
+
69
+ $romanizer->load_script_data(*ht, $script_data_filename);
70
+ $romanizer->load_unicode_data(*ht, $unicode_data_filename);
71
+ $romanizer->load_unicode_overwrite_romanization(*ht, $unicode_data_overwrite_filename);
72
+ $romanizer->load_romanization_table(*ht, $romanization_table_filename);
73
+ $chinese_to_pinyin_not_yet_loaded_p = 1;
74
+ $current_date = $util->datetime("dateTtime");
75
+ $lang_code_clause = ($lang_code) ? " \"lang-code\":\"$lang_code\",\n" : "";
76
+
77
+ print "{\n \"romanizer\":\"uroman $version (Ulf Hermjakob, USC/ISI)\",\n \"date\":\"$current_date\",\n$lang_code_clause \"romanization\": [\n" if $return_chart_p;
78
+ my $line_number = 0;
79
+ my $chart_result = "";
80
+ while (<>) {
81
+ $line_number++;
82
+ my $line = $_;
83
+ my $snt_id = "";
84
+ if ($workset_p) {
85
+ next if $line =~ /^#/;
86
+ if (($i_value, $s_value) = ($line =~ /^(\S+\.\d+)\s(.*)$/)) {
87
+ $snt_id = $i_value;
88
+ $line = "$s_value\n";
89
+ } else {
90
+ next;
91
+ }
92
+ }
93
+ if ($chinese_to_pinyin_not_yet_loaded_p && $chinesePM->string_contains_utf8_cjk_unified_ideograph_p($line)) {
94
+ $chinesePM->read_chinese_tonal_pinyin_files(*pinyin_ht, $chinese_tonal_pinyin_filename);
95
+ $chinese_to_pinyin_not_yet_loaded_p = 0;
96
+ }
97
+ if ($return_chart_p) {
98
+ print $chart_result;
99
+ *chart_ht = $romanizer->romanize($line, $lang_code, "", *ht, *pinyin_ht, 0, "return chart", $line_number);
100
+ $chart_result = $romanizer->chart_to_json_romanization_elements(0, $chart_ht{N_CHARS}, *chart_ht, $line_number);
101
+ } elsif ($return_offset_mappings_p) {
102
+ ($best_romanization, $offset_mappings) = $romanizer->romanize($line, $lang_code, "", *ht, *pinyin_ht, 0, "return offset mappings", $line_number, 0);
103
+ print "::snt-id $snt_id\n" if $workset_p;
104
+ print "::orig $line";
105
+ print "::rom $best_romanization\n";
106
+ print "::align $offset_mappings\n\n";
107
+ } elsif ($cache_rom_tokens_p) {
108
+ print $romanizer->romanize_by_token_with_caching($line, $lang_code, "", *ht, *pinyin_ht, 0, "", $line_number) . "\n";
109
+ } else {
110
+ print $romanizer->romanize($line, $lang_code, "", *ht, *pinyin_ht, 0, "", $line_number) . "\n";
111
+ }
112
+ }
113
+ $chart_result =~ s/,(\s*)$/$1/;
114
+ print $chart_result;
115
+ print " ]\n}\n" if $return_chart_p;
116
+
117
+ $dev_test_p = 0;
118
+ if ($dev_test_p) {
119
+ $n_suspicious_code_points = 0;
120
+ $n_instances = 0;
121
+ foreach $char_name (sort { hex($ht{UTF_NAME_TO_UNICODE}->{$a}) <=> hex($ht{UTF_NAME_TO_UNICODE}->{$b}) }
122
+ keys %{$ht{SUSPICIOUS_ROMANIZATION}}) {
123
+ $unicode_value = $ht{UTF_NAME_TO_UNICODE}->{$char_name};
124
+ $utf8_string = $ht{UTF_NAME_TO_CODE}->{$char_name};
125
+ foreach $romanization (sort keys %{$ht{SUSPICIOUS_ROMANIZATION}->{$char_name}}) {
126
+ $count = $ht{SUSPICIOUS_ROMANIZATION}->{$char_name}->{$romanization};
127
+ $s = ($count == 1) ? "" : "s";
128
+ print STDERR "*** Suspiciously lengthy romanization:\n" unless $n_suspicious_code_points;
129
+ print STDERR "::s $utf8_string ::t $romanization ::comment $char_name (U+$unicode_value)\n";
130
+ $n_suspicious_code_points++;
131
+ $n_instances += $count;
132
+ }
133
+ }
134
+ print STDERR " *** Total of $n_suspicious_code_points suspicious code points ($n_instances instance$s)\n" if $n_suspicious_code_points;
135
+ }
136
+
137
+ exit 0;
138
+
uroman/data/Chinese_to_Pinyin.txt ADDED
The diff for this file is too large to render. See raw diff
 
uroman/data/Scripts.txt ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ::script-name Aegean
2
+ ::script-name Ahom
3
+ ::script-name Anatolian Hieroglyph
4
+ ::script-name Arabic ::direction right-to-left
5
+ ::script-name Armenian
6
+ ::script-name Avestan
7
+ ::script-name Balinese
8
+ ::script-name Bamum
9
+ ::script-name Bassa Vah
10
+ ::script-name Batak
11
+ ::script-name Bengali ::abugida-default-vowel a
12
+ ::script-name Bhaiksuki
13
+ ::script-name Bopomofo ::language Chinese
14
+ ::script-name Brahmi ::abugida-default-vowel a
15
+ ::script-name Braille
16
+ ::script-name Buginese
17
+ ::script-name Buhid
18
+ ::script-name Canadian Syllabics
19
+ ::script-name Carian
20
+ ::script-name Caucasian Albanian
21
+ ::script-name Chakma
22
+ ::script-name Cham
23
+ ::script-name Cherokee
24
+ ::script-name Coptic
25
+ ::script-name Cuneiform
26
+ ::script-name Cypriot
27
+ ::script-name Cyrillic
28
+ ::script-name CJK ::alt-script-name Chinese, Kanji ::language Chinese, Japanese, Korean, Mandarin
29
+ ::script-name Deseret
30
+ ::script-name Devanagari ::abugida-default-vowel a
31
+ ::script-name Duployan
32
+ ::script-name Egyptian Hieroglyph
33
+ ::script-name Elbasan
34
+ ::script-name Ethiopic
35
+ ::script-name Georgian
36
+ ::script-name Glagolitic
37
+ ::script-name Gothic
38
+ ::script-name Grantha
39
+ ::script-name Greek
40
+ ::script-name Gujarati ::abugida-default-vowel a
41
+ ::script-name Gurmukhi ::abugida-default-vowel a
42
+ ::script-name Hangul ::language Korean
43
+ ::script-name Hanunoo
44
+ ::script-name Hatran
45
+ ::script-name Hebrew ::direction right-to-left
46
+ ::script-name Hiragana ::language Japanese
47
+ ::script-name Imperial Aramaic
48
+ ::script-name Inscriptional Pahlavi
49
+ ::script-name Inscriptional Parthian
50
+ ::script-name Javanese
51
+ ::script-name Kaithi
52
+ ::script-name Kannada ::abugida-default-vowel a
53
+ ::script-name Katakana ::language Japanese
54
+ ::script-name Kayah Li
55
+ ::script-name Kharoshthi
56
+ ::script-name Khmer ::abugida-default-vowel a, o
57
+ ::script-name Khojki
58
+ ::script-name Khudawadi
59
+ ::script-name Klingon
60
+ ::script-name Lao
61
+ ::script-name Lepcha
62
+ ::script-name Latin
63
+ ::script-name Limbu
64
+ ::script-name Linear A
65
+ ::script-name Linear B
66
+ ::script-name Lycian
67
+ ::script-name Lydian
68
+ ::script-name Mahajani
69
+ ::script-name Malayalam ::abugida-default-vowel a
70
+ ::script-name Mandaic
71
+ ::script-name Manichaean
72
+ ::script-name Marchen
73
+ ::script-name Meetei Mayek
74
+ ::script-name Meroitic Cursive
75
+ ::script-name Meroitic Hieroglyphic
76
+ ::script-name Miao
77
+ ::script-name Modi ::abugida-default-vowel a
78
+ ::script-name Mongolian
79
+ ::script-name Mro
80
+ ::script-name Multani
81
+ ::script-name Myanmar ::alt-script-name Burmese ::abugida-default-vowel a
82
+ ::script-name Nabataean
83
+ ::script-name New Tai Lue
84
+ ::script-name Newa
85
+ ::script-name Nko ::direction right-to-left
86
+ ::script-name Ogham
87
+ ::script-name Ol Chiki
88
+ ::script-name Old Hungarian
89
+ ::script-name Old Italic
90
+ ::script-name Old Permic
91
+ ::script-name Old Persian
92
+ ::script-name Old North Arabian
93
+ ::script-name Old South Arabian
94
+ ::script-name Old Turkic
95
+ ::script-name Oriya ::alt-script-name Odia ::abugida-default-vowel a
96
+ ::script-name Osage
97
+ ::script-name Osmanya
98
+ ::script-name Pahawh Hmong
99
+ ::script-name Palmyrene
100
+ ::script-name Pau Cin Hau
101
+ ::script-name Phags-pa
102
+ ::script-name Phaistos Disc
103
+ ::script-name Phoenician
104
+ ::script-name Psalter Pahlavi
105
+ ::script-name Rejang
106
+ ::script-name Runic
107
+ ::script-name Samaritan
108
+ ::script-name Saurashtra
109
+ ::script-name Sharada
110
+ ::script-name Shavian
111
+ ::script-name Siddham
112
+ ::script-name Sinhala ::abugida-default-vowel a
113
+ ::script-name Sora Sompeng
114
+ ::script-name Sundanese ::abugida-default-vowel a
115
+ ::script-name Syloti Nagri
116
+ ::script-name Syriac
117
+ ::script-name Tagalog
118
+ ::script-name Tagbanwa
119
+ ::script-name Tai Le
120
+ ::script-name Tai Tham
121
+ ::script-name Tai Viet
122
+ ::script-name Takri
123
+ ::script-name Tamil ::abugida-default-vowel a
124
+ ::script-name Tangut
125
+ ::script-name Telugu ::abugida-default-vowel a
126
+ ::script-name Thaana ::direction right-to-left
127
+ ::script-name Thai
128
+ ::script-name Tibetan ::abugida-default-vowel a
129
+ ::script-name Tifinagh
130
+ ::script-name Tirhuta
131
+ ::script-name Ugaritic
132
+ ::script-name Vai
133
+ ::script-name Vedic
134
+ ::script-name Warang Citi
135
+ ::script-name Yi
uroman/data/UnicodeData.txt ADDED
The diff for this file is too large to render. See raw diff
 
uroman/data/UnicodeDataOverwrite.txt ADDED
@@ -0,0 +1,442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## UnicodeDataOverwrite.txt
2
+ ::u 00A0 ::r " " ::comment no-break space
3
+ ::u 01BF ::r w ::comment ƿ Latin Character Wynn (Old English)
4
+ ::u 0294 ::r ' ::comment gottal stop
5
+ ::u 0295 ::r ' ::comment ʕ voiced pharyngeal fricative
6
+ ::u 0305 ::r "" ::comment ̅ Combining overline
7
+ ::u 0306 ::r "" ::comment ̆ Combining breve
8
+ ::u 0307 ::r "" ::comment ̇ Combining dot above
9
+ ::u 030A ::r "" ::comment ̊ Combining ring above
10
+ ::u 030C ::r "" ::comment ̌ Combining caron
11
+ ::u 0311 ::r "" ::comment ̑ Combining inverted breve
12
+ ::u 031D ::r "" ::comment ̝ Combining down up below
13
+ ::u 031E ::r "" ::comment ̞ Combining down tack below
14
+ ::u 031F ::r "" ::comment ̟ Combining plus sign below
15
+ ::u 0323 ::r "" ::comment ̣ Combining dot below
16
+ ::u 0325 ::r "" ::comment ̥ Combining ring below
17
+ ::u 0329 ::r "" ::comment ̩ Combining vertical line below
18
+ ::u 032A ::r "" ::comment ̪ Combining bridge below
19
+ ::u 032F ::r "" ::comment ̯ Combining inverted breve below
20
+ ::u 0342 ::r "" ::comment ͂ Combining Greek perispomeni (circumflex accent)
21
+ ::u 0343 ::r "" ::comment ̓ Combining Greek koronis
22
+ ::u 0361 ::r "" ::comment Combining double inverted breve
23
+ ::u 0384 ::r "" ::comment ΄ Greek tonos
24
+ ::u 0482 ::r 1000· ::comment ҂ Cyrillic thousands sign
25
+ ::u 0483 ::r "" ::comment ҃ Combining Cyrillic Titlo ::annotation titlo
26
+ ::u 0484 ::r "" ::comment ҄ Combining Cyrillic Palatalization ::annotation palatalization
27
+ ::u 055B ::r "" ::comment ՛ Armenian emphasis mark
28
+ ::u 055F ::r "" ::comment ՟ Armenian abbreviation mark ::annotation abbreviation
29
+
30
+ ::u 0901 ::r +m ::comment Devanagari sign candrabindu
31
+ ::u 0902 ::r +m ::comment Devanagari sign anusvara
32
+ ::u 0903 ::r +h ::comment Devanagari sign visarga
33
+ ::u 093D ::r ' ::comment Devanagari sign avagraha
34
+ ::u 0950 ::r om ::comment ॐ Devanagari om symbol
35
+ ::u 0951 ::r "" ::comment ॑ Devanagari stress sign "udatta"
36
+ ::u 0952 ::r "" ::comment ॒ Devanagari stress sign "anudatta"
37
+ ::u 0981 ::r +n ::comment Bengali sign candrabindu ("chôndrôbindu")
38
+ ::u 0982 ::r +ng ::comment Bengali sign anusvara ("ônushar")
39
+ ::u 0983 ::r +h ::comment Bengali sign visarga ("bishôrgô")
40
+ ::u 099A ::r ch ::comment instead of Bengali C(A)
41
+ ::u 099B ::r chh ::comment instead of Bengali CC(A)
42
+ ::u 0A02 ::r +m ::comment Gurmukhi sign bindi
43
+ ::u 0A70 ::r +m ::comment Gurmukhi tippi
44
+ # ::u 0A72 ::r "" ::comment Gurmukhi addak
45
+ ::u 0A72 ::r "" ::comment Gurmukhi iri
46
+ ::u 0A73 ::r "" ::comment Gurmukhi ura
47
+ ::u 0B01 ::r +m ::comment Oriya sign candrabindu
48
+ ::u 0B03 ::r +h ::comment Oriya sign visarga
49
+ ::u 0B5F ::r ya ::comment ୟ Oriya letter yya
50
+ ::u 0B82 ::r +m ::comment Tamil sign anusvara (not to be used?)
51
+ ::u 0B83 ::r +h ::comment Tamil sign visarga ("āytam")
52
+ ::u 0B9F ::r t ::comment instead of Tamil TT(A)
53
+ ::u 0BA3 ::r n ::comment instead of Tamil NN(A)
54
+ ::u 0BA9 ::r n ::comment instead of Tamil NNN(A)
55
+ ::u 0BB1 ::r r ::comment instead of Tamil RR(A)
56
+ ::u 0BB3 ::r l ::comment instead of Tamil LL(A)
57
+ ::u 0BB4 ::r l ::comment instead of Tamil LLL(A)
58
+ ::u 0C03 ::r +h ::comment ః Telugu sign visarga
59
+ ::u 0C83 ::r +h ::comment Kannada sign visarga
60
+ ::u 0D02 ::r +m ::comment Malayalam sign anusvara
61
+ ::u 0D03 ::r +h ::comment Malayalam sign visarga
62
+ ::u 0D82 ::r +n ::comment Sinhala sign anusvaraya
63
+ ::u 0DA4 ::r ny ::comment Sinhala ඤ
64
+ ::u 0DA5 ::r gn ::comment Sinhala ඥ
65
+ ::u 0DCA ::r "" ::comment Sinhala sign al-lakuna (virama = no vowel)
66
+ ::u 0DCF ::r aa ::comment Sinhala ා
67
+ ::u 0DD0 ::r ae ::comment Sinhala ැ
68
+ ::u 0DD1 ::r ae ::comment Sinhala ෑ
69
+ ::u 0DD2 ::r i ::comment Sinhala ි
70
+ ::u 0DD3 ::r ii ::comment Sinhala ී
71
+ ::u 0DD4 ::r u ::comment Sinhala ු
72
+ ::u 0DD6 ::r uu ::comment Sinhala ූ
73
+ ::u 0DD8 ::r r ::comment Sinhala ෘ
74
+ ::u 0DD9 ::r e ::comment Sinhala ෙ
75
+ ::u 0DDA ::r ee ::comment Sinhala ේ
76
+ ::u 0DDB ::r ai ::comment Sinhala ෛ
77
+ ::u 0DDC ::r o ::comment Sinhala ො
78
+ ::u 0DDD ::r oo ::comment Sinhala ෝ
79
+ ::u 0DDE ::r au ::comment Sinhala ෞ
80
+ ::u 0DDF ::r aa ::comment Sinhala ා
81
+ ::u 0DF2 ::r rr ::comment Sinhala ෲ
82
+
83
+ ::u 0E02 ::r k ::comment Thai character KHO KHAI
84
+ ::u 0E03 ::r k ::comment Thai character KHO KHUAT
85
+ ::u 0E04 ::r k ::comment Thai character KHO KHWAI
86
+ ::u 0E05 ::r k ::comment Thai character KHO KHON
87
+ ::u 0E06 ::r k ::comment Thai character KHO RAKHANG
88
+ ::u 0E10 ::r t ::comment Thai character THO THAN
89
+ ::u 0E11 ::r t ::comment Thai character THO NANGMONTHO
90
+ ::u 0E12 ::r t ::comment Thai character THO PHUTHAO
91
+ ::u 0E16 ::r t ::comment Thai character THO THUNG
92
+ ::u 0E17 ::r t ::comment Thai character THO THAHAN
93
+ ::u 0E18 ::r t ::comment Thai character THO THONG
94
+ ::u 0E1C ::r p ::comment Thai character PHO PHUNG
95
+ ::u 0E1E ::r p ::comment Thai character PHO PHAN
96
+ ::u 0E20 ::r p ::comment Thai character PHO SAMPHAO
97
+ ::u 0E2D ::r o ::comment Thai character O ANG
98
+ ::u 0E2F ::r ... ::comment ฯ Thai character PAIYANNOI (ellipsis, abbreviation)
99
+ ::u 0E31 ::r a ::comment Thai character MAI HAN-AKAT
100
+ ::u 0E3A ::r "" ::comment Thai character PHINTHU (Pali virama)
101
+ ::u 0E40 ::r e ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA E
102
+ ::u 0E41 ::r ae ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA AE
103
+ ::u 0E42 ::r o ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA O
104
+ ::u 0E43 ::r ai ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA AI MAIMUAN
105
+ ::u 0E44 ::r ai ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA AI MAIMALAI
106
+ ::u 0E45 ::r "" ::comment Thai character LAKKHANGYAO vowel lengthener
107
+ ::u 0E47 ::r o ::comment Thai character MAITAIKHU vowel shortener
108
+ ::u 0E48 ::r "" ::tone-mark non-standard ::comment Thai tone mark MAI EK
109
+ ::u 0E49 ::r "" ::tone-mark standard ::comment Thai tone mark MAI THO
110
+ ::u 0E4A ::r "" ::tone-mark high ::comment Thai tone mark MAI TRI
111
+ ::u 0E4B ::r "" ::tone-mark rising ::comment Thai tone mark MAI CHATTAWA
112
+ ::u 0E4C ::r "" ::comment Thai character THANTHAKHAT cancellation mark (cf. virama)
113
+ ::u 0E4D ::r +m ::comment ํ Thai character NIKHAHIT final nasal (cf. anusvara)
114
+ ::u 0ECC ::r "" ::comment ໌ Lao cancellation mark ::annotation cancellation
115
+ ::u 0F0B ::r · ::comment ་ Tibetan mark intersyllabic tsheg
116
+ ::u 0F0C ::r "" ::comment ༌ Tibetan mark delimiter tsheg bstar
117
+ ::u 0F84 ::r "" ::comment ྄ Tibetan halanta
118
+ ::u 1036 ::r +n ::comment Myanmar sign anusvara ("auk myit")
119
+ ::u 1037 ::r "" ::tone-mark creaky ::comment Myanmar sign dot below
120
+ ::u 1038 ::r "" ::tone-mark high ::comment Myanmar sign visarga
121
+
122
+ ::u 16A0 ::r f ::comment ᚠ RUNIC LETTER FEHU FEOH FE F
123
+ ::u 16A1 ::r v ::comment ᚡ RUNIC LETTER V
124
+ ::u 16A2 ::r u ::comment ᚢ RUNIC LETTER URUZ UR U
125
+ ::u 16A3 ::r y ::comment ᚣ RUNIC LETTER YR
126
+ ::u 16A4 ::r y ::comment ᚤ RUNIC LETTER Y
127
+ ::u 16A5 ::r w ::comment ᚥ RUNIC LETTER W
128
+ ::u 16A6 ::r th ::comment ᚦ RUNIC LETTER THURISAZ THURS THORN
129
+ ::u 16A7 ::r th ::comment ᚧ RUNIC LETTER ETH
130
+ ::u 16A8 ::r a ::comment ᚨ RUNIC LETTER ANSUZ A
131
+ ::u 16A9 ::r o ::comment ᚩ RUNIC LETTER OS O
132
+ ::u 16AA ::r a ::comment ᚪ RUNIC LETTER AC A
133
+ ::u 16AB ::r ae ::comment ᚫ RUNIC LETTER AESC
134
+ ::u 16AC ::r o ::comment ᚬ RUNIC LETTER LONG-BRANCH-OSS O
135
+ ::u 16AD ::r o ::comment ᚭ RUNIC LETTER SHORT-TWIG-OSS O
136
+ ::u 16AE ::r o ::comment ᚮ RUNIC LETTER O
137
+ ::u 16AF ::r oe ::comment ᚯ RUNIC LETTER OE
138
+ ::u 16B0 ::r on ::comment ᚰ RUNIC LETTER ON
139
+ ::u 16B1 ::r r ::comment ᚱ RUNIC LETTER RAIDO RAD REID R
140
+ ::u 16B2 ::r k ::comment ᚲ RUNIC LETTER KAUNA
141
+ ::u 16B3 ::r c ::comment ᚳ RUNIC LETTER CEN
142
+ ::u 16B4 ::r k ::comment ᚴ RUNIC LETTER KAUN K
143
+ ::u 16B5 ::r g ::comment ᚵ RUNIC LETTER G
144
+ ::u 16B6 ::r ng ::comment ᚶ RUNIC LETTER ENG
145
+ ::u 16B7 ::r g ::comment ᚷ RUNIC LETTER GEBO GYFU G
146
+ ::u 16B8 ::r g ::comment ᚸ RUNIC LETTER GAR
147
+ ::u 16B9 ::r w ::comment ᚹ RUNIC LETTER WUNJO WYNN W
148
+ ::u 16BA ::r h ::comment ᚺ RUNIC LETTER HAGLAZ H
149
+ ::u 16BB ::r h ::comment ᚻ RUNIC LETTER HAEGL H
150
+ ::u 16BC ::r h ::comment ᚼ RUNIC LETTER LONG-BRANCH-HAGALL H
151
+ ::u 16BD ::r h ::comment ᚽ RUNIC LETTER SHORT-TWIG-HAGALL H
152
+ ::u 16BE ::r n ::comment ᚾ RUNIC LETTER NAUDIZ NYD NAUD N
153
+ ::u 16BF ::r n ::comment ᚿ RUNIC LETTER SHORT-TWIG-NAUD N
154
+ ::u 16C0 ::r n ::comment ᛀ RUNIC LETTER DOTTED-N
155
+ ::u 16C1 ::r i ::comment ᛁ RUNIC LETTER ISAZ IS ISS I
156
+ ::u 16C2 ::r e ::comment ᛂ RUNIC LETTER E
157
+ ::u 16C3 ::r j ::comment ᛃ RUNIC LETTER JERAN J
158
+ ::u 16C4 ::r j ::comment ᛄ RUNIC LETTER GER
159
+ ::u 16C5 ::r ae ::comment ᛅ RUNIC LETTER LONG-BRANCH-AR AE
160
+ ::u 16C6 ::r a ::comment ᛆ RUNIC LETTER SHORT-TWIG-AR A
161
+ ::u 16C7 ::r i ::comment ᛇ RUNIC LETTER IWAZ EOH
162
+ ::u 16C8 ::r p ::comment ᛈ RUNIC LETTER PERTHO PEORTH P
163
+ ::u 16C9 ::r z ::comment ᛉ RUNIC LETTER ALGIZ EOLHX
164
+ ::u 16CA ::r s ::comment ᛊ RUNIC LETTER SOWILO S
165
+ ::u 16CB ::r s ::comment ᛋ RUNIC LETTER SIGEL LONG-BRANCH-SOL S
166
+ ::u 16CC ::r s ::comment ᛌ RUNIC LETTER SHORT-TWIG-SOL S
167
+ ::u 16CD ::r c ::comment ᛍ RUNIC LETTER C
168
+ ::u 16CE ::r z ::comment ᛎ RUNIC LETTER Z
169
+ ::u 16CF ::r t ::comment ᛏ RUNIC LETTER TIWAZ TIR TYR T
170
+ ::u 16D0 ::r t ::comment ᛐ RUNIC LETTER SHORT-TWIG-TYR T
171
+ ::u 16D1 ::r d ::comment ᛑ RUNIC LETTER D
172
+ ::u 16D2 ::r b ::comment ᛒ RUNIC LETTER BERKANAN BEORC BJARKAN B
173
+ ::u 16D3 ::r b ::comment ᛓ RUNIC LETTER SHORT-TWIG-BJARKAN B
174
+ ::u 16D4 ::r p ::comment ᛔ RUNIC LETTER DOTTED-P
175
+ ::u 16D5 ::r p ::comment ᛕ RUNIC LETTER OPEN-P
176
+ ::u 16D6 ::r e ::comment ᛖ RUNIC LETTER EHWAZ EH E
177
+ ::u 16D7 ::r m ::comment ᛗ RUNIC LETTER MANNAZ MAN M
178
+ ::u 16D8 ::r m ::comment ᛘ RUNIC LETTER LONG-BRANCH-MADR M
179
+ ::u 16D9 ::r m ::comment ᛙ RUNIC LETTER SHORT-TWIG-MADR M
180
+ ::u 16DA ::r l ::comment ᛚ RUNIC LETTER LAUKAZ LAGU LOGR L
181
+ ::u 16DB ::r l ::comment ᛛ RUNIC LETTER DOTTED-L
182
+ ::u 16DC ::r ng ::comment ᛜ RUNIC LETTER INGWAZ
183
+ ::u 16DD ::r ng ::comment ᛝ RUNIC LETTER ING
184
+ ::u 16DE ::r d ::comment ᛞ RUNIC LETTER DAGAZ DAEG D
185
+ ::u 16DF ::r o ::comment ᛟ RUNIC LETTER OTHALAN ETHEL O
186
+ ::u 16E0 ::r ea ::comment ᛠ RUNIC LETTER EAR
187
+ ::u 16E1 ::r io ::comment ᛡ RUNIC LETTER IOR
188
+ ::u 16E2 ::r q ::comment ᛢ RUNIC LETTER CWEORTH
189
+ ::u 16E3 ::r k ::comment ᛣ RUNIC LETTER CALC
190
+ ::u 16E4 ::r k ::comment ᛤ RUNIC LETTER CEALC
191
+ ::u 16E5 ::r st ::comment ᛥ RUNIC LETTER STAN
192
+ ::u 16E6 ::r r ::comment ᛦ RUNIC LETTER LONG-BRANCH-YR
193
+ ::u 16E7 ::r r ::comment ᛧ RUNIC LETTER SHORT-TWIG-YR
194
+ ::u 16E8 ::r r ::comment ᛨ RUNIC LETTER ICELANDIC-YR
195
+ ::u 16E9 ::r q ::comment ᛩ RUNIC LETTER Q
196
+ ::u 16EA ::r x ::comment ᛪ RUNIC LETTER X
197
+
198
+ ::u 17B9 ::r oe ::comment Khmer vowel sign y (short)
199
+ ::u 17BA ::r oe ::comment Khmer vowel sign yy (long)
200
+ ::u 17C6 ::r +m ::comment Khmer sign nikahit (cf. anusvara)
201
+ ::u 17C7 ::r +h ::comment Khmer sign reahmuk (cf. visarga)
202
+ ::u 17C8 ::r ' ::comment Khmer sign yuukaleapintu (short vowel and glottal stop)
203
+ ::u 17C9 ::r "" ::comment Khmer sign muusikatoan: changes the second register to the first
204
+ ::u 17CA ::r "" ::comment Khmer sign triisap: changes the first register to the second
205
+ ::u 17CB ::r "" ::comment Khmer sign bantoc (vowel shortener)
206
+ ::u 17D2 ::r "" ::comment Khmer sign coeng (foot/subscript, cf. virama = no vowel)
207
+ ::u 17D5 ::r . ::comment Khmer sign bariyoosan; period ending entire text or chapter
208
+
209
+ ::u 180E ::r ' ::comment ᠎ Mongolian vowel separator
210
+
211
+ ::u 1B80 ::r +ng ::comment ᮀ Sundanese sign panyecek
212
+ ::u 1B81 ::r +r ::comment ᮁ Sundanese sign panglayar
213
+ ::u 1B82 ::r +h ::comment ᮂ Sundanese sign pangwisad
214
+ ::u 1BA1 ::r ya ::comment ᮡ Sundanese consonant sign pamingkal
215
+ ::u 1BA2 ::r ra ::comment ᮢ Sundanese consonant sign panyakr
216
+ ::u 1BA3 ::r la ::comment ᮣ Sundanese consonant sign panyiku
217
+ ::u 1BA4 ::r i ::comment ᮤ Sundanese consonant sign panghulu
218
+ ::u 1BA5 ::r u ::comment ᮥ Sundanese consonant sign panyuku
219
+ ::u 1BA6 ::r e ::comment ᮦ Sundanese vowel sign panaelaeng
220
+ ::u 1BA7 ::r o ::comment ᮧ Sundanese vowel sign panolong
221
+ ::u 1BA8 ::r e ::comment ᮨ Sundanese vowel sign pamepet
222
+ ::u 1BA9 ::r eu ::comment ᮩ Sundanese vowel sign paneuleung
223
+ ::u 1BAA ::r "" ::comment ᮪ Sundanese sign pamaaeh or patén (no vowel/virama)
224
+
225
+ ::u 1FBD ::r "" ::comment ᾽ Greek koronis
226
+ ::u 1FFE ::r "" ::comment Greek dasia (rough breathing)
227
+
228
+ ::u 2002 ::r " " ::comment en space
229
+ ::u 2003 ::r " " ::comment em space
230
+ ::u 2004 ::r " " ::comment three-per-em space
231
+ ::u 2005 ::r " " ::comment four-per-em space
232
+ ::u 2006 ::r " " ::comment six-per-em space
233
+ ::u 2007 ::r " " ::comment figure space
234
+ ::u 2008 ::r " " ::comment punctuation space
235
+ ::u 2009 ::r " " ::comment thin space
236
+ ::u 200A ::r " " ::comment hair space
237
+ ::u 202F ::r " " ::comment narrow no-break space
238
+
239
+ ::u 2D30 ::r a ::comment TIFINAGH LETTER YA ⴰ
240
+ ::u 2D31 ::r b ::comment TIFINAGH LETTER YAB ⴱ
241
+ ::u 2D32 ::r bh ::comment TIFINAGH LETTER YABH ⴲ
242
+ ::u 2D33 ::r g ::comment TIFINAGH LETTER YAG ⴳ
243
+ ::u 2D34 ::r ghh ::comment TIFINAGH LETTER YAGHH ⴴ
244
+ ::u 2D35 ::r j ::comment TIFINAGH LETTER BERBER ACADEMY YAJ ⴵ
245
+ ::u 2D36 ::r j ::comment TIFINAGH LETTER YAJ ⴶ
246
+ ::u 2D37 ::r d ::comment TIFINAGH LETTER YAD ⴷ
247
+ ::u 2D38 ::r dh ::comment TIFINAGH LETTER YADH ⴸ
248
+ ::u 2D39 ::r dd ::comment TIFINAGH LETTER YADD ⴹ
249
+ ::u 2D3A ::r ddh ::comment TIFINAGH LETTER YADDH ⴺ
250
+ ::u 2D3B ::r e ::comment TIFINAGH LETTER YEY ⴻ
251
+ ::u 2D3C ::r f ::comment TIFINAGH LETTER YAF ⴼ
252
+ ::u 2D3D ::r k ::comment TIFINAGH LETTER YAK ⴽ
253
+ ::u 2D3E ::r k ::comment TIFINAGH LETTER TUAREG YAK ⴾ
254
+ ::u 2D3F ::r khh ::comment TIFINAGH LETTER YAKHH ⴿ
255
+ ::u 2D40 ::r h ::comment TIFINAGH LETTER YAH ⵀ
256
+ ::u 2D41 ::r h ::comment TIFINAGH LETTER BERBER ACADEMY YAH ⵁ
257
+ ::u 2D42 ::r h ::comment TIFINAGH LETTER TUAREG YAH ⵂ
258
+ ::u 2D43 ::r hh ::comment TIFINAGH LETTER YAHH ⵃ
259
+ ::u 2D44 ::r ' ::comment TIFINAGH LETTER YAA ⵄ
260
+ ::u 2D45 ::r kh ::comment TIFINAGH LETTER YAKH ⵅ
261
+ ::u 2D46 ::r kh ::comment TIFINAGH LETTER TUAREG YAKH ⵆ
262
+ ::u 2D47 ::r q ::comment TIFINAGH LETTER YAQ ⵇ
263
+ ::u 2D48 ::r q ::comment TIFINAGH LETTER TUAREG YAQ ⵈ
264
+ ::u 2D49 ::r i ::comment TIFINAGH LETTER YI ⵉ
265
+ ::u 2D4A ::r zh ::comment TIFINAGH LETTER YAZH ⵊ
266
+ ::u 2D4B ::r zh ::comment TIFINAGH LETTER AHAGGAR YAZH ⵋ
267
+ ::u 2D4C ::r zh ::comment TIFINAGH LETTER TUAREG YAZH ⵌ
268
+ ::u 2D4D ::r l ::comment TIFINAGH LETTER YAL ⵍ
269
+ ::u 2D4E ::r m ::comment TIFINAGH LETTER YAM ⵎ
270
+ ::u 2D4F ::r n ::comment TIFINAGH LETTER YAN ⵏ
271
+ ::u 2D50 ::r gn ::comment TIFINAGH LETTER TUAREG YAGN ⵐ
272
+ ::u 2D51 ::r ng ::comment TIFINAGH LETTER TUAREG YANG ⵑ
273
+ ::u 2D52 ::r p ::comment TIFINAGH LETTER YAP ⵒ
274
+ ::u 2D53 ::r u ::comment TIFINAGH LETTER YU ⵓ
275
+ ::u 2D54 ::r r ::comment TIFINAGH LETTER YAR ⵔ
276
+ ::u 2D55 ::r rr ::comment TIFINAGH LETTER YARR ⵕ
277
+ ::u 2D56 ::r gh ::comment TIFINAGH LETTER YAGH ⵖ
278
+ ::u 2D57 ::r gh ::comment TIFINAGH LETTER TUAREG YAGH ⵗ
279
+ ::u 2D58 ::r gh ::comment TIFINAGH LETTER AYER YAGH ⵘ
280
+ ::u 2D59 ::r s ::comment TIFINAGH LETTER YAS ⵙ
281
+ ::u 2D5A ::r ss ::comment TIFINAGH LETTER YASS ⵚ
282
+ ::u 2D5B ::r sh ::comment TIFINAGH LETTER YASH ⵛ
283
+ ::u 2D5C ::r t ::comment TIFINAGH LETTER YAT ⵜ
284
+ ::u 2D5D ::r th ::comment TIFINAGH LETTER YATH ⵝ
285
+ ::u 2D5E ::r ch ::comment TIFINAGH LETTER YACH ⵞ
286
+ ::u 2D5F ::r tt ::comment TIFINAGH LETTER YATT ⵟ
287
+ ::u 2D60 ::r v ::comment TIFINAGH LETTER YAV ⵠ
288
+ ::u 2D61 ::r w ::comment TIFINAGH LETTER YAW ⵡ
289
+ ::u 2D62 ::r y ::comment TIFINAGH LETTER YAY ⵢ
290
+ ::u 2D63 ::r z ::comment TIFINAGH LETTER YAZ ⵣ
291
+ ::u 2D64 ::r z ::comment TIFINAGH LETTER TAWELLEMET YAZ ⵤ
292
+ ::u 2D65 ::r zz ::comment TIFINAGH LETTER YAZZ ⵥ
293
+ ::u 2D66 ::r ye ::comment TIFINAGH LETTER YE ⵦ
294
+ ::u 2D67 ::r yo ::comment TIFINAGH LETTER YO ⵧ
295
+ ::u 2D6F ::r "" ::comment TIFINAGH MODIFIER LETTER LABIALIZATION MARK ⵯ
296
+ ::u 2D70 ::r "" ::comment TIFINAGH SEPARATOR MARK ⵰
297
+ ::u 2D7F ::r "" ::comment TIFINAGH CONSONANT JOINER ⵿
298
+
299
+ ::u 3063 ::r tsu ::comment Hiragana letter small tsu
300
+ ::u 30C3 ::r tsu ::comment Katakana letter small tsu
301
+
302
+ ::u ABE3 ::r o ::comment ꯣ Meetei Mayek vowel sign onap
303
+ ::u ABE7 ::r ou ::comment ꯧ Meetei Mayek vowel sign sounap
304
+
305
+ ::u F008 ::r "" ::comment Yoruba diacritic in private use area
306
+ ::u F00F ::r "" ::comment Yoruba diacritic in private use area
307
+ ::u F023 ::r "" ::comment Yoruba diacritic in private use area
308
+ ::u F025 ::r "" ::comment Yoruba diacritic in private use area
309
+
310
+ ::u F8D0 ::r a ::name KLINGON LETTER A
311
+ ::u F8D1 ::r b ::name KLINGON LETTER B
312
+ ::u F8D2 ::r ch ::name KLINGON LETTER CH
313
+ ::u F8D3 ::r D ::name KLINGON LETTER D
314
+ ::u F8D4 ::r e ::name KLINGON LETTER E
315
+ ::u F8D5 ::r gh ::name KLINGON LETTER GH
316
+ ::u F8D6 ::r H ::name KLINGON LETTER H
317
+ ::u F8D7 ::r I ::name KLINGON LETTER I
318
+ ::u F8D8 ::r j ::name KLINGON LETTER J
319
+ ::u F8D9 ::r l ::name KLINGON LETTER L
320
+ ::u F8DA ::r m ::name KLINGON LETTER M
321
+ ::u F8DB ::r n ::name KLINGON LETTER N
322
+ ::u F8DC ::r ng ::name KLINGON LETTER NG
323
+ ::u F8DD ::r o ::name KLINGON LETTER O
324
+ ::u F8DE ::r p ::name KLINGON LETTER P
325
+ ::u F8DF ::r q ::name KLINGON LETTER Q
326
+ ::u F8E0 ::r Q ::name KLINGON LETTER Q
327
+ ::u F8E1 ::r r ::name KLINGON LETTER R
328
+ ::u F8E2 ::r S ::name KLINGON LETTER S
329
+ ::u F8E3 ::r t ::name KLINGON LETTER T
330
+ ::u F8E4 ::r tlh ::name KLINGON LETTER TLH
331
+ ::u F8E5 ::r u ::name KLINGON LETTER U
332
+ ::u F8E6 ::r v ::name KLINGON LETTER V
333
+ ::u F8E7 ::r w ::name KLINGON LETTER W
334
+ ::u F8E8 ::r y ::name KLINGON LETTER Y
335
+ ::u F8E9 ::r ' ::name KLINGON LETTER GLOTTAL STOP
336
+ ::u F8F0 ::num 0 ::name KLINGON DIGIT ZERO
337
+ ::u F8F1 ::num 1 ::name KLINGON DIGIT ONE
338
+ ::u F8F2 ::num 2 ::name KLINGON DIGIT TWO
339
+ ::u F8F3 ::num 3 ::name KLINGON DIGIT THREE
340
+ ::u F8F4 ::num 4 ::name KLINGON DIGIT FOUR
341
+ ::u F8F5 ::num 5 ::name KLINGON DIGIT FIVE
342
+ ::u F8F6 ::num 6 ::name KLINGON DIGIT SIX
343
+ ::u F8F7 ::num 7 ::name KLINGON DIGIT SEVEN
344
+ ::u F8F8 ::num 8 ::name KLINGON DIGIT EIGHT
345
+ ::u F8F9 ::num 9 ::name KLINGON DIGIT NINE
346
+ ::u F8FD ::r , ::name KLINGON COMMA
347
+ ::u F8FE ::r . ::name KLINGON FULL STOP
348
+ ::u F8FF ::name KLINGON MUMMIFICATION GLYPH
349
+
350
+ ::u 1163D ::r +m ::comment Modi sign anusvara
351
+ ::u 1163E ::r +h ::comment Modi sign visarga
352
+
353
+ ::u 13068 ::num 1000000 ::comment Egyptian Hieroglyph
354
+ ::u 1308B ::r r ::comment Egyptian Hieroglyph ::pic mouth
355
+ ::u 1309D ::r ' ::comment Egyptian Hieroglyph (ayn) ::pic forearm
356
+ ::u 130A7 ::r d ::comment Egyptian Hieroglyph ::pic hand
357
+ ::u 130AD ::num 10000 ::comment Egyptian Hieroglyph
358
+ ::u 130AE ::num 20000 ::comment Egyptian Hieroglyph
359
+ ::u 130AF ::num 30000 ::comment Egyptian Hieroglyph
360
+ ::u 130B0 ::num 40000 ::comment Egyptian Hieroglyph
361
+ ::u 130B1 ::num 50000 ::comment Egyptian Hieroglyph
362
+ ::u 130B2 ::num 60000 ::comment Egyptian Hieroglyph
363
+ ::u 130B3 ::num 70000 ::comment Egyptian Hieroglyph
364
+ ::u 130B4 ::num 80000 ::comment Egyptian Hieroglyph
365
+ ::u 130B5 ::num 90000 ::comment Egyptian Hieroglyph
366
+ ::u 130B6 ::num 50000 ::comment Egyptian Hieroglyph
367
+ ::u 130C0 ::r b ::comment Egyptian Hieroglyph ::pic foot
368
+ ::u 130ED ::r l ::comment Egyptian Hieroglyph [also rw] ::pic lion recumbent
369
+ ::u 13121 ::r h ::comment Egyptian Hieroglyph (f-underscore) ::pic aninal's belly and udder
370
+ ::u 1313F ::r a ::comment Egyptian Hieroglyph (alef) ::pic vulture
371
+ ::u 13153 ::r m ::comment Egyptian Hieroglyph ::pic owl
372
+ ::u 13171 ::r w ::comment Egyptian Hieroglyph ::pic quail chick
373
+ ::u 13187 ::r ::comment Egyptian Hieroglyph (determinative/son) H8 ::pic egg
374
+ ::u 13190 ::num 100000 ::comment Egyptian Hieroglyph
375
+ ::u 13191 ::r f ::comment Egyptian Hieroglyph ::pic horned viper
376
+ ::u 13193 ::r d ::comment Egyptian Hieroglyph (J) ::pic cobra
377
+ ::u 131BC ::num 1000 ::comment Egyptian Hieroglyph
378
+ ::u 131BD ::num 2000 ::comment Egyptian Hieroglyph
379
+ ::u 131BE ::num 3000 ::comment Egyptian Hieroglyph
380
+ ::u 131BF ::num 4000 ::comment Egyptian Hieroglyph
381
+ ::u 131C0 ::num 5000 ::comment Egyptian Hieroglyph
382
+ ::u 131C1 ::num 6000 ::comment Egyptian Hieroglyph
383
+ ::u 131C2 ::num 7000 ::comment Egyptian Hieroglyph
384
+ ::u 131C3 ::num 8000 ::comment Egyptian Hieroglyph
385
+ ::u 131C4 ::num 9000 ::comment Egyptian Hieroglyph
386
+ ::u 131CB ::r i ::comment Egyptian Hieroglyph (yod) ::pic single reed
387
+ ::u 131CC ::r y ::comment Egyptian Hieroglyph ::pic double reed
388
+ ::u 1320E ::r q ::comment Egyptian Hieroglyph (qaf) ::pic sandy slope
389
+ ::u 13209 ::comment Egyptian Hieroglyph ::pic desert hills
390
+ ::u 13216 ::r n ::comment Egyptian Hieroglyph ::pic ripple of water
391
+ ::u 13219 ::r sh ::comment Egyptian Hieroglyph (š) ::pic basin
392
+ ::u 13254 ::r h ::comment Egyptian Hieroglyph ::pic reed shelter
393
+ ::u 13283 ::r z ::comment Egyptian Hieroglyph [also S?] ::pic door bolt
394
+ ::u 132AA ::r p ::comment Egyptian Hieroglyph ::pic stool
395
+ ::u 132D4 ::r n ::comment Egyptian Hieroglyph ::pic red crown
396
+ ::u 132F4 ::r s ::comment Egyptian Hieroglyph [also Z?] ::pic folded cloth
397
+ ::u 13319 ::comment Egyptian Hieroglyph ::pic throw stick
398
+ ::u 13362 ::num 100 ::comment Egyptian Hieroglyph
399
+ ::u 13363 ::num 200 ::comment Egyptian Hieroglyph
400
+ ::u 13364 ::num 300 ::comment Egyptian Hieroglyph
401
+ ::u 13365 ::num 400 ::comment Egyptian Hieroglyph
402
+ ::u 13366 ::num 500 ::comment Egyptian Hieroglyph
403
+ ::u 13367 ::num 600 ::comment Egyptian Hieroglyph
404
+ ::u 13368 ::num 700 ::comment Egyptian Hieroglyph
405
+ ::u 13369 ::num 800 ::comment Egyptian Hieroglyph
406
+ ::u 1336A ::num 900 ::comment Egyptian Hieroglyph
407
+ ::u 1336B ::num 500 ::comment Egyptian Hieroglyph
408
+ ::u 1336F ::r o ::comment Egyptian Hieroglyph ::pic lasso
409
+ ::u 1337F ::r t ::comment Egyptian Hieroglyph (ṯ) ::pic hobble
410
+ ::u 13386 ::num 10 ::comment Egyptian Hieroglyph
411
+ ::u 13387 ::num 20 ::comment Egyptian Hieroglyph
412
+ ::u 13388 ::num 30 ::comment Egyptian Hieroglyph
413
+ ::u 13389 ::num 40 ::comment Egyptian Hieroglyph
414
+ ::u 1338A ::num 50 ::comment Egyptian Hieroglyph
415
+ ::u 1338B ::num 60 ::comment Egyptian Hieroglyph
416
+ ::u 1338C ::num 70 ::comment Egyptian Hieroglyph
417
+ ::u 1338D ::num 80 ::comment Egyptian Hieroglyph
418
+ ::u 1338E ::num 90 ::comment Egyptian Hieroglyph
419
+ ::u 1338F ::num 20 ::comment Egyptian Hieroglyph
420
+ ::u 13390 ::num 30 ::comment Egyptian Hieroglyph
421
+ ::u 13391 ::num 40 ::comment Egyptian Hieroglyph
422
+ ::u 13392 ::num 50 ::comment Egyptian Hieroglyph
423
+ ::u 1339B ::r h ::comment Egyptian Hieroglyph ::pic twisted flax
424
+ ::u 133A1 ::r k ::comment Egyptian Hieroglyph ::pic basket with handle
425
+ ::u 133A2 ::r k ::comment Egyptian Hieroglyph ::pic basket with handle, variant
426
+ ::u 133A4 ::r g ::comment Egyptian Hieroglyph ::pic bag
427
+ ::u 133BC ::r g ::comment Egyptian Hieroglyph ::pic stand
428
+ ::u 133CF ::r t ::comment Egyptian Hieroglyph ::pic loaf
429
+ ::u 133ED ::r y ::comment Egyptian Hieroglyph ::pic two strokes
430
+ ::u 133F2 ::r w ::comment Egyptian Hieroglyph ::pic quail chick, hieratic variant
431
+ ::u 133FA ::num 1 ::comment Egyptian Hieroglyph
432
+ ::u 133FB ::num 2 ::comment Egyptian Hieroglyph
433
+ ::u 133FC ::num 3 ::comment Egyptian Hieroglyph
434
+ ::u 133FD ::num 4 ::comment Egyptian Hieroglyph
435
+ ::u 133FE ::num 5 ::comment Egyptian Hieroglyph
436
+ ::u 133FF ::num 6 ::comment Egyptian Hieroglyph
437
+ ::u 13400 ::num 7 ::comment Egyptian Hieroglyph
438
+ ::u 13401 ::num 8 ::comment Egyptian Hieroglyph
439
+ ::u 13402 ::num 9 ::comment Egyptian Hieroglyph
440
+ ::u 13403 ::num 5 ::comment Egyptian Hieroglyph
441
+ ::u 1340D ::r kh ::comment Egyptian Hieroglyph (ḫ, khah) ::pic placenta?
442
+ ::u 1341D ::r m ::comment Egyptian Hieroglyph (also jm)
uroman/data/romanization-table-arabic-block.txt ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ::s ، ::t , ::comment ARABIC COMMA
2
+ ::s ؛ ::t ; ::comment ARABIC SEMICOLON
3
+ ::s ؟ ::t ? ::comment ARABIC QUESTION MARK
4
+ ::s ء ::t ' ::comment ARABIC LETTER HAMZA
5
+ ::s آ ::t a ::comment ARABIC LETTER ALEF WITH MADDA ABOVE
6
+ ::s أ ::t a ::comment ARABIC LETTER ALEF WITH HAMZA ABOVE
7
+ ::s ؤ ::t w ::comment ARABIC LETTER WAW WITH HAMZA ABOVE
8
+ ::s إ ::t i ::comment ARABIC LETTER ALEF WITH HAMZA BELOW
9
+ ::s ئ ::t ye ::comment ARABIC LETTER YEH WITH HAMZA ABOVE
10
+ ::s ا ::t a ::comment ARABIC LETTER ALEF
11
+ ::s ب ::t b ::comment ARABIC LETTER BEH
12
+ ::s ة ::t a ::comment ARABIC LETTER TEH MARBUTA
13
+ ::s ت ::t t ::comment ARABIC LETTER TEH
14
+ ::s ث ::t th ::comment ARABIC LETTER THEH
15
+ ::s ج ::t j ::comment ARABIC LETTER JEEM
16
+ ::s ح ::t h ::comment ARABIC LETTER HAH
17
+ ::s خ ::t kh ::comment ARABIC LETTER KHAH
18
+ ::s د ::t d ::comment ARABIC LETTER DAL
19
+ ::s ذ ::t th ::comment ARABIC LETTER THAL
20
+ ::s ر ::t r ::comment ARABIC LETTER REH
21
+ ::s ز ::t z ::comment ARABIC LETTER ZAIN
22
+ ::s س ::t s ::comment ARABIC LETTER SEEN
23
+ ::s ش ::t sh ::comment ARABIC LETTER SHEEN
24
+ ::s ص ::t s ::comment ARABIC LETTER SAD
25
+ ::s ض ::t d ::comment ARABIC LETTER DAD
26
+ ::s ط ::t t ::comment ARABIC LETTER TAH
27
+ ::s ظ ::t z ::comment ARABIC LETTER ZAH
28
+ ::s ع ::t ' ::comment ARABIC LETTER AIN
29
+ ::s غ ::t gh ::comment ARABIC LETTER GHAIN
30
+ ::s ـ ::t - ::comment ARABIC TATWEEL
31
+ ::s ف ::t f ::comment ARABIC LETTER FEH
32
+ ::s ق ::t q ::comment ARABIC LETTER QAF
33
+ ::s ك ::t k ::comment ARABIC LETTER KAF
34
+ ::s ل ::t l ::comment ARABIC LETTER LAM
35
+ ::s م ::t m ::comment ARABIC LETTER MEEM
36
+ ::s ن ::t n ::comment ARABIC LETTER NOON
37
+ ::s ه ::t h ::comment ARABIC LETTER HEH
38
+ ::s و ::t w ::comment ARABIC LETTER WAW
39
+ ::s ى ::t a ::comment ARABIC LETTER ALEF MAKSURA
40
+ ::s ي ::t y ::comment ARABIC LETTER YEH
41
+ ::s َ ::t a ::comment ARABIC FATHA
42
+ ::s ُ ::t u ::comment ARABIC DAMMA
43
+ ::s ِ ::t i ::comment ARABIC KASRA
44
+ ::s ْ ::t ::comment ARABIC SUKUN
45
+ ::s ٔ ::t ' ::comment ARABIC HAMZA ABOVE
46
+ ::s ٕ ::t ' ::comment ARABIC HAMZA BELOW
47
+ ::s ٠ ::t 0 ::comment ARABIC-INDIC DIGIT ZERO
48
+ ::s ١ ::t 1 ::comment ARABIC-INDIC DIGIT ONE
49
+ ::s ٢ ::t 2 ::comment ARABIC-INDIC DIGIT TWO
50
+ ::s ٣ ::t 3 ::comment ARABIC-INDIC DIGIT THREE
51
+ ::s ٤ ::t 4 ::comment ARABIC-INDIC DIGIT FOUR
52
+ ::s ٥ ::t 5 ::comment ARABIC-INDIC DIGIT FIVE
53
+ ::s ٦ ::t 6 ::comment ARABIC-INDIC DIGIT SIX
54
+ ::s ٧ ::t 7 ::comment ARABIC-INDIC DIGIT SEVEN
55
+ ::s ٨ ::t 8 ::comment ARABIC-INDIC DIGIT EIGHT
56
+ ::s ٩ ::t 9 ::comment ARABIC-INDIC DIGIT NINE
57
+ ::s ٪ ::t % ::comment ARABIC PERCENT SIGN
58
+ ::s ٫ ::t , ::comment ARABIC DECIMAL SEPARATOR
59
+ ::s ٬ ::t , ::comment ARABIC THOUSANDS SEPARATOR
60
+ ::s ٮ ::t b ::comment ARABIC LETTER DOTLESS BEH
61
+ ::s ٯ ::t q ::comment ARABIC LETTER DOTLESS QAF
62
+ ::s ٰ ::t a ::comment ARABIC LETTER SUPERSCRIPT ALEF
63
+ ::s ٱ ::t a ::comment ARABIC LETTER ALEF WASLA
64
+ ::s ٲ ::t a ::comment ARABIC LETTER ALEF WITH WAVY HAMZA ABOVE
65
+ ::s ٳ ::t a ::comment ARABIC LETTER ALEF WITH WAVY HAMZA BELOW
66
+ ::s ٷ ::t u ::comment ARABIC LETTER U WITH HAMZA ABOVE
67
+ ::s ٹ ::t tt ::comment ARABIC LETTER TTEH
68
+ ::s ٺ ::t tt ::comment ARABIC LETTER TTEHEH
69
+ ::s ٻ ::t b ::comment ARABIC LETTER BEEH
70
+ ::s ټ ::t t ::comment ARABIC LETTER TEH WITH RING
71
+ ::s ٽ ::t t ::comment ARABIC LETTER TEH WITH THREE DOTS ABOVE DOWNWARDS
72
+ ::s پ ::t p ::comment ARABIC LETTER PEH
73
+ ::s ٿ ::t t ::comment ARABIC LETTER TEHEH
74
+ ::s ڀ ::t b ::comment ARABIC LETTER BEHEH
75
+ ::s ځ ::t h ::comment ARABIC LETTER HAH WITH HAMZA ABOVE
76
+ ::s ڂ ::t h ::comment ARABIC LETTER HAH WITH TWO DOTS VERTICAL ABOVE
77
+ ::s ڃ ::t ny ::comment ARABIC LETTER NYEH
78
+ ::s ڄ ::t dy ::comment ARABIC LETTER DYEH
79
+ ::s څ ::t h ::comment ARABIC LETTER HAH WITH THREE DOTS ABOVE
80
+ ::s چ ::t tch ::comment ARABIC LETTER TCHEH
81
+ ::s ڇ ::t tch ::comment ARABIC LETTER TCHEHEH
82
+ ::s ڈ ::t dd ::comment ARABIC LETTER DDAL
83
+ ::s ډ ::t d ::comment ARABIC LETTER DAL WITH RING
84
+ ::s ڊ ::t d ::comment ARABIC LETTER DAL WITH DOT BELOW
85
+ ::s ڋ ::t d ::comment ARABIC LETTER DAL WITH DOT BELOW AND SMALL TAH
86
+ ::s ڌ ::t d ::comment ARABIC LETTER DAHAL
87
+ ::s ڍ ::t dd ::comment ARABIC LETTER DDAHAL
88
+ ::s ڎ ::t d ::comment ARABIC LETTER DUL
89
+ ::s ڏ ::t d ::comment ARABIC LETTER DAL WITH THREE DOTS ABOVE DOWNWARDS
90
+ ::s ڐ ::t d ::comment ARABIC LETTER DAL WITH FOUR DOTS ABOVE
91
+ ::s ڑ ::t rr ::comment ARABIC LETTER RREH
92
+ ::s ڒ ::t r ::comment ARABIC LETTER REH WITH SMALL V
93
+ ::s ړ ::t r ::comment ARABIC LETTER REH WITH RING
94
+ ::s ڔ ::t r ::comment ARABIC LETTER REH WITH DOT BELOW
95
+ ::s ڕ ::t r ::comment ARABIC LETTER REH WITH SMALL V BELOW
96
+ ::s ږ ::t r ::comment ARABIC LETTER REH WITH DOT BELOW AND DOT ABOVE
97
+ ::s ڗ ::t r ::comment ARABIC LETTER REH WITH TWO DOTS ABOVE
98
+ ::s ژ ::t j ::comment ARABIC LETTER JEH
99
+ ::s ڙ ::t r ::comment ARABIC LETTER REH WITH FOUR DOTS ABOVE
100
+ ::s ښ ::t s ::comment ARABIC LETTER SEEN WITH DOT BELOW AND DOT ABOVE
101
+ ::s ڛ ::t s ::comment ARABIC LETTER SEEN WITH THREE DOTS BELOW
102
+ ::s ڜ ::t s ::comment ARABIC LETTER SEEN WITH THREE DOTS BELOW AND THREE DOTS ABOVE
103
+ ::s ڝ ::t s ::comment ARABIC LETTER SAD WITH TWO DOTS BELOW
104
+ ::s ڞ ::t s ::comment ARABIC LETTER SAD WITH THREE DOTS ABOVE
105
+ ::s ڟ ::t t ::comment ARABIC LETTER TAH WITH THREE DOTS ABOVE
106
+ ::s ڠ ::t n ::comment ARABIC LETTER AIN WITH THREE DOTS ABOVE
107
+ ::s ڡ ::t f ::comment ARABIC LETTER DOTLESS FEH
108
+ ::s ڢ ::t f ::comment ARABIC LETTER FEH WITH DOT MOVED BELOW
109
+ ::s ڣ ::t f ::comment ARABIC LETTER FEH WITH DOT BELOW
110
+ ::s ڤ ::t v ::comment ARABIC LETTER VEH
111
+ ::s ڥ ::t f ::comment ARABIC LETTER FEH WITH THREE DOTS BELOW
112
+ ::s ڦ ::t p ::comment ARABIC LETTER PEHEH
113
+ ::s ڧ ::t q ::comment ARABIC LETTER QAF WITH DOT ABOVE
114
+ ::s ڨ ::t q ::comment ARABIC LETTER QAF WITH THREE DOTS ABOVE
115
+ ::s ک ::t k ::comment ARABIC LETTER KEHEH
116
+ ::s ڪ ::t k ::comment ARABIC LETTER SWASH KAF
117
+ ::s ګ ::t k ::comment ARABIC LETTER KAF WITH RING
118
+ ::s ڬ ::t k ::comment ARABIC LETTER KAF WITH DOT ABOVE
119
+ ::s ڭ ::t ng ::comment ARABIC LETTER NG
120
+ ::s ڮ ::t k ::comment ARABIC LETTER KAF WITH THREE DOTS BELOW
121
+ ::s گ ::t g ::comment ARABIC LETTER GAF
122
+ ::s ڰ ::t g ::comment ARABIC LETTER GAF WITH RING
123
+ ::s ڱ ::t ng ::comment ARABIC LETTER NGOEH
124
+ ::s ڲ ::t g ::comment ARABIC LETTER GAF WITH TWO DOTS BELOW
125
+ ::s ڳ ::t g ::comment ARABIC LETTER GUEH
126
+ ::s ڴ ::t g ::comment ARABIC LETTER GAF WITH THREE DOTS ABOVE
127
+ ::s ڵ ::t l ::comment ARABIC LETTER LAM WITH SMALL V
128
+ ::s ڶ ::t l ::comment ARABIC LETTER LAM WITH DOT ABOVE
129
+ ::s ڷ ::t l ::comment ARABIC LETTER LAM WITH THREE DOTS ABOVE
130
+ ::s ڸ ::t l ::comment ARABIC LETTER LAM WITH THREE DOTS BELOW
131
+ ::s ڹ ::t n ::comment ARABIC LETTER NOON WITH DOT BELOW
132
+ ::s ں ::t n ::comment ARABIC LETTER NOON GHUNNA
133
+ ::s ڻ ::t rn ::comment ARABIC LETTER RNOON
134
+ ::s ڼ ::t n ::comment ARABIC LETTER NOON WITH RING
135
+ ::s ڽ ::t n ::comment ARABIC LETTER NOON WITH THREE DOTS ABOVE
136
+ ::s ھ ::t h ::comment ARABIC LETTER HEH DOACHASHMEE
137
+ ::s ڿ ::t tch ::comment ARABIC LETTER TCHEH WITH DOT ABOVE
138
+ ::s ۀ ::t h ::comment ARABIC LETTER HEH WITH YEH ABOVE
139
+ ::s ہ ::t h ::comment ARABIC LETTER HEH GOAL
140
+ ::s ۂ ::t h ::comment ARABIC LETTER HEH GOAL WITH HAMZA ABOVE
141
+ ::s ۃ ::t a ::comment ARABIC LETTER TEH MARBUTA GOAL
142
+ ::s ۄ ::t w ::comment ARABIC LETTER WAW WITH RING
143
+ ::s ۅ ::t oe ::comment ARABIC LETTER KIRGHIZ OE
144
+ ::s ۆ ::t oe ::comment ARABIC LETTER OE
145
+ ::s ۇ ::t u ::comment ARABIC LETTER U
146
+ ::s ۈ ::t yu ::comment ARABIC LETTER YU
147
+ ::s ۉ ::t yu ::comment ARABIC LETTER KIRGHIZ YU
148
+ ::s ۊ ::t w ::comment ARABIC LETTER WAW WITH TWO DOTS ABOVE
149
+ ::s ۋ ::t v ::comment ARABIC LETTER VE
150
+ ::s ی ::t y ::comment ARABIC LETTER FARSI YEH
151
+ ::s ۍ ::t y ::comment ARABIC LETTER YEH WITH TAIL
152
+ ::s ێ ::t y ::comment ARABIC LETTER YEH WITH SMALL V
153
+ ::s ۏ ::t w ::comment ARABIC LETTER WAW WITH DOT ABOVE
154
+ ::s ې ::t e ::comment ARABIC LETTER E
155
+ ::s ۑ ::t y ::comment ARABIC LETTER YEH WITH THREE DOTS BELOW
156
+ ::s ے ::t y ::comment ARABIC LETTER YEH BARREE
157
+ ::s ۓ ::t y ::comment ARABIC LETTER YEH BARREE WITH HAMZA ABOVE
158
+ ::s ۔ ::t . ::comment ARABIC FULL STOP
159
+ ::s ە ::t ae ::comment ARABIC LETTER AE
160
+ ::s ۮ ::t d ::comment ARABIC LETTER DAL WITH INVERTED V
161
+ ::s ۯ ::t r ::comment ARABIC LETTER REH WITH INVERTED V
162
+ ::s ۰ ::t 0 ::comment EXTENDED ARABIC-INDIC DIGIT ZERO
163
+ ::s ۱ ::t 1 ::comment EXTENDED ARABIC-INDIC DIGIT ONE
164
+ ::s ۲ ::t 2 ::comment EXTENDED ARABIC-INDIC DIGIT TWO
165
+ ::s ۳ ::t 3 ::comment EXTENDED ARABIC-INDIC DIGIT THREE
166
+ ::s ۴ ::t 4 ::comment EXTENDED ARABIC-INDIC DIGIT FOUR
167
+ ::s ۵ ::t 5 ::comment EXTENDED ARABIC-INDIC DIGIT FIVE
168
+ ::s ۶ ::t 6 ::comment EXTENDED ARABIC-INDIC DIGIT SIX
169
+ ::s ۷ ::t 7 ::comment EXTENDED ARABIC-INDIC DIGIT SEVEN
170
+ ::s ۸ ::t 8 ::comment EXTENDED ARABIC-INDIC DIGIT EIGHT
171
+ ::s ۹ ::t 9 ::comment EXTENDED ARABIC-INDIC DIGIT NINE
172
+ ::s ۺ ::t sh ::comment ARABIC LETTER SHEEN WITH DOT BELOW
173
+ ::s ۻ ::t d ::comment ARABIC LETTER DAD WITH DOT BELOW
174
+ ::s ۼ ::t gh ::comment ARABIC LETTER GHAIN WITH DOT BELOW
175
+ ::s ۽ ::t & ::comment ARABIC SIGN SINDHI AMPERSAND
176
+ ::s ﷲ ::t allah ::comment ARABIC LIGATURE ALLAH ISOLATED FORM
177
+
178
+ ::s ‌ ::t ::comment ZERO WIDTH NON-JOINER
179
+ ::s ‍ ::t ::comment ZERO WIDTH JOINER
uroman/data/romanization-table.txt ADDED
@@ -0,0 +1,2019 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 
2
+ ## European Latin extensions
3
+ # Vowels
4
+ ::s Ä ::t Ae
5
+ ::s Ö ::t Oe
6
+ ::s Ü ::t Ue
7
+ ::s Å ::t Aa
8
+ ::s Æ ::t Ae
9
+ ::s Ø ::t oe
10
+ ::s Œ ::t Oe
11
+ ::s ä ::t ae
12
+ ::s ö ::t oe
13
+ ::s ü ::t ue
14
+ ::s å ::t aa
15
+ ::s æ ::t ae
16
+ ::s ø ::t oe
17
+ ::s œ ::t oe
18
+ # Consonants
19
+ ::s Ç ::t S
20
+ ::s ç ::t s
21
+ ::s Ç ::t Ch ::lcode tur
22
+ ::s ç ::t ch ::lcode tur
23
+ ::s Ş ::t Sh
24
+ ::s ş ::t sh
25
+ ::s Ș ::t Sh
26
+ ::s ș ::t sh
27
+ ::s ß ::t ss
28
+ ::s Ț ::t Ts
29
+ ::s ț ::t ts
30
+
31
+ # Digraphs
32
+ # ::s ʣ ::t dz
33
+ ::s ʤ ::t dzh ::comment Latin small letter dezh digraph
34
+ # ::s ʥ ::t dz
35
+ # ::s ʦ ::t ts
36
+ ::s ʧ ::t tsh ::comment Latin small letter tesh digraph
37
+ # ::s ʨ ::t tc
38
+
39
+ # Miscellaneous
40
+ ::s ə ::t e
41
+
42
+ # English
43
+ ::s chr ::t chr ::t-alt kr ::example chromosome, synchronize
44
+ ::s Chr ::t Chr ::t-alt Kr ::example Christmas, Chrysler
45
+ ::s eight ::t eight ::t-alt eit ::example eight, weight
46
+ ::s Eight ::t Eight ::t-alt Eit ::example Eighteen
47
+ ::s ight ::t ight ::t-alt ait ::example Knight
48
+ ::s gh ::t gh ::t-alt f, ph, "" ::example laugh, daughter
49
+ ::s high ::t high ::t-alt hai ::example highlight
50
+ ::s High ::t High ::t-alt Hai ::example High School
51
+ ::s Isle ::t Isle ::t-alt Ail ::use-only-for-whole-word ::example Isle
52
+ ::s Island ::t Island ::t-alt Ailand ::use-only-for-whole-word ::example Island
53
+ ::s kn ::t kn ::t-alt n ::use-only-at-start-of-word ::example knowledge
54
+ ::s Kn ::t Kn ::t-alt N ::use-only-at-start-of-word ::example Knight
55
+ ::s Mc ::t Mc ::t-alt Mac ::use-only-at-start-of-word ::example McNulty
56
+ ::s mc ::t mc ::t-alt mac ::use-only-at-start-of-word
57
+ ::s oo ::t oo ::t-alt u ::lcode eng ::example Brooklyn; Goose Bay
58
+ ::s ph ::t ph ::t-alt f ::example alpha
59
+ ::s Ph ::t Ph ::t-alt F ::example Philip
60
+ ::s Thom ::t Thom ::t-alt Tom ::use-only-at-start-of-word ::example Thomas, Thompson
61
+ ::s tion ::t tion ::t-alt shen ::example
62
+ ::s Sean ::t Sean ::t-alt Shawn ::use-only-for-whole-word
63
+ ::s ssion ::t ssion ::t-alt shen ::example Sessions
64
+ ::s St ::t St ::t-alt Saint ::use-only-for-whole-word
65
+ ::s St. ::t St. ::t-alt Saint ::use-only-for-whole-word
66
+ ::s Wr ::t Wr ::t-alt R ::example Wren
67
+ ::s wr ::t wr ::t-alt r ::example Cartwright
68
+ ::s x ::t x ::t-alt ks ::example Mexico
69
+ ::s x ::t x ::t-alt gz ::example example, anxiety, exhaust, exit
70
+
71
+ # French
72
+ ::s â ::t a ::t-alt as ::example pâte/paste, pastry
73
+ ::s ê ::t e ::t-alt es ::example fête/feast
74
+ ::s î ::t i ::t-alt is ::example île/isle
75
+ ::s ô ::t o ::t-alt os ::example côte/coast
76
+ ::s û ::t u ::t-alt us ::example août/August
77
+ ::s eaux ::t eaux ::t-alt o ::example Bordeaux
78
+ ::s eau ::t eau ::t-alt o ::example Chateau
79
+ ::s auld ::t auld ::t-alt o ::use-only-at-end-of-word ::example Renauld
80
+ ::s ault ::t ault ::t-alt o ::use-only-at-end-of-word ::example Renault
81
+ ::s oux ::t oux ::t-alt u
82
+ ::s ois ::t ois ::t-alt oa ::use-only-at-end-of-word ::example Dubois
83
+
84
+ # German
85
+ ::s Sch ::t Sch ::t-alt Sh
86
+ ::s sch ::t sch ::t-alt sh
87
+ ::s stein ::t stein ::t-alt shtain
88
+ ::s dt ::t dt ::t-alt tt ::use-only-at-end-of-word ::example Schmidt
89
+
90
+ # Dutch
91
+ ::s ij ::t ij ::t-alt ai
92
+ ::s Ij ::t Ij ::t-alt Ai
93
+
94
+ # Latvian
95
+ ::s Ā ::t A ::t-alt Aa ::lcode lav
96
+ ::s ā ::t a ::t-alt aa ::lcode lav
97
+ ::s Ē ::t E ::t-alt Ee ::lcode lav
98
+ ::s ē ::t e ::t-alt ee ::lcode lav
99
+ ::s Ī ::t I ::t-alt Ii ::lcode lav
100
+ ::s ī ::t i ::t-alt ii ::lcode lav
101
+ ::s Ū ::t U ::t-alt Uu ::lcode lav
102
+ ::s ū ::t u ::t-alt uu ::lcode lav
103
+ ::s Ģ ::t G ::t-alt Gj ::lcode lav
104
+ ::s ģ ::t g ::t-alt gj ::lcode lav
105
+ ::s Ķ ::t K ::t-alt Kj ::lcode lav
106
+ ::s ķ ::t k ::t-alt kj ::lcode lav
107
+ ::s Ļ ::t L ::t-alt Lj ::lcode lav
108
+ ::s ļ ::t l ::t-alt lj ::lcode lav
109
+ ::s Ņ ::t N ::t-alt Nj ::lcode lav
110
+ ::s ņ ::t n ::t-alt nj ::lcode lav
111
+ ::s C ::t C ::t-alt Ts ::lcode lav
112
+ ::s c ::t c ::t-alt ts ::lcode lav
113
+ ::s Č ::t C ::t-alt Tsh ::lcode lav
114
+ ::s č ::t c ::t-alt tsh ::lcode lav
115
+ ::s Š ::t Sh ::t-alt s ::lcode lav
116
+ ::s š ::t sh ::t-alt s ::lcode lav
117
+ ::s Ž ::t Z ::t-alt Zh ::lcode lav
118
+ ::s ž ::t z ::t-alt zh ::lcode lav
119
+
120
+ # Lithuanian
121
+ ::s C ::t C ::t-alt Ts ::lcode lit
122
+ ::s c ::t c ::t-alt ts ::lcode lit
123
+ ::s Č ::t C ::t-alt Tsh ::lcode lit
124
+ ::s č ::t c ::t-alt tsh ::lcode lit
125
+ ::s Š ::t Sh ::t-alt s ::lcode lit
126
+ ::s š ::t sh ::t-alt s ::lcode lit
127
+ ::s Ž ::t Z ::t-alt Zh ::lcode lit
128
+ ::s ž ::t z ::t-alt zh ::lcode lit
129
+
130
+ # International Greek (e.g. as used in chemical compounds)
131
+ ::s β ::t b
132
+ ::s Β ::t B
133
+ ::s ϐ ::t b
134
+
135
+ # Ancient Greek
136
+ ::s β ::t b ::lcode grc
137
+ ::s Β ::t B ::lcode grc
138
+ ::s γγ ::t ng ::lcode grc
139
+ ::s γκ ::t nk ::lcode grc
140
+ ::s γξ ::t nx ::lcode grc
141
+ ::s γχ ::t nch ::lcode grc
142
+ ::s ϱ ::t r ::lcode grc
143
+
144
+ # Pontic Greek
145
+ ::s β ::t v ::t-alt b ::lcode pnt
146
+ ::s Β ::t V ::t-alt B ::lcode pnt
147
+ ::s ϐ ::t v ::t-alt b ::lcode pnt
148
+
149
+ # Modern Greek (generally the default)
150
+ ::s β ::t v ::t-alt b ::lcode ell
151
+ ::s Β ::t V ::t-alt B ::lcode ell
152
+ ::s ϐ ::t v ::t-alt b ::lcode ell
153
+ ::s Ι ::t I
154
+ ::s ι ::t i
155
+ ::s ί ::t i
156
+ ::s ἶ ::t i
157
+ ::s Υ ::t Y
158
+ ::s υ ::t y
159
+ ::s Ρ ::t R
160
+ ::s ρ ::t r
161
+ ::s ϱ ::t r
162
+ ::s Χ ::t Ch ::t-alt Kh
163
+ ::s χ ::t ch ::t-alt kh
164
+ ::s φ ::t f ::t-alt ph
165
+ ::s Φ ::t F ::t-alt Ph
166
+ ::s Ντ ::t D
167
+ ::s ντ ::t nd ::t-alt d, nt
168
+ # ::s ντζ ::t ntz
169
+ ::s Μπ ::t B
170
+ ::s μπ ::t b ::use-only-at-start-of-word
171
+ ::s μπ ::t mb ::t-alt b, mp ::dont-use-at-start-of-word
172
+ ::s λμπ ::t lb
173
+ ::s νμπ ::t nb
174
+ ::s ρμπ ::t rb
175
+ ::s γγ ::t ng
176
+ ::s Γκ ::t G
177
+ ::s γκ ::t ng ::t-alt g ::dont-use-at-start-of-word
178
+ ::s γκ ::t g ::use-only-at-start-of-word
179
+ ::s γξ ::t nx ::lcode grc
180
+ ::s γχ ::t nch ::lcode grc
181
+ ::s ει ::t ei ::t-alt i
182
+ ::s Ει ::t Ei ::t-alt I
183
+ ::s ευ ::t eu ::t-alt ev ::comment donated by Constantine
184
+ ::s Ευ ::t Eu ::t-alt Ev ::comment donated by Constantine
185
+ ::s αυ ::t au ::t-alt av
186
+ ::s Αυ ::t Au ::t-alt Av
187
+ ::s ου ::t ou ::t-alt u
188
+ ::s Ου ::t Ou ::t-alt U
189
+ ::s ηυ ::t eu
190
+ ::s Ηυ ::t Eu
191
+ ::s υι ::t ui
192
+ ::s Υι ::t Ui
193
+ ::s ωυ ::t ou
194
+ ::s Ωυ ::t Ou
195
+ ::s ͺ ::t ::comment GREEK YPOGEGRAMMENI (U+037A)
196
+ ::s ϒ ::t Y ::comment GREEK UPSILON WITH HOOK SYMBOL (U+03D2)
197
+ ::s ϓ ::t Y ::comment GREEK UPSILON WITH ACUTE AND HOOK SYMBOL (U+03D3)
198
+ ::s ϔ ::t Y ::comment GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL (U+03D4)
199
+ ::s ι ::t ::comment GREEK PROSGEGRAMMENI (U+1FBE)
200
+ ::s ᾿ ::t ::comment GREEK PSILI (U+1FBF)
201
+ ::s ῀ ::t ::comment GREEK PERISPOMENI (U+1FC0)
202
+ ::s ` ::t ::comment GREEK VARIA (U+1FEF)
203
+ ::s ´ ::t ::comment GREEK OXIA (U+1FFD)
204
+
205
+ # Glagolitic
206
+ ::s Ⰿ ::t M ::comment GLAGOLITIC CAPITAL LETTER MYSLITE (U+2C0F)
207
+ ::s Ⱞ ::t M ::comment GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE (U+2C2E)
208
+ ::s ⰿ ::t m ::comment GLAGOLITIC SMALL LETTER MYSLITE (U+2C3F)
209
+ ::s ⱞ ::t m ::comment GLAGOLITIC SMALL LETTER LATINATE MYSLITE (U+2C5E)
210
+ ::s 𞀏 ::t m ::comment COMBINING GLAGOLITIC LETTER MYSLITE (U+1E00F)
211
+
212
+ # Cyrillic
213
+ ::s Г ::t G ::t-alt H ::comment Cyrillic capital ghe
214
+ ::s г ::t g ::t-alt h ::comment Cyrillic small ghe
215
+ ::s Е ::t E ::t-alt Ye ::comment Cyrillic capital ie
216
+ ::s е ::t e ::t-alt ye ::comment Cyrillic small ie
217
+ ::s Ё ::t E ::t-alt Yo
218
+ ::s ё ::t e ::t-alt yo
219
+ ::s Х ::t Kh ::t-alt Ch, H ::comment Cyrillic capital ha
220
+ ::s х ::t kh ::t-alt ch, h ::comment Cyrillic small ha
221
+ ::s Щ ::t Shch ::t-alt Sh
222
+ ::s щ ::t shch ::t-alt sh
223
+ ::s Ъ ::t ::comment Cyrillic capital hard sign
224
+ ::s ъ ::t ::comment Cyrillic small hard sign
225
+ ::s ᲆ ::t ::comment CYRILLIC SMALL LETTER TALL HARD SIGN
226
+ ::s Ы ::t Y ::comment Cyrillic capital yeru
227
+ ::s ы ::t y ::comment Cyrillic small yeru
228
+ ::s Ь ::t ::comment Cyrillic capital soft sign
229
+ ::s ь ::t ::comment Cyrillic small soft sign
230
+ ::s Ж ::t Zh ::comment Cyrillic capital letter zhe
231
+ ::s Ш ::t Sh ::comment Cyrillic capital letter sha
232
+ ::s Ч ::t Ch ::comment Cyrillic capital letter che
233
+ ::s Џ ::t Dzh ::comment Cyrillic capital letter dzhe
234
+ ::s Є ::t Ie ::comment Cyrillic capital letter ie
235
+ ::s Ю ::t Yu ::comment Cyrillic capital letter yu
236
+ ::s Я ::t Ya ::comment Cyrillic capital letter ya
237
+
238
+ ::s Ҥ ::t Ng ::comment Cyrillic capital ligature EN GHE
239
+ ::s ҥ ::t ng ::comment Cyrillic small ligature EN GHE
240
+ ::s Ә ::t e ::comment Cyrillic capital schwa
241
+ ::s ә ::t e ::comment Cyrillic small schwa
242
+ ::s Ӏ ::t ' ::comment Cyrillic palochka
243
+ ::s Ҵ ::t TS ::comment Cyrillic capital ligature te tse, used in Abkhasian
244
+ ::s ҵ ::t ts ::comment Cyrillic small ligature te tse, used in Abkhasian
245
+ ::s Ӕ ::t AE ::comment Cyrillic capital ligature a ie
246
+ ::s ӕ ::t ae ::comment Cyrillic small ligature a ie
247
+ ::s ʹ ::t "'" ::comment modifier letter prime
248
+ ::s ʺ ::t '"' ::comment modifier letter double prime
249
+ ::s ий ::t iy ::dont-use-at-end-of-word
250
+ ::s ий ::t y ::use-only-at-end-of-word
251
+
252
+ ::s ᲈ ::t u ::comment CYRILLIC SMALL LETTER UNBLENDED UK ligature ou
253
+
254
+ # Russian
255
+ ::s Г ::t G ::t-alt _NONE_ ::lcode rus ::comment Cyrillic capital letter ghe
256
+ ::s г ::t g ::t-alt _NONE_ ::lcode rus ::comment Cyrillic small letter ghe
257
+ ::s Й ::t Y ::t-alt I, J ::lcode rus ::comment Cyrillic capital letter short i
258
+ ::s й ::t y ::t-alt i, j ::lcode rus ::comment Cyrillic small letter short i
259
+ ::s Ц ::t Ts ::t-alt C ::lcode rus ::comment Cyrillic capital letter tse
260
+ ::s ц ::t ts ::t-alt c ::lcode rus ::comment Cyrillic small letter tse
261
+ ::s Щ ::t Shch ::t-alt _NONE_ ::lcode rus ::comment Cyrillic capital letter shcha
262
+ ::s щ ::t shch ::t-alt _NONE_ ::lcode rus ::comment Cyrillic small letter shcha
263
+ ::s Ѣ ::t E ::t-alt Ie ::lcode rus ::comment archaic Cyrillic capital letter yat
264
+ ::s ѣ ::t e ::t-alt ie ::lcode rus ::comment archaic Cyrillic small letter yat
265
+ ::s Е ::t E ::t-alt Ye ::dont-use-at-start-of-word ::lcode rus ::comment Cyrillic capital ie
266
+ ::s Е ::t Ye ::t-alt E ::use-only-at-start-of-word ::lcode rus
267
+ ::s е ::t e ::t-alt ye ::dont-use-at-start-of-word ::lcode rus ::comment Cyrillic small ie
268
+ ::s е ::t ye ::t-alt e ::use-only-at-start-of-word ::lcode rus
269
+ ::s ае ::t aye ::lcode rus
270
+ ::s а́е ::t aye ::lcode rus
271
+ ::s ее ::t eye ::lcode rus
272
+ ::s е́е ::t eye ::lcode rus
273
+ ::s ие ::t iye ::lcode rus
274
+ ::s и́е ::t iye ::lcode rus
275
+ ::s ое ::t oye ::lcode rus
276
+ ::s о́е ::t oye ::lcode rus
277
+ ::s уе ::t uye ::lcode rus
278
+ ::s у́е ::t uye ::lcode rus
279
+ ::s ье ::t ye ::lcode rus
280
+ ::s ъе ::t ye ::lcode rus
281
+ ::s Ё ::t Yo ::t-alt E ::lcode rus ::comment Cyrillic capital io
282
+ ::s ё ::t yo ::t-alt e ::lcode rus
283
+ ::s аё ::t ayo ::lcode rus
284
+ ::s а́ё ::t ayo ::lcode rus
285
+ ::s её ::t eyo ::lcode rus
286
+ ::s е́ё ::t eyo ::lcode rus
287
+ ::s иё ::t iyo ::lcode rus
288
+ ::s и́ё ::t iyo ::lcode rus
289
+ ::s оё ::t oyo ::lcode rus
290
+ ::s о́ё ::t oyo ::lcode rus
291
+ ::s уё ::t uyo ::lcode rus
292
+ ::s у́ё ::t uyo ::lcode rus
293
+ ::s ьё ::t yo ::lcode rus
294
+ ::s ъё ::t yo ::lcode rus
295
+ ::s ий ::t y ::lcode rus
296
+
297
+ # Ukranian
298
+ ::s Г ::t H ::lcode ukr ::comment Ukrainian capital letter he
299
+ ::s г ::t h ::lcode ukr ::comment Ukrainian small letter he
300
+ ::s Ґ ::t G ::lcode ukr ::comment Ukrainian capital letter ghe
301
+ ::s ґ ::t g ::lcode ukr ::comment Ukrainian small letter ghe
302
+ ::s Е ::t E ::t-alt _NONE_ ::lcode ukr ::comment Cyrillic capital ie
303
+ ::s е ::t e ::t-alt _NONE_ ::lcode ukr ::comment Cyrillic small ie
304
+ ::s И ::t Y ::lcode ukr ::comment Ukrainian capital letter i
305
+ ::s и ::t y ::lcode ukr ::comment Ukrainian small letter i
306
+ ::s Ї ::t Yi ::lcode ukr ::comment Ukrainian capital letter yi
307
+ ::s ї ::t yi ::lcode ukr ::comment Ukrainian small letter yi
308
+ ::s Й ::t I ::t-alt Y ::lcode ukr ::comment Cyrillic capital letter short i
309
+ ::s й ::t i ::t-alt y ::lcode ukr ::comment Cyrillic small letter short i
310
+ ::s Ц ::t Ts ::t-alt C ::lcode ukr ::comment Cyrillic capital letter tse
311
+ ::s ц ::t ts ::t-alt c ::lcode ukr ::comment Cyrillic small letter tse
312
+ ::s Щ ::t Shch ::t-alt _NONE_ ::lcode ukr ::comment Cyrillic capital letter shcha
313
+ ::s щ ::t shch ::t-alt _NONE_ ::lcode ukr ::comment Cyrillic small letter shcha
314
+ ::s Ѣ ::t E ::t-alt Ie ::lcode ukr ::comment archaic Cyrillic capital letter yat
315
+ ::s ѣ ::t e ::t-alt ie ::lcode ukr ::comment archaic Cyrillic small letter yat
316
+ ::s Иї ::t Yi ::lcode ukr ::comment avoid Yyi
317
+ ::s иї ::t yi ::lcode ukr ::comment avoid yyi
318
+ ::s ій ::t iy ::lcode ukr
319
+ ::s і́й ::t iy ::lcode ukr
320
+ ::s ий ::t y ::lcode ukr ::comment Зеленський/Zelensky
321
+
322
+ # Belarusian
323
+ ::s Г ::t H ::t-alt G ::lcode bel ::comment capital letter he
324
+ ::s г ::t h ::t-alt g ::lcode bel ::comment small letter he
325
+ ::s Ґ ::t G ::lcode bel ::comment capital letter ghe
326
+ ::s ґ ::t g ::lcode bel ::comment small letter ghe
327
+ ::s Й ::t J ::t-alt Y ::lcode bel ::comment Cyrillic capital letter short i
328
+ ::s й ::t j ::t-alt y ::lcode bel ::comment Cyrillic small letter short i
329
+ ::s Ц ::t Ts ::t-alt C ::lcode bel ::comment Cyrillic capital letter tse
330
+ ::s ц ::t ts ::t-alt c ::lcode bel ::comment Cyrillic small letter tse
331
+ ::s Щ ::t Shch ::t-alt _NONE_ ::lcode bel ::comment Cyrillic capital letter shcha
332
+ ::s щ ::t shch ::t-alt _NONE_ ::lcode bel ::comment Cyrillic small letter shcha
333
+ ::s Ѣ ::t E ::t-alt Ie ::lcode bel ::comment archaic Cyrillic capital letter yat
334
+ ::s ѣ ::t e ::t-alt ie ::lcode bel ::comment archaic Cyrillic small letter yat
335
+ ::s 'я ::t ya ::lcode bel
336
+ ::s ’я ::t ya ::lcode bel
337
+ ::s 'і ::t i ::lcode bel
338
+ ::s ’і ::t i ::lcode bel
339
+ ::s Ё ::t Yo ::t-alt E ::lcode bel ::comment Cyrillic capital io
340
+ ::s ё ::t yo ::t-alt e ::lcode bel
341
+ ::s ёў ::t you ::lcode bel
342
+ ::s ий ::t y ::lcode bel
343
+
344
+ # Serbian
345
+ ::s Г ::t G ::t-alt _NONE_ ::lcode srp ::comment Cyrillic capital ghe
346
+ ::s г ::t g ::t-alt _NONE_ ::lcode srp ::comment Cyrillic small ghe
347
+ ::s Х ::t H ::t-alt _NONE_ ::lcode srp ::comment Cyrillic capital ha
348
+ ::s х ::t h ::t-alt _NONE_ ::lcode srp ::comment Cyrillic small ha
349
+ ::s Е ::t E ::t-alt _NONE_ ::lcode srp ::comment Cyrillic capital ie
350
+ ::s е ::t e ::t-alt _NONE_ ::lcode srp ::comment Cyrillic small ie
351
+ ::s Ђ ::t Dj ::lcode srp ::comment Cyrillic capital dje
352
+ ::s Љ ::t Lj ::lcode srp ::comment Cyrillic capital lje
353
+ ::s Ћ ::t Tsh ::lcode srp ::comment Cyrillic capital tshe
354
+ ::s Ж ::t Zh ::lcode srp ::comment Cyrillic capital zhe
355
+ ::s Ц ::t C ::t-alt Ts ::lcode srp ::comment Cyrillic capital tse
356
+ ::s ц ::t c ::t-alt ts ::lcode srp ::comment Cyrillic capital tse
357
+ ::s Đ ::t Dj ::lcode srp ::comment Latin capital d with stroke
358
+ ::s đ ::t dj ::lcode srp ::comment Latin small d with stroke
359
+ ::s Ž ::t Zh ::lcode srp ::comment Latin capital z with caron
360
+ ::s ž ::t zh ::lcode srp ::comment Latin small z with caron
361
+ ::s Ć ::t Tsh ::lcode srp ::comment Latin capital c with acute
362
+ ::s ć ::t tsh ::lcode srp ::comment Latin small c with acute
363
+ ::s Č ::t Ch ::lcode srp ::comment Latin capital c with caron
364
+ ::s č ::t ch ::lcode srp ::comment Latin small c with caron
365
+ ::s Š ::t Sh ::lcode srp ::comment Latin capital s with caron
366
+ ::s š ::t sh ::lcode srp ::comment Latin small s with caron
367
+
368
+ ::s Г ::t G ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital ghe
369
+ ::s г ::t g ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small ghe
370
+ ::s Х ::t H ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital ha
371
+ ::s х ::t h ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small ha
372
+ ::s Ц ::t C ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital letter tse
373
+ ::s ц ::t c ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small letter tse
374
+ ::s Ч ::t C ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital letter che
375
+ ::s ч ::t c ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small letter che
376
+ ::s Џ ::t Dz ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital letter dzhe
377
+ ::s џ ::t dz ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small letter dzhe
378
+ ::s Е ::t E ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital ie
379
+ ::s е ::t e ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small ie
380
+ ::s Ш ::t S ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital sha
381
+ ::s ш ::t s ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small sha
382
+ ::s Ж ::t Z ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital zhe
383
+ ::s ж ::t z ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small zhe
384
+ ::s Љ ::t Lj ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital lje
385
+ ::s љ ::t lj ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small lje
386
+ ::s Њ ::t Nj ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital nje
387
+ ::s њ ::t nj ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small nje
388
+ ::s Ђ ::t Dj ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital dje
389
+ ::s ђ ::t dj ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small dje
390
+ ::s Ћ ::t C ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital tshe
391
+ ::s ћ ::t c ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small tshe
392
+ ::s Đ ::t Dj ::lcode srp2 ::comment Latin capital d with stroke
393
+ ::s đ ::t dj ::lcode srp2 ::comment Latin small d with stroke
394
+
395
+ # Montenegrin extension (controversial)
396
+ ::s З́ ::t Zj ::lcode srp ::comment Cyrillic capital zje
397
+ ::s з́ ::t zj ::lcode srp ::comment Cyrillic small zje
398
+ ::s С́ ::t Sj ::lcode srp ::comment Cyrillic capital sje
399
+ ::s с́ ::t sj ::lcode srp ::comment Cyrillic small sje
400
+ ::s Ź ::t Zj ::lcode srp ::comment Latin capital z with acute
401
+ ::s ź ::t zj ::lcode srp ::comment Latin small z with acute
402
+ ::s Ś ::t Sj ::lcode srp ::comment Latin capital s with acute
403
+ ::s ś ::t sj ::lcode srp ::comment Latin small s with acute
404
+
405
+ ::s З́ ::t Z ::lcode srp2 ::comment Cyrillic capital zje
406
+ ::s з́ ::t z ::lcode srp2 ::comment Cyrillic small zje
407
+ ::s С́ ::t S ::lcode srp2 ::comment Cyrillic capital sje
408
+ ::s с́ ::t s ::lcode srp2 ::comment Cyrillic small sje
409
+ ::s Ź ::t Z ::lcode srp2 ::comment Latin capital z with acute
410
+ ::s ź ::t z ::lcode srp2 ::comment Latin small z with acute
411
+ ::s Ś ::t S ::lcode srp2 ::comment Latin capital s with acute
412
+ ::s ś ::t s ::lcode srp2 ::comment Latin small s with acute
413
+
414
+ # Bulgarian
415
+ ::s Г ::t G ::t-alt _NONE_ ::lcode bul ::comment Cyrillic capital ghe
416
+ ::s г ::t g ::t-alt _NONE_ ::lcode bul ::comment Cyrillic small ghe
417
+ ::s Х ::t H ::t-alt Kh ::lcode bul ::comment Cyrillic capital letter ha
418
+ ::s х ::t h ::t-alt kh ::lcode bul ::comment Cyrillic small letter ha
419
+ ::s Ц ::t C ::t-alt Ts ::lcode bul ::comment Cyrillic capital letter tse
420
+ ::s ц ::t c ::t-alt ts ::lcode bul ::comment Cyrillic small letter tse
421
+ ::s Щ ::t Sht ::t-alt _NONE_ ::lcode bul ::comment Cyrillic capital letter shcha
422
+ ::s щ ::t sht ::t-alt _NONE_ ::lcode bul ::comment Cyrillic small letter shcha
423
+ ::s Е ::t E ::t-alt _NONE_ ::lcode bul ::comment Cyrillic capital ie
424
+ ::s е ::t e ::t-alt _NONE_ ::lcode bul ::comment Cyrillic small ie
425
+ ::s Ж ::t Zh ::t-alt Z, J ::lcode bul ::comment Cyrillic capital zhe
426
+ ::s ж ::t zh ::t-alt z, j ::lcode bul ::comment Cyrillic small zhe
427
+ ::s Й ::t I ::t-alt Y, J ::lcode bul ::comment Cyrillic capital letter short i
428
+ ::s й ::t i ::t-alt y, j ::lcode bul ::comment Cyrillic short letter short i
429
+ ::s Ю ::t Yu ::t-alt U, Ju, Iu ::lcode bul ::comment Cyrillic capital letter yu
430
+ ::s ю ::t yu ::t-alt u, ju, iu ::lcode bul ::comment Cyrillic small letter yu
431
+ ::s Ъ ::t U ::t-alt A ::lcode bul ::comment Cyrillic capital letter hard sign
432
+ ::s ъ ::t u ::t-alt a ::lcode bul ::comment Cyrillic capital letter hard sign
433
+ ::s Ѣ ::t E ::t-alt Ie ::lcode bul ::comment archaic Cyrillic capital letter yat
434
+ ::s ѣ ::t e ::t-alt ie ::lcode bul ::comment archaic Cyrillic small letter yat
435
+ ::s Ѫ ::t U ::lcode bul ::comment archaic Cyrillic capital letter yus
436
+ ::s ѫ ::t u ::lcode bul ::comment archaic Cyrillic small letter yus
437
+ ::s ИЯ ::t IA ::lcode bul ::use-only-at-end-of-word
438
+ ::s ия ::t ia ::lcode bul ::use-only-at-end-of-word
439
+
440
+ ::s Ž ::t Zh ::lcode bul ::comment Latin capital z with caron
441
+ ::s ž ::t zh ::lcode bul ::comment Latin small z with caron
442
+ ::s Č ::t Ch ::lcode bul ::comment Latin capital c with caron
443
+ ::s č ::t ch ::lcode bul ::comment Latin small c with caron
444
+ ::s Š ::t Sh ::lcode bul ::comment Latin capital s with caron
445
+ ::s š ::t sh ::lcode bul ::comment Latin small s with caron
446
+ ::s Ŝ ::t Sht ::lcode bul ::comment Latin capital s with circumflex
447
+ ::s ŝ ::t sht ::lcode bul ::comment Latin small s with circumflex
448
+ ::s Û ::t Yu ::t-alt U, Ju, Iu ::lcode bul ::comment Latin capital u with circumflex
449
+ ::s û ::t yu ::t-alt u, ju, iu ::lcode bul ::comment Latin small u with circumflex
450
+ ::s  ::t Ya ::t-alt _NONE_ ::lcode bul ::comment Latin capital a with circumflex
451
+ ::s â ::t ya ::t-alt _NONE_ ::lcode bul ::comment Latin small a with circumflex
452
+ ::s Ŭ ::t U ::t-alt A ::lcode bul ::comment Latin capital u with breve (for hard sign)
453
+ ::s ŭ ::t u ::t-alt a ::lcode bul ::comment Latin small u with breve (for hard sign)
454
+ ::s Ǎ ::t U ::t-alt A ::lcode bul ::comment Latin capital a with caron (for hard sign)
455
+ ::s ǎ ::t u ::t-alt a ::lcode bul ::comment Latin small a with caron (for hard sign)
456
+
457
+ # Macedonian
458
+ ::s Г ::t G ::t-alt _NONE_ ::lcode mkd ::comment Cyrillic capital ghe
459
+ ::s г ::t g ::t-alt _NONE_ ::lcode mkd ::comment Cyrillic small ghe
460
+ ::s Х ::t H ::lcode mkd ::comment Cyrillic capital ha
461
+ ::s х ::t h ::lcode mkd ::comment Cyrillic small ha
462
+ ::s Ц ::t C ::t-alt Ts ::lcode mkd ::comment Cyrillic capital letter tse
463
+ ::s ц ::t c ::t-alt ts ::lcode mkd ::comment Cyrillic small letter tse
464
+ ::s Џ ::t Dzh ::t-alt Dj, Dz ::lcode mkd ::comment Cyrillic capital letter dzhe
465
+ ::s џ ::t dzh ::t-alt dj, dz ::lcode mkd ::comment Cyrillic small letter dzhe
466
+ ::s Е ::t E ::t-alt _NONE_ ::lcode mkd ::comment Cyrillic capital ie
467
+ ::s е ::t e ::t-alt _NONE_ ::lcode mkd ::comment Cyrillic small ie
468
+ ::s Ž ::t Zh ::lcode mkd ::comment Latin capital z with caron
469
+ ::s ž ::t zh ::lcode mkd ::comment Latin small z with caron
470
+ ::s Č ::t Ch ::lcode mkd ::comment Latin capital c with caron
471
+ ::s č ::t ch ::lcode mkd ::comment Latin small c with caron
472
+ ::s Š ::t Sh ::lcode mkd ::comment Latin capital s with caron
473
+ ::s š ::t sh ::lcode mkd ::comment Latin small s with caron
474
+ ::s Ǵ ::t Gj ::lcode mkd
475
+ ::s ǵ ::t gj ::lcode mkd
476
+ ::s Đ ::t Gj ::lcode mkd
477
+ ::s đ ::t gj ::lcode mkd
478
+ ::s Ẑ ::t Dz ::lcode mkd
479
+ ::s ẑ ::t dz ::lcode mkd
480
+ ::s J̌ ::t J ::lcode mkd
481
+ ::s ǰ ::t j ::lcode mkd
482
+ ::s L̂ ::t Lj ::lcode mkd
483
+ ::s l̂ ::t lj ::lcode mkd
484
+ ::s N̂ ::t Nj ::lcode mkd
485
+ ::s n̂ ::t nj ::lcode mkd
486
+ ::s Ḱ ::t Kj ::lcode mkd
487
+ ::s ḱ ::t kj ::lcode mkd
488
+ ::s Ć ::t Kj ::lcode mkd
489
+ ::s ć ::t kj ::lcode mkd
490
+ ::s D̂ ::t Dzh ::lcode mkd
491
+ ::s d̂ ::t dzh ::lcode mkd
492
+
493
+ ::s Г ::t G ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital ghe
494
+ ::s г ::t g ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small ghe
495
+ ::s Х ::t H ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital ha
496
+ ::s х ::t h ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small ha
497
+ ::s Ц ::t C ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital letter tse
498
+ ::s ц ::t c ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small letter tse
499
+ ::s Ч ::t C ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital letter che
500
+ ::s ч ::t c ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small letter che
501
+ ::s Џ ::t D ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital letter dzhe
502
+ ::s џ ::t d ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small letter dzhe
503
+ ::s Е ::t E ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital ie
504
+ ::s е ::t e ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small ie
505
+ ::s Ш ::t S ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital sha
506
+ ::s ш ::t s ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small sha
507
+ ::s Ѓ ::t G ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital gje
508
+ ::s ѓ ::t g ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small gje
509
+ ::s Ж ::t Z ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital zhe
510
+ ::s ж ::t z ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small zhe
511
+ ::s Ѕ ::t Z ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital dze
512
+ ::s ѕ ::t z ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small dze
513
+ ::s Ќ ::t K ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital kje
514
+ ::s ќ ::t k ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small kje
515
+ ::s Љ ::t L ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital lje
516
+ ::s љ ::t l ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small lje
517
+ ::s Њ ::t N ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital nje
518
+ ::s њ ::t n ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small nje
519
+ ::s Ž ::t Z ::lcode mkd2 ::comment Latin capital z with caron
520
+ ::s ž ::t z ::lcode mkd2 ::comment Latin small z with caron
521
+ ::s Č ::t C ::lcode mkd2 ::comment Latin capital c with caron
522
+ ::s č ::t c ::lcode mkd2 ::comment Latin small c with caron
523
+ ::s Š ::t S ::lcode mkd2 ::comment Latin capital s with caron
524
+ ::s š ::t s ::lcode mkd2 ::comment Latin small s with caron
525
+ ::s Ǵ ::t G ::lcode mkd2
526
+ ::s ǵ ::t g ::lcode mkd2
527
+ ::s Đ ::t G ::lcode mkd2
528
+ ::s đ ::t g ::lcode mkd2
529
+ ::s Ẑ ::t D ::lcode mkd2
530
+ ::s ẑ ::t d ::lcode mkd2
531
+ ::s J̌ ::t J ::lcode mkd2
532
+ ::s ǰ ::t j ::lcode mkd2
533
+ ::s L̂ ::t L ::lcode mkd2
534
+ ::s l̂ ::t l ::lcode mkd2
535
+ ::s N̂ ::t N ::lcode mkd2
536
+ ::s n̂ ::t n ::lcode mkd2
537
+ ::s Ḱ ::t K ::lcode mkd2
538
+ ::s ḱ ::t k ::lcode mkd2
539
+ ::s Ć ::t K ::lcode mkd2
540
+ ::s ć ::t k ::lcode mkd2
541
+ ::s D̂ ::t D ::lcode mkd2
542
+ ::s d̂ ::t d ::lcode mkd2
543
+
544
+ # Kazakh
545
+ ::s Ә ::t A ::lcode kaz
546
+ ::s ә ::t a ::lcode kaz
547
+ ::s Г ::t G ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic capital ghe
548
+ ::s г ::t g ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic small ghe
549
+ ::s Ғ ::t G ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic capital ghe with stroke
550
+ ::s ғ ::t g ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic small ghe with stroke
551
+ ::s Е ::t E ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic capital ie
552
+ ::s е ::t e ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic small ie
553
+ ::s Ё ::t Yo ::t-alt _NONE_ ::lcode kaz
554
+ ::s ё ::t yo ::t-alt _NONE_ ::lcode kaz
555
+ ::s Х ::t H ::t-alt X ::lcode kaz ::comment Cyrillic capital ha
556
+ ::s х ::t h ::t-alt x ::lcode kaz ::comment Cyrillic small ha
557
+ ::s Һ ::t H ::lcode kaz ::comment Cyrillic capital shha
558
+ ::s һ ::t h ::lcode kaz ::comment Cyrillic small shha
559
+ ::s Қ ::t Q ::t-alt K ::lcode kaz
560
+ ::s қ ::t q ::t-alt k ::lcode kaz
561
+ ::s Ц ::t Ts ::t-alt C ::lcode kaz ::comment Cyrillic capital letter tse
562
+ ::s ц ::t ts ::t-alt c ::lcode kaz ::comment Cyrillic small letter tse
563
+ ::s Щ ::t Sh ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic capital letter shcha
564
+ ::s щ ::t sh ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic small letter shcha
565
+ ::s У ::t U ::t-alt Y ::lcode kaz
566
+ ::s у ::t u ::t-alt y ::lcode kaz
567
+ ::s уы ::t wy ::lcode kaz
568
+ ::s Ж ::t J ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic capital zhe
569
+ ::s ж ::t j ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic small zhe
570
+ ::s Ю ::t Yw ::t-alt Yuw, Yiw ::lcode kaz ::comment Cyrillic capital letter yu
571
+ ::s ю ::t yw ::t-alt yuw, yiw ::lcode kaz ::comment Cyrillic small letter yu
572
+
573
+ # Kyrgyz
574
+ ::s Г ::t G ::t-alt _NONE_ ::lcode kir ::comment Cyrillic capital ghe
575
+ ::s г ::t g ::t-alt _NONE_ ::lcode kir ::comment Cyrillic small ghe
576
+ ::s Е ::t E ::t-alt Ye ::lcode kir ::comment Cyrillic capital ie
577
+ ::s е ::t e ::t-alt ye ::lcode kir ::comment Cyrillic small ie
578
+ ::s Ё ::t Yo ::t-alt _NONE_ ::lcode kir
579
+ ::s ё ::t yo ::t-alt _NONE_ ::lcode kir
580
+ ::s Х ::t Kh ::t-alt X, H ::lcode kir ::comment Cyrillic capital ha
581
+ ::s х ::t kh ::t-alt x, h ::lcode kir ::comment Cyrillic small ha
582
+ ::s Ж ::t Zh ::t-alt J ::lcode kir ::comment Cyrillic capital zhe
583
+ ::s ж ::t zh ::t-alt j ::lcode kir ::comment Cyrillic small zhe
584
+ ::s Й ::t Y ::t-alt I ::lcode kir ::comment Cyrillic capital letter short i
585
+ ::s й ::t y ::t-alt i ::lcode kir ::comment Cyrillic small letter short i
586
+ ::s Ц ::t Ts ::t-alt C ::lcode kir ::comment Cyrillic capital letter tse
587
+ ::s ц ::t ts ::t-alt c ::lcode kir ::comment Cyrillic small letter tse
588
+ ::s Ң ::t Ng ::lcode kir
589
+ ::s ң ::t ng ::lcode kir
590
+ ::s Ө ::t O ::t-alt Oe ::lcode kir
591
+ ::s ө ::t o ::t-alt oe ::lcode kir
592
+ ::s Ү ::t U ::t-alt Y, Ue ::lcode kir
593
+ ::s ү ::t u ::t-alt y, ue ::lcode kir
594
+ ::s Ы ::t I ::t-alt Y ::lcode kir
595
+ ::s ы ::t i ::t-alt y ::lcode kir
596
+ ::s йы ::t yi ::lcode kir
597
+ ::s ый ::t iy ::lcode kir
598
+
599
+ # Ossetian
600
+ ::s ийы ::t iy ::lcode oss
601
+
602
+ # Gothic
603
+ ::s 𐌴 ::t e ::comment Gothic letter aihvus
604
+ ::s 𐌹 ::t i ::comment Gothic letter eis
605
+ ::s 𐍇 ::t x ::comment Gothic letter iggws
606
+
607
+ # Runic
608
+ ::s ᛫ ::t " " ::comment Runic single punctuation, used as word separator
609
+ ::s ᛬ ::t . ::comment Runic multiple punctuation, used as sentence separator
610
+
611
+ # Ogham
612
+ ::s ᚁ ::t b ::comment Ogham letter Beith
613
+ ::s ᚂ ::t l ::comment Ogham letter Luis
614
+ ::s ᚃ ::t f ::comment Ogham letter Fearn
615
+ ::s ᚄ ::t s ::comment Ogham letter Sail
616
+ ::s ᚅ ::t n ::comment Ogham letter Nion
617
+ ::s ᚋ ::t m ::comment Ogham letter Muin
618
+ ::s ᚌ ::t g ::comment Ogham letter Gort
619
+ ::s ᚍ ::t v ::t-alt ng ::comment Ogham letter nGéadal
620
+ ::s ᚎ ::t z ::comment Ogham letter Straif
621
+ ::s ᚏ ::t r ::comment Ogham letter Ruis
622
+ ::s ᚆ ::t h ::t-alt j ::comment Ogham letter Uath
623
+ ::s ᚇ ::t d ::comment Ogham letter Dair
624
+ ::s ᚈ ::t t ::comment Ogham letter Tinne
625
+ ::s ᚉ ::t k ::comment Ogham letter Coll
626
+ ::s ᚊ ::t q ::t-alt kw ::comment Ogham letter Ceirt
627
+ ::s ᚐ ::t a ::comment Ogham letter Ailm
628
+ ::s ᚑ ::t o ::comment Ogham letter Onn
629
+ ::s ᚒ ::t u ::comment Ogham letter Úr
630
+ ::s ᚓ ::t e ::comment Ogham letter Eadhadh
631
+ ::s ᚔ ::t i ::comment Ogham letter Iodhadh
632
+ ::s ᚚ ::t p ::comment Ogham letter Peith
633
+ # Additional Ogham letters (outside standard alphabet)
634
+ ::s ᚕ ::t eo ::t-alt ea ::comment Ogham additional letter Éabhadh
635
+ ::s ᚖ ::t oi ::t-alt oe ::comment Ogham additional letter Ór
636
+ ::s ᚗ ::t ui ::t-alt ua ::comment Ogham additional letter Uilleann
637
+ ::s ᚘ ::t p ::t-alt io ::comment Ogham additional letter Ifín
638
+ ::s ᚙ ::t ch ::t-alt x, ai ::comment Ogham additional letter Eamhancholl
639
+ ::s   ::t " " ::comment Ogham space mark
640
+ ::s ᚛ ::t "" ::comment Ogham feather mark
641
+ ::s ᚜ ::t "" ::comment Ogham feather mark
642
+
643
+ # Georgian
644
+ ::s ა ::t a ::comment Georgian letter an
645
+ ::s ე ::t e ::comment Georgian letter en
646
+ ::s ი ::t i ::comment Georgian letter in
647
+ ::s ო ::t o ::comment Georgian letter on
648
+ ::s უ ::t u ::comment Georgian letter un
649
+ ::s ჱ ::t ey ::comment archaic Georgian letter he
650
+ ::s ჲ ::t i ::comment archaic Georgian letter hie
651
+ ::s ჳ :::t w ::comment archaic Georgian letter we
652
+ ::s ჴ ::t q ::comment archaic Georgian letter har
653
+ ::s ჵ ::t o ::comment archaic Georgian letter hoe
654
+ ::s ჶ ::t f ::comment Georgian letter fi (Greek phi)
655
+ ::s ჷ ::t e ::comment Georgian letter yn (schwa)
656
+ ::s ჸ ::t a ::comment Georgian letter elifi
657
+ ::s ჹ ::t g ::comment Georgian letter gan
658
+ ::s ჺ ::t ' ::comment Georgian letter ain
659
+ ::s ჼ ::t n ::comment Georgian letter nar
660
+ ::s ჽ ::t e ::comment Georgian letter aen
661
+ ::s ჾ ::t ::comment Georgian letter hard sign
662
+ ::s ჿ ::t w ::comment Georgian letter labial sign
663
+
664
+ ::s Ⴚ ::t TS ::comment GEORGIAN CAPITAL LETTER CAN
665
+ ::s ც ::t ts ::comment GEORGIAN LETTER CAN
666
+ ::s Ც ::t TS ::comment GEORGIAN MTAVRULI CAPITAL LETTER CAN
667
+ ::s ⴚ ::t ts ::comment GEORGIAN SMALL LETTER CAN
668
+ ::s Ⴜ ::t TS ::comment GEORGIAN CAPITAL LETTER CIL
669
+ ::s წ ::t ts ::comment GEORGIAN LETTER CIL
670
+ ::s Წ ::t TS ::comment GEORGIAN MTAVRULI CAPITAL LETTER CIL
671
+ ::s ⴜ ::t ts ::comment GEORGIAN SMALL LETTER CIL
672
+ ::s Ⴛ ::t DZ ::comment GEORGIAN CAPITAL LETTER JIL
673
+ ::s ძ ::t dz ::comment GEORGIAN LETTER JIL
674
+ ::s Ძ ::t DZ ::comment GEORGIAN MTAVRULI CAPITAL LETTER JIL
675
+ ::s ⴛ ::t dz ::comment GEORGIAN SMALL LETTER JIL
676
+ ::s Ⴟ ::t J ::comment GEORGIAN CAPITAL LETTER JHAN
677
+ ::s ჯ ::t j ::comment GEORGIAN LETTER JHAN
678
+ ::s Ჯ ::t J ::comment GEORGIAN MTAVRULI CAPITAL LETTER JHAN
679
+ ::s ⴟ ::t j ::comment GEORGIAN SMALL LETTER JHAN
680
+
681
+
682
+ ::s Ⴀ ::t A ::comment Georgian capital letter an
683
+ ::s Ⴄ ::t E ::comment Georgian capital letter en
684
+ ::s Ⴈ ::t I ::comment Georgian capital letter in
685
+ ::s Ⴍ ::t O ::comment Georgian capital letter on
686
+ ::s Ⴓ ::t U ::comment Georgian capital letter un
687
+ ::s Ⴡ ::t EY ::comment archaic Georgian capital letter he
688
+ ::s Ⴢ ::t I ::comment archaic Georgian capital letter hie
689
+ ::s Ⴣ :::t W ::comment archaic Georgian capitel letter we
690
+ ::s Ⴤ ::t Q ::comment archaic Georgian capital letter har
691
+ ::s Ⴥ ::t O ::comment archaic Georgian capital letter hoe
692
+ ::s Ⴧ ::t E ::comment archaic Georgian capital letter yn (schwa)
693
+ ::s Ⴭ ::t E ::comment archaic Georgian capital letter aen
694
+
695
+ ::s Ა ::t A ::comment Georgian Mtavruli capital letter an
696
+ ::s Ე ::t E ::comment Georgian Mtavruli capital letter en
697
+ ::s Ი ::t I ::comment Georgian Mtavruli capital letter in
698
+ ::s Ო ::t O ::comment Georgian Mtavruli capital letter on
699
+ ::s Უ ::t U ::comment Georgian Mtavruli capital letter un
700
+ ::s Ჱ ::t EY ::comment archaic Georgian Mtavruli capital letter he
701
+ ::s Ჲ ::t I ::comment archaic Georgian Mtavruli capital letter hie
702
+ ::s Ჳ :::t W ::comment archaic Georgian Mtavruli capital letter we
703
+ ::s Ჴ ::t Q ::comment archaic Georgian Mtavruli capital letter har
704
+ ::s Ჵ ::t O ::comment archaic Georgian Mtavruli capital letter hoe
705
+ ::s Ჶ ::t F ::comment Georgian Mtavruli capital letter fi (Greek phi)
706
+ ::s Ჷ ::t E ::comment Georgian Mtavruli capital letter yn (schwa)
707
+ ::s Ჸ ::t A ::comment Georgian Mtavruli capital letter elifi
708
+ ::s Ჹ ::t G ::comment Georgian Mtavruli capital letter gan
709
+ ::s Ჺ ::t ' ::comment Georgian Mtavruli capital letter ain
710
+ ::s Ჽ ::t E ::comment Georgian Mtavruli capital letter aen
711
+ ::s Ჾ ::t ::comment Georgian Mtavruli capital letter hard sign
712
+ ::s Ჿ ::t W ::comment Georgian Mtavruli capital letter labial sign
713
+
714
+ ::s ⴀ ::t a ::comment Georgian small letter an
715
+ ::s ⴄ ::t e ::comment Georgian small letter en
716
+ ::s ⴈ ::t i ::comment Georgian small letter in
717
+ ::s ⴍ ::t o ::comment Georgian small letter on
718
+ ::s ⴓ ::t u ::comment Georgian small letter un
719
+ ::s ⴡ ::t ey ::comment archaic Georgian small letter he
720
+ ::s ⴢ ::t i ::comment archaic Georgian small letter hie
721
+ ::s ⴣ :::t w ::comment archaic Georgian small letter we
722
+ ::s ⴤ ::t q ::comment archaic Georgian small letter har
723
+ ::s ⴥ ::t o ::comment archaic Georgian small letter hoe
724
+ ::s ⴧ ::t e ::comment Georgian small letter yn (schwa)
725
+ ::s ⴭ ::t e ::comment Georgian small letter aen
726
+
727
+ # Armenian
728
+ ::s Ա ::t A ::comment Armenian capital letter ayb
729
+ ::s ա ::t a ::comment Armenian small letter ayb
730
+ ::s ՠ ::t a ::comment ARMENIAN SMALL LETTER TURNED AYB (CHECK)
731
+ ::s Ե ::t E ::comment Armenian capital letter ech ::dont-use-at-start-of-word
732
+ ::s ե ::t e ::comment Armenian small letter ech ::dont-use-at-start-of-word
733
+ ::s Ե ::t Ye ::comment Armenian capital letter ech ::use-only-at-start-of-word
734
+ ::s ե ::t ye ::comment Armenian small letter ech ::use-only-at-start-of-word
735
+ ::s Է ::t E ::comment Armenian capital letter eh
736
+ ::s է ::t e ::comment Armenian small letter eh
737
+ ::s Ը ::t E ::comment Armenian capital letter et
738
+ ::s ը ::t e ::comment Armenian small letter et
739
+ ::s Ի ::t I ::comment Armenian capital letter ini
740
+ ::s ի ::t i ::comment Armenian small letter ini
741
+ ::s Յ ::t Y ::comment Armenian capital letter yi
742
+ ::s յ ::t y ::comment Armenian small letter yi
743
+ ::s ֈ ::t y ::comment ARMENIAN SMALL LETTER YI WITH STROKE (CHECK)
744
+ ::s Ո ::t Vo ::comment Armenian capital letter vo ::use-only-at-start-of-word
745
+ ::s ո ::t vo ::comment Armenian small letter vo ::use-only-at-start-of-word
746
+ ::s Ո ::t O ::comment Armenian capital letter vo ::dont-use-at-start-of-word
747
+ ::s ո ::t o ::comment Armenian small letter vo ::dont-use-at-start-of-word
748
+ ::s Ւ ::t W ::comment Armenian capital letter yiwn
749
+ ::s ւ ::t w ::comment Armenian small letter yiwn
750
+ ::s Օ ::t O ::comment Armenian capital letter oh
751
+ ::s օ ::t o ::comment Armenian small letter oh
752
+ ::s Խ ::t Kh ::comment Armenian capital letter xeh
753
+ ::s խ ::t kh ::comment Armenian small letter xeh
754
+
755
+ ::s Ժ ::t Zh ::comment Armenian capital letter zhe
756
+ ::s Ղ ::t Gh ::comment Armenian capital letter ghad
757
+ ::s Ճ ::t Tch ::comment Armenian capital letter cheh
758
+ ::s ճ ::t tch ::comment Armenian small letter cheh
759
+ ::s Շ ::t Sh ::comment Armenian capital letter sha
760
+ ::s Չ ::t Ch ::comment Armenian capital letter cha
761
+ ::s Ջ ::t J ::comment Armenian capital letter jheh
762
+ ::s ջ ::t j ::comment Armenian small letter jheh
763
+ ::s Վ ::t V ::comment Armenian capital letter vew
764
+ ::s վ ::t v ::comment Armenian small letter vew
765
+ ::s Ձ ::t Dz ::comment Armenian capital letter ja
766
+ ::s ձ ::t dz ::comment Armenian small letter ja
767
+ ::s Ծ ::t Ts ::comment Armenian capital letter ca
768
+ ::s ծ ::t ts ::comment Armenian small letter ca
769
+ ::s Ք ::t K ::t-alt Q ::comment Armenian capital letter keh - sometimes romanized as K' or Q
770
+ ::s ք ::t k ::t-alt q ::comment Armenian small letter keh - sometimes romanized as k' or q
771
+
772
+ ::s են ::t en ::use-only-for-whole-word ::comment exception (auxiliary verb)
773
+ ::s եմ ::t em ::use-only-for-whole-word ::comment exception (auxiliary verb)
774
+ ::s ենք ::t enk ::use-only-for-whole-word ::comment exception (auxiliary verb)
775
+ ::s ես ::t es ::use-only-for-whole-word ::comment exception (auxiliary verb)
776
+ ::s եք ::t ek ::use-only-for-whole-word ::comment exception (auxiliary verb)
777
+
778
+ ::s և ::t ev ::comment Armenian small ligature ech yiwn
779
+ ::s ՈՒ ::t U ::comment Armenian capital vo+yiwn
780
+ ::s Ու ::t U ::comment Armenian capital/small vo+yiwn
781
+ ::s ու ::t u ::comment Armenian small vo+wywn
782
+
783
+ ::s իւ ::t yu
784
+
785
+ ## Japanese
786
+ # Katakana
787
+ ::s シ ::t shi
788
+ ::s チ ::t chi
789
+ ::s フ ::t fu
790
+ ::s ジ ::t ji
791
+ ::s ヂ ::t ji
792
+ ::s ヅ ::t zu
793
+ ::s シャ ::t sha
794
+ ::s シュ ::t shu
795
+ ::s ショ ::t sho
796
+ ::s チャ ::t cha
797
+ ::s チェ ::t che
798
+ ::s チュ ::t chu
799
+ ::s チョ ::t cho
800
+ ::s ジャ ::t ja
801
+ ::s ジュ ::t ju
802
+ ::s ジョ ::t jo
803
+ ::s ジェ ::t je
804
+ ::s ヂャ ::t ja
805
+ ::s ヂュ ::t ju
806
+ ::s ヂョ ::t jo
807
+ ::s フェ ::t fe
808
+ ::s ヴェ ::t ve
809
+ ::s フィ ::t fi
810
+ ::s ウィ ::t wi
811
+ ::s ヴィ ::t vi
812
+ ::s ティ ::t ti
813
+ ::s ディ ::t di
814
+ ::s ッ ::t (__SOKUON__) ::comment katakana double following consonant
815
+ ::s ー ::t (__CHOONPU__) ::comment katakana prolonged sound mark
816
+ ::s 𛅤 ::t i ::comment KATAKANA LETTER SMALL WI
817
+ ::s 𛅥 ::t e ::comment KATAKANA LETTER SMALL WE
818
+ ::s 𛅦 ::t o ::comment KATAKANA LETTER SMALL WO
819
+ # Hiragana
820
+ ::s し ::t shi
821
+ ::s ち ::t chi
822
+ ::s つ ::t tsu
823
+ ::s ふ ::t fu
824
+ ::s を ::t o
825
+ ::s じ ::t ji
826
+ ::s ぢ ::t ji
827
+ ::s づ ::t zu
828
+ ::s しゃ ::t sha
829
+ ::s しゅ ::t shu
830
+ ::s しょ ::t sho
831
+ ::s ちゃ ::t cha
832
+ ::s ちゅ ::t chu
833
+ ::s ちょ ::t cho
834
+ ::s じゃ ::t ja
835
+ ::s じゅ ::t ju
836
+ ::s じょ ::t jo
837
+ ::s ぢゃ ::t ja
838
+ ::s ぢゅ ::t ju
839
+ ::s ぢょ ::t jo
840
+ ::s 𛅐 ::t i ::comment HIRAGANA LETTER SMALL WI
841
+ ::s 𛅑 ::t e ::comment HIRAGANA LETTER SMALL WE
842
+ ::s 𛅒 ::t o ::comment HIRAGANA LETTER SMALL WO
843
+ ::s っ ::t (__SOKUON__) ::comment hiragana double following consonant
844
+ ::s 々 ::t ² ::comment ideographic iteration mark ::annotation repetition-sign
845
+
846
+ ::s フ ::t fu ::t-alt f
847
+ ::s キ ::t ki ::t-alt k
848
+ ::s ク ::t ku ::t-alt k
849
+ ::s ラ ::t ra ::t-alt la
850
+ ::s リ ::t ri ::t-alt li
851
+ ::s ル ::t ru ::t-alt lu, l, r
852
+ ::s レ ::t re ::t-alt le
853
+ ::s ロ ::t ro ::t-alt lo
854
+ ::s ム ::t mu ::t-alt m ::example キム = Kim
855
+ ::s シ ::t shi ::t-alt si ::example メキシコ = meksiko (Mexico)
856
+ ::s ス ::t su ::t-alt s
857
+ ::s ト ::t to ::t-alt t
858
+ ::s ツ ::t tsu ::t-alt tu, ts ::example シュルツ = Schultz
859
+
860
+ ::s ㋿ ::t Reiwa ::comment SQUARE ERA NAME REIWA
861
+
862
+ # Chinese
863
+ ::s 邦 ::t bang ::t-alt bon, bum, bun, pon
864
+ ::s 鲍 ::t bao ::t-alt bow
865
+ ::s 堡 ::t bao ::t-alt berg, burg, bourg, burgh
866
+ ::s 贝 ::t bei ::t-alt ber
867
+ ::s 本 ::t ben ::t-alt bern, bon, bourn, burn
868
+ ::s 彼得 ::t bide ::t-alt peter, pet
869
+ ::s 伯 ::t bo ::t-alt ber
870
+ ::s 波 ::t bo ::t-alt po
871
+ ::s 布 ::t bu ::t-alt b
872
+ ::s 策 ::t ce ::t-alt tze, tzer
873
+ ::s 曾 ::t ceng ::t-alt tzen, zen
874
+ ::s 彻 ::t che ::t-alt tche
875
+ ::s 茨 ::t ci ::t-alt ts, tz, z
876
+ ::s 兹 ::t ci ::t-alt ds, dz, tz, z, zi
877
+ ::s 蒂 ::t di ::t-alt ti, tti
878
+ ::s 丁 ::t ding ::t-alt din, tin
879
+ ::s 顿 ::t dun ::t-alt ton
880
+ ::s 多 ::t duo ::t-alt do, dor, to
881
+ ::s 尔 ::t er ::t-alt l, le, ll, r
882
+ ::s 弗 ::t fu ::t-alt f, fer, pher, v, ver, vir
883
+ ::s 夫 ::t fu ::t-alt f, v, v
884
+ ::s 福 ::t fu ::t-alt faw, for, ford
885
+ ::s 哥 ::t ge ::t-alt go, co
886
+ ::s 戈 ::t ge ::t-alt go
887
+ ::s 各 ::t ge ::t-alt go, co
888
+ ::s 赫 ::t he ::t-alt ch, che, cher, ge
889
+ ::s 华 ::t hua ::t-alt ver, wa, war, wer ::example Washington
890
+ ::s 怀 ::t huai ::t-alt whi, wi, wy
891
+ ::s 惠 ::t hui ::t-alt wha, whea
892
+ ::s 基 ::t ji ::t-alt ki, chi
893
+ ::s 吉 ::t ji ::t-alt gi, gui
894
+ ::s 加 ::t jia ::t-alt ca, ga, ka ::example Canada
895
+ ::s 杰 ::t jie ::t-alt ger
896
+ ::s 金 ::t jin ::t-alt kin, gin
897
+ ::s 斤 ::t jin ::t-alt zin
898
+ ::s 康 ::t kang ::t-alt con, corn
899
+ ::s 考 ::t kao ::t-alt cow, cour
900
+ ::s 克 ::t ke ::t-alt k, che, cher
901
+ ::s 科 ::t ke ::t-alt ko
902
+ ::s 拉 ::t la ::t-alt ra ::example Tirana
903
+ ::s 朗 ::t lang ::t-alt lon, ron
904
+ ::s 赖 ::t lai ::t-alt ri
905
+ ::s 劳 ::t lao ::t-alt low
906
+ ::s 勒 ::t lei ::t-alt ler
907
+ ::s 伦 ::t lun ::t-alt lon, ran, ron
908
+ ::s 里 ::t li ::t-alt ri
909
+ ::s 利 ::t li ::t-alt ri ::example Ferrari
910
+ ::s 隆 ::t long ::t-alt lon, lum, lund
911
+ ::s 罗 ::t luo ::t-alt l, lo, lu, ro, row, ru
912
+ ::s 洛 ::t luo ::t-alt lo, low, ro
913
+ ::s 默 ::t mo ::t-alt mer
914
+ ::s 纳 ::t na ::t-alt ne, ner
915
+ ::s 珀 ::t po ::t-alt per
916
+ ::s 奇 ::t qi ::t-alt chi, dge, ge, tch
917
+ ::s 齐 ::t qi ::t-alt tsi, zi
918
+ ::s 乔 ::t qiao ::t-alt jo
919
+ ::s 青 ::t qing ::t-alt tsing
920
+ ::s 琼 ::t qiong ::t-alt jon, jum, jun
921
+ ::s 瑟 ::t se ::t-alt the
922
+ ::s 什 ::t shen ::t-alt sh
923
+ ::s 圣 ::t sheng ::t-alt san, sao, saint
924
+ ::s 斯 ::t si ::t-alt s, rth, th ::example Alaska
925
+ ::s 索 ::t suo ::t-alt tho
926
+ ::s 特 ::t te ::t-alt t
927
+ ::s 翁 ::t weng ::t-alt on
928
+ ::s 沃 ::t wo ::t-alt ver, vo, war, wer
929
+ ::s 乌 ::t wu ::t-alt ou, u
930
+ ::s 希 ::t xi ::t-alt chi, hi, shi
931
+ ::s 西 ::t xi ::t-alt s, si
932
+ ::s 锡 ::t xi ::t-alt ci, si, thi, zi
933
+ ::s 夏 ::t xia ::t-alt ha, cha, cia, sha, tia
934
+ ::s 香 ::t xiang ::t-alt chan, cham
935
+ ::s 歇 ::t xie ::t-alt she
936
+ ::s 谢 ::t xie ::t-alt che, she
937
+ ::s 辛 ::t xin ::t-alt cin, sen, sin, sing, sun, zen
938
+ ::s 欣 ::t xin ::t-alt hin, shin
939
+ ::s 休 ::t xiu ::t-alt hu, hue
940
+ ::s 修 ::t xiu ::t-alt ciu, siu, thew, tiu
941
+ ::s 许 ::t xu ::t-alt hue, schue
942
+ ::s 逊 ::t xun ::t-alt son
943
+ ::s 耶 ::t ye ::t-alt yer, ier
944
+ ::s 泽 ::t ze ::t-alt ser
945
+ ::s 扎 ::t zha ::t-alt za
946
+ ::s 詹 ::t zhan ::t-alt ja, jam, jan, jen, jon
947
+ ::s 治 ::t zhi ::t-alt ge ::example George
948
+
949
+ ## Numbers
950
+ # Chinese and Japanese numbers
951
+ ::s 零 ::num 0
952
+ ::s 〇 ::num 0
953
+ ::s 一 ::num 1
954
+ ::s 二 ::num 2
955
+ ::s 三 ::num 3
956
+ ::s 四 ::num 4
957
+ ::s 五 ::num 5
958
+ ::s 六 ::num 6
959
+ ::s 七 ::num 7
960
+ ::s 八 ::num 8
961
+ ::s 九 ::num 9
962
+ ::s 十 ::num 10
963
+ ::s 百 ::num 100
964
+ ::s 千 ::num 1000
965
+ ::s 万 ::num 10000
966
+ ::s 萬 ::num 10000
967
+ ::s 亿 ::num 100000000
968
+ ::s 億 ::num 100000000
969
+ ::s 兆 ::num 1000000000000
970
+ ::s 京 ::num 10000000000000000
971
+
972
+ # numbers in non-number words (to be exptended)
973
+ ::s 一贯 ::t yiguan ::comment consistent
974
+
975
+ ::s 红十字会 ::t hongshizihui ::comment Red Cross
976
+
977
+ ::s 百度 ::t baidu ::comment Baidu (company)
978
+ ::s 百分 ::t baifen ::comment percent
979
+ ::s 百合 ::t baihe ::comment lily
980
+ ::s 百货 ::t baihuo ::comment general merchandise
981
+ ::s 百科 ::t baike ::comment encyclopedia
982
+ ::s 百老汇 ::t bailaohui
983
+ ::s 百灵 ::t bailing
984
+ ::s 百慕大 ::t baimuda
985
+ ::s 百日咳 ::t bairike
986
+ ::s 百色市 ::t baiseshi
987
+ ::s 百事可乐 ::t baishikele ::comment Pepsi Cola
988
+ ::s 百無 ::t baiwu
989
+ ::s 百香 ::t baixiang
990
+ ::s 百姓 ::t baixing
991
+ ::s 百叶 ::t baiye
992
+ ::s 百色 ::t bose
993
+ ::s 杨百翰 ::t yangbaihan ::comment Brigham Young
994
+
995
+ ::s 北京 ::t beijing
996
+ ::s 京都 ::t jingdou
997
+ ::s 东京 ::t dongjing
998
+ ::s 京胡 ::t jinghu
999
+ ::s 南京 ::t nangjing
1000
+ ::s 普京 ::t pujing ::comment Putin
1001
+ ::s 東京 ::t dongjing ::comment Tokyo
1002
+ ::s 京兆 ::t jingzhao
1003
+
1004
+ ::s ㎢ ::t km²
1005
+ ::s ㎥ ::t m³
1006
+ ::s ㎝ ::t cm
1007
+
1008
+ ## Indian
1009
+ # see mostly under UnicodeDataOverwrite.txt
1010
+
1011
+ # Malayalam
1012
+ ::s ൗ ::t au ::comment MALAYALAM AU LENGTH MARK
1013
+
1014
+ # Tamil
1015
+ ::s ட ::t d ::comment most commonly d, but t when word-initial or in a doubled consonant
1016
+ ::s ஃப ::t f ::comment h+p=f
1017
+ ::s ஃஜ ::t z ::comment h+j=z
1018
+
1019
+ # Myanmar/Burmese
1020
+ # ::s ့ ::t ::comment dot below, denotes creaky tone
1021
+ # ::s း ::t ::comment visarga, denotes high tone
1022
+ ::s ၌ ::t -nai ::comment locative
1023
+ ::s ၍ ::t -jwe ::comment completed
1024
+ ::s ၎ ::t legau ::comment aforementioned
1025
+ ::s ၏ ::t -i ::comment genetive
1026
+
1027
+ # Lao
1028
+ ::s ັ ::t a ::comment vowel sign mai kan
1029
+ ::s ົ ::t o ::comment vowel sign mai kon
1030
+ ::s ູ ::t uu ::comment vowel sign uu
1031
+ ::s ຽ ::t y ::comment semivowel sign nyo
1032
+ ::s ຼ ::t l ::comment semivowel sign lo
1033
+ ::s ລ ::t l ::comment lo loot
1034
+ ::s ຣ ::t l ::comment lo ling
1035
+ ::s ໝ ::t m ::comment ho mo
1036
+ ::s ໜ ::n ::comment ho no
1037
+ ::s ຢ ::t y ::comment yo
1038
+ ::s ໍ ::t oo ::comment niggahita (possibly also nasal -m in final position)
1039
+ ::s ໆ ::t ² ::comment Lao ko la ::annotation repetition-sign
1040
+ ::s ຯ ::t ... ::comment Lao ellipsis
1041
+
1042
+ # Thai
1043
+ ::s ออ ::t o
1044
+ ::s อั ::t a
1045
+ ::s อิ ::t i
1046
+ ::s ๆ ::t ² ::comment Thai character maiyamok ::annotation repetition-sign
1047
+
1048
+ # Khmer
1049
+ ::s ័ ::t "" ::comment Khmer samyok sannya: indicates deviation from the general rules of pronunciation
1050
+ ::s ៏ ::t "" ::comment Khmer sign ahsda: denotes stressed intonation in some single-consonant words
1051
+ ::s ៍ ::t "" ::comment Khmer sign toandakhiat: indicates that the base character is not pronounced
1052
+ ::s ៌ ::t "" ::comment Khmer sign robat: a diacritic historically corresponding to the repha form of ra in Devanagari
1053
+ ::s ប៉ ::t pa ::comment Khmer ba + musĕkâtônd -> pa
1054
+ ::s ៗ ::t ² ::comment Khmer sign lek too ::annotation repetition-sign
1055
+
1056
+ ## Semitic languages
1057
+ # Arabic
1058
+ ::s و ::t w ::comment Arabic letter waw ::t-alt o, u ::lcode ara
1059
+ ::s ء ::t ' ::comment hamza
1060
+ ::s ٔ ::t ' ::comment hamza above
1061
+ ::s ٕ ::t ' ::comment hamza below
1062
+ ::s ع ::t ' ::comment ain
1063
+ ::s آ ::t a ::comment alef madda
1064
+ ::s ٓا ::t a ::comment Arabic maddah above plus alef (presumably an ill-formed version of آ; found 1 instance in Urdu text)
1065
+ ::s إ ::t i ::comment alef with hamza below
1066
+ ::s ٱ ::t a ::comment alef wasla ::comment typically indicates liaison with preceding word
1067
+ ::s ة ::t a ::comment teh marbuta
1068
+ ::s ۃ ::t a ::comment teh marbuta goal ::comment Used in Punjabi, Sindhi. Different from plain 'teh marbuta'?
1069
+ ::s ي ::t y ::comment Arabic yeh
1070
+ ::s ى ::t a ::comment alef maksura
1071
+ ::s ﻯ ::t a ::comment alef maksura isolated form
1072
+ ::s ﻰ ::t a ::comment alef maksura final form
1073
+ ::s ﯨ ::t a ::comment Uighur Kazach Kirghiz alef maksura initial form
1074
+ ::s ﯩ ::t a ::comment Uighur Kazach Kirghiz alef maksura medial form
1075
+ ::s ٰ ::t a ::comment Arabic letter superscript alef
1076
+ ::s ـ ::t ::comment tatweel (filler)
1077
+ ::s َ ::t a ::comment fatha ("-a")
1078
+ ::s ُ ::t u ::comment damma ("-u")
1079
+ ::s ِ ::t i ::comment kasra ("-i")
1080
+ ::s ْ ::t ::comment sukun (no vowel)
1081
+ ::s ۡ ::t ::comment small high dotless head of khah; like sukun (no vowel); used in Kashmiri, Assamese
1082
+ ::s ً ::t ::comment fathatan ("-an")
1083
+ ::s اً ::t an ::comment alef + fathatan
1084
+ ::s ٌ ::t ::comment dammatan ("-un")
1085
+ ::s ٍ ::t ::comment kasratan ("-in")
1086
+ ::s ّ ::t ::comment shadda (consonant doubler)
1087
+ ::s ڃ ::t ny ::comment Arabic letter nyeh U+0683 (used in Sindhi (snd))
1088
+ ::s ڄ ::t dy ::comment Arabic letter dyeh U+0684 (used in Sindhi (snd))
1089
+ ::s ۾ ::t men ::comment Sindhi postposition men
1090
+ ::s ؑ ::t alayhe wasallam ::comment "upon him be peace"
1091
+ ::s ﷴ ::t mohammad ::comment "Mohammad"
1092
+ ::s ﷸ ::t wasallam ::comment "and peace"
1093
+ ::s ﷺ ::t sallallahou alayhe wasallam ::comment "prayer of God be upon him and his family and peace"
1094
+
1095
+ ::s ࣓ ::t waw ::comment ARABIC SMALL LOW WAW
1096
+ ::s ࣔ ::t al-rub ::comment ARABIC SMALL HIGH WORD AR-RUB
1097
+ ::s ࣕ ::t s ::comment ARABIC SMALL HIGH SAD
1098
+ ::s ࣖ ::t ' ::comment ARABIC SMALL HIGH AIN
1099
+ ::s ࣗ ::t q ::comment ARABIC SMALL HIGH QAF
1100
+ ::s ࣘ ::t n ::comment ARABIC SMALL HIGH NOON WITH KASRA
1101
+ ::s ࣙ ::t n ::comment ARABIC SMALL LOW NOON WITH KASRA
1102
+ ::s ࣚ ::t al-thalatha ::comment ARABIC SMALL HIGH WORD ATH-THALATHA
1103
+ ::s ࣛ ::t al-sajda ::comment ARABIC SMALL HIGH WORD AS-SAJDA
1104
+ ::s ࣜ ::t al-nisf ::comment ARABIC SMALL HIGH WORD AN-NISF
1105
+ ::s ࣝ ::t sakta ::comment ARABIC SMALL HIGH WORD SAKTA
1106
+ ::s ࣞ ::t qif ::comment ARABIC SMALL HIGH WORD QIF
1107
+ ::s ࣟ ::t waqfa ::comment ARABIC SMALL HIGH WORD WAQFA
1108
+ ::s ࣠ ::t ::comment ARABIC SMALL HIGH FOOTNOTE MARKER (CHECK)
1109
+ ::s ࣡ ::t ::comment ARABIC SMALL HIGH SIGN SAFHA (CHECK)
1110
+ ::s ࣢ ::t ::comment ARABIC DISPUTED END OF AYAH (CHECK)
1111
+
1112
+ # Farsi
1113
+ ::s ی ::t i ::t-alt y ::comment Contributed by Nima
1114
+ ::s ای ::t i ::t-alt ai ::use-only-at-start-of-word ::comment Contributed by Nima
1115
+ ::s هٔ ::t eye ::use-only-at-end-of-word ::lcode fas ::comment Contributed by Nima
1116
+ ::s و ::t v ::t-alt o, u ::lcode fas ::comment Arabic letter waw
1117
+ ::s ض ::t z ::t-alt d ::lcode fas ::comment Contributed by Marjan
1118
+ ::s ث ::t s ::t-alt th ::lcode fas ::comment Contributed by Marjan
1119
+ ::s ذ ::t z ::t-alt th ::lcode fas ::comment Contributed by Nima
1120
+ ::s ع ::t a ::t-alt ' ::lcode fas ::comment Contributed by Nima
1121
+ ::s عا ::t a ::lcode fas ::comment Contributed by Nima
1122
+ ::s عی ::t i ::t-alt iy ::lcode fas ::comment Contributed by Nima
1123
+ ::s عو ::t u ::t-alt o, av ::lcode fas ::comment Contributed by Nima
1124
+ ::s چ ::t ch ::t-alt tch, tsh ::lcode fas ::comment Contributed by Nima
1125
+ ::s ه ::t e ::t-alt h ::use-only-at-end-of-word ::lcode fas ::comment Contributed by Nima
1126
+ ::s ‌ ::t "" ::t-alt " " ::lcode fas ::comment source is character "zero-width non-joiner" (U+200C); Contributed by Nima
1127
+ ::s غ ::t gh ::t-alt g ::lcode fas
1128
+ ::s آئی ::t ai ::t-alt ae ::lcode fas
1129
+ ::s ائی ::t ai ::t-alt ae ::lcode fas
1130
+ ::s آئو ::t au ::t-alt ao ::lcode fas
1131
+ ::s ائو ::t au ::t-alt ao ::lcode fas
1132
+
1133
+ # Kashmiri (so far: educated guesses)
1134
+ ::s ٖ ::t a ::comment Arabic subscript alef U+0656
1135
+ ::s ٗ ::t u ::comment Arabic inverted damma U+0657
1136
+ ::s ۚ ::t j ::comment Arabic small high jeem U+06DA
1137
+ ::s ۪ ::t ::comment Arabic emtpy centre low stop U+06EA
1138
+ ::s ۬ ::t ::comment Arabic rounded high stop with filled center U+06EC
1139
+
1140
+ # Pashto
1141
+ ::s ٙ ::t e ::comment Arabic zwarakay
1142
+ ::s ځ ::t z ::t-alt dz ::comment Pashto letter zim; Arabic letter "hah with hamza above"
1143
+ ::s څ ::t ts ::t-alt c ::comment Pashto letter tsim; Arabic letter "h with three dots above"
1144
+ ::s ګ ::t g ::comment Pashto letter gaf; Arabic letter "kaf with ring"
1145
+ ::s ڼ ::t n ::comment Arabic letter "noon with ring"
1146
+ ::s ږ ::t g ::t-alt z, zh, j ::comment pronunciation varies regionally
1147
+ ::s ښ ::t kh ::t-alt sh ::comment pronunciation varies regionally
1148
+ ::s ه ::t h ::t-alt a ::lcode pus
1149
+ ::s ۀ ::t e ::lcode pus ::comment Arabic letter "heh with yeh above"
1150
+ ::s و ::t w ::t-alt o, u ::lcode pus
1151
+ ::s ی ::t ay ::t-alt y ::lcode pus
1152
+ ::s وی ::t wy ::t-alt oy, uy ::lcode pus
1153
+ ::s ای ::t ay ::lcode pus
1154
+ ::s ۍ ::t ay ::lcode pus
1155
+ ::s ئ ::t ay ::t-alt y ::lcode pus
1156
+ ::s ژ ::t zh ::t-alt z ::lcode pus ::comment [ʒ]
1157
+ ::s ض ::t z ::t-alt d ::lcode pus
1158
+ ::s ث ::t s ::lcode pus ::t-alt th ::comment Arabic letter theh (unvoiced th/θ)
1159
+ ::s ذ ::t z ::lcode pus ::t-alt th ::comment Arabic letter thal (voiced th/ð)
1160
+
1161
+ # Hebrew
1162
+ ::s ב ::t v ::comment Hebrew letter bet ::t-alt b
1163
+ ::s כ ::t k ::comment Hebrew letter kaf ::t-alt kh
1164
+ ::s ך ::t k ::comment Hebrew letter kaf ::t-alt kh
1165
+ ::s פ ::t f ::comment Hebrew letter pe ::t-alt p
1166
+ ::s ש ::t sh ::comment Hebrew letter shin ::t-alt s
1167
+ ::s ו ::t v ::comment Hebrew letter vav ::t-alt o, u
1168
+ ::s ח ::t ch ::comment Hebrew letter het ::t-alt h ::use-alt-in-pointed
1169
+ ::s ק ::t q ::t-alt k ::use-alt-in-pointed
1170
+ ::s וֹ ::t o
1171
+ ::s וּ ::t u
1172
+ ::s קְוָ ::t qva ::t-alt kva ::use-alt-in-pointed
1173
+ ::s י ::t y
1174
+ ::s יּ ::t y
1175
+ ::s יָּ ::t ya
1176
+ ::s ײ ::t yy ::comment Hebrew ligature Yiddish double Yod (CHECK)
1177
+ ::s ׯ ::t yyy ::comment HEBREW YOD TRIANGLE (CHECK)
1178
+ ::s ע ::t '
1179
+ ::s ִי ::t i ::t-alt iy ::use-alt-in-pointed
1180
+ ::s ֵי ::t e
1181
+ ::s ִיּ ::t iy
1182
+ ::s ִיָּ ::t iya
1183
+ ::s ױ ::t oy
1184
+ ::s א ::t a ::t-alt '
1185
+ ::s אָ ::t a
1186
+ ::s ֹא ::t o
1187
+ ::s אַ ::t 'a
1188
+ ::s אֲ ::t 'a
1189
+ ::s אֶ ::t e
1190
+ ::s אֱ ::t e
1191
+ ::s פ ::t f
1192
+ ::s פּ ::t p
1193
+ ::s פַּ ::t pa
1194
+ ::s פְּ ::t pe ::t-alt p ::use-alt-in-pointed
1195
+ ::s שׁ ::t sh
1196
+ ::s שָׁ ::t sha
1197
+ ::s שָּׁ ::t sha ::comment ?
1198
+ ::s שְׁ ::t she ::t-alt sh ::use-alt-in-pointed
1199
+ ::s שֶׁ ::t she
1200
+ ::s שִׁ ::t shi
1201
+ ::s שֻׁ ::t shu
1202
+ ::s שׂ ::t s
1203
+ ::s שָׂ ::t sa
1204
+ ::s שְׂ ::t s ::t-alt se ::use-alt-in-pointed
1205
+ ::s כּ ::t k
1206
+ ::s כֶּ ::t ke
1207
+ ::s כֹּ ::t ko
1208
+ ::s בּ ::t b
1209
+ ::s בַּ ::t ba
1210
+ ::s בָּ ::t ba
1211
+ ::s בְּ ::t be ::t-alt b ::use-alt-in-pointed
1212
+ ::s בֶּ ::t be
1213
+ ::s תּ ::t t
1214
+ ::s תַּ ::t ta
1215
+ ::s תֵּ ::t te
1216
+ ::s תִּ ::t ti
1217
+ ::s דָּ ::t da
1218
+ ::s דְּ ::t de ::t-alt d ::use-alt-in-pointed
1219
+ ::s גּ ::t g
1220
+ ::s לֵּ ::t le
1221
+ ::s ד׳ ::t dh
1222
+ ::s ג׳ ::t j
1223
+ ::s ת׳ ::t th
1224
+ ::s ז׳ ::t zh
1225
+ ::s חַ ::t ach ::comment furtive patah ::use-only-at-end-of-word
1226
+ ::s עַ ::t a' ::comment furtive patah ::use-only-at-end-of-word
1227
+ ::s הַּ ::t ah ::comment furtive patah ::use-only-at-end-of-word
1228
+ ::s ַ ::t a ::comment Hebrew point patah
1229
+ ::s ֲ ::t a ::comment Hebrew point hataf patah (hataf = reduced)
1230
+ ::s ֳ ::t o ::comment Hebrew point hataf qamats
1231
+ ::s ָ ::t a ::comment Hebrew point qamats ::t-alt o ::use-alt-in-pointed
1232
+ ::s ֶ ::t e ::comment Hebrew point segol
1233
+ ::s ֱ ::t e ::comment Hebrew point hataf segol (hataf = reduced)
1234
+ ::s ְ ::t e ::comment Hebrew point sheva ::t-alt "" ::use-alt-in-pointed
1235
+ ::s ֵ ::t e ::comment Hebrew point tsere
1236
+ ::s ִ ::t i ::comment Hebrew point hiriq
1237
+ ::s ֹ ::t o ::comment Hebrew point holam
1238
+ ::s ֻ ::t u ::comment Hebrew point qubuts
1239
+ # ::s ּ ::t "" ::comment Hebrew point dagesh or mapiq
1240
+
1241
+ # Yiddish
1242
+ ::s א ::t a ::lcode yid ::comment called "silent" alef
1243
+ ::s אי ::t y ::lcode yid
1244
+ ::s איי ::t ey ::lcode yid
1245
+ ::s או ::t u ::lcode yid
1246
+ ::s אוי ::t oy ::lcode yid
1247
+ ::s אַ ::t a ::lcode yid
1248
+ ::s אָ ::t o ::lcode yid
1249
+ ::s ב ::t b ::lcode yid
1250
+ ::s בֿ ::t v ::lcode yid
1251
+ ::s דזש ::t dzh ::lcode yid
1252
+ ::s ו ::t u ::lcode yid
1253
+ ::s וּ ::t u ::lcode yid
1254
+ ::s וֹ ::t o ::lcode yid
1255
+ ::s װ ::t v ::lcode yid
1256
+ ::s ווא ::t wa ::lcode yid
1257
+ ::s וואַ ::t wa ::lcode yid
1258
+ ::s ווע ::t we ::lcode yid
1259
+ ::s ווי ::t wi ::lcode yid
1260
+ ::s וואוי ::t wo ::lcode yid
1261
+ ::s וי ::t oy ::lcode yid
1262
+ ::s זש ::t zh ::lcode yid
1263
+ ::s ח ::t ch ::lcode yid
1264
+ ::s טש ::t tsh ::lcode yid
1265
+ ::s יִ::t i ::lcode yid
1266
+ ::s יי ::t ey ::lcode yid ::comment maybe "yi" at beginning of word
1267
+ ::s ײַ ::t ay ::lcode yid
1268
+ ::s כּ ::t k ::lcode yid
1269
+ ::s כ ::t ch ::lcode yid
1270
+ ::s ך ::t ch ::lcode yid
1271
+ ::s ע ::t e ::lcode yid
1272
+ ::s פּ ::t p ::lcode yid
1273
+ ::s פֿ ::t f ::lcode yid
1274
+ ::s ף ::t f ::lcode yid ::comment sometimes p
1275
+ ::s ק ::t k ::lcode yid
1276
+ ::s ת ::t s ::lcode yid
1277
+
1278
+ # Syriac/Aramaic (should be vetted by expert)
1279
+ ::s ܰ ::t a ::comment Syriac pthaha above
1280
+ ::s ܲ ::t a ::comment Syriac pthaha dotted
1281
+ ::s ܳ ::t aa ::comment Syriac zqapha above
1282
+ ::s ܴ ::t aa ::comment Syriac zqapha below
1283
+ ::s ܵ ::t aa ::comment Syriac zqapha dotted
1284
+ ::s ܶ ::t e ::comment Syriac rbasa above
1285
+ ::s ܷ ::t e ::comment Syriac rbasa below
1286
+ ::s ܿ ::t o ::comment Syriac rwaha
1287
+ ::s ܸ ::t e ::comment Syriac dotted zlama horizontal
1288
+ ::s ܹ ::t e ::comment Syriac dotted zlama angular
1289
+ ::s ܺ ::t i ::comment Syriac hbasa above
1290
+ ::s ܝܺ ::t i ::comment Syriac yudh + hbasa above
1291
+ ::s ܼ ::t u ::comment Syriac hbasa-esasa dotted
1292
+ ::s ܽ ::t o ::comment Syriac esasa above
1293
+ ::s ܾ ::t u ::comment Syriac esasa below
1294
+ ::s ݇ ::t "" ::comment Syriac oblique line above; indication of a silent letter
1295
+
1296
+ ::s ܖ ::t d ::comment Syriac letter dotless dalath rish; ambiguous form for undifferentiated early dalath/rish
1297
+ ::s ܜ ::t t ::comment Syriac letter teth garshuni; used in Garshuni documents
1298
+ ::s ܒ݂ ::t v ::comment Syriac beth + rukkakha
1299
+ ::s ܒ�� ::t v ::comment Syriac beth + ring-below
1300
+ ::s ܓ݂ ::t g ::comment Syriac gammal + rukkakha [IPA: ɣ]
1301
+ ::s ܓ̥ ::t g ::comment Syriac gammal + ring-below [IPA: ɣ]
1302
+ ::s ܕ݂ ::t d ::comment Syriac dalath + rukkakha [IPA: ð]
1303
+ ::s ܕ̥ ::t d ::comment Syriac dalath + ring-below [IPA: ð]
1304
+ ::s ܟ݂ ::t kh ::comment Syriac kaph + rukkakha [IPA: x]
1305
+ ::s ܟ̥ ::t kh ::comment Syriac kaph + ring-below [IPA: x]
1306
+ ::s ܦ݂ ::t f ::comment Syriac pe + rukkakha
1307
+ ::s ܦ̥ ::t f ::comment Syriac pe + ring-below
1308
+ ::s ܦ݁ ::t p ::comment Syriac pe + qushshaya
1309
+ ::s ܬ݂ ::t th ::comment Syriac taw + rukkakha [IPA: θ]
1310
+ ::s ܬ̥ ::t th ::comment Syriac taw + ring-below [IPA: θ]
1311
+
1312
+ ::s ܄ ::t : ::comment Syriac sublinear colon; used at the end of verses of supplicationscolon skewed left
1313
+ ::s ܆ ::t , ::comment Syriac colon skewed left; marks a dependent clause
1314
+ ::s ܇ ::t , ::comment Syriac colon skewed right; marks the end of a subdivision of the apodosis, or latter part of a Biblical verse
1315
+
1316
+ # Uzbek
1317
+ ::s ʻ ::t ' ::comment modifies pronunciation of preceding "o" and "g"
1318
+ ::s ʼ ::t ' ::comment glottal stop (tutuq belgisi)
1319
+
1320
+ # Uyghur
1321
+ ::s ئا ::t a ::lcode uig
1322
+ ::s ە ::t e ::lcode uig
1323
+ ::s ئې ::t e ::lcode uig ::latinplus ë
1324
+ ::s ې ::t e ::lcode uig ::latinplus ë
1325
+ ::s ئە ::t e ::lcode uig
1326
+ ::s يە ::t e ::lcode uig
1327
+ ::s ئى ::t i ::lcode uig
1328
+ ::s ى ::t i ::lcode uig
1329
+ ::s ئو ::t o ::lcode uig
1330
+ ::s و ::t o ::lcode uig
1331
+ ::s ئۇ ::t u ::lcode uig
1332
+ ::s ۇ ::t u ::lcode uig
1333
+ ::s چ ::t ch ::t-alt q ::lcode uig
1334
+ ::s خ ::t x ::lcode uig
1335
+ ::s ژ ::t zh ::lcode uig
1336
+ ::s ئۆ ::t oe ::t-alt o ::lcode uig ::latinplus ö
1337
+ ::s ۆ ::t oe ::t-alt o ::lcode uig ::latinplus ö
1338
+ ::s ئۈ ::t ue ::t-alt u ::lcode uig ::latinplus ü
1339
+ ::s ۈ ::t ue ::t-alt u ::lcode uig ::latinplus ü
1340
+ ::s ۋ ::t w ::lcode uig
1341
+
1342
+ # Maldivian
1343
+ ::s ް ::t ::comment thaana sukun
1344
+ ::s ަ ::t a ::comment thaana abafili
1345
+ ::s ާ ::t aa ::comment thaana aabaafili
1346
+ ::s ި ::t i ::comment thaana ibifili
1347
+ ::s ީ ::t ee ::comment thaana eebeefili
1348
+ ::s ު ::t u ::comment thaana ubufili
1349
+ ::s ޫ ::t oo ::comment thaana ooboofili
1350
+ ::s ެ ::t e ::comment thaana ebefili
1351
+ ::s ޭ ::t ey ::comment thaana eybeyfili
1352
+ ::s ޮ ::t o ::comment thaana obofili
1353
+ ::s ޯ ::t oa ::comment thaana oaboafili
1354
+
1355
+ # Canadian syllabics (Inuktitut)
1356
+ ::s ᑊ ::t p ::comment syllable final
1357
+ ::s ᐟ ::t t ::comment syllable final
1358
+ ::s ᐠ ::t k ::comment syllable final
1359
+ ::s ᐨ ::t c ::comment syllable final
1360
+ ::s ᒼ ::t m ::comment syllable final
1361
+ ::s ᐣ ::t n ::comment syllable final
1362
+ ::s ᐢ ::t s ::comment syllable final
1363
+ ::s ᐧ ::t y ::comment syllable final
1364
+ ::s ᐤ ::t w ::comment syllable final
1365
+ ::s ᐦ ::t h ::comment syllable final
1366
+ ::s ᕽ ::t hk ::comment syllable final
1367
+ ::s ᓫ ::t l ::comment syllable final
1368
+ ::s ᕑ ::t r ::comment syllable final
1369
+
1370
+ # Mongolian
1371
+ ::s ᢅ ::t ::comment MONGOLIAN LETTER ALI GALI BALUDA (CHECK) indicates assimilation
1372
+ ::s ᢆ ::t ::comment MONGOLIAN LETTER ALI GALI THREE BALUDA (CHECK) indicates assimilation
1373
+
1374
+ # Tibetan
1375
+ ::s ྅ ::t ::comment TIBETAN MARK PALUTA (CHECK) indicates assimilation
1376
+
1377
+ ## Punctuation
1378
+ # delete
1379
+ ::s ¿ ::t "" ::comment inverted question mark
1380
+ ::s ¡ ::t "" ::comment inverted exclamation mark
1381
+ # decompose double-punctuation
1382
+ ::s ‼ ::t !!
1383
+ ::s ⁇ ::t ??
1384
+ ::s ⁉ ::t !?
1385
+ ::s ⁈ ::t ?!
1386
+ # preserve
1387
+ ::s ′ ::t ′
1388
+ ::s ∩ ::t ∩
1389
+ ::s ‡ ::t ‡
1390
+ # Cyrillic
1391
+ ::s ⁙ ::t . ::comment five dot punctuation
1392
+ # Amharic/Ethiopian
1393
+ ::s ። ::t .
1394
+ ::s ፣ ::t ,
1395
+ ::s ፤ ::t ;
1396
+ ::s ፥ ::t :
1397
+ ::s ፧ ::t ? ::comment Ethiopic question mark
1398
+ ::s ፡ ::t " " ::comment Ethiopic wordspace
1399
+ ::s ፦ ::t : ::comment Ethiopic preface colon
1400
+ # Ethiopic wordspace often appropriated for other purposes:
1401
+ ::s ፡፡ ::t .
1402
+ ::s ፡- ::t :
1403
+ ::s "፡ " ::t ", "
1404
+ ::s ቸ ::t cha ::comment Ethiopic syllable ca
1405
+ ::s ቹ ::t chu ::comment Ethiopic syllable cu
1406
+ ::s ቺ ::t chi ::comment Ethiopic syllable ci
1407
+ ::s ቻ ::t chaa ::comment Ethiopic syllable caa
1408
+ ::s ቼ ::t chee ::comment Ethiopic syllable cee
1409
+ ::s ች ::t che ::comment Ethiopic syllable ce
1410
+ ::s ቾ ::t cho ::comment Ethiopic syllable co
1411
+ ::s ሠ ::t sa ::comment Ethiopic syllable sza
1412
+ ::s ሡ ::t su ::comment Ethiopic syllable szu
1413
+ ::s ሢ ::t si ::comment Ethiopic syllable szi
1414
+ ::s ሣ ::t saa ::comment Ethiopic syllable szaa
1415
+ ::s ሤ ::t see::comment Ethiopic syllable szee
1416
+ ::s ሥ ::t se ::comment Ethiopic syllable sze
1417
+ ::s ሦ ::t so ::comment Ethiopic syllable szo
1418
+ ::s ጠ ::t te ::comment Ethiopic syllable the with ejective 't'
1419
+ ::s ጡ ::t tu ::comment Ethiopic syllable thu with ejective 't'
1420
+ ::s ጢ ::t ti ::comment Ethiopic syllable thi with ejective 't'
1421
+ ::s ጣ ::t taa ::comment Ethiopic syllable thaa with ejective 't'
1422
+ ::s ጤ ::t tee ::comment Ethiopic syllable thee with ejective 't'
1423
+ ::s ጥ ::t te ::comment Ethiopic syllable the with ejective 't'
1424
+ ::s ጦ ::t to ::comment Ethiopic syllable tho with ejective 't'
1425
+
1426
+ # Devanagari (Hindi etc.)
1427
+ ::s । ::t . ::comment danda
1428
+ ::s ॥ ::t . ::comment double danda
1429
+ ::s ৷ ::t . ::comment Bengali currency numerator four; used as danda
1430
+ ::s ॰ ::t . ::comment Devanagari abbreviation sign
1431
+ # Bengali
1432
+ ::s ৽ ::t . ::comment BENGALI ABBREVIATION SIGN
1433
+ ::s ৾ ::t ::comment BENGALI SANDHI MARK (CHECK)
1434
+ # Gurmukhi
1435
+ ::s ੶ ::t . ::comment GURMUKHI ABBREVIATION SIGN
1436
+ # Oriya/Odia (India)
1437
+ ::s ୤ ::t . ::comment danda (deprecated, should use Devanagari danda ।)
1438
+ ::s ୥ ::t . ::comment double danda (deprecated, should use Devanagari double danda ॥)
1439
+ # Tibetan
1440
+ ::s ། ::t ,
1441
+ ::s །: ::t :
1442
+ ::s ༏ ::t ;
1443
+ ::s ༎ ::t .
1444
+ ::s ༑ ::t , ::comment Tibetan mark run chen spungs shad
1445
+ ::s ༼ ::t ( ::comment Tibetan open roof punctuation
1446
+ ::s ༽ ::t ) ::comment Tibetan close roof punctuation
1447
+ ::s ༈ ::t "" ::comment Tibetan mark srbul shad
1448
+ ::s 【 ::t [ ::comment left black lenticular bracket
1449
+ ::s 】 ::t ] ::comment right black lenticular bracket
1450
+ ::s ༄ ::t "" ::comment Tibetan head mark
1451
+ ::s ༄༅ ::t "" ::comment Tibetan head mark
1452
+ ::s ༆ ::t "" ::comment Tibetan head mark
1453
+ # Myanmar/Burmese
1454
+ ::s ၊ ::t ,
1455
+ ::s ။ ::t .
1456
+ Khmer
1457
+ ::s ៖ ::t ; ::comment Khmer sign camnuc pii kuuh
1458
+ ::s ។ ::t . ::comment Khmer sign khan
1459
+ # Arabic
1460
+ ::s ، ::t ,
1461
+ ::s ؛ ::t ;
1462
+ ::s ٬ ::t ,
1463
+ ::s ۔ ::t .
1464
+ ::s ؟ ::t ?
1465
+ ::s ٪ ::t %
1466
+ ::s ٫ ::t , ::comment Arabic decimal separator
1467
+ ::s ۽ ::t & ::comment Arabic sign Sindhi ampersand
1468
+ # Aramaic
1469
+ ::s ܀ ::t .
1470
+ ::s ܂ ::t .
1471
+ # Hebrew
1472
+ ::s ־ ::t - ::comment maqaf
1473
+ # Armenian
1474
+ ::s ։ ::t .
1475
+ ::s ՝ ::t , ::comment Armenian comma
1476
+ # Chinese
1477
+ ::s , ::t ", "
1478
+ ::s 、 ::t ", "
1479
+ ::s 。 ::t ". "
1480
+ ::s ! ::t "! "
1481
+ ::s ? ::t "? "
1482
+ ::s 「 ::t ' "'
1483
+ ::s 」 ::t '" '
1484
+ ::s 《 ::t ' "'
1485
+ ::s 》 ::t '" '
1486
+ ::s ( ::t " ("
1487
+ ::s ) ::t ") "
1488
+ ::s ; ::t ;
1489
+ ::s : ::t ": "
1490
+ ::s ︰ ::t ": "
1491
+ ::s - ::t -
1492
+ ::s / ::t /
1493
+ ::s = ::t =
1494
+ ::s ~ ::t ~
1495
+ ::s & ::t &
1496
+ ::s < ::t <
1497
+ ::s > ::t >
1498
+ ::s % ::t %
1499
+ ::s _ ::t _ ::comment FULLWIDTH LOW LINE (U+FF3F)
1500
+ ::s { ::t { ::comment FULLWIDTH LEFT CURLY BRACKET (U+FF5B)
1501
+ ::s } ::t } ::comment FULLWIDTH RIGHT CURLY BRACKET (U+FF5D)
1502
+ ::s   ::t " " ::comment ideographic space
1503
+ # Japanese
1504
+ ::s 『 ::t ' "'
1505
+ ::s 』 ::t '" '
1506
+ ::s ・ ::t " " ::comment Katakana middle dot; separates name elements such as first and last name
1507
+ # N'ko
1508
+ ::s ߽ ::t . ::comment NKO DANTAYALAN used to abbreviate units of measure
1509
+ # Medefaidrin
1510
+ ::s 𖺗 ::t , ::comment MEDEFAIDRIN COMMA
1511
+ ::s 𖺘 ::t . ::comment MEDEFAIDRIN FULL STOP
1512
+ # Khitan
1513
+ ::s 𖿤 ::t ::comment KHITAN SMALL SCRIPT FILLER
1514
+
1515
+ # Symbols
1516
+ ::s ∞ ::t ∞ ::comment infinity
1517
+ ::s ­ ::t ::comment soft hyphen; used to indicate preferred line breaks; remove
1518
+ ::s ֊ ::t - ::comment Armenian hyphen; map to regular hyphen-minus
1519
+ ::s ᐩ ::t + ::comment Canadian syllabics final plus; map to regular plus
1520
+ ::s ﹐ ::t , ::comment small comma; map to regular comma
1521
+ ::s ˚ ::t ° ::comment ring above; map to degree sign
1522
+ ::s ⇒ ::t ⇒ ::comment rightwards double arrow
1523
+ ::s † ::t † ::comment dagger
1524
+ ::s • ::t • ::comment bullet
1525
+ ::s ℃ ::t °C ::comment degree Celsius; split into 2 characters
1526
+ ::s ℉ ::t °F ::comment degree Fahrenheit; split into 2 characters
1527
+ ::s ― ::t ― ::comment horizontal bar
1528
+ ::s ˇ ::t ˇ ::comment caron (sometimes apparently used for "Arabic vowel sign small v above" U+065A, e.g. in Gilaki language (glk))
1529
+ ::s ″ ::t ″ ::comment double prime
1530
+ ::s ﴾ ::t ( ::comment ornate left parenthesis
1531
+ ::s ﴿ ::t ) ::comment ornate right parenthesis
1532
+ ::s 〔 ::t [ ::comment left tortoise shell bracket
1533
+ ::s 〕 ::t ] ::comment right tortoise shell bracket
1534
+ ::s ﹝ ::t ( ::comment small left tortoise shell bracket
1535
+ ::s ﹞ ::t ) ::comment small left tortoise shell bracket
1536
+ ::s ¦ ::t ¦ ::comment BROKEN BAR (U+00A6)
1537
+ ::s ¨ ::t ::comment DIAERESIS (U+00A8)
1538
+ ::s ¯ ::t ::comment MACRON (U+00AF)
1539
+ ::s ¸ ::t ::comment CEDILLA (U+00B8)
1540
+ ::s Ƿ ::t W ::comment LATIN CAPITAL LETTER WYNN (U+01F7)
1541
+ ::s ˘ ::t ::comment BREVE (U+02D8)
1542
+ ::s ˛ ::t ::comment OGONEK (U+02DB)
1543
+ ::s ˜ ::t ~ ::comment SMALL TILDE (U+02DC)
1544
+ ::s ̒ ::t ::comment COMBINING TURNED COMMA ABOVE (U+0312)
1545
+ ::s ̔ ::t ::comment COMBINING REVERSED COMMA ABOVE (U+0314)
1546
+ ::s ̜ ::t ::comment COMBINING LEFT HALF RING BELOW (U+031C)
1547
+ ::s ̧ ::t ::comment COMBINING CEDILLA (U+0327)
1548
+ ::s ̫ ::t ::comment COMBINING INVERTED DOUBLE ARCH BELOW (U+032B)
1549
+ ::s ̲ ::t ::comment COMBINING LOW LINE (U+0332)
1550
+ ::s ̳ ::t ::comment COMBINING DOUBLE LOW LINE (U+0333)
1551
+ ::s ̹ ::t ::comment COMBINING RIGHT HALF RING BELOW (U+0339)
1552
+ ::s ̺ ::t ::comment COMBINING INVERTED BRIDGE BELOW (U+033A)
1553
+ ::s ̿ ::t ::comment COMBINING DOUBLE OVERLINE (U+033F)
1554
+ ::s ͅ ::t ::comment COMBINING GREEK YPOGEGRAMMENI (U+0345)
1555
+ ::s ͑ ::t ::comment COMBINING LEFT HALF RING ABOVE (U+0351)
1556
+ ::s ͗ ::t ::comment COMBINING RIGHT HALF RING ABOVE (U+0357)
1557
+ ::s ͚ ::t ::comment COMBINING DOUBLE RING BELOW (U+035A)
1558
+ ::s ͜ ::t ::comment COMBINING DOUBLE BREVE BELOW (U+035C)
1559
+ ::s ͝ ::t ::comment COMBINING DOUBLE BREVE (U+035D)
1560
+ ::s ͞ ::t ::comment COMBINING DOUBLE MACRON (U+035E)
1561
+ ::s ͟ ::t ::comment COMBINING DOUBLE MACRON BELOW (U+035F)
1562
+ ::s ͠ ::t ::comment COMBINING DOUBLE TILDE (U+0360)
1563
+
1564
+ ::s ‐ ::t - ::comment HYPHEN (U+2010)
1565
+ ::s ‗ ::t ‗ ::comment DOUBLE LOW LINE (U+2017)
1566
+ ::s ‵ ::t ‵ ::comment REVERSED PRIME (U+2035)
1567
+ ::s ‶ ::t ‶ ::comment REVERSED DOUBLE PRIME (U+2036)
1568
+ ::s ‸ ::t ‸ ::comment CARET (U+2038)
1569
+ ::s ‽ ::t ?! ::comment INTERROBANG (U+203D)
1570
+ ::s ‾ ::t ‾ ::comment OVERLINE (U+203E)
1571
+ ::s ‿ ::t ‿ ::comment UNDERTIE (U+203F)
1572
+ ::s ⁂ ::t ⁂ ::comment ASTERISM (U+2042)
1573
+ ::s ⁎ ::t * ::comment LOW ASTERISK (U+204E)
1574
+ ::s ⁏ ::t ; ::comment REVERSED SEMICOLON (U+204F)
1575
+ ::s ⁔ ::t ⁔ ::comment INVERTED UNDERTIE (U+2054)
1576
+ ::s ⁝ ::t ⁝ ::comment TRICOLON (U+205D)
1577
+ ::s   ::t " " ::comment MEDIUM MATHEMATICAL SPACE (U+205F)
1578
+ ::s ₋ ::t - ::comment SUBSCRIPT MINUS (U+208B)
1579
+ ::s ⃩ ::t ::comment COMBINING WIDE BRIDGE ABOVE (U+20E9)
1580
+
1581
+ ::s ﹔ ::t ; ::comment SMALL SEMICOLON (U+FE54)
1582
+ ::s ﹕ ::t : ::comment SMALL COLON (U+FE55)
1583
+ ::s ﹛ ::t { ::comment SMALL LEFT CURLY BRACKET (U+FE5B)
1584
+ ::s ﹜ ::t } ::comment SMALL RIGHT CURLY BRACKET (U+FE5C)
1585
+ ::s ﹠ ::t & ::comment SMALL AMPERSAND (U+FE60)
1586
+ ::s ﹡ ::t * ::comment SMALL ASTERISK (U+FE61)
1587
+ ::s ﹣ ::t - ::comment SMALL HYPHEN-MINUS (U+FE63)
1588
+
1589
+ ::s ℈ ::t ℈ ::comment SCRUPLE (U+2108)
1590
+ ::s ℟ ::t ℟ ::comment RESPONSE (U+211F)
1591
+ ::s ℣ ::t ℣ ::comment VERSICLE (U+2123)
1592
+ ::s ℽ ::t ℽ ::comment DOUBLE-STRUCK SMALL GAMMA (U+213D)
1593
+ ::s ℾ ::t ℾ ::comment DOUBLE-STRUCK CAPITAL GAMMA (U+213E)
1594
+ ::s ⅋ ::t ⅋ ::comment TURNED AMPERSAND (U+214B)
1595
+ ::s ⅍ ::t A/S::comment AKTIESELSKAB (U+214D)
1596
+
1597
+ ::s ⑃ ::t ⑃ ::comment OCR INVERTED FORK (U+2443)
1598
+ ::s ⑊ ::t \\ ::comment OCR DOUBLE BACKSLASH (U+244A)
1599
+ ::s ⟮ ::t ( ::comment MATHEMATICAL LEFT FLATTENED PARENTHESIS (U+27EE)
1600
+ ::s ⟯ ::t ) ::comment MATHEMATICAL RIGHT FLATTENED PARENTHESIS (U+27EF)
1601
+ ::s ⸨ ::t (( ::comment LEFT DOUBLE PARENTHESIS (U+2E28)
1602
+ ::s ⸩ ::t )) ::comment RIGHT DOUBLE PARENTHESIS (U+2E29)
1603
+
1604
+ # kavyka indicates alternative reading
1605
+ ::s ᷶ ::t ::comment COMBINING KAVYKA ABOVE RIGHT (U+1DF6)
1606
+ ::s ᷷ ::t ::comment COMBINING KAVYKA ABOVE LEFT (U+1DF7)
1607
+ ::s ⹅ ::t ::comment INVERTED LOW KAVYKA (U+2E45)
1608
+ ::s ⹆ ::t ::comment INVERTED LOW KAVYKA WITH KAVYKA ABOVE (U+2E46)
1609
+ ::s ⹇ ::t ::comment LOW KAVYKA (U+2E47)
1610
+ ::s ⹈ ::t ::comment LOW KAVYKA WITH DOT (U+2E48)
1611
+ ::s ꙾ ::t ::comment CYRILLIC KAVYKA (U+A67E)
1612
+
1613
+ # Braille
1614
+ ::s ⠁ ::t a
1615
+ ::s ⠃ ::t b
1616
+ ::s ⠉ ::t c
1617
+ ::s ⠙ ::t d
1618
+ ::s ⠑ ::t e
1619
+ ::s ⠋ ::t f
1620
+ ::s ⠛ ::t g
1621
+ ::s ⠓ ::t h
1622
+ ::s ⠊ ::t i
1623
+ ::s ⠚ ::t j
1624
+ ::s ⠅ ::t k
1625
+ ::s ⠇ ::t l
1626
+ ::s ⠍ ::t m
1627
+ ::s ⠝ ::t n
1628
+ ::s ⠕ ::t o
1629
+ ::s ⠏ ::t p
1630
+ ::s ⠟ ::t q
1631
+ ::s ⠗ ::t r
1632
+ ::s ⠎ ::t s
1633
+ ::s ⠞ ::t t
1634
+ ::s ⠥ ::t u
1635
+ ::s ⠧ ::t v
1636
+ ::s ⠺ ::t w
1637
+ ::s ⠭ ::t x
1638
+ ::s ⠽ ::t y
1639
+ ::s ⠵ ::t z
1640
+
1641
+ ::s ⠜ ::t ae
1642
+ ::s ⠪ ::t oe
1643
+ ::s ⠳ ::t ue
1644
+ ::s ⠷ ::t a ::comment à
1645
+ ::s ⠡ ::t a ::comment â
1646
+ ::s ⠿ ::t e ::comment é
1647
+ ::s ⠮ ::t e ::comment è
1648
+ ::s ⠣ ::t e ::comment ê
1649
+ ::s ⠫ ::t e ::comment ë
1650
+ ::s ⠩ ::t i ::comment î
1651
+ ::s ⠻ ::t i ::comment ï
1652
+ ::s ⠹ ::t o ::comment ô
1653
+ ::s ⠾ ::t u ::comment ù
1654
+ ::s ⠱ ::t u ::comment û
1655
+
1656
+ ::s ⠡ ::t au ::lcode deu
1657
+ ::s ⠌ ::t aeu ::lcode deu
1658
+ ::s ⠹ ::t ch ::lcode deu
1659
+ ::s ⠩ ::t ei ::lcode deu
1660
+ ::s ⠣ ::t eu ::lcode deu
1661
+ ::s ⠬ ::t ie ::lcode deu
1662
+ ::s ⠱ ::t sch ::lcode deu
1663
+ ::s ⠮ ::t ss ::lcode deu
1664
+ ::s ⠾ ::t st ::lcode deu
1665
+
1666
+ ::s ⠠⠠ ::t "" ::comment start of word all-caps mode
1667
+ # ::s ⠠⠁ ::t A
1668
+ # ::s ⠠⠃ ::t B
1669
+ # ::s ⠠⠉ ::t C
1670
+ # ::s ⠠⠙ ::t D
1671
+ # ::s ⠠⠑ ::t E
1672
+ # ::s ⠠⠋ ::t F
1673
+ # ::s ⠠⠛ ::t G
1674
+ # ::s ⠠⠓ ::t H
1675
+ # ::s ⠠⠊ ::t I
1676
+ # ::s ⠠⠚ ::t J
1677
+ # ::s ⠠⠅ ::t K
1678
+ # ::s ⠠⠇ ::t L
1679
+ # ::s ⠠⠍ ::t M
1680
+ # ::s ⠠⠝ ::t N
1681
+ # ::s ⠠⠕ ::t O
1682
+ # ::s ⠠⠏ ::t P
1683
+ # ::s ⠠⠟ ::t Q
1684
+ # ::s ⠠⠗ ::t R
1685
+ # ::s ⠠⠎ ::t S
1686
+ # ::s ⠠⠞ ::t T
1687
+ # ::s ⠠⠥ ::t U
1688
+ # ::s ⠠⠧ ::t V
1689
+ # ::s ⠠⠺ ::t W
1690
+ # ::s ⠠⠭ ::t X
1691
+ # ::s ⠠⠽ ::t Y
1692
+ # ::s ⠠⠵ ::t Z
1693
+
1694
+ ::s ⠼⠁ ::t 1
1695
+ ::s ⠼⠃ ::t 2
1696
+ ::s ⠼⠉ ::t 3
1697
+ ::s ⠼⠙ ::t 4
1698
+ ::s ⠼⠑ ::t 5
1699
+ ::s ⠼⠋ ::t 6
1700
+ ::s ⠼⠛ ::t 7
1701
+ ::s ⠼⠓ ::t 8
1702
+ ::s ⠼⠊ ::t 9
1703
+ ::s ⠼⠚ ::t 0
1704
+
1705
+ ::s ⠂ ::t ,
1706
+ ::s ⠆ ::t ;
1707
+ ::s ⠒ ::t :
1708
+ ::s ⠲ ::t .
1709
+ ::s ⠦ ::t ?
1710
+ ::s ⠖ ::t !
1711
+ ::s ⠄ ::t '
1712
+ ::s ⠤ ::t -
1713
+ ::s ⠨⠤ ::t _
1714
+
1715
+ ::s ⠀ ::t " " ::comment blank
1716
+ # ::s ⠐ t " " ::comment blank in numeric mode
1717
+ ::s ⠈ ::t "" ::comment accent
1718
+ # ::s ⠌ ::t / ::comment in numeric mode only
1719
+ # ::s ⠐ ::comment abbreviation sign
1720
+ # ::s ⠘ ::comment abbreviation sign
1721
+ # ::s ⠠ ::comment capital indicator
1722
+ ::s ⠨ ::t . ::comment decimal point; emphasis
1723
+ ::s ⠰ ::t "" ::comment letter indicator
1724
+ # ::s ⠴ ::t ”
1725
+ # ::s ⠶ ::t ()
1726
+ # ::s ⠸ ::comment abbreviation sign
1727
+ ::s ⠼ ::t "" ::comment number indicator
1728
+ ::s ⠘⠚ ::t ° ::word-external-punctuation
1729
+ ::s ⠘⠚⠠⠉ ::t °C
1730
+ ::s ⠘⠚⠉ ::t °C
1731
+ ::s ⠘⠚⠠⠋ ::t °F
1732
+ ::s ⠘⠚⠋ ::t °F
1733
+
1734
+ ::s ⠠⠶ ::t " ::word-external-punctuation
1735
+ ::s ⠘⠦ ::t “ ::word-external-punctuation
1736
+ ::s ⠘⠴ ::t ” ::word-external-punctuation
1737
+ ::s ⠄⠦ ::t ‘
1738
+ ::s ⠄⠴ ::t ’
1739
+ ::s ⠠⠴ ::t ���
1740
+ ::s ⠐⠣ ::t ( ::word-external-punctuation
1741
+ ::s ⠐⠜ ::t ) ::word-external-punctuation
1742
+ ::s ⠨⠣ ::t [ ::word-external-punctuation
1743
+ ::s ⠨⠜ ::t ] ::word-external-punctuation
1744
+ ::s ⠸⠣ ::t { ::word-external-punctuation
1745
+ ::s ⠸⠜ ::t } ::word-external-punctuation
1746
+ ::s ⠈⠣ ::t < ::word-external-punctuation
1747
+ ::s ⠈⠜ ::t > ::word-external-punctuation
1748
+ ::s ⠸⠌ ::t / ::word-external-punctuation
1749
+ ::s ⠸⠡ ::t \ ::word-external-punctuation
1750
+ ::s ⠠⠤ ::t – ::word-external-punctuation
1751
+ ::s ⠐⠠⠤ ::t — ::word-external-punctuation
1752
+ ::s ⠈⠯ ::t & ::word-external-punctuation
1753
+ ::s ⠐⠔ ::t * ::word-external-punctuation
1754
+ ::s ⠨⠦ ::t ∩ ::word-external-punctuation
1755
+ ::s ⠨⠴ ::t % ::word-external-punctuation
1756
+ ::s ⠐⠖ ::t + ::word-external-punctuation
1757
+ ::s ⠐⠤ ::t − ::word-external-punctuation
1758
+ ::s ⠐⠶ ::t = ::word-external-punctuation
1759
+ ::s ⠈⠎ ::t $ ::word-external-punctuation
1760
+ ::s ⠈⠉ ::t ¢ ::word-external-punctuation
1761
+ ::s ⠈⠇ ::t £ ::word-external-punctuation
1762
+ ::s ⠈⠽ ::t ¥ ::word-external-punctuation
1763
+ ::s ⠈⠁ ::t @ ::word-external-punctuation
1764
+ ::s ⠸⠹ ::t # ::word-external-punctuation
1765
+ ::s ⠸⠲ ::t • ::word-external-punctuation
1766
+ ::s ⠈⠢ ::t ^ ::word-external-punctuation
1767
+ ::s ⠈⠔ ::t ~ ::word-external-punctuation
1768
+ ::s ⠘⠉ ::t © ::word-external-punctuation
1769
+ ::s ⠐⠌ ::t ÷ ::word-external-punctuation
1770
+ ::s ⠐⠦ ::t × ::word-external-punctuation
1771
+ ::s ⠈⠠⠹ ::t † ::word-external-punctuation
1772
+ ::s ⠈⠠⠻ ::t ‡ ::word-external-punctuation
1773
+ ::s ⠘⠏ ::t ¶ ::word-external-punctuation
1774
+ ::s ⠘⠎ ::t § ::word-external-punctuation
1775
+ ::s ⠘⠗ ::t ® ::word-external-punctuation
1776
+ ::s ⠘⠞ ::t ™ ::word-external-punctuation
1777
+
1778
+ # English Braille
1779
+ ::s ⠁⠃ ::t about ::lcode eng ::use-only-for-whole-word
1780
+ ::s ⠁⠃⠧ ::t above ::lcode eng ::use-only-for-whole-word
1781
+ ::s ⠁⠉ ::t according ::lcode eng ::use-only-for-whole-word
1782
+ ::s ⠁⠉⠗ ::t across ::lcode eng ::use-only-for-whole-word
1783
+ ::s ⠁⠋ ::t after ::lcode eng ::use-only-for-whole-word
1784
+ ::s ⠁⠋⠝ ::t afternoon ::lcode eng ::use-only-for-whole-word
1785
+ ::s ⠁⠋⠺ ::t afterward ::lcode eng ::use-only-for-whole-word
1786
+ ::s ⠁⠛ ::t again ::lcode eng ::use-only-for-whole-word
1787
+ ::s ⠁⠛⠌ ::t against ::lcode eng ::use-only-for-whole-word
1788
+ ::s ⠠⠽ ::t ally ::lcode eng ::use-only-at-end-of-word ::use-only-in-lower-case-environment
1789
+ ::s ⠁⠇⠍ ::t almost ::lcode eng ::use-only-for-whole-word
1790
+ ::s ⠁⠇⠗ ::t already ::lcode eng ::use-only-for-whole-word
1791
+ ::s ⠁⠇ ::t also ::lcode eng ::use-only-for-whole-word
1792
+ ::s ⠁⠇⠹ ::t although ::lcode eng ::use-only-for-whole-word
1793
+ ::s ⠁⠇⠞ ::t altogether ::lcode eng ::use-only-for-whole-word
1794
+ ::s ⠁⠇⠺ ::t always ::lcode eng ::use-only-for-whole-word
1795
+ ::s ⠨⠑ ::t ance ::lcode eng
1796
+ ::s ⠯ ::t and ::lcode eng
1797
+ ::s ⠜ ::t ar ::lcode eng
1798
+ ::s ⠵ ::t as ::lcode eng ::use-only-for-whole-word
1799
+ ::s ⠠⠝ ::t ation ::lcode eng ::use-only-at-end-of-word ::use-only-in-lower-case-environment
1800
+ ::s ⠃ ::t b ::lcode eng
1801
+ ::s ⠆ ::t bb ::lcode eng ::dont-use-at-start-of-word ::dont-use-at-end-of-word
1802
+ ::s ⠆ ::t be ::lcode eng ::use-only-at-start-of-word
1803
+ ::s ⠆⠉ ::t because ::lcode eng ::use-only-for-whole-word
1804
+ ::s ⠆⠋ ::t before ::lcode eng ::use-only-for-whole-word
1805
+ ::s ⠆⠓ ::t behind ::lcode eng ::use-only-for-whole-word
1806
+ ::s ⠆⠇ ::t below ::lcode eng ::use-only-for-whole-word
1807
+ ::s ⠆⠝ ::t beneath ::lcode eng ::use-only-for-whole-word
1808
+ ::s ⠆⠎ ::t beside ::lcode eng ::use-only-for-whole-word
1809
+ ::s ⠆⠞ ::t between ::lcode eng ::use-only-for-whole-word
1810
+ ::s ⠆⠽ ::t beyond ::lcode eng ::use-only-for-whole-word
1811
+ ::s ⠃⠇ ::t blind ::lcode eng ::use-only-for-whole-word
1812
+ ::s ⠃⠗⠇ ::t Braille ::lcode eng ::use-only-for-whole-word
1813
+ ::s ⠃ ::t but ::lcode eng ::use-only-for-whole-word
1814
+ ::s ⠉ ::t c ::lcode eng
1815
+ ::s ⠉ ::t can ::lcode eng ::use-only-for-whole-word
1816
+ ::s ⠸⠉ ::t cannot ::lcode eng
1817
+ ::s ⠒ ::t cc ::lcode eng ::dont-use-at-start-of-word ::dont-use-at-end-of-word
1818
+ ::s ⠉⠧ ::t ceive ::lcode eng ::use-only-at-end-of-word
1819
+ ::s ⠉⠧⠙ ::t ceived ::lcode eng ::use-only-at-end-of-word
1820
+ ::s ⠉⠧⠎ ::t ceives ::lcode eng ::use-only-at-end-of-word
1821
+ ::s ⠉⠧⠛ ::t ceiving ::lcode eng
1822
+ ::s ⠡ ::t ch ::lcode eng
1823
+ ::s ⠐⠡ ::t character ::lcode eng
1824
+ ::s ⠡ ::t child ::lcode eng ::use-only-for-whole-word
1825
+ ::s ⠡⠝ ::t children ::lcode eng ::use-only-for-whole-word
1826
+ ::s ⠒ ::t con ::lcode eng ::use-only-at-start-of-word
1827
+ ::s ⠒ ::t : ::lcode eng ::use-only-at-end-of-word
1828
+ ::s ⠉⠙ ::t could ::lcode eng ::use-only-for-whole-word
1829
+ ::s ⠙ ::t d ::lcode eng
1830
+ ::s ⠙ ::t do ::lcode eng ::use-only-for-whole-word
1831
+ ::s ⠐⠙ ::t day ::lcode eng
1832
+ # ::s ⠲ ::t dd ::t-alt . ::lcode eng ::dont-use-at-start-of-word ::dont-use-at-end-of-word ::comment abolished; interferes with period in abbrevisations such as U.S.
1833
+ ::s ⠙⠉⠇ ::t declare ::lcode eng
1834
+ ::s ⠙⠉⠇⠛ ::t declaring ::lcode eng
1835
+ ::s ⠲ ::t dis ::lcode eng ::use-only-at-start-of-word
1836
+ ::s ⠲ ::t . ::lcode eng ::dont-use-at-start-of-word
1837
+ ::s ⠑ ::t e ::lcode eng
1838
+ ::s ⠂ ::t ea ::lcode eng ::dont-use-at-end-of-word
1839
+ ::s ⠂ ::t , ::lcode eng ::use-only-at-end-of-word
1840
+ ::s ⠫ ::t ed ::lcode eng
1841
+ ::s ⠑⠊ ::t either ::lcode eng ::use-only-for-whole-word
1842
+ ::s ⠢ ::t en ::lcode eng
1843
+ ::s ⠰⠑ ::t ence ::lcode eng ::dont-use-at-start-of-word
1844
+ ::s ⠢ ::t enough ::lcode eng ::use-only-for-whole-word
1845
+ ::s ⠻ ::t er ::lcode eng
1846
+ ::s ⠐⠑ ::t ever ::lcode eng
1847
+ ::s ⠑ ::t every ::lcode eng ::use-only-for-whole-word
1848
+ ::s ⠋ ::t f ::lcode eng
1849
+ ::s ⠐⠋ ::t father ::lcode eng
1850
+ ::s ⠖ ::t ff ::lcode eng ::dont-use-at-start-of-word ::dont-use-at-end-of-word
1851
+ ::s ⠋⠌ ::t first ::lcode eng
1852
+ ::s ⠿ ::t for ::lcode eng
1853
+ ::s ⠋⠗ ::t friend ::lcode eng ::use-only-for-whole-word
1854
+ ::s ⠋⠗⠎ ::t friends ::lcode eng ::use-only-for-whole-word
1855
+ ::s ⠋ ::t from ::lcode eng ::use-only-for-whole-word
1856
+ ::s ⠰⠇ ::t ful ::lcode eng ::dont-use-at-start-of-word
1857
+ ::s ⠛ ::t g ::lcode eng
1858
+ ::s ⠶ ::t gg ::lcode eng ::dont-use-at-start-of-word ::dont-use-at-end-of-word
1859
+ ::s ⠣ ::t gh ::lcode eng
1860
+ ::s ⠛ ::t go ::lcode eng ::use-only-for-whole-word
1861
+ ::s ⠛⠙ ::t good ::lcode eng ::use-only-at-start-of-word
1862
+ ::s ⠛⠗⠞ ::t great ::lcode eng
1863
+ ::s ⠓ ::t h ::lcode eng
1864
+ ::s ⠸⠓ ::t had ::lcode eng
1865
+ ::s ⠓ ::t have ::lcode eng ::use-only-for-whole-word
1866
+ ::s ⠐⠓ ::t here ::lcode eng
1867
+ ::s ⠓⠻⠋ ::t herself ::lcode eng ::use-only-for-whole-word
1868
+ ::s ⠓⠍ ::t him ::lcode eng ::use-only-for-whole-word
1869
+ ::s ⠓⠍⠋ ::t himself ::lcode eng ::use-only-for-whole-word
1870
+ ::s ⠦ ::t ? ::lcode eng
1871
+ ::s ⠦ ::t his ::lcode eng ::use-only-for-whole-word
1872
+ ::s ⠊⠍⠍ ::t immediate ::lcode eng ::use-only-for-whole-word
1873
+ ::s ⠊⠍⠍⠇⠽ ::t immediately ::lcode eng ::use-only-for-whole-word
1874
+ ::s ⠔ ::t in ::lcode eng
1875
+ ::s ⠔⠒ ::t incon ::lcode eng ::use-only-at-start-of-word
1876
+ ::s ⠬ ::t ing ::lcode eng
1877
+ ::s ⠭ ::t it ::lcode eng ::use-only-for-whole-word
1878
+ ::s ⠭⠎ ::t its ::lcode eng ::use-only-for-whole-word
1879
+ ::s ⠭⠋ ::t itself ::lcode eng ::use-only-for-whole-word
1880
+ ::s ⠰⠽ ::t ity ::lcode eng ::dont-use-at-start-of-word
1881
+ ::s ⠚ ::t j ::lcode eng
1882
+ ::s ⠚ ::t just ::lcode eng ::use-only-for-whole-word
1883
+ ::s ⠅ ::t k ::lcode eng
1884
+ ::s ⠐⠅ ::t know ::lcode eng
1885
+ ::s ⠅ ::t knowledge ::lcode eng ::use-only-for-whole-word
1886
+ ::s ⠇ ::t l ::lcode eng
1887
+ ::s ⠨⠎ ::t less ::lcode eng ::dont-use-at-start-of-word
1888
+ ::s ⠇⠗ ::t letter ::lcode eng ::use-only-for-whole-word
1889
+ ::s ⠇⠗⠎ ::t letters ::lcode eng ::use-only-for-whole-word
1890
+ ::s ⠇ ::t like ::lcode eng ::use-only-for-whole-word
1891
+ ::s ⠇⠇ ::t little ::lcode eng ::use-only-for-whole-word
1892
+ ::s ⠐⠇ ::t lord ::lcode eng
1893
+ ::s ⠍ ::t m ::lcode eng
1894
+ ::s ⠸⠍ ::t many ::lcode eng
1895
+ ::s ⠰⠞ ::t ment ::lcode eng ::dont-use-at-start-of-word
1896
+ ::s ⠍ ::t more ::lcode eng ::use-only-for-whole-word
1897
+ ::s ⠐⠍ ::t mother ::lcode eng
1898
+ ::s ⠍⠡ ::t much ::lcode eng ::use-only-for-whole-word
1899
+ ::s ⠍⠌ ::t must ::lcode eng ::use-only-for-whole-word
1900
+ ::s ⠍⠽⠋ ::t myself ::lcode eng ::use-only-for-whole-word
1901
+ ::s ⠝ ::t n ::lcode eng
1902
+ ::s ⠐⠝ ::t name ::lcode eng
1903
+ ::s ⠝⠑⠉ ::t necessary ::lcode eng ::use-only-for-whole-word
1904
+ ::s ⠝⠑⠊ ::t neither ::lcode eng ::use-only-for-whole-word
1905
+ ::s ⠰⠎ ::t ness ::lcode eng ::dont-use-at-start-of-word
1906
+ ::s ⠝ ::t not ::lcode eng ::use-only-for-whole-word
1907
+ ::s ⠕⠄⠉ ::t o'clock ::lcode eng ::use-only-for-whole-word
1908
+ ::s ⠷ ::t of ::lcode eng
1909
+ ::s ⠐⠕ ::t one ::lcode eng
1910
+ ::s ⠰⠛ ::t ong ::lcode eng ::dont-use-at-start-of-word
1911
+ ::s ⠳ ::t ou ::lcode eng
1912
+ ::s ⠨⠙ ::t ound ::lcode eng
1913
+ ::s ⠨⠞ ::t ount ::lcode eng
1914
+ ::s ⠐⠳ ::t ought ::lcode eng
1915
+ ::s ⠳⠗⠧⠎ ::t ourselves ::lcode eng ::use-only-for-whole-word
1916
+ ::s ⠳ ::t out ::lcode eng ::use-only-for-whole-word
1917
+ ::s ⠪ ::t ow ::lcode eng
1918
+ ::s ⠏ ::t p ::lcode eng
1919
+ ::s ⠏⠙ ::t paid ::lcode eng ::use-only-for-whole-word
1920
+ ::s ⠐⠏ ::t part ::lcode eng
1921
+ ::s ⠏ ::t people ::lcode eng ::use-only-for-whole-word
1922
+ ::s ⠏⠻⠓ ::t perhaps ::lcode eng ::use-only-for-whole-word
1923
+ ::s ⠟ ::t q ::lcode eng
1924
+ ::s ⠐⠟ ::t question ::lcode eng
1925
+ ::s ⠟⠅ ::t quick ::lcode eng ::use-only-for-whole-word
1926
+ ::s ⠟⠅⠻ ::t quicker ::lcode eng ::use-only-for-whole-word
1927
+ ::s ⠟⠅⠑⠌ ::t quickest ::lcode eng ::use-only-for-whole-word
1928
+ ::s ⠟ ::t quite ::lcode eng ::use-only-for-whole-word
1929
+ ::s ⠗ ::t r ::lcode eng
1930
+ ::s ⠗ ::t rather ::lcode eng ::use-only-for-whole-word
1931
+ ::s ⠐⠗ ::t right ::lcode eng
1932
+ ::s ⠗⠚⠉ ::t rejoice ::lcode eng
1933
+ ::s ⠗⠚⠉⠛ ::t rejoicing ::lcode eng
1934
+ ::s ⠎ ::t s ::lcode eng
1935
+ ::s ⠎⠙ ::t said ::lcode eng ::use-only-for-whole-word
1936
+ ::s ⠩ ::t sh ::lcode eng
1937
+ ::s ⠩ ::t shall ::lcode eng ::use-only-for-whole-word
1938
+ ::s ⠩⠙ ::t should ::lcode eng ::use-only-for-whole-word
1939
+ ::s ⠨⠝ ::t sion ::lcode eng
1940
+ ::s ⠎ ::t so ::lcode eng ::use-only-for-whole-word
1941
+ ::s ⠐⠎ ::t some ::lcode eng
1942
+ ::s ⠸⠎ ::t spirit ::lcode eng
1943
+ ::s ⠌ ::t st ::lcode eng
1944
+ ::s ⠌ ::t still ::lcode eng ::use-only-for-whole-word
1945
+ ::s ⠎⠡ ::t such ::lcode eng ::use-only-for-whole-word
1946
+ ::s ⠞ ::t t ::lcode eng
1947
+ ::s ⠹ ::t th ::lcode eng
1948
+ ::s ⠞ ::t that ::lcode eng ::use-only-for-whole-word
1949
+ ::s ⠹ ::t this ::lcode eng ::use-only-for-whole-word
1950
+ ::s ⠮ ::t the ::lcode eng
1951
+ ::s ⠸⠮ ::t their ::lcode eng
1952
+ ::s ⠮⠍⠧⠎ ::t themselves ::lcode eng ::use-only-for-whole-word
1953
+ ::s ⠐⠮ ::t there ::lcode eng
1954
+ ::s ⠘⠮ ::t these ::lcode eng
1955
+ ::s ⠘⠹ ::t those ::lcode eng
1956
+ ::s ⠐⠹ ::t through ::lcode eng
1957
+ ::s ⠐⠞ ::t time ::lcode eng
1958
+ ::s ⠰⠝ ::t tion ::lcode eng ::dont-use-at-start-of-word
1959
+ ::s ⠖ ::t to ::lcode eng ::use-only-for-whole-word
1960
+ ::s ⠞⠙ ::t today ::lcode eng ::use-only-for-whole-word
1961
+ ::s ⠞⠛⠗ ::t together ::lcode eng ::use-only-for-whole-word
1962
+ ::s ⠞⠍ ::t tomorrow ::lcode eng ::use-only-for-whole-word
1963
+ ::s ⠞⠝ ::t tonight ::lcode eng ::use-only-for-whole-word
1964
+ ::s ⠥ ::t u ::lcode eng
1965
+ ::s ⠥⠝⠒ ::t uncon ::lcode eng ::use-only-at-start-of-word
1966
+ ::s ⠥ ::t us ::lcode eng ::use-only-for-whole-word
1967
+ ::s ⠠⠥⠲⠎⠲ ::t U.S. ::lcode eng
1968
+ ::s ⠐⠥ ::t under ::lcode eng
1969
+ ::s ⠘⠥ ::t upon ::lcode eng
1970
+ ::s ⠧ ::t v ::lcode eng
1971
+ ::s ⠧ ::t very ::lcode eng ::use-only-for-whole-word
1972
+ ::s ⠺ ::t w ::lcode eng
1973
+ ::s ⠴ ::t " ::lcode eng
1974
+ ::s ⠴ ::t was ::lcode eng ::use-only-for-whole-word
1975
+ ::s ⠶ ::t were ::lcode eng ::use-only-for-whole-word
1976
+ ::s ⠱ ::t wh ::lcode eng
1977
+ ::s ⠐⠱ ::t where ::lcode eng
1978
+ ::s ⠱ ::t which ::lcode eng ::use-only-for-whole-word
1979
+ ::s ⠘⠱ ::t whose ::lcode eng
1980
+ ::s ⠺ ::t will ::lcode eng ::use-only-for-whole-word
1981
+ ::s ⠾ ::t with ::lcode eng
1982
+ ::s ⠘⠺ ::t word ::lcode eng
1983
+ ::s ⠐⠺ ::t work ::lcode eng
1984
+ ::s ⠸⠺ ::t world ::lcode eng
1985
+ ::s ⠺⠙ ::t would ::lcode eng ::use-only-for-whole-word
1986
+ ::s ⠭ ::t x ::lcode eng
1987
+ ::s ⠽ ::t y ::lcode eng
1988
+ ::s ⠽ ::t you ::lcode eng ::use-only-for-whole-word
1989
+ ::s ⠽⠗ ::t your ::lcode eng ::use-only-for-whole-word
1990
+ ::s ⠽⠗⠎ ::t yours ::lcode eng ::use-only-for-whole-word
1991
+ ::s ⠽⠗⠋ ::t yourself ::lcode eng ::use-only-for-whole-word
1992
+ ::s ⠽⠗⠧⠎ ::t yourselves ::lcode eng ::use-only-for-whole-word
1993
+ ::s ⠐⠽ ::t young ::lcode eng
1994
+ ::s ⠵ ::t z ::lcode eng
1995
+ ::s ⠠⠴ ::t ’ ::lcode eng
1996
+
1997
+ ::preserve ::from U+2190 ::to U+21FF ::comments Arrows
1998
+ ::preserve ::from U+2200 ::to U+22FF ::comment Mathematical Operators
1999
+ ::preserve ::from U+2300 ::to U+23FF ::comment Miscellaneous Technical
2000
+ ::preserve ::from U+2500 ::to U+257F ::comment Box Drawing
2001
+ ::preserve ::from U+2580 ::to U+259F ::comment Block Elements
2002
+ ::preserve ::from U+25A0 ::to U+25FF ::comment Geometric Shapes
2003
+ ::preserve ::from U+2600 ::to U+26FF ::comment Miscellaneous Symbols
2004
+ ::preserve ::from U+27C0 ::to U+27ED ::comment Miscellaneous Mathematical Symbols-A
2005
+ ::preserve ::from U+27F0 ::to U+27FF ::comment Supplemental Arrows-A
2006
+ ::preserve ::from U+2900 ::to U+297F ::comment Supplemental Arrows-B
2007
+ ::preserve ::from U+2980 ::to U+29FF ::comment Miscellaneous Mathematical Symbols-B
2008
+ ::preserve ::from U+2A00 ::to U+2AFF ::comment Supplemental Mathematical Operators
2009
+ ::preserve ::from U+2B00 ::to U+2BFF ::comment Miscellaneous Symbols and Arrows
2010
+ ::preserve ::from U+2E00 ::to U+2E27 ::comment Supplemental Punctuation (excluding ⸨⸩)
2011
+ ::preserve ::from U+2E2A ::to U+2E7F ::comment Supplemental Punctuation (cont'd)
2012
+ ::preserve ::from U+18B00 ::to U+18CD5 ::comment Khitan Small Script
2013
+ ::preserve ::from U+1D100 ::to U+1D1FF ::comment Musical Symbols
2014
+ ::preserve ::from U+1D6A8 ::to U+1D7CB ::comment Mathematical Alphanumeric Symbols (Greek)
2015
+ ::preserve ::from U+1D800 ::to U+1DAAF ::comment Sutton SignWriting
2016
+ ::preserve ::from U+1F800 ::to U+1F8FF ::comment Supplemental Arrows-C
2017
+ ::preserve ::from U+1FA00 ::to U+1FA6F ::comment Chess Symbols
2018
+ ::preserve ::from U+1FB00 ::to U+1FBCF ::comment Symbols for Legacy Computing
2019
+ ::preserve ::from U+1FA70 ::to U+1FAFF ::comment Symbols and Pictographs Extended-A
uroman/data/romanization-table.v1.2.1.txt ADDED
@@ -0,0 +1,814 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 
2
+ ## European Latin extensions
3
+ # Vowels
4
+ ::s Ä ::t Ae
5
+ ::s Ö ::t Oe
6
+ ::s Ü ::t Ue
7
+ ::s Å ::t Aa
8
+ ::s Æ ::t Ae
9
+ ::s Ø ::t oe
10
+ ::s Œ ::t Oe
11
+ ::s ä ::t ae
12
+ ::s ö ::t oe
13
+ ::s ü ::t ue
14
+ ::s å ::t aa
15
+ ::s æ ::t ae
16
+ ::s ø ::t oe
17
+ ::s œ ::t oe
18
+ # Consonants
19
+ ::s Ç ::t S
20
+ ::s ç ::t s
21
+ ::s Ç ::t Ch ::lcode tur
22
+ ::s ç ::t ch ::lcode tur
23
+ ::s Ş ::t Sh
24
+ ::s ş ::t sh
25
+ ::s Ș ::t Sh
26
+ ::s ș ::t sh
27
+ ::s ß ::t ss
28
+ ::s Ț ::t Ts
29
+ ::s ț ::t ts
30
+
31
+ # Miscellaneous
32
+ ::s ə ::t e
33
+
34
+ # English
35
+ ::s chr ::t chr ::t-alt kr ::example chromosome, synchronize
36
+ ::s Chr ::t Chr ::t-alt Kr ::example Christmas, Chrysler
37
+ ::s eight ::t eight ::t-alt eit ::example eight, weight
38
+ ::s Eight ::t Eight ::t-alt Eit ::example Eighteen
39
+ ::s ight ::t ight ::t-alt ait ::example Knight
40
+ ::s gh ::t gh ::t-alt f, ph, "" ::example laugh, daughter
41
+ ::s high ::t high ::t-alt hai ::example highlight
42
+ ::s High ::t High ::t-alt Hai ::example High School
43
+ ::s Isle ::t Isle ::t-alt Ail ::use-only-at-start-of-word ::use-only-at-end-of-word ::example Isle
44
+ ::s Island ::t Island ::t-alt Ailand ::use-only-at-start-of-word ::use-only-at-end-of-word ::example Island
45
+ ::s kn ::t kn ::t-alt n ::use-only-at-start-of-word ::example knowledge
46
+ ::s Kn ::t Kn ::t-alt N ::use-only-at-start-of-word ::example Knight
47
+ ::s Mc ::t Mc ::t-alt Mac ::use-only-at-start-of-word ::example McNulty
48
+ ::s mc ::t mc ::t-alt mac ::use-only-at-start-of-word
49
+ ::s oo ::t oo ::t-alt u ::lcode eng ::example Brooklyn; Goose Bay
50
+ ::s ph ::t ph ::t-alt f ::example alpha
51
+ ::s Ph ::t Ph ::t-alt F ::example Philip
52
+ ::s Thom ::t Thom ::t-alt Tom ::use-only-at-start-of-word ::example Thomas, Thompson
53
+ ::s tion ::t tion ::t-alt shen ::example
54
+ ::s Sean ::t Sean ::t-alt Shawn ::use-only-at-start-of-word ::use-only-at-end-of-word
55
+ ::s ssion ::t ssion ::t-alt shen ::example Sessions
56
+ ::s St ::t St ::t-alt Saint ::use-only-at-start-of-word ::use-only-at-end-of-word
57
+ ::s St. ::t St. ::t-alt Saint ::use-only-at-start-of-word ::use-only-at-end-of-word
58
+ ::s Wr ::t Wr ::t-alt R ::example Wren
59
+ ::s wr ::t wr ::t-alt r ::example Cartwright
60
+ ::s x ::t x ::t-alt ks ::example Mexico
61
+ ::s x ::t x ::t-alt gz ::example example, anxiety, exhaust, exit
62
+
63
+ # French
64
+ ::s â ::t a ::t-alt as ::example pâte/paste, pastry
65
+ ::s ê ::t e ::t-alt es ::example fête/feast
66
+ ::s î ::t i ::t-alt is ::example île/isle
67
+ ::s ô ::t o ::t-alt os ::example côte/coast
68
+ ::s û ::t u ::t-alt us ::example août/August
69
+ ::s eaux ::t eaux ::t-alt o ::example Bordeaux
70
+ ::s eau ::t eau ::t-alt o ::example Chateau
71
+ ::s auld ::t auld ::t-alt o ::use-only-at-end-of-word ::example Renauld
72
+ ::s ault ::t ault ::t-alt o ::use-only-at-end-of-word ::example Renault
73
+ ::s oux ::t oux ::t-alt u
74
+ ::s ois ::t ois ::t-alt oa ::use-only-at-end-of-word ::example Dubois
75
+
76
+ # German
77
+ ::s Sch ::t Sch ::t-alt Sh
78
+ ::s sch ::t sch ::t-alt sh
79
+ ::s stein ::t stein ::t-alt shtain
80
+ ::s dt ::t dt ::t-alt tt ::use-only-at-end-of-word ::example Schmidt
81
+
82
+ # Dutch
83
+ ::s ij ::t ij ::t-alt ai
84
+ ::s Ij ::t Ij ::t-alt Ai
85
+
86
+ # Greek
87
+ ::s Ι ::t I
88
+ ::s ι ::t i
89
+ ::s ί ::t i
90
+ ::s ἶ ::t i
91
+ ::s Υ ::t Y
92
+ ::s υ ::t y
93
+ ::s Ρ ::t R
94
+ ::s ρ ::t r
95
+ ::s Ντ ::t D
96
+ ::s ντ ::t nd ::t-alt d
97
+ # ::s ντζ ::t ntz
98
+ ::s Μπ ::t B
99
+ ::s μπ ::t mb ::t-alt b
100
+ ::s γγ ::t ng
101
+ ::s γκ ::t ng ::t-alt g
102
+ ::s ει ::t ei ::t-alt i
103
+ ::s ου ::t ou ::t-alt u
104
+ ::s χ ::t ch ::t-alt kh
105
+
106
+ # Cyrillic
107
+ ::s Г ::t G ::t-alt H
108
+ ::s г ::t g ::t-alt h
109
+ ::s Е ::t E ::t-alt Ye
110
+ ::s е ::t e ::t-alt ye
111
+ ::s Ё ::t E ::t-alt Yo
112
+ ::s ё ::t e ::t-alt yo
113
+ ::s Х ::t Kh ::t-alt Ch, H ::comment Cyrillic capital ha
114
+ ::s х ::t kh ::t-alt ch, h ::comment Cyrillic small ha
115
+ ::s Щ ::t Shch ::t-alt Sh
116
+ ::s щ ::t shch ::t-alt sh
117
+ ::s Ъ ::t ::comment Cyrillic capital hard sign
118
+ ::s ъ ::t ::comment Cyrillic small hard sign
119
+ ::s Ы ::t Y ::comment Cyrillic capital yeru
120
+ ::s ы ::t y ::comment Cyrillic small yeru
121
+ ::s Ь ::t ::comment Cyrillic capital soft sign
122
+ ::s ь ::t ::comment Cyrillic small soft sign
123
+
124
+ ::s Ҥ ::t Ng ::comment Cyrillic capital ligature EN GHE
125
+ ::s ҥ ::t ng ::comment Cyrillic small ligature EN GHE
126
+ ::s Ә ::t e ::comment Cyrillic capital schwa
127
+ ::s ә ::t e ::comment Cyrillic small schwa
128
+ ::s Ӏ ::t ' ::comment Cyrillic palochka
129
+ ::s Ҵ ::t TS ::comment Cyrillic capital ligature te tse, used in Abkhasian
130
+ ::s ҵ ::t ts ::comment Cyrillic small ligature te tse, used in Abkhasian
131
+ ::s Ӕ ::t AE ::comment Cyrillic capital ligature a ie
132
+ ::s ӕ ::t ae ::comment Cyrillic small ligature a ie
133
+ ::s Г ::t H ::lcode ukr ::comment Ukrainian capital letter he
134
+ ::s г ::t h ::lcode ukr ::comment Ukrainian small letter he
135
+ ::s Ґ ::t G ::lcode ukr ::comment Ukrainian capital letter ghe
136
+ ::s ґ ::t g ::lcode ukr ::comment Ukrainian small letter ghe
137
+
138
+ # Gothic
139
+ ::s 𐌴 ::t e ::comment Gothic letter aihvus
140
+ ::s 𐌹 ::t i ::comment Gothic letter eis
141
+ ::s 𐍇 ::t x ::comment Gothic letter iggws
142
+
143
+ # Georgian
144
+ ::s ა ::t a ::comment Georgian letter an
145
+ ::s ე ::t e ::comment Georgian letter en
146
+ ::s ი ::t i ::comment Georgian letter in
147
+ ::s ო ::t o ::comment Georgian letter on
148
+ ::s უ ::t u ::comment Georgian letter un
149
+
150
+ # Armenian
151
+ ::s Ա ::t a ::comment Armenian capital letter ayb
152
+ ::s ա ::t a ::comment Armenian small letter ayb
153
+ ::s Ե ::t e ::comment Armenian capital letter ech
154
+ ::s ե ::t e ::comment Armenian small letter ech
155
+ ::s և ::t ev ::comment Armenian small ligature ech yiwn
156
+ ::s Է ::t e ::comment Armenian capital letter eh
157
+ ::s է ::t e ::comment Armenian small letter eh
158
+ ::s Ի ::t i ::comment Armenian capital letter ini
159
+ ::s ի ::t i ::comment Armenian small letter ini
160
+ ::s Օ ::t o ::comment Armenian capital letter oh
161
+ ::s օ ::t o ::comment Armenian small letter oh
162
+
163
+ ## Japanese
164
+ # Katakana
165
+ ::s シ ::t shi
166
+ ::s チ ::t chi
167
+ ::s フ ::t fu
168
+ ::s ジ ::t ji
169
+ ::s ヂ ::t ji
170
+ ::s ヅ ::t zu
171
+ ::s シャ ::t sha
172
+ ::s シュ ::t shu
173
+ ::s ショ ::t sho
174
+ ::s チャ ::t cha
175
+ ::s チェ ::t che
176
+ ::s チュ ::t chu
177
+ ::s チョ ::t cho
178
+ ::s ジャ ::t ja
179
+ ::s ジュ ::t ju
180
+ ::s ジョ ::t jo
181
+ ::s ジェ ::t je
182
+ ::s ヂャ ::t ja
183
+ ::s ヂュ ::t ju
184
+ ::s ヂョ ::t jo
185
+ ::s フェ ::t fe
186
+ ::s ヴェ ::t ve
187
+ ::s フィ ::t fi
188
+ ::s ウィ ::t wi
189
+ ::s ヴィ ::t vi
190
+ ::s ティ ::t ti
191
+ ::s ディ ::t di
192
+ ::s ッ ::t (__SOKUON__) ::comment katakana double following consonant
193
+ ::s ー ::t (__CHOONPU__) ::comment katakana prolonged sound mark
194
+ # Hiragana
195
+ ::s し ::t shi
196
+ ::s ち ::t chi
197
+ ::s つ ::t tsu
198
+ ::s ふ ::t fu
199
+ ::s を ::t o
200
+ ::s じ ::t ji
201
+ ::s ぢ ::t ji
202
+ ::s づ ::t zu
203
+ ::s しゃ ::t sha
204
+ ::s しゅ ::t shu
205
+ ::s しょ ::t sho
206
+ ::s ちゃ ::t cha
207
+ ::s ちゅ ::t chu
208
+ ::s ちょ ::t cho
209
+ ::s じゃ ::t ja
210
+ ::s じゅ ::t ju
211
+ ::s じょ ::t jo
212
+ ::s ぢゃ ::t ja
213
+ ::s ぢゅ ::t ju
214
+ ::s ぢょ ::t jo
215
+ ::s っ ::t (__SOKUON__) ::comment hiragana double following consonant
216
+ ::s 々 ::t ² ::comment ideographic iteration mark ::annotation repetition-sign
217
+
218
+ ::s フ ::t fu ::t-alt f
219
+ ::s キ ::t ki ::t-alt k
220
+ ::s ク ::t ku ::t-alt k
221
+ ::s ラ ::t ra ::t-alt la
222
+ ::s リ ::t ri ::t-alt li
223
+ ::s ル ::t ru ::t-alt lu, l, r
224
+ ::s レ ::t re ::t-alt le
225
+ ::s ロ ::t ro ::t-alt lo
226
+ ::s ム ::t mu ::t-alt m ::example キム = Kim
227
+ ::s シ ::t shi ::t-alt si ::example メキシコ = meksiko (Mexico)
228
+ ::s ス ::t su ::t-alt s
229
+ ::s ト ::t to ::t-alt t
230
+ ::s ツ ::t tsu ::t-alt tu, ts ::example シュルツ = Schultz
231
+
232
+ # Chinese
233
+ ::s 邦 ::t bang ::t-alt bon, bum, bun, pon
234
+ ::s 鲍 ::t bao ::t-alt bow
235
+ ::s 堡 ::t bao ::t-alt berg, burg, bourg, burgh
236
+ ::s 贝 ::t bei ::t-alt ber
237
+ ::s 本 ::t ben ::t-alt bern, bon, bourn, burn
238
+ ::s 彼得 ::t bide ::t-alt peter, pet
239
+ ::s 伯 ::t bo ::t-alt ber
240
+ ::s 波 ::t bo ::t-alt po
241
+ ::s 布 ::t bu ::t-alt b
242
+ ::s 策 ::t ce ::t-alt tze, tzer
243
+ ::s 曾 ::t ceng ::t-alt tzen, zen
244
+ ::s 彻 ::t che ::t-alt tche
245
+ ::s 茨 ::t ci ::t-alt ts, tz, z
246
+ ::s 兹 ::t ci ::t-alt ds, dz, tz, z, zi
247
+ ::s 蒂 ::t di ::t-alt ti, tti
248
+ ::s 丁 ::t ding ::t-alt din, tin
249
+ ::s 顿 ::t dun ::t-alt ton
250
+ ::s 多 ::t duo ::t-alt do, dor, to
251
+ ::s 尔 ::t er ::t-alt l, le, ll, r
252
+ ::s 弗 ::t fu ::t-alt f, fer, pher, v, ver, vir
253
+ ::s 夫 ::t fu ::t-alt f, v, v
254
+ ::s 福 ::t fu ::t-alt faw, for, ford
255
+ ::s 哥 ::t ge ::t-alt go, co
256
+ ::s 戈 ::t ge ::t-alt go
257
+ ::s 各 ::t ge ::t-alt go, co
258
+ ::s 赫 ::t he ::t-alt ch, che, cher, ge
259
+ ::s 华 ::t hua ::t-alt ver, wa, war, wer ::example Washington
260
+ ::s 怀 ::t huai ::t-alt whi, wi, wy
261
+ ::s 惠 ::t hui ::t-alt wha, whea
262
+ ::s 基 ::t ji ::t-alt ki, chi
263
+ ::s 吉 ::t ji ::t-alt gi, gui
264
+ ::s 加 ::t jia ::t-alt ca, ga, ka ::example Canada
265
+ ::s 杰 ::t jie ::t-alt ger
266
+ ::s 金 ::t jin ::t-alt kin, gin
267
+ ::s 斤 ::t jin ::t-alt zin
268
+ ::s 康 ::t kang ::t-alt con, corn
269
+ ::s 考 ::t kao ::t-alt cow, cour
270
+ ::s 克 ::t ke ::t-alt k, che, cher
271
+ ::s 科 ::t ke ::t-alt ko
272
+ ::s 拉 ::t la ::t-alt ra ::example Tirana
273
+ ::s 朗 ::t lang ::t-alt lon, ron
274
+ ::s 赖 ::t lai ::t-alt ri
275
+ ::s 劳 ::t lao ::t-alt low
276
+ ::s 勒 ::t lei ::t-alt ler
277
+ ::s 伦 ::t lun ::t-alt lon, ran, ron
278
+ ::s 里 ::t li ::t-alt ri
279
+ ::s 利 ::t li ::t-alt ri ::example Ferrari
280
+ ::s 隆 ::t long ::t-alt lon, lum, lund
281
+ ::s 罗 ::t luo ::t-alt l, lo, lu, ro, row, ru
282
+ ::s 洛 ::t luo ::t-alt lo, low, ro
283
+ ::s 默 ::t mo ::t-alt mer
284
+ ::s 纳 ::t na ::t-alt ne, ner
285
+ ::s 珀 ::t po ::t-alt per
286
+ ::s 奇 ::t qi ::t-alt chi, dge, ge, tch
287
+ ::s 齐 ::t qi ::t-alt tsi, zi
288
+ ::s 乔 ::t qiao ::t-alt jo
289
+ ::s 青 ::t qing ::t-alt tsing
290
+ ::s 琼 ::t qiong ::t-alt jon, jum, jun
291
+ ::s 瑟 ::t se ::t-alt the
292
+ ::s 什 ::t shen ::t-alt sh
293
+ ::s 圣 ::t sheng ::t-alt san, sao, saint
294
+ ::s 斯 ::t si ::t-alt s, rth, th ::example Alaska
295
+ ::s 索 ::t suo ::t-alt tho
296
+ ::s 特 ::t te ::t-alt t
297
+ ::s 翁 ::t weng ::t-alt on
298
+ ::s 沃 ::t wo ::t-alt ver, vo, war, wer
299
+ ::s 乌 ::t wu ::t-alt ou, u
300
+ ::s 希 ::t xi ::t-alt chi, hi, shi
301
+ ::s 西 ::t xi ::t-alt s, si
302
+ ::s 锡 ::t xi ::t-alt ci, si, thi, zi
303
+ ::s 夏 ::t xia ::t-alt ha, cha, cia, sha, tia
304
+ ::s 香 ::t xiang ::t-alt chan, cham
305
+ ::s 歇 ::t xie ::t-alt she
306
+ ::s 谢 ::t xie ::t-alt che, she
307
+ ::s 辛 ::t xin ::t-alt cin, sen, sin, sing, sun, zen
308
+ ::s 欣 ::t xin ::t-alt hin, shin
309
+ ::s 休 ::t xiu ::t-alt hu, hue
310
+ ::s 修 ::t xiu ::t-alt ciu, siu, thew, tiu
311
+ ::s 许 ::t xu ::t-alt hue, schue
312
+ ::s 逊 ::t xun ::t-alt son
313
+ ::s 耶 ::t ye ::t-alt yer, ier
314
+ ::s 泽 ::t ze ::t-alt ser
315
+ ::s 扎 ::t zha ::t-alt za
316
+ ::s 詹 ::t zhan ::t-alt ja, jam, jan, jen, jon
317
+ ::s 治 ::t zhi ::t-alt ge ::example George
318
+
319
+ ## Numbers
320
+ # Chinese and Japanese numbers
321
+ ::s 零 ::num 0
322
+ ::s 〇 ::num 0
323
+ ::s 一 ::num 1
324
+ ::s 二 ::num 2
325
+ ::s 三 ::num 3
326
+ ::s 四 ::num 4
327
+ ::s 五 ::num 5
328
+ ::s 六 ::num 6
329
+ ::s 七 ::num 7
330
+ ::s 八 ::num 8
331
+ ::s 九 ::num 9
332
+ ::s 十 ::num 10
333
+ ::s 百 ::num 100
334
+ ::s 千 ::num 1000
335
+ ::s 万 ::num 10000
336
+ ::s 萬 ::num 10000
337
+ ::s 亿 ::num 100000000
338
+ ::s 億 ::num 100000000
339
+ ::s 兆 ::num 1000000000000
340
+ ::s 京 ::num 10000000000000000
341
+
342
+ ::s 北京 ::t beijing
343
+ ::s 京都 ::t jingdou
344
+ ::s 东京 ::t dongjing
345
+ ::s 京胡 ::t jinghu
346
+ ::s 南京 ::t nangjing
347
+ ::s 普京 ::t pujing ::comment Putin
348
+ ::s 東京 ::t dongjing ::comment Tokyo
349
+ ::s 京兆 ::t jingzhao
350
+
351
+ ::s ㎢ ::t km²
352
+ ::s ㎥ ::t m³
353
+ ::s ㎝ ::t cm
354
+
355
+ ## Indian
356
+ # see mostly under UnicodeDataOverwrite.txt
357
+
358
+ # Malayalam
359
+ ::s ൗ ::t au ::comment MALAYALAM AU LENGTH MARK
360
+
361
+ # Tamil
362
+ ::s ட ::t d ::comment most commonly d, but t when word-initial or in a doubled consonant
363
+ ::s ஃப ::t f ::comment h+p=f
364
+ ::s ஃஜ ::t z ::comment h+j=z
365
+
366
+ # Myanmar/Burmese
367
+ # ::s ့ ::t ::comment dot below, denotes creaky tone
368
+ # ::s း ::t ::comment visarga, denotes high tone
369
+ ::s ၌ ::t -nai ::comment locative
370
+ ::s ၍ ::t -jwe ::comment completed
371
+ ::s ၎ ::t legau ::comment aforementioned
372
+ ::s ၏ ::t -i ::comment genetive
373
+
374
+ # Lao
375
+ ::s ັ ::t a ::comment vowel sign mai kan
376
+ ::s ົ ::t o ::comment vowel sign mai kon
377
+ ::s ູ ::t uu ::comment vowel sign uu
378
+ ::s ຽ ::t y ::comment semivowel sign nyo
379
+ ::s ຼ ::t l ::comment semivowel sign lo
380
+ ::s ລ ::t l ::comment lo loot
381
+ ::s ຣ ::t l ::comment lo ling
382
+ ::s ໝ ::t m ::comment ho mo
383
+ ::s ໜ ::n ::comment ho no
384
+ ::s ຢ ::t y ::comment yo
385
+ ::s ໍ ::t oo ::comment niggahita (possibly also nasal -m in final position)
386
+ ::s ໆ ::t ² ::comment Lao ko la ::annotation repetition-sign
387
+ ::s ຯ ::t ... ::comment Lao ellipsis
388
+
389
+ # Thai
390
+ ::s ออ ::t o
391
+ ::s อั ::t a
392
+ ::s อิ ::t i
393
+ ::s ๆ ::t ² ::comment Thai character maiyamok ::annotation repetition-sign
394
+
395
+ # Khmer
396
+ ::s ័ ::t "" ::comment Khmer samyok sannya: indicates deviation from the general rules of pronunciation
397
+ ::s ៏ ::t "" ::comment Khmer sign ahsda: denotes stressed intonation in some single-consonant words
398
+ ::s ៍ ::t "" ::comment Khmer sign toandakhiat: indicates that the base character is not pronounced
399
+ ::s ៌ ::t "" ::comment Khmer sign robat: a diacritic historically corresponding to the repha form of ra in Devanagari
400
+ ::s ប៉ ::t pa ::comment Khmer ba + musĕkâtônd -> pa
401
+ ::s ៗ ::t ² ::comment Khmer sign lek too ::annotation repetition-sign
402
+
403
+ ## Semitic languages
404
+ # Arabic
405
+ ::s و ::t w ::comment Arabic letter waw ::t-alt o, u ::lcode ara
406
+ ::s ء ::t ' ::comment hamza
407
+ ::s ٔ ::t ' ::comment hamza above
408
+ ::s ٕ ::t ' ::comment hamza below
409
+ ::s ع ::t ' ::comment ain
410
+ ::s آ ::t a ::comment alef madda
411
+ ::s ٓا ::t a ::comment Arabic maddah above plus alef (presumably an ill-formed version of آ; found 1 instance in Urdu text)
412
+ ::s إ ::t i ::comment alef with hamza below
413
+ ::s ٱ ::t a ::comment alef wasla ::comment typically indicates liaison with preceding word
414
+ ::s ة ::t a ::comment teh marbuta
415
+ ::s ۃ ::t a ::comment teh marbuta goal ::comment Used in Punjabi, Sindhi. Different from plain 'teh marbuta'?
416
+ ::s ي ::t y ::comment Arabic yeh
417
+ ::s ى ::t a ::comment alef maksura
418
+ ::s ﻯ ::t a ::comment alef maksura isolated form
419
+ ::s ﻰ ::t a ::comment alef maksura final form
420
+ ::s ﯨ ::t a ::comment Uighur Kazach Kirghiz alef maksura initial form
421
+ ::s ﯩ ::t a ::comment Uighur Kazach Kirghiz alef maksura medial form
422
+ ::s ٰ ::t a ::comment Arabic letter superscript alef
423
+ ::s ـ ::t ::comment tatweel (filler)
424
+ ::s َ ::t a ::comment fatha ("-a")
425
+ ::s ُ ::t u ::comment damma ("-u")
426
+ ::s ِ ::t i ::comment kasra ("-i")
427
+ ::s ْ ::t ::comment sukun (no vowel)
428
+ ::s ۡ ::t ::comment small high dotless head of khah; like sukun (no vowel); used in Kashmiri, Assamese
429
+ ::s ً ::t ::comment fathatan ("-an")
430
+ ::s اً ::t an ::comment alef + fathatan
431
+ ::s ٌ ::t ::comment dammatan ("-un")
432
+ ::s ٍ ::t ::comment kasratan ("-in")
433
+ ::s ّ ::t ::comment shadda (consonant doubler)
434
+ ::s ڃ ::t ny ::comment Arabic letter nyeh U+0683 (used in Sindhi (snd))
435
+ ::s ڄ ::t dy ::comment Arabic letter dyeh U+0684 (used in Sindhi (snd))
436
+ ::s ۾ ::t men ::comment Sindhi postposition men
437
+ ::s ؑ ::t alayhe wasallam ::comment "upon him be peace"
438
+ ::s ﷴ ::t mohammad ::comment "Mohammad"
439
+ ::s ﷸ ::t wasallam ::comment "and peace"
440
+ ::s ﷺ ::t sallallahou alayhe wasallam ::comment "prayer of God be upon him and his family and peace"
441
+
442
+ # Farsi
443
+ ::s ی ::t i ::t-alt y ::comment Contributed by Nima
444
+ ::s ای ::t i ::t-alt ai ::use-only-at-start-of-word ::comment Contributed by Nima
445
+ ::s هٔ ::t eye ::use-only-at-end-of-word ::lcode fas ::comment Contributed by Nima
446
+ ::s و ::t v ::t-alt o, u ::lcode fas ::comment Arabic letter waw
447
+ ::s ض ::t z ::t-alt d ::lcode fas ::comment Contributed by Marjan
448
+ ::s ث ::t s ::t-alt th ::lcode fas ::comment Contributed by Marjan
449
+ ::s ذ ::t z ::t-alt th ::lcode fas ::comment Contributed by Nima
450
+ ::s ع ::t a ::t-alt ' ::lcode fas ::comment Contributed by Nima
451
+ ::s عا ::t a ::lcode fas ::comment Contributed by Nima
452
+ ::s عی ::t i ::t-alt iy ::lcode fas ::comment Contributed by Nima
453
+ ::s عو ::t u ::t-alt o, av ::lcode fas ::comment Contributed by Nima
454
+ ::s چ ::t ch ::t-alt tch, tsh ::lcode fas ::comment Contributed by Nima
455
+ ::s ه ::t e ::t-alt h ::use-only-at-end-of-word ::lcode fas ::comment Contributed by Nima
456
+ ::s ‌ ::t "" ::t-alt " " ::lcode fas ::comment source is character "zero-width non-joiner" (U+200C); Contributed by Nima
457
+ ::s غ ::t gh ::t-alt g ::lcode fas
458
+ ::s آئی ::t ai ::t-alt ae ::lcode fas
459
+ ::s ائی ::t ai ::t-alt ae ::lcode fas
460
+ ::s آئو ::t au ::t-alt ao ::lcode fas
461
+ ::s ائو ::t au ::t-alt ao ::lcode fas
462
+
463
+ # Kashmiri (so far: educated guesses)
464
+ ::s ٖ ::t a ::comment Arabic subscript alef U+0656
465
+ ::s ٗ ::t u ::comment Arabic inverted damma U+0657
466
+ ::s ۚ ::t j ::comment Arabic small high jeem U+06DA
467
+ ::s ۪ ::t ::comment Arabic emtpy centre low stop U+06EA
468
+ ::s ۬ ::t ::comment Arabic rounded high stop with filled center U+06EC
469
+
470
+ # Pashto
471
+ ::s ٙ ::t e
472
+
473
+ # Hebrew
474
+ ::s ב ::t v ::comment Hebrew letter bet ::t-alt b
475
+ ::s כ ::t k ::comment Hebrew letter kaf ::t-alt kh
476
+ ::s ך ::t k ::comment Hebrew letter kaf ::t-alt kh
477
+ ::s פ ::t f ::comment Hebrew letter pe ::t-alt p
478
+ ::s ש ::t sh ::comment Hebrew letter shin ::t-alt s
479
+ ::s ו ::t v ::comment Hebrew letter vav ::t-alt o, u
480
+ ::s ח ::t ch ::comment Hebrew letter het ::t-alt h ::use-alt-in-pointed
481
+ ::s ק ::t q ::t-alt k ::use-alt-in-pointed
482
+ ::s וֹ ::t o
483
+ ::s וּ ::t u
484
+ ::s קְוָ ::t qva ::t-alt kva ::use-alt-in-pointed
485
+ ::s י ::t y
486
+ ::s יּ ::t y
487
+ ::s יָּ ::t ya
488
+ ::s ע ::t '
489
+ ::s ִי ::t i ::t-alt iy ::use-alt-in-pointed
490
+ ::s ֵי ::t e
491
+ ::s ִיּ ::t iy
492
+ ::s ִיָּ ::t iya
493
+ ::s ױ ::t oy
494
+ ::s א ::t a ::t-alt '
495
+ ::s אָ ::t a
496
+ ::s ֹא ::t o
497
+ ::s אַ ::t 'a
498
+ ::s אֲ ::t 'a
499
+ ::s אֶ ::t e
500
+ ::s אֱ ::t e
501
+ ::s פ ::t f
502
+ ::s פּ ::t p
503
+ ::s פַּ ::t pa
504
+ ::s פְּ ::t pe ::t-alt p ::use-alt-in-pointed
505
+ ::s שׁ ::t sh
506
+ ::s שָׁ ::t sha
507
+ ::s שָּׁ ::t sha ::comment ?
508
+ ::s שְׁ ::t she ::t-alt sh ::use-alt-in-pointed
509
+ ::s שֶׁ ::t she
510
+ ::s שִׁ ::t shi
511
+ ::s שֻׁ ::t shu
512
+ ::s שׂ ::t s
513
+ ::s שָׂ ::t sa
514
+ ::s שְׂ ::t s ::t-alt se ::use-alt-in-pointed
515
+ ::s כּ ::t k
516
+ ::s כֶּ ::t ke
517
+ ::s כֹּ ::t ko
518
+ ::s בּ ::t b
519
+ ::s בַּ ::t ba
520
+ ::s בָּ ::t ba
521
+ ::s בְּ ::t be ::t-alt b ::use-alt-in-pointed
522
+ ::s בֶּ ::t be
523
+ ::s תּ ::t t
524
+ ::s תַּ ::t ta
525
+ ::s תֵּ ::t te
526
+ ::s תִּ ::t ti
527
+ ::s דָּ ::t da
528
+ ::s דְּ ::t de ::t-alt d ::use-alt-in-pointed
529
+ ::s גּ ::t g
530
+ ::s לֵּ ::t le
531
+ ::s ד׳ ::t dh
532
+ ::s ג׳ ::t j
533
+ ::s ת׳ ::t th
534
+ ::s ז׳ ::t zh
535
+ ::s חַ ::t ach ::comment furtive patah ::use-only-at-end-of-word
536
+ ::s עַ ::t a' ::comment furtive patah ::use-only-at-end-of-word
537
+ ::s הַּ ::t ah ::comment furtive patah ::use-only-at-end-of-word
538
+ ::s ַ ::t a ::comment Hebrew point patah
539
+ ::s ֲ ::t a ::comment Hebrew point hataf patah (hataf = reduced)
540
+ ::s ֳ ::t o ::comment Hebrew point hataf qamats
541
+ ::s ָ ::t a ::comment Hebrew point qamats ::t-alt o ::use-alt-in-pointed
542
+ ::s ֶ ::t e ::comment Hebrew point segol
543
+ ::s ֱ ::t e ::comment Hebrew point hataf segol (hataf = reduced)
544
+ ::s ְ ::t e ::comment Hebrew point sheva ::t-alt "" ::use-alt-in-pointed
545
+ ::s ֵ ::t e ::comment Hebrew point tsere
546
+ ::s ִ ::t i ::comment Hebrew point hiriq
547
+ ::s ֹ ::t o ::comment Hebrew point holam
548
+ ::s ֻ ::t u ::comment Hebrew point qubuts
549
+ # ::s ּ ::t "" ::comment Hebrew point dagesh or mapiq
550
+
551
+ # Yiddish
552
+ ::s א ::t a ::lcode yid ::comment called "silent" alef
553
+ ::s אי ::t y ::lcode yid
554
+ ::s איי ::t ey ::lcode yid
555
+ ::s או ::t u ::lcode yid
556
+ ::s אוי ::t oy ::lcode yid
557
+ ::s אַ ::t a ::lcode yid
558
+ ::s אָ ::t o ::lcode yid
559
+ ::s ב ::t b ::lcode yid
560
+ ::s בֿ ::t v ::lcode yid
561
+ ::s דזש ::t dzh ::lcode yid
562
+ ::s ו ::t u ::lcode yid
563
+ ::s וּ ::t u ::lcode yid
564
+ ::s וֹ ::t o ::lcode yid
565
+ ::s װ ::t v ::lcode yid
566
+ ::s ווא ::t wa ::lcode yid
567
+ ::s וואַ ::t wa ::lcode yid
568
+ ::s ווע ::t we ::lcode yid
569
+ ::s ווי ::t wi ::lcode yid
570
+ ::s וואוי ::t wo ::lcode yid
571
+ ::s וי ::t oy ::lcode yid
572
+ ::s זש ::t zh ::lcode yid
573
+ ::s ח ::t ch ::lcode yid
574
+ ::s טש ::t tsh ::lcode yid
575
+ ::s יִ::t i ::lcode yid
576
+ ::s יי ::t ey ::lcode yid ::comment maybe "yi" at beginning of word
577
+ ::s ײַ ::t ay ::lcode yid
578
+ ::s כּ ::t k ::lcode yid
579
+ ::s כ ::t ch ::lcode yid
580
+ ::s ך ::t ch ::lcode yid
581
+ ::s ע ::t e ::lcode yid
582
+ ::s פּ ::t p ::lcode yid
583
+ ::s פֿ ::t f ::lcode yid
584
+ ::s ף ::t f ::lcode yid ::comment sometimes p
585
+ ::s ק ::t k ::lcode yid
586
+ ::s ת ::t s ::lcode yid
587
+
588
+ # Syriac/Aramaic (should be vetted by expert)
589
+ ::s ܰ ::t a ::comment Syriac pthaha above
590
+ ::s ܲ ::t a ::comment Syriac pthaha dotted
591
+ ::s ܳ ::t aa ::comment Syriac zqapha above
592
+ ::s ܴ ::t aa ::comment Syriac zqapha below
593
+ ::s ܵ ::t aa ::comment Syriac zqapha dotted
594
+ ::s ܶ ::t e ::comment Syriac rbasa above
595
+ ::s ܷ ::t e ::comment Syriac rbasa below
596
+ ::s ܿ ::t o ::comment Syriac rwaha
597
+ ::s ܸ ::t e ::comment Syriac dotted zlama horizontal
598
+ ::s ܹ ::t e ::comment Syriac dotted zlama angular
599
+ ::s ܺ ::t i ::comment Syriac hbasa above
600
+ ::s ܝܺ ::t i ::comment Syriac yudh + hbasa above
601
+ ::s ܼ ::t u ::comment Syriac hbasa-esasa dotted
602
+ ::s ܽ ::t o ::comment Syriac esasa above
603
+ ::s ܾ ::t u ::comment Syriac esasa below
604
+ ::s ݇ ::t "" ::comment Syriac oblique line above; indication of a silent letter
605
+
606
+ ::s ܖ ::t d ::comment Syriac letter dotless dalath rish; ambiguous form for undifferentiated early dalath/rish
607
+ ::s ܜ ::t t ::comment Syriac letter teth garshuni; used in Garshuni documents
608
+ ::s ܒ݂ ::t v ::comment Syriac beth + rukkakha
609
+ ::s ܒ̥ ::t v ::comment Syriac beth + ring-below
610
+ ::s ܓ݂ ::t g ::comment Syriac gammal + rukkakha [IPA: ɣ]
611
+ ::s ܓ̥ ::t g ::comment Syriac gammal + ring-below [IPA: ɣ]
612
+ ::s ܕ݂ ::t d ::comment Syriac dalath + rukkakha [IPA: ð]
613
+ ::s ܕ̥ ::t d ::comment Syriac dalath + ring-below [IPA: ð]
614
+ ::s ܟ݂ ::t kh ::comment Syriac kaph + rukkakha [IPA: x]
615
+ ::s ܟ̥ ::t kh ::comment Syriac kaph + ring-below [IPA: x]
616
+ ::s ܦ݂ ::t f ::comment Syriac pe + rukkakha
617
+ ::s ܦ̥ ::t f ::comment Syriac pe + ring-below
618
+ ::s ܦ݁ ::t p ::comment Syriac pe + qushshaya
619
+ ::s ܬ݂ ::t th ::comment Syriac taw + rukkakha [IPA: θ]
620
+ ::s ܬ̥ ::t th ::comment Syriac taw + ring-below [IPA: θ]
621
+
622
+ ::s ܄ ::t : ::comment Syriac sublinear colon; used at the end of verses of supplicationscolon skewed left
623
+ ::s ܆ ::t , ::comment Syriac colon skewed left; marks a dependent clause
624
+ ::s ܇ ::t , ::comment Syriac colon skewed right; marks the end of a subdivision of the apodosis, or latter part of a Biblical verse
625
+
626
+ # Uzbek
627
+ ::s ʻ ::t ' ::comment modifies pronunciation of preceding "o" and "g"
628
+ ::s ʼ ::t ' ::comment glottal stop (tutuq belgisi)
629
+
630
+ # Uyghur
631
+ ::s ئا ::t a ::lcode uig
632
+ ::s ە ::t e ::lcode uig
633
+ ::s ئې ::t e ::lcode uig ::latinplus ë
634
+ ::s ې ::t e ::lcode uig ::latinplus ë
635
+ ::s ئە ::t e ::lcode uig
636
+ ::s يە ::t e ::lcode uig
637
+ ::s ئى ::t i ::lcode uig
638
+ ::s ى ::t i ::lcode uig
639
+ ::s ئو ::t o ::lcode uig
640
+ ::s و ::t o ::lcode uig
641
+ ::s ئۇ ::t u ::lcode uig
642
+ ::s ۇ ::t u ::lcode uig
643
+ ::s چ ::t ch ::t-alt q ::lcode uig
644
+ ::s خ ::t x ::lcode uig
645
+ ::s ژ ::t zh ::lcode uig
646
+ ::s ئۆ ::t oe ::t-alt o ::lcode uig ::latinplus ö
647
+ ::s ۆ ::t oe ::t-alt o ::lcode uig ::latinplus ö
648
+ ::s ئۈ ::t ue ::t-alt u ::lcode uig ::latinplus ü
649
+ ::s ۈ ::t ue ::t-alt u ::lcode uig ::latinplus ü
650
+ ::s ۋ ::t w ::lcode uig
651
+
652
+ # Maldivian
653
+ ::s ް ::t ::comment thaana sukun
654
+ ::s ަ ::t a ::comment thaana abafili
655
+ ::s ާ ::t aa ::comment thaana aabaafili
656
+ ::s ި ::t i ::comment thaana ibifili
657
+ ::s ީ ::t ee ::comment thaana eebeefili
658
+ ::s ު ::t u ::comment thaana ubufili
659
+ ::s ޫ ::t oo ::comment thaana ooboofili
660
+ ::s ެ ::t e ::comment thaana ebefili
661
+ ::s ޭ ::t ey ::comment thaana eybeyfili
662
+ ::s ޮ ::t o ::comment thaana obofili
663
+ ::s ޯ ::t oa ::comment thaana oaboafili
664
+
665
+ # Canadian syllabics (Inuktitut)
666
+ ::s ᑊ ::t p ::comment syllable final
667
+ ::s ᐟ ::t t ::comment syllable final
668
+ ::s ᐠ ::t k ::comment syllable final
669
+ ::s ᐨ ::t c ::comment syllable final
670
+ ::s ᒼ ::t m ::comment syllable final
671
+ ::s ᐣ ::t n ::comment syllable final
672
+ ::s ᐢ ::t s ::comment syllable final
673
+ ::s ᐧ ::t y ::comment syllable final
674
+ ::s ᐤ ::t w ::comment syllable final
675
+ ::s ᐦ ::t h ::comment syllable final
676
+ ::s ᕽ ::t hk ::comment syllable final
677
+ ::s ᓫ ::t l ::comment syllable final
678
+ ::s ᕑ ::t r ::comment syllable final
679
+
680
+ ## Punctuation
681
+ # delete
682
+ ::s ¿ ::t "" ::comment inverted question mark
683
+ ::s ¡ ::t "" ::comment inverted exclamation mark
684
+ # preserve
685
+ ::s ′ ::t ′
686
+ # Cyrillic
687
+ ::s ⁙ ::t . ::comment five dot punctuation
688
+ # Amharic/Ethiopian
689
+ ::s ። ::t .
690
+ ::s ፣ ::t ,
691
+ ::s ፤ ::t ;
692
+ ::s ፥ ::t :
693
+ ::s ፡ ::t " " ::comment Ethiopic wordspace
694
+ ::s ፦ ::t : ::comment Ethiopic preface colon
695
+ ::s ቸ ::t cha ::comment Ethiopic syllable ca
696
+ ::s ቹ ::t chu ::comment Ethiopic syllable cu
697
+ ::s ቺ ::t chi ::comment Ethiopic syllable ci
698
+ ::s ቻ ::t chaa ::comment Ethiopic syllable caa
699
+ ::s ቼ ::t chee ::comment Ethiopic syllable cee
700
+ ::s ች ::t che ::comment Ethiopic syllable ce
701
+ ::s ቾ ::t cho ::comment Ethiopic syllable co
702
+ ::s ሠ ::t sa ::comment Ethiopic syllable sza
703
+ ::s ሡ ::t su ::comment Ethiopic syllable szu
704
+ ::s ሢ ::t si ::comment Ethiopic syllable szi
705
+ ::s ሣ ::t saa ::comment Ethiopic syllable szaa
706
+ ::s ሤ ::t see::comment Ethiopic syllable szee
707
+ ::s ሥ ::t se ::comment Ethiopic syllable sze
708
+ ::s ሦ ::t so ::comment Ethiopic syllable szo
709
+ ::s ጠ ::t te ::comment Ethiopic syllable the with ejective 't'
710
+ ::s ጡ ::t tu ::comment Ethiopic syllable thu with ejective 't'
711
+ ::s ጢ ::t ti ::comment Ethiopic syllable thi with ejective 't'
712
+ ::s ጣ ::t taa ::comment Ethiopic syllable thaa with ejective 't'
713
+ ::s ጤ ::t tee ::comment Ethiopic syllable thee with ejective 't'
714
+ ::s ጥ ::t te ::comment Ethiopic syllable the with ejective 't'
715
+ ::s ጦ ::t to ::comment Ethiopic syllable tho with ejective 't'
716
+
717
+ # Devanagari (Hindi etc.)
718
+ ::s । ::t . ::comment danda
719
+ ::s ॥ ::t . ::comment double danda
720
+ ::s ৷ ::t . ::comment Bengali currency numerator four; used as danda
721
+ ::s ॰ ::t . ::comment Devanagari abbreviation sign
722
+ # Oriya/Odia (India)
723
+ ::s ୤ ::t . ::comment danda (deprecated, should use Devanagari danda ।)
724
+ ::s ୥ ::t . ::comment double danda (deprecated, should use Devanagari double danda ॥)
725
+ # Tibetan
726
+ ::s ། ::t ,
727
+ ::s །: ::t :
728
+ ::s ༏ ::t ;
729
+ ::s ༎ ::t .
730
+ ::s ༑ ::t , ::comment Tibetan mark run chen spungs shad
731
+ ::s ༼ ::t ( ::comment Tibetan open roof punctuation
732
+ ::s ༽ ::t ) ::comment Tibetan close roof punctuation
733
+ ::s ༈ ::t "" ::comment Tibetan mark srbul shad
734
+ ::s 【 ::t [ ::comment left black lenticular bracket
735
+ ::s 】 ::t ] ::comment right black lenticular bracket
736
+ ::s ༄ ::t "" ::comment Tibetan head mark
737
+ ::s ༄༅ ::t "" ::comment Tibetan head mark
738
+ ::s ༆ ::t "" ::comment Tibetan head mark
739
+ # Myanmar/Burmese
740
+ ::s ၊ ::t ,
741
+ ::s ။ ::t .
742
+ Khmer
743
+ ::s ៖ ::t ; ::comment Khmer sign camnuc pii kuuh
744
+ ::s ។ ::t . ::comment Khmer sign khan
745
+ # Arabic
746
+ ::s ، ::t ,
747
+ ::s ؛ ::t ;
748
+ ::s ٬ ::t ,
749
+ ::s ۔ ::t .
750
+ ::s ؟ ::t ?
751
+ ::s ٪ ::t %
752
+ ::s ٫ ::t , ::comment Arabic decimal separator
753
+ ::s ۽ ::t & ::comment Arabic sign Sindhi ampersand
754
+ # Aramaic
755
+ ::s ܀ ::t .
756
+ ::s ܂ ::t .
757
+ # Hebrew
758
+ ::s ־ ::t - ::comment maqaf
759
+ # Armenian
760
+ ::s ։ ::t .
761
+ ::s ՝ ::t , ::comment Armenian comma
762
+ # Chinese
763
+ ::s , ::t ", "
764
+ ::s 、 ::t ", "
765
+ ::s 。 ::t ". "
766
+ ::s ! ::t "! "
767
+ ::s ? ::t "? "
768
+ ::s 「 ::t ' "'
769
+ ::s 」 ::t '" '
770
+ ::s 《 ::t ' "'
771
+ ::s 》 ::t '" '
772
+ ::s ( ::t " ("
773
+ ::s ) ::t ") "
774
+ ::s ; ::t ;
775
+ ::s : ::t ": "
776
+ ::s ︰ ::t ": "
777
+ ::s - ::t -
778
+ ::s / ::t /
779
+ ::s = ::t =
780
+ ::s ~ ::t ~
781
+ ::s & ::t &
782
+ ::s < ::t <
783
+ ::s > ::t >
784
+ ::s % ::t %
785
+ ::s   ::t " " ::comment ideographic space
786
+ # Japanese
787
+ ::s 『 ::t ' "'
788
+ ::s 』 ::t '" '
789
+ ::s ・ ::t " " ::comment Katakana middle dot; separates name elements such as first and last name
790
+
791
+ # Symbols
792
+ ::s ∞ ::t ∞ ::comment infinity
793
+ ::s ­ ::t ::comment soft hyphen; used to indicate preferred line breaks; remove
794
+ ::s ֊ ::t - ::comment Armenian hyphen; map to regular hyphen-minus
795
+ ::s ᐩ ::t + ::comment Canadian syllabics final plus; map to regular plus
796
+ ::s ﹐ ::t , ::comment small comma; map to regular comma
797
+ ::s ˚ ::t ° ::comment ring above; map to degree sign
798
+ ::s ⇒ ::t ⇒ ::comment rightwards double arrow
799
+ ::s † ::t † ::comment dagger
800
+ ::s • ::t • ::comment bullet
801
+ ::s ℃ ::t °C ::comment degree Celsius; split into 2 characters
802
+ ::s ℉ ::t °F ::comment degree Fahrenheit; split into 2 characters
803
+ ::s ― ::t ― ::comment horizontal bar
804
+ ::s ˇ ::t ˇ ::comment caron (sometimes apparently used for "Arabic vowel sign small v above" U+065A, e.g. in Gilaki language (glk))
805
+ ::s ″ ::t ″ ::comment double prime
806
+ ::s ﴾ ::t ( ::comment ornate left parenthesis
807
+ ::s ﴿ ::t ) ::comment ornate right parenthesis
808
+ ::s 〔 ::t [ ::comment left tortoise shell bracket
809
+ ::s 〕 ::t ] ::comment right tortoise shell bracket
810
+ ::s ﹝ ::t ( ::comment small left tortoise shell bracket
811
+ ::s ﹞ ::t ) ::comment small left tortoise shell bracket
812
+ ::s ♄ ::t ♄ ::comment Saturn
813
+ ::s ♆ ::t ♆ ::comment Neptune
814
+ ::s ♋ ::t ♋ ::comment Cancer
uroman/data/string-distance-cost-rules.txt ADDED
@@ -0,0 +1,896 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # String distance
2
+
3
+ ::s1 a ::s2 ::cost 0.1
4
+ ::s1 b ::s2 ::cost 1
5
+ ::s1 b ::s2 ::cost 0.2 ::left1 /[aou]m$/ ::right1 [e] ::lc1 eng ::lc2 zho ::example Balcombe
6
+ ::s1 c ::s2 ::cost 1
7
+ ::s1 c ::s2 ::cost 0.2 ::left1 /[aeou]$/ ::right1 [cgkq] ::lc2 zho
8
+ ::s1 c ::s2 ::cost 0.5 ::left1 /[aeou][lnr]?$/ ::right1 [h] ::lc2 zho
9
+ ::s1 d ::s2 ::cost 1
10
+ ::s1 d ::s2 ::cost 0.5 ::left1 /[aeiou][lnr]$/ ::right1 [-,$ ]
11
+ ::s1 d ::s2 ::cost 0.4 ::lc1 eng ::lc2 zho ::right1 [bcfgklmnpqrstvwxz]
12
+ ::s1 e ::s2 ::cost 0.1
13
+ ::s1 é ::s2 ::cost 0.1
14
+ ::s1 e ::s2 ::cost 0.02 ::lc2 fas
15
+ ::s1 e ::s2 ::cost 0.02 ::lc1 amh ::lc2 eng
16
+ ::s1 f ::s2 ::cost 1
17
+ ::s1 g ::s2 ::cost 1
18
+ ::s1 g ::s2 ::cost 0.4 ::right1 [bcdfghklmnpqrstvwxz] ::lc2 zho
19
+ ::s1 g ::s2 ::cost 0.2 ::right1 [k] ::lc2 zho
20
+ ::s1 h ::s2 ::cost 0.5
21
+ ::s1 h ::s2 ::cost 0.1 ::left1 /[aeiouy]$/ ::right1 [-,bcdfghklmnpqrstvwxz$ ]
22
+ ::s1 h ::s2 ::cost 0.2 ::left1 /[bdlnr]$/ ::right1 [-,$ aeiouy] ::example Delhi, Minh, Riyadh
23
+ ::s1 i ::s2 ::cost 0.1
24
+ ::s1 j ::s2 ::cost 0.5
25
+ ::s1 k ::s2 ::cost 1
26
+ ::s1 l ::s2 ::cost 1
27
+ ::s1 l ::s2 ::cost 0.3 ::left1 /eui$/ ::right1 [-,$ ] ::example Argenteuil
28
+ ::s1 l ::s2 ::cost 0.3 ::left1 /a$/ ::right1 [km] ::comment walk, palm
29
+ ::s1 l ::s2 ::cost 0.3 ::left1 /[aeiou]$/ ::right1 [bdfgkmpstvwz] ::lc2 zho
30
+ ::s1 m ::s2 ::cost 1
31
+ ::s1 n ::s2 ::cost 1
32
+ ::s1 n ::s2 ::cost 0.7 ::right1 [-,$ ]
33
+ ::s1 o ::s2 ::cost 0.1
34
+ ::s1 p ::s2 ::cost 1
35
+ ::s1 q ::s2 ::cost 1
36
+ ::s1 r ::s2 ::cost 1
37
+ ::s1 r ::s2 ::cost 0.5 ::left1 /[aou]$/ ::right1 [-,bcdfghjklmnpqrstvwxz$ ]
38
+ ::s1 r ::s2 ::cost 0.3 ::left1 /[aeiou]$/ ::right1 [-,bcdfghjklmnpqrstvwxz$ ] ::lc2 zho
39
+ ::s1 re ::s2 ::cost 0.4 ::left1 /[ou]$/ ::right1 [-,$ ] ::lc2 zho
40
+ ::s1 re ::s2 ::cost 0.5 ::left1 /[aeiou]$/ ::right1 [-,bcdfghjklmnpqrstvwxz$ ] ::lc2 zho
41
+ ::s1 rr ::s2 ::cost 0.5 ::left1 /[aeiou]$/ ::right1 [-,bcdfghjklmnpqrstvwxz$ ] ::lc2 zho
42
+ ::s1 s ::s2 ::cost 1
43
+ ::s1 s ::s2 ::cost 0.6 ::right1 [-,$ ]
44
+ ::s1 t ::s2 ::cost 1
45
+ ::s1 t ::s2 ::cost 0.5 ::left1 /[aeiou][lnr]?$/ ::right1 [-,$ ]
46
+ ::s1 t ::s2 ::cost 0.6 ::left1 /[bcdfghklmnpqrstvwxz]$/ ::right1 [bcdfghklmnpqrstvwxz]
47
+ ::s1 u ::s2 ::cost 0.1
48
+ ::s1 v ::s2 ::cost 1
49
+ ::s1 w ::s2 ::cost 1
50
+ ::s1 w ::s2 ::cost 0.4 ::lc1 eng ::right1 [i][c][hk][-,$ ] ::example Greenwich, Alnwick
51
+ ::s1 x ::s2 ::cost 1
52
+ ::s1 y ::s2 ::cost 0.3
53
+ ::s1 z ::s2 ::cost 1
54
+ ::s1 ı ::s2 ::cost 0.3
55
+ ::s1 0 ::s2 ::cost 1
56
+ ::s1 1 ::s2 ::cost 1
57
+ ::s1 2 ::s2 ::cost 1
58
+ ::s1 3 ::s2 ::cost 1
59
+ ::s1 4 ::s2 ::cost 1
60
+ ::s1 5 ::s2 ::cost 1
61
+ ::s1 6 ::s2 ::cost 1
62
+ ::s1 7 ::s2 ::cost 1
63
+ ::s1 8 ::s2 ::cost 1
64
+ ::s1 9 ::s2 ::cost 1
65
+ ::s1 ' ::s2 ::cost 0.1
66
+ ::s1 ` ::s2 ::cost 0.1
67
+ ::s1 ( ::s2 ::cost 0.1
68
+ ::s1 ) ::s2 ::cost 0.1
69
+ ::s1 , ::s2 ::cost 0.1
70
+ ::s1 ; ::s2 ::cost 0.1
71
+ ::s1 - ::s2 ::cost 0.1
72
+ ::s1 . ::s2 ::cost 0.1
73
+ ::s1 .. ::s2 ::cost 0.12
74
+ ::s1 ... ::s2 ::cost 0.14
75
+ ::s1 ? ::s2 ::cost 0.2
76
+ ::s1 ! ::s2 ::cost 0.2
77
+ ::s1 ‼ ::s2 ::cost 0.2
78
+ ::s1 ‼ ::s2 !! ::cost 0.02
79
+ ::s1 ‼ ::s2 ! ::cost 0.1
80
+ ::s1 / ::s2 ::cost 0.1
81
+ ::s1 : ::s2 ::cost 0.1
82
+ ::s1 ː ::s2 ::cost 0.1
83
+ ::s1 ː ::s2 : ::cost 0.1
84
+ ::s1 « ::s2 ::cost 0.1
85
+ ::s1 » ::s2 ::cost 0.1
86
+ ::s1 – ::s2 ::cost 0.1
87
+ ::s1 – ::s2 - ::cost 0.05
88
+ ::s1 — ::s2 ::cost 0.15
89
+ ::s1 — ::s2 - ::cost 0.1
90
+ ::s1 — ::s2 – ::cost 0.05
91
+ ::s1 ─ ::s2 ::cost 0.2
92
+ ::s1 ─ ::s2 - ::cost 0.15
93
+ ::s1 ─ ::s2 – ::cost 0.1
94
+ ::s1 ─ ::s2 — ::cost 0.05
95
+ ::s1 ’ ::s2 ::cost 0.1
96
+ ::s1 ʼ ::s2 ::cost 0.1
97
+ ::s1 " " ::s2 ::cost 0.1
98
+ ::s1 “ ::s2 ::cost 0.1
99
+ ::s1 ” ::s2 ::cost 0.1
100
+ ::s1 ″ ::s2 ::cost 0.1
101
+ ::s1 # ::s2 ::cost 0.3
102
+ ::s1 + ::s2 ::cost 0.3
103
+ ::s1 * ::s2 ::cost 0.3
104
+ ::s1 = ::s2 ::cost 0.3
105
+ ::s1 < ::s2 ::cost 0.3
106
+ ::s1 > ::s2 ::cost 0.3
107
+ ::s1 [ ::s2 ::cost 0.3
108
+ ::s1 ] ::s2 ::cost 0.3
109
+ ::s1 { ::s2 ::cost 0.3
110
+ ::s1 } ::s2 ::cost 0.3
111
+ ::s1 | ::s2 ::cost 0.3
112
+ ::s1 & ::s2 ::cost 0.3
113
+ ::s1 _ ::s2 ::cost 0.3
114
+ ::s1 • ::s2 ::cost 0.1
115
+ ::s1 · ::s2 ::cost 0.1
116
+ ::s1 ◦ ::s2 ::cost 0.1
117
+ ::s1 ° ::s2 ::cost 0.1
118
+ ::s1 … ::s2 ::cost 0.1
119
+ ::s1 … ::s2 ... ::cost 0
120
+ ::s1 @ ::s2 ::cost 0.3
121
+ ::s1 © ::s2 ::cost 0.3
122
+ ::s1 © ::s2 (c) ::cost 0.1
123
+
124
+
125
+ ::s1 a ::s2 aa ::cost 0.02
126
+ ::s1 a ::s2 aaa ::cost 0.03
127
+ ::s1 a ::s2 aaaa ::cost 0.03
128
+ ::s1 a ::s2 aaaaa ::cost 0.03
129
+ ::s1 a ::s2 aaaaaa ::cost 0.04
130
+ ::s1 a ::s2 aaaaaaa ::cost 0.04
131
+ ::s1 a ::s2 aaaaaaaa ::cost 0.04
132
+ ::s1 a ::s2 aaaaaaaaa ::cost 0.04
133
+ ::s1 a ::s2 aaaaaaaaaa ::cost 0.04
134
+ ::s1 a ::s2 aaaaaaaaaaa ::cost 0.04
135
+ ::s1 a ::s2 aaaaaaaaaaaa ::cost 0.04
136
+ ::s1 a ::s2 aaaaaaaaaaaaa ::cost 0.04
137
+ ::s1 a ::s2 aaaaaaaaaaaaaa ::cost 0.04
138
+ ::s1 a ::s2 aaaaaaaaaaaaaaa ::cost 0.04
139
+ ::s1 a ::s2 aaaaaaaaaaaaaaaa ::cost 0.04
140
+ ::s1 b ::s2 bb ::cost 0.02
141
+ ::s1 b ::s2 bbb ::cost 0.03
142
+ ::s1 b ::s2 bbbb ::cost 0.03
143
+ ::s1 b ::s2 bbbbb ::cost 0.03
144
+ ::s1 c ::s2 cc ::cost 0.02
145
+ ::s1 c ::s2 ccc ::cost 0.03
146
+ ::s1 c ::s2 cccc ::cost 0.03
147
+ ::s1 c ::s2 ccccc ::cost 0.03
148
+ ::s1 d ::s2 dd ::cost 0.02
149
+ ::s1 d ::s2 ddd ::cost 0.03
150
+ ::s1 d ::s2 dddd ::cost 0.03
151
+ ::s1 d ::s2 ddddd ::cost 0.03
152
+ ::s1 e ::s2 ee ::cost 0.02
153
+ ::s1 e ::s2 eee ::cost 0.03
154
+ ::s1 e ::s2 eeee ::cost 0.03
155
+ ::s1 e ::s2 eeeee ::cost 0.03
156
+ ::s1 e ::s2 eeeeee ::cost 0.04
157
+ ::s1 e ::s2 eeeeeee ::cost 0.04
158
+ ::s1 e ::s2 eeeeeeee ::cost 0.04
159
+ ::s1 e ::s2 eeeeeeeee ::cost 0.04
160
+ ::s1 e ::s2 eeeeeeeeee ::cost 0.04
161
+ ::s1 e ::s2 eeeeeeeeeee ::cost 0.04
162
+ ::s1 e ::s2 eeeeeeeeeeee ::cost 0.04
163
+ ::s1 e ::s2 eeeeeeeeeeeee ::cost 0.04
164
+ ::s1 e ::s2 eeeeeeeeeeeeee ::cost 0.04
165
+ ::s1 e ::s2 eeeeeeeeeeeeeee ::cost 0.04
166
+ ::s1 e ::s2 eeeeeeeeeeeeeeee ::cost 0.04
167
+ ::s1 f ::s2 ff ::cost 0.02
168
+ ::s1 f ::s2 fff ::cost 0.03
169
+ ::s1 f ::s2 ffff ::cost 0.03
170
+ ::s1 f ::s2 fffff ::cost 0.03
171
+ ::s1 g ::s2 gg ::cost 0.02
172
+ ::s1 g ::s2 ggg ::cost 0.03
173
+ ::s1 g ::s2 gggg ::cost 0.03
174
+ ::s1 g ::s2 ggggg ::cost 0.03
175
+ ::s1 h ::s2 hh ::cost 0.02
176
+ ::s1 h ::s2 hhh ::cost 0.03
177
+ ::s1 h ::s2 hhhh ::cost 0.03
178
+ ::s1 h ::s2 hhhhh ::cost 0.03
179
+ ::s1 i ::s2 ii ::cost 0.02
180
+ ::s1 i ::s2 iii ::cost 0.03
181
+ ::s1 i ::s2 iiii ::cost 0.03
182
+ ::s1 i ::s2 iiiii ::cost 0.03
183
+ ::s1 i ::s2 iiiiii ::cost 0.04
184
+ ::s1 i ::s2 iiiiiii ::cost 0.04
185
+ ::s1 i ::s2 iiiiiiii ::cost 0.04
186
+ ::s1 i ::s2 iiiiiiiii ::cost 0.04
187
+ ::s1 i ::s2 iiiiiiiiii ::cost 0.04
188
+ ::s1 i ::s2 iiiiiiiiiii ::cost 0.04
189
+ ::s1 i ::s2 iiiiiiiiiiii ::cost 0.04
190
+ ::s1 i ::s2 iiiiiiiiiiiii ::cost 0.04
191
+ ::s1 i ::s2 iiiiiiiiiiiiii ::cost 0.04
192
+ ::s1 i ::s2 iiiiiiiiiiiiiii ::cost 0.04
193
+ ::s1 i ::s2 iiiiiiiiiiiiiiii ::cost 0.04
194
+ ::s1 j ::s2 jj ::cost 0.02
195
+ ::s1 j ::s2 jjj ::cost 0.03
196
+ ::s1 j ::s2 jjjj ::cost 0.03
197
+ ::s1 j ::s2 jjjjj ::cost 0.03
198
+ ::s1 k ::s2 kk ::cost 0.02
199
+ ::s1 k ::s2 kkk ::cost 0.03
200
+ ::s1 k ::s2 kkkk ::cost 0.03
201
+ ::s1 k ::s2 kkkkk ::cost 0.03
202
+ ::s1 l ::s2 ll ::cost 0.02
203
+ ::s1 l ::s2 lll ::cost 0.03
204
+ ::s1 l ::s2 llll ::cost 0.03
205
+ ::s1 l ::s2 lllll ::cost 0.03
206
+ ::s1 m ::s2 mm ::cost 0.02
207
+ ::s1 m ::s2 mmm ::cost 0.03
208
+ ::s1 m ::s2 mmmm ::cost 0.03
209
+ ::s1 m ::s2 mmmmm ::cost 0.03
210
+ ::s1 n ::s2 nn ::cost 0.02
211
+ ::s1 n ::s2 nnn ::cost 0.03
212
+ ::s1 n ::s2 nnnn ::cost 0.03
213
+ ::s1 n ::s2 nnnnn ::cost 0.03
214
+ ::s1 o ::s2 oo ::cost 0.02
215
+ ::s1 o ::s2 ooo ::cost 0.03
216
+ ::s1 o ::s2 oooo ::cost 0.03
217
+ ::s1 o ::s2 ooooo ::cost 0.03
218
+ ::s1 o ::s2 oooooo ::cost 0.04
219
+ ::s1 o ::s2 ooooooo ::cost 0.04
220
+ ::s1 o ::s2 oooooooo ::cost 0.04
221
+ ::s1 o ::s2 ooooooooo ::cost 0.04
222
+ ::s1 o ::s2 oooooooooo ::cost 0.04
223
+ ::s1 o ::s2 ooooooooooo ::cost 0.04
224
+ ::s1 o ::s2 oooooooooooo ::cost 0.04
225
+ ::s1 o ::s2 ooooooooooooo ::cost 0.04
226
+ ::s1 o ::s2 oooooooooooooo ::cost 0.04
227
+ ::s1 o ::s2 ooooooooooooooo ::cost 0.04
228
+ ::s1 o ::s2 oooooooooooooooo ::cost 0.04
229
+ ::s1 p ::s2 pp ::cost 0.02
230
+ ::s1 p ::s2 ppp ::cost 0.03
231
+ ::s1 p ::s2 pppp ::cost 0.03
232
+ ::s1 p ::s2 ppppp ::cost 0.03
233
+ ::s1 q ::s2 qq ::cost 0.02
234
+ ::s1 q ::s2 qqq ::cost 0.03
235
+ ::s1 q ::s2 qqqq ::cost 0.03
236
+ ::s1 q ::s2 qqqqq ::cost 0.03
237
+ ::s1 r ::s2 rr ::cost 0.02
238
+ ::s1 r ::s2 rrr ::cost 0.03
239
+ ::s1 r ::s2 rrrr ::cost 0.03
240
+ ::s1 r ::s2 rrrrr ::cost 0.03
241
+ ::s1 s ::s2 ss ::cost 0.02
242
+ ::s1 s ::s2 sss ::cost 0.03
243
+ ::s1 s ::s2 ssss ::cost 0.03
244
+ ::s1 s ::s2 sssss ::cost 0.03
245
+ ::s1 t ::s2 tt ::cost 0.02
246
+ ::s1 t ::s2 ttt ::cost 0.03
247
+ ::s1 t ::s2 tttt ::cost 0.03
248
+ ::s1 t ::s2 ttttt ::cost 0.03
249
+ ::s1 u ::s2 uu ::cost 0.02
250
+ ::s1 u ::s2 uuu ::cost 0.03
251
+ ::s1 u ::s2 uuuu ::cost 0.03
252
+ ::s1 u ::s2 uuuuu ::cost 0.03
253
+ ::s1 u ::s2 uuuuuu ::cost 0.04
254
+ ::s1 u ::s2 uuuuuuu ::cost 0.04
255
+ ::s1 u ::s2 uuuuuuuu ::cost 0.04
256
+ ::s1 u ::s2 uuuuuuuuu ::cost 0.04
257
+ ::s1 u ::s2 uuuuuuuuuu ::cost 0.04
258
+ ::s1 u ::s2 uuuuuuuuuuu ::cost 0.04
259
+ ::s1 u ::s2 uuuuuuuuuuuu ::cost 0.04
260
+ ::s1 u ::s2 uuuuuuuuuuuuu ::cost 0.04
261
+ ::s1 u ::s2 uuuuuuuuuuuuuu ::cost 0.04
262
+ ::s1 u ::s2 uuuuuuuuuuuuuuu ::cost 0.04
263
+ ::s1 u ::s2 uuuuuuuuuuuuuuuu ::cost 0.04
264
+ ::s1 v ::s2 vv ::cost 0.02
265
+ ::s1 v ::s2 vvv ::cost 0.03
266
+ ::s1 v ::s2 vvvv ::cost 0.03
267
+ ::s1 v ::s2 vvvvv ::cost 0.03
268
+ ::s1 w ::s2 ww ::cost 0.02
269
+ ::s1 w ::s2 www ::cost 0.03
270
+ ::s1 w ::s2 wwww ::cost 0.03
271
+ ::s1 w ::s2 wwwww ::cost 0.03
272
+ ::s1 x ::s2 xx ::cost 0.02
273
+ ::s1 x ::s2 xxx ::cost 0.03
274
+ ::s1 x ::s2 xxxx ::cost 0.03
275
+ ::s1 x ::s2 xxxxx ::cost 0.03
276
+ ::s1 y ::s2 yy ::cost 0.02
277
+ ::s1 y ::s2 yyy ::cost 0.03
278
+ ::s1 y ::s2 yyyy ::cost 0.03
279
+ ::s1 y ::s2 yyyyy ::cost 0.03
280
+ ::s1 z ::s2 zz ::cost 0.02
281
+ ::s1 z ::s2 zzz ::cost 0.03
282
+ ::s1 z ::s2 zzzz ::cost 0.03
283
+ ::s1 z ::s2 zzzzz ::cost 0.03
284
+ ::s1 " " ::s2 " " ::cost 0
285
+ ::s1 . ::s2 ::left1 /\./ ::left2 /\./ ::cost 0.02
286
+ ::s1 … ::s2 ::left1 /…/ ::left2 /…/ ::cost 0.01
287
+ ::s1 _ ::s2 ::left1 /_/ ::left2 /_/ ::cost 0.01
288
+ ::s1 = ::s2 ::left1 /=/ ::left2 /=/ ::cost 0.01
289
+ ::s1 ! ::s2 ::left1 /!/ ::left2 /!/ ::cost 0.02
290
+ ::s1 ? ::s2 ::left1 /\?/ ::left2 /\?/ ::cost 0.02
291
+ ::s1 aa ::s2 aː ::cost 0.02
292
+ ::s1 ee ::s2 eː ::cost 0.02
293
+ ::s1 ii ::s2 iː ::cost 0.02
294
+ ::s1 oo ::s2 oː ::cost 0.02
295
+ ::s1 uu ::s2 uː ::cost 0.02
296
+
297
+ ::s1 a ::s2 e ::cost 0.1
298
+ ::s1 au ::s2 o ::cost 0.1 ::lc1 eng
299
+ ::s1 aw ::s2 o ::cost 0.3 ::right1 [-,bcdfghklmnpqrstvwxz$ ]
300
+ ::s1 aw ::s2 o ::cost 0.1 ::right1 [-,bcdfghklmnpqrstvwxz$ ] ::lc1 eng
301
+ ::s1 aw ::s2 a ::cost 0.2 ::right1 [-,bcdfghklmnpqrstvwxz$ ] ::lc1 eng
302
+ ::s1 ay ::s2 i ::cost 0.02 ::lc1 fas ::lc2 eng
303
+ ::s1 aye ::s2 ae ::cost 0.05 ::lc1 fas
304
+ ::s1 é ::s2 e ::cost 0.05
305
+ ::s1 e ::s2 i ::cost 0.15
306
+ ::s1 e ::s2 i ::cost 0.1 ::lc1 uig ::lc2 uig
307
+ ::s1 e ::s2 y ::cost 0.15
308
+ ::s1 ew ::s2 u ::cost 0.3 ::right1 [-,bcdfghklmnpqrstvwxz$ ]
309
+ ::s1 ew ::s2 u ::cost 0.1 ::right1 [-,bcdfghklmnpqrstvwxz$ ] ::lc1 eng
310
+ ::s1 ew ::s2 u ::cost 0.3 ::right1 [aei][lgnrst] ::lc1 eng
311
+ ::s1 ew ::s2 e ::cost 0.3 ::right1 [-,bcdfghklmnpqrstvwxz$ ] ::lc1 eng
312
+ ::s1 i ::s2 a ::cost 0.1 ::right1 [-,$ ] ::lc1 fas
313
+ ::s1 i ::s2 ea ::cost 0.03 ::lc2 eng
314
+ ::s1 i ::s2 ee ::cost 0.03 ::lc2 eng
315
+ ::s1 i ::s2 ei ::cost 0.05 ::lc2 eng
316
+ ::s1 i ::s2 ie ::cost 0.03 ::lc2 eng
317
+ ::s1 i ::s2 ı ::cost 0.05
318
+ ::s1 i ::s2 e ::cost 0.1 ::lc2 eng
319
+ ::s1 i ::s2 y ::cost 0.15
320
+ ::s1 i ::s2 y ::cost 0.1 ::right2 [-,bcdfghklmnpqrstvwxz$ ]
321
+ ::s1 ie ::s2 ei ::cost 0.15
322
+ ::s1 ie ::s2 y ::cost 0.15
323
+ ::s1 ij ::s2 ai ::cost 0.15
324
+ ::s1 o ::s2 u ::cost 0.1
325
+ ::s1 oo ::s2 u ::cost 0.1
326
+ ::s1 ow ::s2 au ::cost 0.2 ::right1 [-,bcdfghklmnpqrstvwxz$ ]
327
+ ::s1 ow ::s2 o ::cost 0.2 ::right1 [-,bcdfghklmnpqrstvwxz$ ]
328
+ ::s1 ow ::s2 o ::cost 0.2 ::lc1 eng ::lc2 zho ::right1 [e]
329
+ ::s1 ow ::s2 o ::cost 0.4 ::lc1 eng ::lc2 zho ::right1 [iy]
330
+ ::s1 u ::s2 a ::cost 0.1 ::lc1 eng ::right1 [-,bcdfghklmnpqrstvwxz][bcdfghklmnpqrstvwxz$ ]
331
+ ::s1 u ::s2 ou ::cost 0.05
332
+ ::s1 u ::s2 yu ::cost 0.05 ::left1 /^(.*[- ])?$/
333
+ ::s1 yeo ::s2 eo ::cost 0.1 ::lc1 fas
334
+
335
+ # Amharic
336
+ ::s1 a ::s2 e ::cost 0.05 ::lc1 amh
337
+ ::s1 aa ::s2 o ::cost 0.15 ::lc1 amh
338
+ ::s1 aawe ::s2 au ::cost 0.05 ::lc1 amh
339
+ ::s1 aawe ::s2 ao ::cost 0.1 ::lc1 amh
340
+ ::s1 aawe ::s2 ou ::cost 0.1 ::lc1 amh
341
+ ::s1 aawo ::s2 ao ::cost 0.05 ::lc1 amh
342
+ ::s1 aaye ::s2 ai ::cost 0.05 ::lc1 amh
343
+ ::s1 aaye ::s2 i ::cost 0.1 ::lc1 amh
344
+ ::s1 aaye ::s2 ei ::cost 0.1 ::lc1 amh
345
+ ::s1 awe ::s2 au ::cost 0.05 ::lc1 amh
346
+ ::s1 awe ::s2 ao ::cost 0.1 ::lc1 amh
347
+ ::s1 awe ::s2 ou ::cost 0.1 ::lc1 amh
348
+ ::s1 ee ::s2 ai ::cost 0.1 ::lc1 amh
349
+ ::s1 eewo ::s2 eo ::cost 0.05 ::lc1 amh
350
+ ::s1 eeyaa ::s2 ea ::cost 0.1 ::lc1 amh
351
+ ::s1 eeye ::s2 ai ::cost 0.1 ::lc1 amh
352
+ ::s1 ewee ::s2 ue ::cost 0.1 ::lc1 amh
353
+ ::s1 gwaa ::s2 gua ::cost 0.05 ::lc1 amh
354
+ ::s1 iya ::s2 ie ::cost 0.05 ::lc1 amh
355
+ ::s1 iyaa ::s2 ia ::cost 0.05 ::lc1 amh
356
+ ::s1 iyo ::s2 io ::cost 0.05 ::lc1 amh
357
+ ::s1 kxaa ::s2 kha ::cost 0.05 ::lc1 amh
358
+ ::s1 liyaa ::s2 llia ::cost 0.05 ::lc1 amh
359
+ ::s2 qaa ::s2 cca ::cost 0.05 ::lc1 amh
360
+ ::s1 uwaa ::s2 ua ::cost 0.05 ::lc1 amh
361
+ ::s1 uwee ::s2 ue ::cost 0.05 ::lc1 amh
362
+ ::s1 uwi ::s2 oui ::cost 0.05 ::lc1 amh
363
+ ::s1 uwi ::s2 ui ::cost 0.05 ::lc1 amh
364
+ ::s1 xaaye ::s2 hai ::cost 0.1 ::lc1 amh
365
+ ::s1 xwaa ::s2 jua ::cost 0.1 ::lc1 amh
366
+ ::s1 ziyaa ::s1 sia ::cost 0.05 ::lc1 amh
367
+ ::s1 w ::s2 ::cost 0.3 ::lc1 amh ::left1 /[aeiou]$/ ::right1 [aeiou]
368
+ ::s1 y ::s2 ::cost 0.1 ::lc1 amh ::left1 /[aeiou]$/ ::right1 [aeiou]
369
+ # abbreviations
370
+ ::s1 ee. ::s2 a ::cost 0.02 ::lc1 amh ::left1 /^(.*[- ])?$/
371
+ ::s1 si. ::s2 c ::cost 0.02 ::lc1 amh ::left1 /^(.*[- ])?$/
372
+ ::s1 di. ::s2 d ::cost 0.02 ::lc1 amh ::left1 /^(.*[- ])?$/
373
+ ::s1 eefe. ::s2 f ::cost 0.02 ::lc1 amh ::left1 /^(.*[- ])?$/
374
+ ::s1 are. ::s2 r ::cost 0.02 ::lc1 amh ::left1 /^(.*[- ])?$/
375
+
376
+ # Arabic
377
+ ::s1 ::s2 a ::cost 0.02 ::lc1 ara
378
+ ::s1 ::s2 e ::cost 0.02 ::lc1 ara
379
+ ::s1 ::s2 i ::cost 0.05 ::lc1 ara
380
+ ::s1 ::s2 o ::cost 0.05 ::lc1 ara
381
+ ::s1 ::s2 p ::cost 0.15 ::lc1 ara ::left2 /m$/ ::right2 [dfgklmnpqrstvwz]
382
+ ::s1 ::s2 u ::cost 0.05 ::lc1 ara
383
+ ::s1 y ::s2 a ::cost 0.15 ::lc1 ara
384
+ ::s1 y ::s2 e ::cost 0.05 ::lc1 ara
385
+ ::s1 y ::s2 ea ::cost 0.02 ::lc1 ara
386
+ ::s1 y ::s2 ee ::cost 0.02 ::lc1 ara
387
+ ::s1 y ::s2 i ::cost 0.02 ::lc1 ara
388
+ ::s1 y ::s2 ie ::cost 0.02 ::lc1 ara
389
+ ::s1 b ::s2 p ::cost 0.02 ::lc1 ara
390
+ ::s1 b ::s2 pp ::cost 0.03 ::lc1 ara
391
+ ::s1 f ::s2 v ::cost 0.02 ::lc1 ara
392
+ ::s1 fyl ::s2 ville ::right2 [-,$ ] ::cost 0.05 ::lc1 ara
393
+ ::s1 gh ::s2 g ::right2 [abcdfgklmnopqrstuvwz] ::cost 0.05 ::lc1 ara
394
+ ::s1 ghz ::s2 gs ::cost 0.05 ::lc1 ara
395
+ ::s1 j ::s2 g ::cost 0.2 ::lc1 ara
396
+ ::s1 kh ::s2 g ::cost 0.3 ::lc1 ara ::right2 [eiy]
397
+ ::s1 q ::s2 g ::cost 0.2 ::lc1 ara ::right2 [arouz]
398
+ ::s1 q ::s2 gg ::cost 0.2 ::lc1 ara ::right2 [arouz]
399
+ ::s1 th ::s2 z ::cost 0.4 ::lc1 ara ::right2 [aou] ::comment Spanish
400
+ ::s1 " (" ::s2 ", " ::cost 0.02 ::lc1 ara
401
+ ::s1 ) ::s2 ::right2 [-,$ ] ::cost 0.02 ::lc1 ara
402
+
403
+ # Bengali
404
+ ::s1 aoyaa ::s2 wa ::cost 0.1 ::lc1 ben
405
+ ::s1 aoye ::s2 way ::cost 0.1 ::lc1 ben
406
+ ::s1 bhaa ::s2 ve ::cost 0.1 ::lc1 ben
407
+ ::s1 bh ::s2 v ::cost 0.2 ::lc1 ben
408
+ ::s1 bh ::s2 w ::cost 0.2 ::lc1 ben
409
+ ::s1 b ::s2 v ::cost 0.3 ::lc1 ben
410
+ ::s1 b ::s2 w ::cost 0.3 ::lc1 ben
411
+ ::s1 dda ::s2 rh ::right2 [-,$ ] ::cost 0.2 ::lc1 ben
412
+ ::s1 dd ::s2 r ::cost 0.4 ::lc1 ben
413
+ ::s1 gk ::s2 k ::cost 0.05 ::lc1 ben
414
+ ::s1 h ::s2 g ::right2 [eiy] ::cost 0.4 ::lc1 ben
415
+ ::s1 h ::s2 j ::cost 0.4 ::lc1 ben
416
+ ::s1 hoyaai ::s2 whi ::cost 0.05 ::lc1 ben
417
+ ::s1 j ::s2 z ::cost 0.1 ::lc1 ben
418
+ ::s1 j ::s2 s ::cost 0.3 ::lc1 ben
419
+ ::s1 myaaka ::s2 mc ::cost 0.1 ::lc1 ben
420
+ ::s1 myaaka ::s2 mac ::cost 0.1 ::lc1 ben
421
+ ::s1 oyaa ::s2 wa ::cost 0.02 ::lc1 ben
422
+ ::s1 oyaa ::s2 wo ::cost 0.1 ::lc1 ben
423
+ ::s1 oyena ::s2 owen ::cost 0.1 ::lc1 ben
424
+ ::s1 ph ::s2 v ::cost 0.1 ::lc1 ben
425
+ ::s1 phana ::s2 von ::cost 0.1 ::lc1 ben
426
+ ::s1 rhio ::s2 gio ::cost 0.2 ::lc1 ben
427
+ ::s1 sh ::s2 s ::cost 0.4 ::lc1 ben
428
+ ::s1 ss ::s2 sh ::left1 /[k]$/ ::cost 0.15 ::lc1 ben
429
+ ::s1 ss ::s2 sh ::cost 0.3 ::lc1 ben
430
+ ::s1 o ::s2 wo ::cost 0.2 ::lc1 ben ::left1 /^(.*[-, ]?)$/
431
+ ::s1 oye ::s2 we ::cost 0.2 ::lc1 ben
432
+ ::s1 tta ::s2 tho ::cost 0.3 ::lc1 ben
433
+ ::s1 tthaa ::s2 ta ::cost 0.3 ::lc1 ben
434
+ ::s1 u ::s2 wo ::cost 0.2 ::lc1 ben ::left1 /^(.*[-, ]?)$/
435
+ ::s1 u ::s2 woo ::cost 0.2 ::lc1 ben ::left1 /^(.*[-, ]?)$/
436
+ ::s1 u ::s2 wu ::cost 0.2 ::lc1 ben ::left1 /^(.*[-, ]?)$/
437
+ ::s1 ui ::s2 wi ::cost 0.02 ::lc1 ben ::left1 /^(.*[-, ]?)$/
438
+ ::s1 yaa ::s2 wa ::cost 0.3 ::lc1 ben
439
+ ::s1 ye ::s2 we ::cost 0.3 ::lc1 ben
440
+
441
+ # Russian
442
+ ::s1 ::s2 os ::cost 0.4 ::left2 /[bcdfghilmnprstvx]$/ ::right2 [-,$ ] ::lc1 rus
443
+ ::s1 ::s2 us ::cost 0.4 ::left2 /[bcdfghilmnprstvx]$/ ::right2 [-,$ ] ::lc1 rus
444
+ ::s1 av ::s2 au ::cost 0.05 ::lc1 rus
445
+ ::s1 ch ::s2 cz ::cost 0.1 ::lc1 rus ::comment Polish
446
+ ::s1 chch ::s2 cci ::right2 [aou] ::cost 0.1 ::lc1 rus
447
+ ::s1 chch ::s2 cc ::right2 [eiy] ::cost 0.1 ::lc1 rus
448
+ ::s1 chzh ::s2 zh ::cost 0.1 ::lc1 rus
449
+ ::s1 dz ::s2 zz ::cost 0.1 ::lc1 rus ::right2 [aeiouy]
450
+ ::s1 dz ::s2 j ::cost 0.3 ::lc1 rus ::right2 [aeiouy] ::comment Japanese
451
+ ::s1 dzh ::s2 g ::cost 0.05 ::lc1 rus ::right2 [eiy]
452
+ ::s1 dzh ::s2 gg ::cost 0.05 ::lc1 rus ::right2 [eiy]
453
+ ::s1 dzh ::s2 j ::cost 0.05 ::lc1 rus
454
+ ::s1 ev ::s2 eu ::cost 0.1 ::lc1 rus
455
+ ::s1 f ::s2 th ::cost 0.6 ::lc1 rus
456
+ ::s1 ievye ::s2 iaceae ::cost 0.02 ::right1 [-,$ ] ::lc1 rus ::comment scientific names for families of species
457
+ ::s1 ii ::s2 ius ::cost 0.2 ::right1 [-,$ ] ::lc1 rus
458
+ ::s1 i ::s2 j ::cost 0.2 ::lc1 rus
459
+ ::s1 naya ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::suffix adjective
460
+ ::s1 nyi ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::suffix adjective
461
+ ::s1 ovye ::s2 aceae ::cost 0.02 ::right1 [-,$ ] ::lc1 rus ::comment scientific names for families of species
462
+ ::s1 shsh ::s2 sh ::cost 0 ::lc1 rus
463
+ ::s1 skaya ::s2 ian ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::suffix possessive
464
+ ::s1 skaya ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::suffix possessive
465
+ ::s1 skii ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::suffix possessive
466
+ ::s1 skii ::s2 ian ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::suffix adjective
467
+ ::s1 tsian ::s2 tian ::cost 0.05 ::lc1 rus
468
+ ::s1 tsion ::s2 tion ::cost 0.05 ::lc1 rus
469
+ ::s1 ts ::s2 c ::cost 0.3 ::lc1 rus
470
+ ::s1 ts ::s2 c ::cost 0.02 ::right1 [-,$ ] ::lc1 rus
471
+ ::s1 tsz ::s2 z ::cost 0.1 ::lc1 rus
472
+ ::s1 itsa ::s2 ica ::cost 0.02 ::right1 [-,$ ] ::lc1 rus
473
+ ::s1 etski ::s2 ecky ::cost 0.02 ::right1 [-,$ ] ::lc1 rus
474
+ ::s1 tsiya ::s2 tion ::cost 0.02 ::right1 [-,$ ] ::lc1 rus
475
+ ::s1 tsi ::s2 qi ::cost 0.15 ::lc1 rus ::comment Chinese names
476
+ ::s1 tsy ::s2 qi ::cost 0.15 ::lc1 rus ::comment Chinese names
477
+ ::s1 tszi ::s2 ji ::cost 0.15 ::lc1 rus ::comment Chinese names
478
+ ::s1 tszy ::s2 ji ::cost 0.15 ::lc1 rus ::comment Chinese names
479
+ ::s1 u ::s2 w ::right2 [aeio] ::cost 0.05 ::lc1 rus
480
+ ::s1 u ::s2 w ::cost 0.2 ::lc1 rus
481
+ ::s1 uo ::s2 wa ::cost 0.2 ::lc1 rus ::right2 [lnrst]
482
+ ::s1 v ::s2 u ::cost 0.05 ::lc1 rus ::left1 /[bcdfghjklmnpqrstvwxz]$/ ::right1 [aeiou]
483
+ ::s1 gva ::s2 gua ::cost 0.02 ::lc1 rus
484
+ ::s1 gvi ::s2 gui ::cost 0.02 ::lc1 rus
485
+ ::s1 x ::s2 sh ::cost 0.2 ::left2 /[aeiou]$/ ::right2 [-,aouct$-] ::lc1 rus
486
+ ::s1 y ::s2 s ::cost 0.4 ::right2 [-,$-] ::lc1 rus
487
+ ::s1 zh ::s2 rz ::cost 0.1 ::lc1 rus ::comment Polish rz
488
+
489
+ # Russian case endings
490
+ ::s1 em ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
491
+ ::s1 ey ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
492
+ ::s1 om ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
493
+ ::s1 oy ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
494
+ ::s1 oyu ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
495
+ ::s1 y ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
496
+ ::s1 ya ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
497
+ ::s1 ye ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
498
+ ::s1 yem ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
499
+ ::s1 ym ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
500
+ ::s1 ymi ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
501
+ ::s1 yu ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
502
+ ::s1 ii ::s2 iya ::cost 0.1 ::right1 [-,$ ] ::right2 [-,$ ] ::lc1 rus ::lc2 rus ::comment Russian case endings
503
+ ::s1 ii ::s2 iye ::cost 0.1 ::right1 [-,$ ] ::right2 [-,$ ] ::lc1 rus ::lc2 rus ::comment Russian case endings
504
+
505
+ ::s1 am ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
506
+ ::s1 ami ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
507
+ ::s1 em ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
508
+ ::s1 ev ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
509
+ ::s1 eri ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
510
+ ::s1 eryu ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
511
+ ::s1 om ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
512
+ ::s1 ov ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
513
+ ::s1 akh ::s2 ::cost 0.3 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
514
+ ::s1 ykh ::s2 ::cost 0.3 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
515
+
516
+ # Ukrainian case endings
517
+ ::s1 eyu ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
518
+ ::s1 oyu ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
519
+ ::s1 ya ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
520
+ ::s1 yi ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
521
+ ::s1 yu ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
522
+
523
+ ::s1 am ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
524
+ ::s1 amy ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
525
+ ::s1 em ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
526
+ ::s1 evy ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
527
+ ::s1 iv ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
528
+ ::s1 om ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
529
+ ::s1 ovy ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
530
+ ::s1 yam ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
531
+ ::s1 yamy ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
532
+ ::s1 yiv ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
533
+ ::s1 akh ::s2 ::cost 0.3 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
534
+ ::s1 yakh ::s2 ::cost 0.3 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
535
+
536
+ # Uyghur
537
+ ::s1 aw ::s2 ao ::cost 0.05 ::lc1 uig
538
+ ::s1 aw ::s2 au ::cost 0.05 ::lc1 uig
539
+ ::s1 gwi ::s2 gui ::cost 0.05 ::lc1 uig
540
+ ::s1 iye ::s2 ia ::cost 0.05 ::lc1 uig
541
+ ::s1 istan ::s2 ia ::cost 0.1 ::right1 [-,$ ] ::lc1 uig
542
+ ::s1 j ::s2 c ::cost 0.4 ::lc1 uig
543
+ ::s1 q ::s2 h ::cost 0.2 ::lc1 uig
544
+ ::s1 sey ::s2 cai ::cost 0.2 ::lc1 uig
545
+ ::s1 sh ::s2 x ::cost 0.2 ::lc1 uig
546
+
547
+ ::s1 b ::s2 p ::cost 0.3
548
+ ::s1 b ::s2 v ::cost 0.5 ::left2 /^(.*[- ])?$/
549
+ ::s1 b ::s2 v ::cost 0.7
550
+ ::s1 c ::s2 ch ::cost 0.25 ::right1 [eiy]
551
+ ::s1 c ::s2 ck ::cost 0.02 ::right1 [-,abcdfghklmnpoqrstuvwxz$ ]
552
+ ::s1 c ::s2 k ::cost 0.4
553
+ ::s1 c ::s2 k ::cost 0.05 ::left1 /^(.* )?ma?$/ ::comment MacIntyre
554
+ ::s1 c ::s2 k ::cost 0.02 ::right1 [-,abcdfghklmnpoqrstuvwxz$ ]
555
+ ::s1 c ::s2 kk ::cost 0.02 ::right1 [-,abcdfghklmnpoqrstuvwxz$ ]
556
+ ::s1 c ::s2 s ::cost 0.7
557
+ ::s1 c ::s2 s ::cost 0.1 ::right1 [eiy]
558
+ ::s1 c ::s2 ts ::cost 0.15 ::right1 [eiy]
559
+ ::s1 c ::s2 z ::cost 0.3
560
+ ::s1 ch ::s2 ck ::cost 0.2
561
+ ::s1 ch ::s2 g ::cost 0.3 ::right1 [eiy] ::right2 [eiy]
562
+ ::s1 ch ::s2 k ::cost 0.2
563
+ ::s1 ch ::s2 kk ::cost 0.2
564
+ ::s1 ch ::s2 sh ::cost 0.3
565
+ ::s1 ch ::s2 sh ::cost 0.2 ::left1 /eiy$/ ::right1 [$ ]
566
+ ::s1 ch ::s2 tch ::cost 0.1
567
+ ::s1 ch ::s2 tsh ::cost 0.1
568
+ ::s1 ch ::s2 z ::cost 0.5
569
+ ::s1 ck ::s2 kk ::cost 0.02
570
+ ::s1 cz ::s2 ch ::cost 0.2 ::left1 /i$/
571
+ ::s1 d ::s2 t ::cost 0.3
572
+ ::s1 de ::s2 dre ::cost 0.3 ::lc1 zho ::right2 [-,$ ]
573
+ ::s1 dg ::s2 j ::cost 0.6 ::lc1 eng ::comment Cambridge
574
+ ::s1 dg ::s2 j ::cost 0.3 ::right1 [eiy] ::lc1 eng
575
+ ::s1 dg ::s2 j ::cost 0.1 ::right1 [eiy] ::lc1 eng ::lc2 fas, jpn
576
+ ::s1 dt ::s2 d ::cost 0.3
577
+ ::s1 dt ::s2 t ::cost 0.03
578
+ ::s1 dt ::s2 tt ::cost 0.03
579
+ ::s1 f ::s2 p ::cost 0.8
580
+ ::s1 f ::s2 ph ::cost 0.01
581
+ ::s1 ff ::s2 ph ::cost 0.02
582
+ ::s1 f ::s2 pf ::cost 0.1
583
+ ::s1 f ::s2 v ::cost 0.3
584
+ ::s1 f ::s2 v ::cost 0.1 ::right1 [-,$ ]
585
+ ::s1 ef ::s2 ev ::cost 0.1 ::right1 [-,bcdfghklmnpqrstvwxz$ ]
586
+ ::s1 f ::s2 w ::cost 0.3
587
+ ::s1 g ::s2 j ::cost 0.6
588
+ ::s1 g ::s2 j ::cost 0.3 ::right1 [eiy]
589
+ ::s1 g ::s2 j ::cost 0.1 ::right1 [eiy] ::lc2 amh, ara, fas, jpn, som
590
+ ::s1 g ::s2 k ::cost 0.3
591
+ ::s1 g ::s2 gh ::cost 0.3
592
+ ::s1 g ::s2 ch ::cost 0.4 ::left1 /[eiy]$/ ::right1 [-,$ ] ::comment German: Ludwig, Braunschweig
593
+ ::s1 gh ::s2 f ::cost 0.2 ::lc1 eng ::comment laughter
594
+ ::s1 gh ::s2 "" ::cost 0.2 ::lc1 eng ::comment daughter
595
+ ::s1 gh ::s2 g ::cost 0.2 ::lc1 eng ::comment Afghanistan
596
+ ::s1 gl ::s2 l ::cost 0.2 ::lc1 eng ::right1 [i]
597
+ ::s1 gn ::s2 n ::cost 0.05 ::left1 /^(.* )?$/ ::lc1 eng
598
+ ::s1 gn ::s2 n ::cost 0.2 ::lc1 eng
599
+ ::s1 gz ::s2 ks ::cost 0.2
600
+ ::s1 h ::s2 e ::cost 0.4 ::lc1 fas
601
+ ::s1 ise ::s2 ize ::cost 0.1
602
+ ::s1 j ::s2 y ::cost 0.2
603
+ ::s1 j ::s2 dj ::cost 0.2
604
+ ::s1 j ::s2 h ::cost 0.4 ::right2 [aeiou] ::lc2 amh ::example Jose
605
+ ::s1 j ::s2 hh ::cost 0.4 ::right2 [aeiou] ::lc2 amh ::example Tardajos
606
+ ::s1 j ::s2 zh ::cost 0.2
607
+ ::s1 k ::s2 cc ::cost 0.02 ::right2 [aour]
608
+ ::s1 k ::s2 cc ::cost 0.3
609
+ ::s1 k ::s2 cch ::cost 0.15
610
+ ::s1 k ::s2 ck ::cost 0.02
611
+ ::s1 k ::s2 cq ::cost 0.05
612
+ ::s1 k ::s2 cqu ::cost 0.05
613
+ ::s1 k ::s2 cque ::cost 0.1
614
+ ::s1 k ::s2 cque ::cost 0.05 ::right2 [-,$ ]
615
+ ::s1 k ::s2 cques ::cost 0.05 ::right2 [-,$ ]
616
+ ::s1 k ::s2 q ::cost 0.05
617
+ ::s1 k ::s2 qu ::cost 0.05
618
+ ::s1 k ::s2 que ::cost 0.1
619
+ ::s1 k ::s2 que ::cost 0.05 ::right2 [-,$ ]
620
+ ::s1 k ::s2 ques ::cost 0.1 ::right2 [-,$ ]
621
+ ::s1 kh ::s2 j ::cost 0.2
622
+ ::s1 kh ::s2 q ::cost 0.2
623
+ ::s1 kh ::s2 k ::cost 0.25 ::right1 [aeiouy]
624
+ ::s1 kh ::s2 k ::cost 0.1 ::right1 [aeiouys] ::lc2 amh
625
+ ::s1 kn ::s2 n ::cost 0.05 ::left1 /^(.* )?$/ ::lc1 eng
626
+ ::s1 kj ::s2 sh ::cost 0.2 ::comment Swedish
627
+ ::s1 l ::s2 r ::cost 0.1 ::lc1 zho
628
+ ::s1 aib ::s2 alb ::cost 0.1 ::lc1 zho
629
+ ::s1 al ::s2 ::cost 0.5 ::left1 /^(.* )?$/
630
+ ::s1 al- ::s2 ::cost 0.3 ::left1 /^(.* )?$/
631
+ ::s1 el ::s2 ::cost 0.5 ::left1 /^(.* )?$/
632
+ ::s1 el- ::s2 ::cost 0.3 ::left1 /^(.* )?$/
633
+ ::s1 ll ::s2 y ::cost 0.1 ::left1 /[aeiouy]$/ ::right1 [aeiouy] ::comment Guillermo, Guillaume
634
+ ::s1 mb ::s2 m ::cost 0.2 ::right1 [-,bcdfghklmnpqstvwxz$ ] ::lc1 eng ::comment bomb
635
+ ::s1 n ::s2 m ::cost 0.5 ::left1 /[aeiou]$/ ::left2 /[aeiou]$/ ::right1 [bcdfghklmnpqrstvwxz$ ] ::right2 [-,bcdfghklmnpqrstvwxz$ ]
636
+ ::s1 ng ::s2 n ::cost 0.1 ::left1 /[aeiou]$/ ::lc1 zho
637
+ ::s1 ng ::s2 m ::cost 0.25 ::left1 /[aeiou]$/ ::lc1 zho
638
+ ::s1 ng ::s2 n ::cost 0.1 ::left2 /[aeiou]$/ ::lc2 ara, ben, rus, zho
639
+ ::s1 nm ::s2 m ::cost 0.25 ::lc1 zho ::left1
640
+ ::s1 pn ::s2 n ::cost 0.05 ::left1 /^(.* )?$/ ::lc1 eng
641
+ ::s1 ph ::s2 p ::cost 0.3 ::lc1 amh
642
+ ::s1 q ::s2 c ::cost 0.15
643
+ ::s1 q ::s2 ch ::cost 0.2 ::right2 [eiy]
644
+ ::s1 q ::s2 ck ::cost 0.2
645
+ ::s1 q ::s2 kk ::cost 0.2
646
+ ::s1 q ::s2 gh ::cost 0.2 ::lc1 fas ::right2 [aeiouy]
647
+ ::s1 qi ::s2 ch ::cost 0.2 ::lc1 zho ::right1 [aeou]
648
+ ::s1 qi ::s2 cci ::cost 0.1 ::lc1 zho
649
+ ::s1 qi ::s2 chi ::cost 0.1 ::lc1 zho
650
+ ::s1 qi ::s2 tch ::cost 0.2 ::lc1 zho ::right1 [aeou]
651
+ ::s1 qi ::s2 ts ::cost 0.4 ::lc1 zho ::right1 [aeou]
652
+ ::s1 qi ::s2 tsch ::cost 0.2 ::lc1 zho ::right1 [aeou]
653
+ ::s1 qi ::s2 tzsch ::cost 0.2 ::lc1 zho ::right1 [aeou]
654
+ ::s1 qi ::s2 czy ::cost 0.2 ::lc1 zho
655
+ ::s1 qu ::s2 kw ::cost 0.15
656
+ ::s1 qu ::s2 kv ::cost 0.15
657
+ ::s1 e ::s2 er ::cost 0.25 ::left1 /[bcdfghklmnpqrstvwxz]$/ ::lc1 zho
658
+ ::s1 re ::s2 er ::cost 0.1
659
+ ::s1 rh ::s2 r ::cost 0.05 ::left1 /^(.*[- ])?$/ ::example Rhine
660
+ ::s1 s ::s2 sh ::cost 0.03 ::right2 [aeiou] ::lc2 amh
661
+ ::s1 s ::s2 sz ::cost 0.3 ::lc2 eng ::example Liszt (Hungarian)
662
+ ::s1 s ::s2 ts ::cost 0.4 ::lc1 amh, zho
663
+ ::s1 s ::s2 z ::cost 0.4
664
+ ::s1 s ::s2 z ::cost 0.1 ::left1 /[aeiouy]$/ ::right1 [aeiouy] ::lc1 eng
665
+ ::s1 s ::s2 z ::cost 0.1 ::left1 /[aeiouy][bdglmnrvw]?$/ ::right1 [-,$ ] ::lc1 eng
666
+ ::s1 s ::s2 z ::cost 0.2 ::lc2 fas
667
+ ::s1 sc ::s2 s ::cost 0.2 ::right1 [i] ::example Nascimento
668
+ ::s1 sci ::s2 sh ::cost 0.2 ::example Brescia
669
+ ::s1 sch ::s2 sh ::cost 0.1
670
+ ::s1 sh ::s2 sz ::cost 0.2 ::example Mariusz (Polish) ::lc2 eng
671
+ ::s1 si ::s2 j ::cost 0.1 ::right2 [a] ::lc1 eng
672
+ ::s1 ss ::s2 z ::cost 0.5
673
+ # ::s1 smith ::s2 mith ::cost 0.75 ::lc2 zho ::comment weird, but several different Xinhua examples
674
+ ::s1 tch ::s2 c ::cost 0.2 ::left2 /[aeiou]$/ ::right2 [-,e$ ]
675
+ ::s1 te ::s2 tre ::cost 0.3 ::lc1 zho ::right2 [-,$ ]
676
+ ::s1 th ::s2 t ::cost 0.2 ::lc2 amh, fas, uig
677
+ ::s1 th ::s2 s ::cost 0.4 ::lc2 zho
678
+ ::s1 th ::s2 sth ::cost 0.4 ::lc1 zho
679
+ ::s1 th ::s2 ths ::cost 0.4 ::lc1 zho
680
+ ::s1 th ::s2 z ::cost 0.3 ::lc2 amh ::right2 [-,$ aeot]
681
+ ::s1 v ::s2 w ::cost 0.02
682
+ ::s1 v ::s2 wh ::cost 0.02 ::left1 /^(.* )?$/
683
+ ::s1 vv ::s2 w ::cost 0.02
684
+ ::s1 w ::s2 u ::cost 0.1 ::lc2 uig
685
+ ::s1 wa ::s2 ua ::cost 0.05
686
+ ::s1 wh ::s2 w ::cost 0.05 ::left1 /^(.* )?$/
687
+ ::s1 wr ::s2 r ::cost 0.05 ::left1 /^(.* )?$/ ::lc1 eng
688
+ ::s1 x ::s2 ks ::cost 0.05
689
+ ::s1 x ::s2 s ::cost 0.2 ::left1 /^(.* )?$/
690
+ ::s1 x ::s2 sh ::cost 0.2 ::lc1 uig ::left1 /^(.* )?$/ ::right1 [aeiou]
691
+ ::s1 x ::s2 z ::cost 0.2 ::left1 /^(.* )?$/ ::right1 [aeiouy]
692
+ ::s1 x ::s2 h ::cost 0.3 ::lc1 uig
693
+ ::s1 x ::s2 h ::cost 0.05 ::lc1 uig ::left1 /^(.* )?$/ ::right1 [aeiou]
694
+ ::s1 x ::s2 kh ::cost 0.1 ::lc1 uig
695
+ ::s1 xi ::s2 sch ::cost 0.2 ::right1 [aeou] ::lc1 zho
696
+ ::s1 xi ::s2 sh ::cost 0.2 ::right1 [aeou] ::lc1 zho
697
+ ::s1 xi ::s2 ch ::cost 0.4 ::right1 [aeou] ::lc1 zho
698
+ ::s1 xi ::s2 sci ::cost 0.4 ::right1 [aeou] ::lc1 zho
699
+ ::s1 xi ::s2 s ::cost 0.6 ::right1 [aeou] ::lc1 zho
700
+ ::s1 z ::s2 dz ::cost 0.1 ::left1 /^(.*[ aeiouy])?[lnr]?$/
701
+ ::s1 z ::s2 ts ::cost 0.15
702
+ ::s1 z ::s2 tz ::cost 0.15
703
+ ::s1 zh ::s2 g ::cost 0.2 ::right2 [eiy]
704
+ ::s1 zh ::s2 g ::cost 0.1 ::right2 [eiy] ::lc2 amh
705
+ ::s1 zz ::s2 ts ::cost 0.15
706
+ ::s1 zz ::s2 tz ::cost 0.1
707
+
708
+ # Oromo
709
+ ::s1 nb ::s2 mb ::cost 0.4 ::lc1 orm ::lc2 orm ::left1 /[aeiou]$/ ::left2 /[aeiou]$/
710
+ ::s1 np ::s2 mp ::cost 0.4 ::lc1 orm ::lc2 orm ::left1 /[aeiou]$/ ::left2 /[aeiou]$/
711
+ ::s1 ph ::s2 p ::cost 0.3 ::lc1 orm ::lc2 orm
712
+
713
+ # Tigrinya
714
+ ::s1 aaye ::s2 a ::cost 0.4 ::lc1 tir ::lc2 tir ::left1 /[bcdfghklmnpqrstvwxz]$/ ::right1 [bcdfghklmnpqrstvwxz] ::comment internal plural
715
+ ::s1 aaye ::s2 i ::cost 0.4 ::lc1 tir ::lc2 tir ::left1 /[bcdfghklmnpqrstvwxz]$/ ::right1 [bcdfghklmnpqrstvwxz] ::comment internal plural
716
+
717
+ # Somali
718
+ ::s1 ay ::s2 ey ::cost 0.1 ::lc1 som ::lc2 som
719
+ ::s1 ay ::s2 eey ::cost 0.15 ::lc1 som ::lc2 som
720
+ ::s1 aha ::s2 ihii ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
721
+ ::s1 aha ::s2 ihi ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
722
+ ::s1 aha ::s2 uhu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
723
+ ::s1 ihii ::s2 uhu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
724
+ ::s1 ihi ::s2 uhu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
725
+ ::s1 ha ::s2 hii ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
726
+ ::s1 ha ::s2 hi ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
727
+ ::s1 ha ::s2 hu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
728
+ ::s1 hii ::s2 hu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
729
+ ::s1 hi ::s2 hu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
730
+ ::s1 aka ::s2 ikii ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
731
+ ::s1 aka ::s2 iki ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
732
+ ::s1 aka ::s2 uku ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
733
+ ::s1 ikii ::s2 uku ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
734
+ ::s1 iki ::s2 uku ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
735
+ ::s1 ka ::s2 kii ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
736
+ ::s1 ka ::s2 ki ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
737
+ ::s1 ka ::s2 ku ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
738
+ ::s1 kii ::s2 ku ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
739
+ ::s1 ki ::s2 ku ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
740
+ ::s1 aga ::s2 ugu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
741
+ ::s1 ga ::s2 gu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
742
+ ::s1 ata ::s2 itii ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
743
+ ::s1 ata ::s2 iti ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
744
+ ::s1 ata ::s2 utu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
745
+ ::s1 itii ::s2 utu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
746
+ ::s1 iti ::s2 utu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
747
+ ::s1 ta ::s2 tii ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
748
+ ::s1 ta ::s2 ti ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
749
+ ::s1 ta ::s2 tu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
750
+ ::s1 tii ::s2 tu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
751
+ ::s1 ti ::s2 tu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
752
+ ::s1 ata ::s2 ete ::cost 0.15 ::lc1 som ::lc2 som
753
+ ::s1 ata ::s2 iti ::cost 0.2 ::lc1 som ::lc2 som
754
+ ::s1 ete ::s2 iti ::cost 0.15 ::lc1 som ::lc2 som
755
+ ::s1 g ::s2 k ::cost 0.2 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [aeiou]
756
+ ::s1 g ::s2 k ::cost 0.25 ::lc1 som ::lc2 som
757
+ ::s1 g ::s2 kh ::cost 0.25 ::lc1 som ::lc2 som
758
+ ::s1 gh ::s2 kh ::cost 0.1 ::lc1 som ::lc2 som
759
+ ::s1 gh ::s2 k ::cost 0.2 ::lc1 som ::lc2 som
760
+ ::s1 g ::s2 q ::cost 0.25 ::lc1 som ::lc2 som
761
+ ::s1 g ::s2 q ::cost 0.2 ::lc1 som ::lc2 som ::right1 [aou] ::right2 [aou]
762
+ ::s1 ga ::s2 q ::cost 0.2 ::lc1 som ::lc2 som ::left1 /^(.*[aeiou])?$/ ::left2 /^(.*[aeiou])?$/ ::right1 [bcdfghklmnpqrstvwxz] ::right2 [bcdfghklmnpqrstvwxz]
763
+ ::s1 g ::s2 j ::cost 0.25 ::lc1 som ::lc2 som
764
+ ::s1 g ::s2 j ::cost 0.15 ::lc1 som ::lc2 som ::right1 [ei] ::right2 [ei]
765
+ ::s1 gi ::s2 j ::cost 0.15 ::lc1 som ::lc2 som ::right2 [ei]
766
+ ::s1 n ::s2 m ::cost 0.2 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [aeiou]
767
+ ::s1 n ::s2 mm ::cost 0.2 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [aeiou]
768
+ ::s1 n ::s2 m ::cost 0.25 ::lc1 som ::lc2 som ::right2 [aeiko]
769
+ ::s1 n ::s2 mm ::cost 0.25 ::lc1 som ::lc2 som ::right2 [aeiko]
770
+ ::s1 ii ::s2 a ::cost 0.15 ::lc1 som ::lc2 som
771
+ ::s1 y ::s2 dj ::cost 0.2 ::lc2 som
772
+ ::s1 ca ::s2 a ::cost 0.15 ::left1 /^(.*[-, ])?$/ ::lc1 som
773
+ ::s1 c ::s2 ::cost 0.25 ::left1 /^(.*[-, ])?$/ ::lc1 som
774
+ ::s1 x ::s2 h ::cost 0.25 ::lc1 som
775
+ ::s1 x ::s2 h ::cost 0.05 ::lc1 som ::left1 /^(.* )?$/ ::right1 [aeiou]
776
+ ::s1 x ::s2 h ::cost 0.1 ::lc1 som ::left1 /[aeiou]$/
777
+ ::s1 b ::s2 p ::cost 0.1 ::lc1 som
778
+ ::s1 majm ::s2 mahm ::cost 0.1 ::lc1 som
779
+ ::s1 chalim ::s2 halim ::cost 0.1 ::lc1 som ::lc2 som
780
+ ::s1 chalim ::s2 jalim ::cost 0.1 ::lc1 som ::lc2 som
781
+ ::s1 chalim ::s2 kalim ::cost 0.1 ::lc1 som ::lc2 som
782
+ ::s1 halim ::s2 jalim ::cost 0.1 ::lc1 som ::lc2 som
783
+ ::s1 halim ::s2 kalim ::cost 0.1 ::lc1 som ::lc2 som
784
+ ::s1 jalim ::s2 kalim ::cost 0.1 ::lc1 som ::lc2 som
785
+ ::s1 dh ::s2 r ::cost 0.25 ::lc1 som ::lc2 som ::left1 /[aeiou]$/
786
+ ::s1 j ::s2 ch ::cost 0.25 ::lc1 som ::lc2 som
787
+ ::s1 j ::s2 kh ::cost 0.25 ::lc1 som ::lc2 som
788
+ ::s1 ch ::s2 sh ::cost 0.2 ::lc1 som ::lc2 som
789
+
790
+ # French
791
+ ::s1 aud ::s2 o ::cost 0.3 ::right1 [-,$ ] ::lc1 eng, fra
792
+ ::s1 aux ::s2 o ::cost 0.05 ::right1 [-,$ ]
793
+ ::s1 eaux ::s2 o ::cost 0.05 ::right1 [-,$ ]
794
+ ::s1 eux ::s2 o ::cost 0.05 ::right1 [-,$ ]
795
+ ::s1 eux ::s2 e ::cost 0.15 ::right1 [-,$ ]
796
+
797
+ ::s1 - ::s2 " " ::cost 0.1
798
+ ::s1 : ::s2 , ::cost 0.1 ::lc1 amh
799
+
800
+ # mini dictionary Amharic-English
801
+ ::s1 dabube ::s2 south ::cost 0 ::lc1 amh ::lc2 eng
802
+ ::s1 daseete ::s2 island ::cost 0 ::lc1 amh ::lc2 eng
803
+ ::s1 daseetoche ::s2 islands ::cost 0 ::lc1 amh ::lc2 eng
804
+ ::s1 kaaweneti ::s2 county ::cost 0 ::lc1 amh ::lc2 eng
805
+ ::s1 katamaa ::s2 city ::cost 0 ::lc1 amh ::lc2 eng
806
+ ::s1 kelele ::s2 region ::cost 0 ::lc1 amh ::lc2 eng
807
+ ::s1 meseraaqe ::s2 east ::cost 0 ::lc1 amh ::lc2 eng
808
+ ::s1 sameene ::s2 north ::cost 0 ::lc1 amh ::lc2 eng
809
+ ::s1 setaadiyame ::s2 stadium ::cost 0 ::lc1 amh ::lc2 eng
810
+ ::s1 waneze ::s2 river ::cost 0 ::lc1 amh ::lc2 eng
811
+
812
+ # mini dictionary Arabic-English
813
+ ::s1 " " ::s2 " of " ::cost 0 ::lc1 ara ::lc2 eng
814
+ ::s1 " alawl" ::s2 " i" ::cost 0 ::lc1 ara ::lc2 eng ::right2 [-,$ ]
815
+
816
+ # mini dictionary Bengali-English
817
+ ::s1 anychala ::s2 zone ::cost 0 ::lc1 ben ::lc2 eng
818
+ ::s1 pradesha ::s2 province ::cost 0 ::lc1 ben ::lc2 eng
819
+ ::s1 saamraajya ::s2 empire ::cost 0 ::lc1 ben ::lc2 eng
820
+ ::s1 upajelaa ::s2 upazila ::cost 0 ::lc1 ben ::lc2 eng
821
+ ::s1 uttara ::s2 north ::cost 0 ::lc1 ben ::lc2 eng
822
+ ::s1 "dya " ::s2 "the " ::left1 /^(.*[-, ])?$/ ::cost 0.2 ::lc1 ben ::lc2 eng
823
+ ::s1 " aba " ::s2 " of " ::cost 0 ::lc1 ben ::lc2 eng
824
+
825
+ # mini dictionary Russian-English
826
+ ::s1 akademiya ::s2 academy ::cost 0 ::lc1 rus ::lc2 eng
827
+ ::s1 eparkhiya ::s2 diocese ::cost 0 ::lc1 rus ::lc2 eng
828
+ ::s1 gorod ::s2 city ::cost 0 ::lc1 rus ::lc2 eng
829
+ ::s1 gosudarstvennyi ::s2 state ::cost 0 ::lc1 rus ::lc2 eng
830
+ ::s1 gubernator ::s2 governor ::cost 0 ::lc1 rus ::lc2 eng
831
+ ::s1 guberniya ::s2 governate ::cost 0 ::lc1 rus ::lc2 eng
832
+ ::s1 imperator ::s2 emperor ::cost 0 ::lc1 rus ::lc2 eng
833
+ ::s1 komitet ::s2 committee ::cost 0 ::lc1 rus ::lc2 eng
834
+ ::s1 korolevstvo ::s2 kingdom ::cost 0 ::lc1 rus ::lc2 eng
835
+ ::s1 koroli ::s2 king ::cost 0 ::lc1 rus ::lc2 eng
836
+ ::s1 mezhdunarodnaya ::s2 international ::cost 0 ::lc1 rus ::lc2 eng
837
+ ::s1 natsionalnyi ::s2 national ::cost 0 ::lc1 rus ::lc2 eng
838
+ ::s1 novyi ::s2 new ::cost 0 ::lc1 rus ::lc2 eng
839
+ ::s1 oblast ::s2 province ::cost 0 ::lc1 rus ::lc2 eng
840
+ ::s1 oblast ::s2 region ::cost 0 ::lc1 rus ::lc2 eng
841
+ ::s1 obshchestvo ::s2 society ::cost 0 ::lc1 rus ::lc2 eng
842
+ ::s1 okrug ::s2 district ::cost 0 ::lc1 rus ::lc2 eng
843
+ ::s1 okrug ::s2 region ::cost 0 ::lc1 rus ::lc2 eng
844
+ ::s1 ostrova ::s2 island ::cost 0 ::lc1 rus ::lc2 eng
845
+ ::s1 partiya ::s2 party ::cost 0 ::lc1 rus ::lc2 eng
846
+ ::s1 raion ::s2 district ::cost 0 ::lc1 rus ::lc2 eng
847
+ ::s1 respublika ::s2 republic ::cost 0 ::lc1 rus ::lc2 eng
848
+ ::s1 respublik ::s2 republic ::cost 0 ::lc1 rus ::lc2 eng
849
+ ::s1 sbornaya ::s2 team ::cost 0 ::lc1 rus ::lc2 eng
850
+ ::s1 severnaya ::s2 north ::cost 0 ::lc1 rus ::lc2 eng
851
+ ::s1 sovet council ::cost 0 ::lc1 rus ::lc2 eng
852
+ ::s1 soyuz ::s2 alliance ::cost 0 ::lc1 rus ::lc2 eng
853
+ ::s1 soyuz ::s2 association ::cost 0 ::lc1 rus ::lc2 eng
854
+ ::s1 soyuz ::s2 league ::cost 0 ::lc1 rus ::lc2 eng
855
+ ::s1 soyuz ::s2 union ::cost 0 ::lc1 rus ::lc2 eng
856
+ ::s1 svyataya ::s2 saint ::cost 0 ::lc1 rus ::lc2 eng
857
+ ::s1 svobodnyi ::s2 free ::cost 0 ::lc1 rus ::lc2 eng
858
+ ::s1 tserkov ::s2 church ::cost 0 ::lc1 rus ::lc2 eng
859
+ ::s1 uezd ::s2 county ::cost 0 ::lc1 rus ::lc2 eng
860
+ ::s1 universitet ::s2 university ::cost 0 ::lc1 rus ::lc2 eng
861
+ ::s1 vostochnaya ::s2 east ::cost 0 ::lc1 rus ::lc2 eng
862
+ ::s1 vostochnaya ::s2 eastern ::cost 0 ::lc1 rus ::lc2 eng
863
+ ::s1 yuzhnaya ::s2 south ::cost 0 ::lc1 rus ::lc2 eng
864
+ ::s1 yuzhnaya ::s2 southern ::cost 0 ::lc1 rus ::lc2 eng
865
+ ::s1 yuzhnoi ::s2 south ::cost 0 ::lc1 rus ::lc2 eng
866
+ ::s1 yuzhnoi ::s2 southern ::cost 0 ::lc1 rus ::lc2 eng
867
+ ::s1 yuzhnyi ::s2 south ::cost 0 ::lc1 rus ::lc2 eng
868
+ # often dropped in Russian name
869
+ ::s1 ::s2 county ::cost 0 ::lc1 rus ::lc2 eng
870
+ ::s1 ::s2 island ::cost 0 ::lc1 rus ::lc2 eng
871
+ ::s1 ::s2 pope ::cost 0 ::lc1 rus ::lc2 eng
872
+ ::s1 ::s2 river ::cost 0 ::lc1 rus ::lc2 eng
873
+ ::s1 ::s2 "the " ::cost 0 ::lc1 rus ::lc2 eng ::left2 /^(.*[- ])?$/
874
+ ::s1 " " ::s2 " of " ::cost 0 ::lc1 rus ::lc2 eng
875
+
876
+
877
+ # mini dictionary Uyghur-English
878
+ ::s1 aptonom ::s2 automomous ::cost 0 ::lc1 uig ::lc2 eng
879
+ ::s1 aralliri ::s2 islands ::cost 0 ::lc1 uig ::lc2 eng
880
+ ::s1 aralliri ::s2 ::cost 0 ::lc1 uig ::lc2 eng
881
+ ::s1 arili ::s2 island ::cost 0 ::lc1 uig ::lc2 eng
882
+ ::s1 arili ::s2 ::cost 0 ::lc1 uig ::lc2 eng
883
+ ::s1 nahiyisi ::s2 county ::cost 0 ::lc1 uig ::lc2 eng
884
+ ::s1 oelkisi ::s2 province ::cost 0 ::lc1 uig ::lc2 eng
885
+ ::s1 oelkisi ::s2 ::cost 0 ::lc1 uig ::lc2 eng
886
+ ::s1 ottura ::s2 central ::cost 0 ::lc1 uig ::lc2 eng
887
+ ::s1 rayoni ::s2 region ::cost 0 ::lc1 uig ::lc2 eng
888
+ ::s1 shehiri ::s2 city ::cost 0 ::lc1 uig ::lc2 eng
889
+ ::s1 shehiri ::s2 ::cost 0 ::lc1 uig ::lc2 eng
890
+ ::s1 shitati ::s2 state ::cost 0 ::lc1 uig ::lc2 eng
891
+ ::s1 shitati ::s2 ::cost 0 ::lc1 uig ::lc2 eng
892
+ ::s1 shtati ::s2 state ::cost 0 ::lc1 uig ::lc2 eng
893
+ ::s1 shtati ::s2 ::cost 0 ::lc1 uig ::lc2 eng
894
+ ::s1 uniwersiteti ::s2 university ::cost 0 ::lc1 uig ::lc2 eng
895
+ ::s1 yengi ::s2 new ::cost 0 ::lc1 uig ::lc2 eng
896
+
uroman/lib/JSON.pm ADDED
@@ -0,0 +1,2317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package JSON;
2
+
3
+
4
+ use strict;
5
+ use Carp ();
6
+ use base qw(Exporter);
7
+ @JSON::EXPORT = qw(from_json to_json jsonToObj objToJson encode_json decode_json);
8
+
9
+ BEGIN {
10
+ $JSON::VERSION = '2.90';
11
+ $JSON::DEBUG = 0 unless (defined $JSON::DEBUG);
12
+ $JSON::DEBUG = $ENV{ PERL_JSON_DEBUG } if exists $ENV{ PERL_JSON_DEBUG };
13
+ }
14
+
15
+ my $Module_XS = 'JSON::XS';
16
+ my $Module_PP = 'JSON::PP';
17
+ my $Module_bp = 'JSON::backportPP'; # included in JSON distribution
18
+ my $PP_Version = '2.27203';
19
+ my $XS_Version = '2.34';
20
+
21
+
22
+ # XS and PP common methods
23
+
24
+ my @PublicMethods = qw/
25
+ ascii latin1 utf8 pretty indent space_before space_after relaxed canonical allow_nonref
26
+ allow_blessed convert_blessed filter_json_object filter_json_single_key_object
27
+ shrink max_depth max_size encode decode decode_prefix allow_unknown
28
+ /;
29
+
30
+ my @Properties = qw/
31
+ ascii latin1 utf8 indent space_before space_after relaxed canonical allow_nonref
32
+ allow_blessed convert_blessed shrink max_depth max_size allow_unknown
33
+ /;
34
+
35
+ my @XSOnlyMethods = qw/allow_tags/; # Currently nothing
36
+
37
+ my @PPOnlyMethods = qw/
38
+ indent_length sort_by
39
+ allow_singlequote allow_bignum loose allow_barekey escape_slash as_nonblessed
40
+ /; # JSON::PP specific
41
+
42
+
43
+ # used in _load_xs and _load_pp ($INSTALL_ONLY is not used currently)
44
+ my $_INSTALL_DONT_DIE = 1; # When _load_xs fails to load XS, don't die.
45
+ my $_INSTALL_ONLY = 2; # Don't call _set_methods()
46
+ my $_ALLOW_UNSUPPORTED = 0;
47
+ my $_UNIV_CONV_BLESSED = 0;
48
+ my $_USSING_bpPP = 0;
49
+
50
+
51
+ # Check the environment variable to decide worker module.
52
+
53
+ unless ($JSON::Backend) {
54
+ $JSON::DEBUG and Carp::carp("Check used worker module...");
55
+
56
+ my $backend = exists $ENV{PERL_JSON_BACKEND} ? $ENV{PERL_JSON_BACKEND} : 1;
57
+
58
+ if ($backend eq '1' or $backend =~ /JSON::XS\s*,\s*JSON::PP/) {
59
+ _load_xs($_INSTALL_DONT_DIE) or _load_pp();
60
+ }
61
+ elsif ($backend eq '0' or $backend eq 'JSON::PP') {
62
+ _load_pp();
63
+ }
64
+ elsif ($backend eq '2' or $backend eq 'JSON::XS') {
65
+ _load_xs();
66
+ }
67
+ elsif ($backend eq 'JSON::backportPP') {
68
+ $_USSING_bpPP = 1;
69
+ _load_pp();
70
+ }
71
+ else {
72
+ Carp::croak "The value of environmental variable 'PERL_JSON_BACKEND' is invalid.";
73
+ }
74
+ }
75
+
76
+
77
+ sub import {
78
+ my $pkg = shift;
79
+ my @what_to_export;
80
+ my $no_export;
81
+
82
+ for my $tag (@_) {
83
+ if ($tag eq '-support_by_pp') {
84
+ if (!$_ALLOW_UNSUPPORTED++) {
85
+ JSON::Backend::XS
86
+ ->support_by_pp(@PPOnlyMethods) if ($JSON::Backend eq $Module_XS);
87
+ }
88
+ next;
89
+ }
90
+ elsif ($tag eq '-no_export') {
91
+ $no_export++, next;
92
+ }
93
+ elsif ( $tag eq '-convert_blessed_universally' ) {
94
+ eval q|
95
+ require B;
96
+ *UNIVERSAL::TO_JSON = sub {
97
+ my $b_obj = B::svref_2object( $_[0] );
98
+ return $b_obj->isa('B::HV') ? { %{ $_[0] } }
99
+ : $b_obj->isa('B::AV') ? [ @{ $_[0] } ]
100
+ : undef
101
+ ;
102
+ }
103
+ | if ( !$_UNIV_CONV_BLESSED++ );
104
+ next;
105
+ }
106
+ push @what_to_export, $tag;
107
+ }
108
+
109
+ return if ($no_export);
110
+
111
+ __PACKAGE__->export_to_level(1, $pkg, @what_to_export);
112
+ }
113
+
114
+
115
+ # OBSOLETED
116
+
117
+ sub jsonToObj {
118
+ my $alternative = 'from_json';
119
+ if (defined $_[0] and UNIVERSAL::isa($_[0], 'JSON')) {
120
+ shift @_; $alternative = 'decode';
121
+ }
122
+ Carp::carp "'jsonToObj' will be obsoleted. Please use '$alternative' instead.";
123
+ return JSON::from_json(@_);
124
+ };
125
+
126
+ sub objToJson {
127
+ my $alternative = 'to_json';
128
+ if (defined $_[0] and UNIVERSAL::isa($_[0], 'JSON')) {
129
+ shift @_; $alternative = 'encode';
130
+ }
131
+ Carp::carp "'objToJson' will be obsoleted. Please use '$alternative' instead.";
132
+ JSON::to_json(@_);
133
+ };
134
+
135
+
136
+ # INTERFACES
137
+
138
+ sub to_json ($@) {
139
+ if (
140
+ ref($_[0]) eq 'JSON'
141
+ or (@_ > 2 and $_[0] eq 'JSON')
142
+ ) {
143
+ Carp::croak "to_json should not be called as a method.";
144
+ }
145
+ my $json = JSON->new;
146
+
147
+ if (@_ == 2 and ref $_[1] eq 'HASH') {
148
+ my $opt = $_[1];
149
+ for my $method (keys %$opt) {
150
+ $json->$method( $opt->{$method} );
151
+ }
152
+ }
153
+
154
+ $json->encode($_[0]);
155
+ }
156
+
157
+
158
+ sub from_json ($@) {
159
+ if ( ref($_[0]) eq 'JSON' or $_[0] eq 'JSON' ) {
160
+ Carp::croak "from_json should not be called as a method.";
161
+ }
162
+ my $json = JSON->new;
163
+
164
+ if (@_ == 2 and ref $_[1] eq 'HASH') {
165
+ my $opt = $_[1];
166
+ for my $method (keys %$opt) {
167
+ $json->$method( $opt->{$method} );
168
+ }
169
+ }
170
+
171
+ return $json->decode( $_[0] );
172
+ }
173
+
174
+
175
+
176
+ sub true { $JSON::true }
177
+
178
+ sub false { $JSON::false }
179
+
180
+ sub null { undef; }
181
+
182
+
183
+ sub require_xs_version { $XS_Version; }
184
+
185
+ sub backend {
186
+ my $proto = shift;
187
+ $JSON::Backend;
188
+ }
189
+
190
+ #*module = *backend;
191
+
192
+
193
+ sub is_xs {
194
+ return $_[0]->backend eq $Module_XS;
195
+ }
196
+
197
+
198
+ sub is_pp {
199
+ return not $_[0]->is_xs;
200
+ }
201
+
202
+
203
+ sub pureperl_only_methods { @PPOnlyMethods; }
204
+
205
+
206
+ sub property {
207
+ my ($self, $name, $value) = @_;
208
+
209
+ if (@_ == 1) {
210
+ my %props;
211
+ for $name (@Properties) {
212
+ my $method = 'get_' . $name;
213
+ if ($name eq 'max_size') {
214
+ my $value = $self->$method();
215
+ $props{$name} = $value == 1 ? 0 : $value;
216
+ next;
217
+ }
218
+ $props{$name} = $self->$method();
219
+ }
220
+ return \%props;
221
+ }
222
+ elsif (@_ > 3) {
223
+ Carp::croak('property() can take only the option within 2 arguments.');
224
+ }
225
+ elsif (@_ == 2) {
226
+ if ( my $method = $self->can('get_' . $name) ) {
227
+ if ($name eq 'max_size') {
228
+ my $value = $self->$method();
229
+ return $value == 1 ? 0 : $value;
230
+ }
231
+ $self->$method();
232
+ }
233
+ }
234
+ else {
235
+ $self->$name($value);
236
+ }
237
+
238
+ }
239
+
240
+
241
+
242
+ # INTERNAL
243
+
244
+ sub _load_xs {
245
+ my $opt = shift;
246
+
247
+ $JSON::DEBUG and Carp::carp "Load $Module_XS.";
248
+
249
+ # if called after install module, overload is disable.... why?
250
+ JSON::Boolean::_overrride_overload($Module_XS);
251
+ JSON::Boolean::_overrride_overload($Module_PP);
252
+
253
+ eval qq|
254
+ use $Module_XS $XS_Version ();
255
+ |;
256
+
257
+ if ($@) {
258
+ if (defined $opt and $opt & $_INSTALL_DONT_DIE) {
259
+ $JSON::DEBUG and Carp::carp "Can't load $Module_XS...($@)";
260
+ return 0;
261
+ }
262
+ Carp::croak $@;
263
+ }
264
+
265
+ unless (defined $opt and $opt & $_INSTALL_ONLY) {
266
+ _set_module( $JSON::Backend = $Module_XS );
267
+ my $data = join("", <DATA>); # this code is from Jcode 2.xx.
268
+ close(DATA);
269
+ eval $data;
270
+ JSON::Backend::XS->init;
271
+ }
272
+
273
+ return 1;
274
+ };
275
+
276
+
277
+ sub _load_pp {
278
+ my $opt = shift;
279
+ my $backend = $_USSING_bpPP ? $Module_bp : $Module_PP;
280
+
281
+ $JSON::DEBUG and Carp::carp "Load $backend.";
282
+
283
+ # if called after install module, overload is disable.... why?
284
+ JSON::Boolean::_overrride_overload($Module_XS);
285
+ JSON::Boolean::_overrride_overload($backend);
286
+
287
+ if ( $_USSING_bpPP ) {
288
+ eval qq| require $backend |;
289
+ }
290
+ else {
291
+ eval qq| use $backend $PP_Version () |;
292
+ }
293
+
294
+ if ($@) {
295
+ if ( $backend eq $Module_PP ) {
296
+ $JSON::DEBUG and Carp::carp "Can't load $Module_PP ($@), so try to load $Module_bp";
297
+ $_USSING_bpPP++;
298
+ $backend = $Module_bp;
299
+ JSON::Boolean::_overrride_overload($backend);
300
+ local $^W; # if PP installed but invalid version, backportPP redefines methods.
301
+ eval qq| require $Module_bp |;
302
+ }
303
+ Carp::croak $@ if $@;
304
+ }
305
+
306
+ unless (defined $opt and $opt & $_INSTALL_ONLY) {
307
+ _set_module( $JSON::Backend = $Module_PP ); # even if backportPP, set $Backend with 'JSON::PP'
308
+ JSON::Backend::PP->init;
309
+ }
310
+ };
311
+
312
+
313
+ sub _set_module {
314
+ return if defined $JSON::true;
315
+
316
+ my $module = shift;
317
+
318
+ local $^W;
319
+ no strict qw(refs);
320
+
321
+ $JSON::true = ${"$module\::true"};
322
+ $JSON::false = ${"$module\::false"};
323
+
324
+ push @JSON::ISA, $module;
325
+ if ( JSON->is_xs and JSON->backend->VERSION < 3 ) {
326
+ eval 'package JSON::PP::Boolean';
327
+ push @{"$module\::Boolean::ISA"}, qw(JSON::PP::Boolean);
328
+ }
329
+
330
+ *{"JSON::is_bool"} = \&{"$module\::is_bool"};
331
+
332
+ for my $method ($module eq $Module_XS ? @PPOnlyMethods : @XSOnlyMethods) {
333
+ *{"JSON::$method"} = sub {
334
+ Carp::carp("$method is not supported in $module.");
335
+ $_[0];
336
+ };
337
+ }
338
+
339
+ return 1;
340
+ }
341
+
342
+
343
+
344
+ #
345
+ # JSON Boolean
346
+ #
347
+
348
+ package JSON::Boolean;
349
+
350
+ my %Installed;
351
+
352
+ sub _overrride_overload {
353
+ return; # this function is currently disable.
354
+ return if ($Installed{ $_[0] }++);
355
+
356
+ my $boolean = $_[0] . '::Boolean';
357
+
358
+ eval sprintf(q|
359
+ package %s;
360
+ use overload (
361
+ '""' => sub { ${$_[0]} == 1 ? 'true' : 'false' },
362
+ 'eq' => sub {
363
+ my ($obj, $op) = ref ($_[0]) ? ($_[0], $_[1]) : ($_[1], $_[0]);
364
+ if ($op eq 'true' or $op eq 'false') {
365
+ return "$obj" eq 'true' ? 'true' eq $op : 'false' eq $op;
366
+ }
367
+ else {
368
+ return $obj ? 1 == $op : 0 == $op;
369
+ }
370
+ },
371
+ );
372
+ |, $boolean);
373
+
374
+ if ($@) { Carp::croak $@; }
375
+
376
+ if ( exists $INC{'JSON/XS.pm'} and $boolean eq 'JSON::XS::Boolean' ) {
377
+ local $^W;
378
+ my $true = do { bless \(my $dummy = 1), $boolean };
379
+ my $false = do { bless \(my $dummy = 0), $boolean };
380
+ *JSON::XS::true = sub () { $true };
381
+ *JSON::XS::false = sub () { $false };
382
+ }
383
+ elsif ( exists $INC{'JSON/PP.pm'} and $boolean eq 'JSON::PP::Boolean' ) {
384
+ local $^W;
385
+ my $true = do { bless \(my $dummy = 1), $boolean };
386
+ my $false = do { bless \(my $dummy = 0), $boolean };
387
+ *JSON::PP::true = sub { $true };
388
+ *JSON::PP::false = sub { $false };
389
+ }
390
+
391
+ return 1;
392
+ }
393
+
394
+
395
+ #
396
+ # Helper classes for Backend Module (PP)
397
+ #
398
+
399
+ package JSON::Backend::PP;
400
+
401
+ sub init {
402
+ local $^W;
403
+ no strict qw(refs); # this routine may be called after JSON::Backend::XS init was called.
404
+ *{"JSON::decode_json"} = \&{"JSON::PP::decode_json"};
405
+ *{"JSON::encode_json"} = \&{"JSON::PP::encode_json"};
406
+ *{"JSON::PP::is_xs"} = sub { 0 };
407
+ *{"JSON::PP::is_pp"} = sub { 1 };
408
+ return 1;
409
+ }
410
+
411
+ #
412
+ # To save memory, the below lines are read only when XS backend is used.
413
+ #
414
+
415
+ package JSON;
416
+
417
+ 1;
418
+ __DATA__
419
+
420
+
421
+ #
422
+ # Helper classes for Backend Module (XS)
423
+ #
424
+
425
+ package JSON::Backend::XS;
426
+
427
+ use constant INDENT_LENGTH_FLAG => 15 << 12;
428
+
429
+ use constant UNSUPPORTED_ENCODE_FLAG => {
430
+ ESCAPE_SLASH => 0x00000010,
431
+ ALLOW_BIGNUM => 0x00000020,
432
+ AS_NONBLESSED => 0x00000040,
433
+ EXPANDED => 0x10000000, # for developer's
434
+ };
435
+
436
+ use constant UNSUPPORTED_DECODE_FLAG => {
437
+ LOOSE => 0x00000001,
438
+ ALLOW_BIGNUM => 0x00000002,
439
+ ALLOW_BAREKEY => 0x00000004,
440
+ ALLOW_SINGLEQUOTE => 0x00000008,
441
+ EXPANDED => 0x20000000, # for developer's
442
+ };
443
+
444
+
445
+ sub init {
446
+ local $^W;
447
+ no strict qw(refs);
448
+ *{"JSON::decode_json"} = \&{"JSON::XS::decode_json"};
449
+ *{"JSON::encode_json"} = \&{"JSON::XS::encode_json"};
450
+ *{"JSON::XS::is_xs"} = sub { 1 };
451
+ *{"JSON::XS::is_pp"} = sub { 0 };
452
+ return 1;
453
+ }
454
+
455
+
456
+ sub support_by_pp {
457
+ my ($class, @methods) = @_;
458
+
459
+ local $^W;
460
+ no strict qw(refs);
461
+
462
+ my $JSON_XS_encode_orignal = \&JSON::XS::encode;
463
+ my $JSON_XS_decode_orignal = \&JSON::XS::decode;
464
+ my $JSON_XS_incr_parse_orignal = \&JSON::XS::incr_parse;
465
+
466
+ *JSON::XS::decode = \&JSON::Backend::XS::Supportable::_decode;
467
+ *JSON::XS::encode = \&JSON::Backend::XS::Supportable::_encode;
468
+ *JSON::XS::incr_parse = \&JSON::Backend::XS::Supportable::_incr_parse;
469
+
470
+ *{JSON::XS::_original_decode} = $JSON_XS_decode_orignal;
471
+ *{JSON::XS::_original_encode} = $JSON_XS_encode_orignal;
472
+ *{JSON::XS::_original_incr_parse} = $JSON_XS_incr_parse_orignal;
473
+
474
+ push @JSON::Backend::XS::Supportable::ISA, 'JSON';
475
+
476
+ my $pkg = 'JSON::Backend::XS::Supportable';
477
+
478
+ *{JSON::new} = sub {
479
+ my $proto = JSON::XS->new; $$proto = 0;
480
+ bless $proto, $pkg;
481
+ };
482
+
483
+
484
+ for my $method (@methods) {
485
+ my $flag = uc($method);
486
+ my $type |= (UNSUPPORTED_ENCODE_FLAG->{$flag} || 0);
487
+ $type |= (UNSUPPORTED_DECODE_FLAG->{$flag} || 0);
488
+
489
+ next unless($type);
490
+
491
+ $pkg->_make_unsupported_method($method => $type);
492
+ }
493
+
494
+ # push @{"JSON::XS::Boolean::ISA"}, qw(JSON::PP::Boolean);
495
+ # push @{"JSON::PP::Boolean::ISA"}, qw(JSON::Boolean);
496
+
497
+ $JSON::DEBUG and Carp::carp("set -support_by_pp mode.");
498
+
499
+ return 1;
500
+ }
501
+
502
+
503
+
504
+
505
+ #
506
+ # Helper classes for XS
507
+ #
508
+
509
+ package JSON::Backend::XS::Supportable;
510
+
511
+ $Carp::Internal{'JSON::Backend::XS::Supportable'} = 1;
512
+
513
+ sub _make_unsupported_method {
514
+ my ($pkg, $method, $type) = @_;
515
+
516
+ local $^W;
517
+ no strict qw(refs);
518
+
519
+ *{"$pkg\::$method"} = sub {
520
+ local $^W;
521
+ if (defined $_[1] ? $_[1] : 1) {
522
+ ${$_[0]} |= $type;
523
+ }
524
+ else {
525
+ ${$_[0]} &= ~$type;
526
+ }
527
+ $_[0];
528
+ };
529
+
530
+ *{"$pkg\::get_$method"} = sub {
531
+ ${$_[0]} & $type ? 1 : '';
532
+ };
533
+
534
+ }
535
+
536
+
537
+ sub _set_for_pp {
538
+ JSON::_load_pp( $_INSTALL_ONLY );
539
+
540
+ my $type = shift;
541
+ my $pp = JSON::PP->new;
542
+ my $prop = $_[0]->property;
543
+
544
+ for my $name (keys %$prop) {
545
+ $pp->$name( $prop->{$name} ? $prop->{$name} : 0 );
546
+ }
547
+
548
+ my $unsupported = $type eq 'encode' ? JSON::Backend::XS::UNSUPPORTED_ENCODE_FLAG
549
+ : JSON::Backend::XS::UNSUPPORTED_DECODE_FLAG;
550
+ my $flags = ${$_[0]} || 0;
551
+
552
+ for my $name (keys %$unsupported) {
553
+ next if ($name eq 'EXPANDED'); # for developer's
554
+ my $enable = ($flags & $unsupported->{$name}) ? 1 : 0;
555
+ my $method = lc $name;
556
+ $pp->$method($enable);
557
+ }
558
+
559
+ $pp->indent_length( $_[0]->get_indent_length );
560
+
561
+ return $pp;
562
+ }
563
+
564
+ sub _encode { # using with PP encode
565
+ if (${$_[0]}) {
566
+ _set_for_pp('encode' => @_)->encode($_[1]);
567
+ }
568
+ else {
569
+ $_[0]->_original_encode( $_[1] );
570
+ }
571
+ }
572
+
573
+
574
+ sub _decode { # if unsupported-flag is set, use PP
575
+ if (${$_[0]}) {
576
+ _set_for_pp('decode' => @_)->decode($_[1]);
577
+ }
578
+ else {
579
+ $_[0]->_original_decode( $_[1] );
580
+ }
581
+ }
582
+
583
+
584
+ sub decode_prefix { # if unsupported-flag is set, use PP
585
+ _set_for_pp('decode' => @_)->decode_prefix($_[1]);
586
+ }
587
+
588
+
589
+ sub _incr_parse {
590
+ if (${$_[0]}) {
591
+ _set_for_pp('decode' => @_)->incr_parse($_[1]);
592
+ }
593
+ else {
594
+ $_[0]->_original_incr_parse( $_[1] );
595
+ }
596
+ }
597
+
598
+
599
+ sub get_indent_length {
600
+ ${$_[0]} << 4 >> 16;
601
+ }
602
+
603
+
604
+ sub indent_length {
605
+ my $length = $_[1];
606
+
607
+ if (!defined $length or $length > 15 or $length < 0) {
608
+ Carp::carp "The acceptable range of indent_length() is 0 to 15.";
609
+ }
610
+ else {
611
+ local $^W;
612
+ $length <<= 12;
613
+ ${$_[0]} &= ~ JSON::Backend::XS::INDENT_LENGTH_FLAG;
614
+ ${$_[0]} |= $length;
615
+ *JSON::XS::encode = \&JSON::Backend::XS::Supportable::_encode;
616
+ }
617
+
618
+ $_[0];
619
+ }
620
+
621
+
622
+ 1;
623
+ __END__
624
+
625
+ =head1 NAME
626
+
627
+ JSON - JSON (JavaScript Object Notation) encoder/decoder
628
+
629
+ =head1 SYNOPSIS
630
+
631
+ use JSON; # imports encode_json, decode_json, to_json and from_json.
632
+
633
+ # simple and fast interfaces (expect/generate UTF-8)
634
+
635
+ $utf8_encoded_json_text = encode_json $perl_hash_or_arrayref;
636
+ $perl_hash_or_arrayref = decode_json $utf8_encoded_json_text;
637
+
638
+ # OO-interface
639
+
640
+ $json = JSON->new->allow_nonref;
641
+
642
+ $json_text = $json->encode( $perl_scalar );
643
+ $perl_scalar = $json->decode( $json_text );
644
+
645
+ $pretty_printed = $json->pretty->encode( $perl_scalar ); # pretty-printing
646
+
647
+ # If you want to use PP only support features, call with '-support_by_pp'
648
+ # When XS unsupported feature is enable, using PP (de|en)code instead of XS ones.
649
+
650
+ use JSON -support_by_pp;
651
+
652
+ # option-acceptable interfaces (expect/generate UNICODE by default)
653
+
654
+ $json_text = to_json( $perl_scalar, { ascii => 1, pretty => 1 } );
655
+ $perl_scalar = from_json( $json_text, { utf8 => 1 } );
656
+
657
+ # Between (en|de)code_json and (to|from)_json, if you want to write
658
+ # a code which communicates to an outer world (encoded in UTF-8),
659
+ # recommend to use (en|de)code_json.
660
+
661
+ =head1 VERSION
662
+
663
+ 2.90
664
+
665
+ This version is compatible with JSON::XS B<2.34> and later.
666
+ (Not yet compatble to JSON::XS B<3.0x>.)
667
+
668
+
669
+ =head1 NOTE
670
+
671
+ JSON::PP was earlier included in the C<JSON> distribution, but
672
+ has since Perl 5.14 been a core module. For this reason,
673
+ L<JSON::PP> was removed from the JSON distribution and can now
674
+ be found also in the Perl5 repository at
675
+
676
+ =over
677
+
678
+ =item * L<http://perl5.git.perl.org/perl.git>
679
+
680
+ =back
681
+
682
+ (The newest JSON::PP version still exists in CPAN.)
683
+
684
+ Instead, the C<JSON> distribution will include JSON::backportPP
685
+ for backwards computability. JSON.pm should thus work as it did
686
+ before.
687
+
688
+ =head1 DESCRIPTION
689
+
690
+ *************************** CAUTION **************************************
691
+ * *
692
+ * INCOMPATIBLE CHANGE (JSON::XS version 2.90) *
693
+ * *
694
+ * JSON.pm had patched JSON::XS::Boolean and JSON::PP::Boolean internally *
695
+ * on loading time for making these modules inherit JSON::Boolean. *
696
+ * But since JSON::XS v3.0 it use Types::Serialiser as boolean class. *
697
+ * Then now JSON.pm breaks boolean classe overload features and *
698
+ * -support_by_pp if JSON::XS v3.0 or later is installed. *
699
+ * *
700
+ * JSON::true and JSON::false returned JSON::Boolean objects. *
701
+ * For workaround, they return JSON::PP::Boolean objects in this version. *
702
+ * *
703
+ * isa_ok(JSON::true, 'JSON::PP::Boolean'); *
704
+ * *
705
+ * And it discards a feature: *
706
+ * *
707
+ * ok(JSON::true eq 'true'); *
708
+ * *
709
+ * In other word, JSON::PP::Boolean overload numeric only. *
710
+ * *
711
+ * ok( JSON::true == 1 ); *
712
+ * *
713
+ **************************************************************************
714
+
715
+ ************************** CAUTION ********************************
716
+ * This is 'JSON module version 2' and there are many differences *
717
+ * to version 1.xx *
718
+ * Please check your applications using old version. *
719
+ * See to 'INCOMPATIBLE CHANGES TO OLD VERSION' *
720
+ *******************************************************************
721
+
722
+ JSON (JavaScript Object Notation) is a simple data format.
723
+ See to L<http://www.json.org/> and C<RFC4627>(L<http://www.ietf.org/rfc/rfc4627.txt>).
724
+
725
+ This module converts Perl data structures to JSON and vice versa using either
726
+ L<JSON::XS> or L<JSON::PP>.
727
+
728
+ JSON::XS is the fastest and most proper JSON module on CPAN which must be
729
+ compiled and installed in your environment.
730
+ JSON::PP is a pure-Perl module which is bundled in this distribution and
731
+ has a strong compatibility to JSON::XS.
732
+
733
+ This module try to use JSON::XS by default and fail to it, use JSON::PP instead.
734
+ So its features completely depend on JSON::XS or JSON::PP.
735
+
736
+ See to L<BACKEND MODULE DECISION>.
737
+
738
+ To distinguish the module name 'JSON' and the format type JSON,
739
+ the former is quoted by CE<lt>E<gt> (its results vary with your using media),
740
+ and the latter is left just as it is.
741
+
742
+ Module name : C<JSON>
743
+
744
+ Format type : JSON
745
+
746
+ =head2 FEATURES
747
+
748
+ =over
749
+
750
+ =item * correct unicode handling
751
+
752
+ This module (i.e. backend modules) knows how to handle Unicode, documents
753
+ how and when it does so, and even documents what "correct" means.
754
+
755
+ Even though there are limitations, this feature is available since Perl version 5.6.
756
+
757
+ JSON::XS requires Perl 5.8.2 (but works correctly in 5.8.8 or later), so in older versions
758
+ C<JSON> should call JSON::PP as the backend which can be used since Perl 5.005.
759
+
760
+ With Perl 5.8.x JSON::PP works, but from 5.8.0 to 5.8.2, because of a Perl side problem,
761
+ JSON::PP works slower in the versions. And in 5.005, the Unicode handling is not available.
762
+ See to L<JSON::PP/UNICODE HANDLING ON PERLS> for more information.
763
+
764
+ See also to L<JSON::XS/A FEW NOTES ON UNICODE AND PERL>
765
+ and L<JSON::XS/ENCODING/CODESET_FLAG_NOTES>.
766
+
767
+
768
+ =item * round-trip integrity
769
+
770
+ When you serialise a perl data structure using only data types supported
771
+ by JSON and Perl, the deserialised data structure is identical on the Perl
772
+ level. (e.g. the string "2.0" doesn't suddenly become "2" just because
773
+ it looks like a number). There I<are> minor exceptions to this, read the
774
+ L</MAPPING> section below to learn about those.
775
+
776
+
777
+ =item * strict checking of JSON correctness
778
+
779
+ There is no guessing, no generating of illegal JSON texts by default,
780
+ and only JSON is accepted as input by default (the latter is a security
781
+ feature).
782
+
783
+ See to L<JSON::XS/FEATURES> and L<JSON::PP/FEATURES>.
784
+
785
+ =item * fast
786
+
787
+ This module returns a JSON::XS object itself if available.
788
+ Compared to other JSON modules and other serialisers such as Storable,
789
+ JSON::XS usually compares favorably in terms of speed, too.
790
+
791
+ If not available, C<JSON> returns a JSON::PP object instead of JSON::XS and
792
+ it is very slow as pure-Perl.
793
+
794
+ =item * simple to use
795
+
796
+ This module has both a simple functional interface as well as an
797
+ object oriented interface interface.
798
+
799
+ =item * reasonably versatile output formats
800
+
801
+ You can choose between the most compact guaranteed-single-line format possible
802
+ (nice for simple line-based protocols), a pure-ASCII format (for when your transport
803
+ is not 8-bit clean, still supports the whole Unicode range), or a pretty-printed
804
+ format (for when you want to read that stuff). Or you can combine those features
805
+ in whatever way you like.
806
+
807
+ =back
808
+
809
+ =head1 FUNCTIONAL INTERFACE
810
+
811
+ Some documents are copied and modified from L<JSON::XS/FUNCTIONAL INTERFACE>.
812
+ C<to_json> and C<from_json> are additional functions.
813
+
814
+ =head2 encode_json
815
+
816
+ $json_text = encode_json $perl_scalar
817
+
818
+ Converts the given Perl data structure to a UTF-8 encoded, binary string.
819
+
820
+ This function call is functionally identical to:
821
+
822
+ $json_text = JSON->new->utf8->encode($perl_scalar)
823
+
824
+ =head2 decode_json
825
+
826
+ $perl_scalar = decode_json $json_text
827
+
828
+ The opposite of C<encode_json>: expects an UTF-8 (binary) string and tries
829
+ to parse that as an UTF-8 encoded JSON text, returning the resulting
830
+ reference.
831
+
832
+ This function call is functionally identical to:
833
+
834
+ $perl_scalar = JSON->new->utf8->decode($json_text)
835
+
836
+
837
+ =head2 to_json
838
+
839
+ $json_text = to_json($perl_scalar)
840
+
841
+ Converts the given Perl data structure to a json string.
842
+
843
+ This function call is functionally identical to:
844
+
845
+ $json_text = JSON->new->encode($perl_scalar)
846
+
847
+ Takes a hash reference as the second.
848
+
849
+ $json_text = to_json($perl_scalar, $flag_hashref)
850
+
851
+ So,
852
+
853
+ $json_text = to_json($perl_scalar, {utf8 => 1, pretty => 1})
854
+
855
+ equivalent to:
856
+
857
+ $json_text = JSON->new->utf8(1)->pretty(1)->encode($perl_scalar)
858
+
859
+ If you want to write a modern perl code which communicates to outer world,
860
+ you should use C<encode_json> (supposed that JSON data are encoded in UTF-8).
861
+
862
+ =head2 from_json
863
+
864
+ $perl_scalar = from_json($json_text)
865
+
866
+ The opposite of C<to_json>: expects a json string and tries
867
+ to parse it, returning the resulting reference.
868
+
869
+ This function call is functionally identical to:
870
+
871
+ $perl_scalar = JSON->decode($json_text)
872
+
873
+ Takes a hash reference as the second.
874
+
875
+ $perl_scalar = from_json($json_text, $flag_hashref)
876
+
877
+ So,
878
+
879
+ $perl_scalar = from_json($json_text, {utf8 => 1})
880
+
881
+ equivalent to:
882
+
883
+ $perl_scalar = JSON->new->utf8(1)->decode($json_text)
884
+
885
+ If you want to write a modern perl code which communicates to outer world,
886
+ you should use C<decode_json> (supposed that JSON data are encoded in UTF-8).
887
+
888
+ =head2 JSON::is_bool
889
+
890
+ $is_boolean = JSON::is_bool($scalar)
891
+
892
+ Returns true if the passed scalar represents either JSON::true or
893
+ JSON::false, two constants that act like C<1> and C<0> respectively
894
+ and are also used to represent JSON C<true> and C<false> in Perl strings.
895
+
896
+ =head2 JSON::true
897
+
898
+ Returns JSON true value which is blessed object.
899
+ It C<isa> JSON::Boolean object.
900
+
901
+ =head2 JSON::false
902
+
903
+ Returns JSON false value which is blessed object.
904
+ It C<isa> JSON::Boolean object.
905
+
906
+ =head2 JSON::null
907
+
908
+ Returns C<undef>.
909
+
910
+ See L<MAPPING>, below, for more information on how JSON values are mapped to
911
+ Perl.
912
+
913
+ =head1 HOW DO I DECODE A DATA FROM OUTER AND ENCODE TO OUTER
914
+
915
+ This section supposes that your perl version is 5.8 or later.
916
+
917
+ If you know a JSON text from an outer world - a network, a file content, and so on,
918
+ is encoded in UTF-8, you should use C<decode_json> or C<JSON> module object
919
+ with C<utf8> enable. And the decoded result will contain UNICODE characters.
920
+
921
+ # from network
922
+ my $json = JSON->new->utf8;
923
+ my $json_text = CGI->new->param( 'json_data' );
924
+ my $perl_scalar = $json->decode( $json_text );
925
+
926
+ # from file content
927
+ local $/;
928
+ open( my $fh, '<', 'json.data' );
929
+ $json_text = <$fh>;
930
+ $perl_scalar = decode_json( $json_text );
931
+
932
+ If an outer data is not encoded in UTF-8, firstly you should C<decode> it.
933
+
934
+ use Encode;
935
+ local $/;
936
+ open( my $fh, '<', 'json.data' );
937
+ my $encoding = 'cp932';
938
+ my $unicode_json_text = decode( $encoding, <$fh> ); # UNICODE
939
+
940
+ # or you can write the below code.
941
+ #
942
+ # open( my $fh, "<:encoding($encoding)", 'json.data' );
943
+ # $unicode_json_text = <$fh>;
944
+
945
+ In this case, C<$unicode_json_text> is of course UNICODE string.
946
+ So you B<cannot> use C<decode_json> nor C<JSON> module object with C<utf8> enable.
947
+ Instead of them, you use C<JSON> module object with C<utf8> disable or C<from_json>.
948
+
949
+ $perl_scalar = $json->utf8(0)->decode( $unicode_json_text );
950
+ # or
951
+ $perl_scalar = from_json( $unicode_json_text );
952
+
953
+ Or C<encode 'utf8'> and C<decode_json>:
954
+
955
+ $perl_scalar = decode_json( encode( 'utf8', $unicode_json_text ) );
956
+ # this way is not efficient.
957
+
958
+ And now, you want to convert your C<$perl_scalar> into JSON data and
959
+ send it to an outer world - a network or a file content, and so on.
960
+
961
+ Your data usually contains UNICODE strings and you want the converted data to be encoded
962
+ in UTF-8, you should use C<encode_json> or C<JSON> module object with C<utf8> enable.
963
+
964
+ print encode_json( $perl_scalar ); # to a network? file? or display?
965
+ # or
966
+ print $json->utf8->encode( $perl_scalar );
967
+
968
+ If C<$perl_scalar> does not contain UNICODE but C<$encoding>-encoded strings
969
+ for some reason, then its characters are regarded as B<latin1> for perl
970
+ (because it does not concern with your $encoding).
971
+ You B<cannot> use C<encode_json> nor C<JSON> module object with C<utf8> enable.
972
+ Instead of them, you use C<JSON> module object with C<utf8> disable or C<to_json>.
973
+ Note that the resulted text is a UNICODE string but no problem to print it.
974
+
975
+ # $perl_scalar contains $encoding encoded string values
976
+ $unicode_json_text = $json->utf8(0)->encode( $perl_scalar );
977
+ # or
978
+ $unicode_json_text = to_json( $perl_scalar );
979
+ # $unicode_json_text consists of characters less than 0x100
980
+ print $unicode_json_text;
981
+
982
+ Or C<decode $encoding> all string values and C<encode_json>:
983
+
984
+ $perl_scalar->{ foo } = decode( $encoding, $perl_scalar->{ foo } );
985
+ # ... do it to each string values, then encode_json
986
+ $json_text = encode_json( $perl_scalar );
987
+
988
+ This method is a proper way but probably not efficient.
989
+
990
+ See to L<Encode>, L<perluniintro>.
991
+
992
+
993
+ =head1 COMMON OBJECT-ORIENTED INTERFACE
994
+
995
+ =head2 new
996
+
997
+ $json = JSON->new
998
+
999
+ Returns a new C<JSON> object inherited from either JSON::XS or JSON::PP
1000
+ that can be used to de/encode JSON strings.
1001
+
1002
+ All boolean flags described below are by default I<disabled>.
1003
+
1004
+ The mutators for flags all return the JSON object again and thus calls can
1005
+ be chained:
1006
+
1007
+ my $json = JSON->new->utf8->space_after->encode({a => [1,2]})
1008
+ => {"a": [1, 2]}
1009
+
1010
+ =head2 ascii
1011
+
1012
+ $json = $json->ascii([$enable])
1013
+
1014
+ $enabled = $json->get_ascii
1015
+
1016
+ If $enable is true (or missing), then the encode method will not generate characters outside
1017
+ the code range 0..127. Any Unicode characters outside that range will be escaped using either
1018
+ a single \uXXXX or a double \uHHHH\uLLLLL escape sequence, as per RFC4627.
1019
+
1020
+ If $enable is false, then the encode method will not escape Unicode characters unless
1021
+ required by the JSON syntax or other flags. This results in a faster and more compact format.
1022
+
1023
+ This feature depends on the used Perl version and environment.
1024
+
1025
+ See to L<JSON::PP/UNICODE HANDLING ON PERLS> if the backend is PP.
1026
+
1027
+ JSON->new->ascii(1)->encode([chr 0x10401])
1028
+ => ["\ud801\udc01"]
1029
+
1030
+ =head2 latin1
1031
+
1032
+ $json = $json->latin1([$enable])
1033
+
1034
+ $enabled = $json->get_latin1
1035
+
1036
+ If $enable is true (or missing), then the encode method will encode the resulting JSON
1037
+ text as latin1 (or iso-8859-1), escaping any characters outside the code range 0..255.
1038
+
1039
+ If $enable is false, then the encode method will not escape Unicode characters
1040
+ unless required by the JSON syntax or other flags.
1041
+
1042
+ JSON->new->latin1->encode (["\x{89}\x{abc}"]
1043
+ => ["\x{89}\\u0abc"] # (perl syntax, U+abc escaped, U+89 not)
1044
+
1045
+ =head2 utf8
1046
+
1047
+ $json = $json->utf8([$enable])
1048
+
1049
+ $enabled = $json->get_utf8
1050
+
1051
+ If $enable is true (or missing), then the encode method will encode the JSON result
1052
+ into UTF-8, as required by many protocols, while the decode method expects to be handled
1053
+ an UTF-8-encoded string. Please note that UTF-8-encoded strings do not contain any
1054
+ characters outside the range 0..255, they are thus useful for bytewise/binary I/O.
1055
+
1056
+ In future versions, enabling this option might enable autodetection of the UTF-16 and UTF-32
1057
+ encoding families, as described in RFC4627.
1058
+
1059
+ If $enable is false, then the encode method will return the JSON string as a (non-encoded)
1060
+ Unicode string, while decode expects thus a Unicode string. Any decoding or encoding
1061
+ (e.g. to UTF-8 or UTF-16) needs to be done yourself, e.g. using the Encode module.
1062
+
1063
+
1064
+ Example, output UTF-16BE-encoded JSON:
1065
+
1066
+ use Encode;
1067
+ $jsontext = encode "UTF-16BE", JSON::XS->new->encode ($object);
1068
+
1069
+ Example, decode UTF-32LE-encoded JSON:
1070
+
1071
+ use Encode;
1072
+ $object = JSON::XS->new->decode (decode "UTF-32LE", $jsontext);
1073
+
1074
+ See to L<JSON::PP/UNICODE HANDLING ON PERLS> if the backend is PP.
1075
+
1076
+
1077
+ =head2 pretty
1078
+
1079
+ $json = $json->pretty([$enable])
1080
+
1081
+ This enables (or disables) all of the C<indent>, C<space_before> and
1082
+ C<space_after> (and in the future possibly more) flags in one call to
1083
+ generate the most readable (or most compact) form possible.
1084
+
1085
+ Equivalent to:
1086
+
1087
+ $json->indent->space_before->space_after
1088
+
1089
+ The indent space length is three and JSON::XS cannot change the indent
1090
+ space length.
1091
+
1092
+ =head2 indent
1093
+
1094
+ $json = $json->indent([$enable])
1095
+
1096
+ $enabled = $json->get_indent
1097
+
1098
+ If C<$enable> is true (or missing), then the C<encode> method will use a multiline
1099
+ format as output, putting every array member or object/hash key-value pair
1100
+ into its own line, identifying them properly.
1101
+
1102
+ If C<$enable> is false, no newlines or indenting will be produced, and the
1103
+ resulting JSON text is guaranteed not to contain any C<newlines>.
1104
+
1105
+ This setting has no effect when decoding JSON texts.
1106
+
1107
+ The indent space length is three.
1108
+ With JSON::PP, you can also access C<indent_length> to change indent space length.
1109
+
1110
+
1111
+ =head2 space_before
1112
+
1113
+ $json = $json->space_before([$enable])
1114
+
1115
+ $enabled = $json->get_space_before
1116
+
1117
+ If C<$enable> is true (or missing), then the C<encode> method will add an extra
1118
+ optional space before the C<:> separating keys from values in JSON objects.
1119
+
1120
+ If C<$enable> is false, then the C<encode> method will not add any extra
1121
+ space at those places.
1122
+
1123
+ This setting has no effect when decoding JSON texts.
1124
+
1125
+ Example, space_before enabled, space_after and indent disabled:
1126
+
1127
+ {"key" :"value"}
1128
+
1129
+
1130
+ =head2 space_after
1131
+
1132
+ $json = $json->space_after([$enable])
1133
+
1134
+ $enabled = $json->get_space_after
1135
+
1136
+ If C<$enable> is true (or missing), then the C<encode> method will add an extra
1137
+ optional space after the C<:> separating keys from values in JSON objects
1138
+ and extra whitespace after the C<,> separating key-value pairs and array
1139
+ members.
1140
+
1141
+ If C<$enable> is false, then the C<encode> method will not add any extra
1142
+ space at those places.
1143
+
1144
+ This setting has no effect when decoding JSON texts.
1145
+
1146
+ Example, space_before and indent disabled, space_after enabled:
1147
+
1148
+ {"key": "value"}
1149
+
1150
+
1151
+ =head2 relaxed
1152
+
1153
+ $json = $json->relaxed([$enable])
1154
+
1155
+ $enabled = $json->get_relaxed
1156
+
1157
+ If C<$enable> is true (or missing), then C<decode> will accept some
1158
+ extensions to normal JSON syntax (see below). C<encode> will not be
1159
+ affected in anyway. I<Be aware that this option makes you accept invalid
1160
+ JSON texts as if they were valid!>. I suggest only to use this option to
1161
+ parse application-specific files written by humans (configuration files,
1162
+ resource files etc.)
1163
+
1164
+ If C<$enable> is false (the default), then C<decode> will only accept
1165
+ valid JSON texts.
1166
+
1167
+ Currently accepted extensions are:
1168
+
1169
+ =over 4
1170
+
1171
+ =item * list items can have an end-comma
1172
+
1173
+ JSON I<separates> array elements and key-value pairs with commas. This
1174
+ can be annoying if you write JSON texts manually and want to be able to
1175
+ quickly append elements, so this extension accepts comma at the end of
1176
+ such items not just between them:
1177
+
1178
+ [
1179
+ 1,
1180
+ 2, <- this comma not normally allowed
1181
+ ]
1182
+ {
1183
+ "k1": "v1",
1184
+ "k2": "v2", <- this comma not normally allowed
1185
+ }
1186
+
1187
+ =item * shell-style '#'-comments
1188
+
1189
+ Whenever JSON allows whitespace, shell-style comments are additionally
1190
+ allowed. They are terminated by the first carriage-return or line-feed
1191
+ character, after which more white-space and comments are allowed.
1192
+
1193
+ [
1194
+ 1, # this comment not allowed in JSON
1195
+ # neither this one...
1196
+ ]
1197
+
1198
+ =back
1199
+
1200
+
1201
+ =head2 canonical
1202
+
1203
+ $json = $json->canonical([$enable])
1204
+
1205
+ $enabled = $json->get_canonical
1206
+
1207
+ If C<$enable> is true (or missing), then the C<encode> method will output JSON objects
1208
+ by sorting their keys. This is adding a comparatively high overhead.
1209
+
1210
+ If C<$enable> is false, then the C<encode> method will output key-value
1211
+ pairs in the order Perl stores them (which will likely change between runs
1212
+ of the same script).
1213
+
1214
+ This option is useful if you want the same data structure to be encoded as
1215
+ the same JSON text (given the same overall settings). If it is disabled,
1216
+ the same hash might be encoded differently even if contains the same data,
1217
+ as key-value pairs have no inherent ordering in Perl.
1218
+
1219
+ This setting has no effect when decoding JSON texts.
1220
+
1221
+ =head2 allow_nonref
1222
+
1223
+ $json = $json->allow_nonref([$enable])
1224
+
1225
+ $enabled = $json->get_allow_nonref
1226
+
1227
+ If C<$enable> is true (or missing), then the C<encode> method can convert a
1228
+ non-reference into its corresponding string, number or null JSON value,
1229
+ which is an extension to RFC4627. Likewise, C<decode> will accept those JSON
1230
+ values instead of croaking.
1231
+
1232
+ If C<$enable> is false, then the C<encode> method will croak if it isn't
1233
+ passed an arrayref or hashref, as JSON texts must either be an object
1234
+ or array. Likewise, C<decode> will croak if given something that is not a
1235
+ JSON object or array.
1236
+
1237
+ JSON->new->allow_nonref->encode ("Hello, World!")
1238
+ => "Hello, World!"
1239
+
1240
+ =head2 allow_unknown
1241
+
1242
+ $json = $json->allow_unknown ([$enable])
1243
+
1244
+ $enabled = $json->get_allow_unknown
1245
+
1246
+ If $enable is true (or missing), then "encode" will *not* throw an
1247
+ exception when it encounters values it cannot represent in JSON (for
1248
+ example, filehandles) but instead will encode a JSON "null" value.
1249
+ Note that blessed objects are not included here and are handled
1250
+ separately by c<allow_nonref>.
1251
+
1252
+ If $enable is false (the default), then "encode" will throw an
1253
+ exception when it encounters anything it cannot encode as JSON.
1254
+
1255
+ This option does not affect "decode" in any way, and it is
1256
+ recommended to leave it off unless you know your communications
1257
+ partner.
1258
+
1259
+ =head2 allow_blessed
1260
+
1261
+ $json = $json->allow_blessed([$enable])
1262
+
1263
+ $enabled = $json->get_allow_blessed
1264
+
1265
+ If C<$enable> is true (or missing), then the C<encode> method will not
1266
+ barf when it encounters a blessed reference. Instead, the value of the
1267
+ B<convert_blessed> option will decide whether C<null> (C<convert_blessed>
1268
+ disabled or no C<TO_JSON> method found) or a representation of the
1269
+ object (C<convert_blessed> enabled and C<TO_JSON> method found) is being
1270
+ encoded. Has no effect on C<decode>.
1271
+
1272
+ If C<$enable> is false (the default), then C<encode> will throw an
1273
+ exception when it encounters a blessed object.
1274
+
1275
+
1276
+ =head2 convert_blessed
1277
+
1278
+ $json = $json->convert_blessed([$enable])
1279
+
1280
+ $enabled = $json->get_convert_blessed
1281
+
1282
+ If C<$enable> is true (or missing), then C<encode>, upon encountering a
1283
+ blessed object, will check for the availability of the C<TO_JSON> method
1284
+ on the object's class. If found, it will be called in scalar context
1285
+ and the resulting scalar will be encoded instead of the object. If no
1286
+ C<TO_JSON> method is found, the value of C<allow_blessed> will decide what
1287
+ to do.
1288
+
1289
+ The C<TO_JSON> method may safely call die if it wants. If C<TO_JSON>
1290
+ returns other blessed objects, those will be handled in the same
1291
+ way. C<TO_JSON> must take care of not causing an endless recursion cycle
1292
+ (== crash) in this case. The name of C<TO_JSON> was chosen because other
1293
+ methods called by the Perl core (== not by the user of the object) are
1294
+ usually in upper case letters and to avoid collisions with the C<to_json>
1295
+ function or method.
1296
+
1297
+ This setting does not yet influence C<decode> in any way.
1298
+
1299
+ If C<$enable> is false, then the C<allow_blessed> setting will decide what
1300
+ to do when a blessed object is found.
1301
+
1302
+ =over
1303
+
1304
+ =item convert_blessed_universally mode
1305
+
1306
+ If use C<JSON> with C<-convert_blessed_universally>, the C<UNIVERSAL::TO_JSON>
1307
+ subroutine is defined as the below code:
1308
+
1309
+ *UNIVERSAL::TO_JSON = sub {
1310
+ my $b_obj = B::svref_2object( $_[0] );
1311
+ return $b_obj->isa('B::HV') ? { %{ $_[0] } }
1312
+ : $b_obj->isa('B::AV') ? [ @{ $_[0] } ]
1313
+ : undef
1314
+ ;
1315
+ }
1316
+
1317
+ This will cause that C<encode> method converts simple blessed objects into
1318
+ JSON objects as non-blessed object.
1319
+
1320
+ JSON -convert_blessed_universally;
1321
+ $json->allow_blessed->convert_blessed->encode( $blessed_object )
1322
+
1323
+ This feature is experimental and may be removed in the future.
1324
+
1325
+ =back
1326
+
1327
+ =head2 filter_json_object
1328
+
1329
+ $json = $json->filter_json_object([$coderef])
1330
+
1331
+ When C<$coderef> is specified, it will be called from C<decode> each
1332
+ time it decodes a JSON object. The only argument passed to the coderef
1333
+ is a reference to the newly-created hash. If the code references returns
1334
+ a single scalar (which need not be a reference), this value
1335
+ (i.e. a copy of that scalar to avoid aliasing) is inserted into the
1336
+ deserialised data structure. If it returns an empty list
1337
+ (NOTE: I<not> C<undef>, which is a valid scalar), the original deserialised
1338
+ hash will be inserted. This setting can slow down decoding considerably.
1339
+
1340
+ When C<$coderef> is omitted or undefined, any existing callback will
1341
+ be removed and C<decode> will not change the deserialised hash in any
1342
+ way.
1343
+
1344
+ Example, convert all JSON objects into the integer 5:
1345
+
1346
+ my $js = JSON->new->filter_json_object (sub { 5 });
1347
+ # returns [5]
1348
+ $js->decode ('[{}]'); # the given subroutine takes a hash reference.
1349
+ # throw an exception because allow_nonref is not enabled
1350
+ # so a lone 5 is not allowed.
1351
+ $js->decode ('{"a":1, "b":2}');
1352
+
1353
+
1354
+ =head2 filter_json_single_key_object
1355
+
1356
+ $json = $json->filter_json_single_key_object($key [=> $coderef])
1357
+
1358
+ Works remotely similar to C<filter_json_object>, but is only called for
1359
+ JSON objects having a single key named C<$key>.
1360
+
1361
+ This C<$coderef> is called before the one specified via
1362
+ C<filter_json_object>, if any. It gets passed the single value in the JSON
1363
+ object. If it returns a single value, it will be inserted into the data
1364
+ structure. If it returns nothing (not even C<undef> but the empty list),
1365
+ the callback from C<filter_json_object> will be called next, as if no
1366
+ single-key callback were specified.
1367
+
1368
+ If C<$coderef> is omitted or undefined, the corresponding callback will be
1369
+ disabled. There can only ever be one callback for a given key.
1370
+
1371
+ As this callback gets called less often then the C<filter_json_object>
1372
+ one, decoding speed will not usually suffer as much. Therefore, single-key
1373
+ objects make excellent targets to serialise Perl objects into, especially
1374
+ as single-key JSON objects are as close to the type-tagged value concept
1375
+ as JSON gets (it's basically an ID/VALUE tuple). Of course, JSON does not
1376
+ support this in any way, so you need to make sure your data never looks
1377
+ like a serialised Perl hash.
1378
+
1379
+ Typical names for the single object key are C<__class_whatever__>, or
1380
+ C<$__dollars_are_rarely_used__$> or C<}ugly_brace_placement>, or even
1381
+ things like C<__class_md5sum(classname)__>, to reduce the risk of clashing
1382
+ with real hashes.
1383
+
1384
+ Example, decode JSON objects of the form C<< { "__widget__" => <id> } >>
1385
+ into the corresponding C<< $WIDGET{<id>} >> object:
1386
+
1387
+ # return whatever is in $WIDGET{5}:
1388
+ JSON
1389
+ ->new
1390
+ ->filter_json_single_key_object (__widget__ => sub {
1391
+ $WIDGET{ $_[0] }
1392
+ })
1393
+ ->decode ('{"__widget__": 5')
1394
+
1395
+ # this can be used with a TO_JSON method in some "widget" class
1396
+ # for serialisation to json:
1397
+ sub WidgetBase::TO_JSON {
1398
+ my ($self) = @_;
1399
+
1400
+ unless ($self->{id}) {
1401
+ $self->{id} = ..get..some..id..;
1402
+ $WIDGET{$self->{id}} = $self;
1403
+ }
1404
+
1405
+ { __widget__ => $self->{id} }
1406
+ }
1407
+
1408
+
1409
+ =head2 shrink
1410
+
1411
+ $json = $json->shrink([$enable])
1412
+
1413
+ $enabled = $json->get_shrink
1414
+
1415
+ With JSON::XS, this flag resizes strings generated by either
1416
+ C<encode> or C<decode> to their minimum size possible. This can save
1417
+ memory when your JSON texts are either very very long or you have many
1418
+ short strings. It will also try to downgrade any strings to octet-form
1419
+ if possible: perl stores strings internally either in an encoding called
1420
+ UTF-X or in octet-form. The latter cannot store everything but uses less
1421
+ space in general (and some buggy Perl or C code might even rely on that
1422
+ internal representation being used).
1423
+
1424
+ With JSON::PP, it is noop about resizing strings but tries
1425
+ C<utf8::downgrade> to the returned string by C<encode>. See to L<utf8>.
1426
+
1427
+ See to L<JSON::XS/OBJECT-ORIENTED INTERFACE> and L<JSON::PP/METHODS>.
1428
+
1429
+ =head2 max_depth
1430
+
1431
+ $json = $json->max_depth([$maximum_nesting_depth])
1432
+
1433
+ $max_depth = $json->get_max_depth
1434
+
1435
+ Sets the maximum nesting level (default C<512>) accepted while encoding
1436
+ or decoding. If a higher nesting level is detected in JSON text or a Perl
1437
+ data structure, then the encoder and decoder will stop and croak at that
1438
+ point.
1439
+
1440
+ Nesting level is defined by number of hash- or arrayrefs that the encoder
1441
+ needs to traverse to reach a given point or the number of C<{> or C<[>
1442
+ characters without their matching closing parenthesis crossed to reach a
1443
+ given character in a string.
1444
+
1445
+ If no argument is given, the highest possible setting will be used, which
1446
+ is rarely useful.
1447
+
1448
+ Note that nesting is implemented by recursion in C. The default value has
1449
+ been chosen to be as large as typical operating systems allow without
1450
+ crashing. (JSON::XS)
1451
+
1452
+ With JSON::PP as the backend, when a large value (100 or more) was set and
1453
+ it de/encodes a deep nested object/text, it may raise a warning
1454
+ 'Deep recursion on subroutine' at the perl runtime phase.
1455
+
1456
+ See L<JSON::XS/SECURITY CONSIDERATIONS> for more info on why this is useful.
1457
+
1458
+ =head2 max_size
1459
+
1460
+ $json = $json->max_size([$maximum_string_size])
1461
+
1462
+ $max_size = $json->get_max_size
1463
+
1464
+ Set the maximum length a JSON text may have (in bytes) where decoding is
1465
+ being attempted. The default is C<0>, meaning no limit. When C<decode>
1466
+ is called on a string that is longer then this many bytes, it will not
1467
+ attempt to decode the string but throw an exception. This setting has no
1468
+ effect on C<encode> (yet).
1469
+
1470
+ If no argument is given, the limit check will be deactivated (same as when
1471
+ C<0> is specified).
1472
+
1473
+ See L<JSON::XS/SECURITY CONSIDERATIONS>, below, for more info on why this is useful.
1474
+
1475
+ =head2 encode
1476
+
1477
+ $json_text = $json->encode($perl_scalar)
1478
+
1479
+ Converts the given Perl data structure (a simple scalar or a reference
1480
+ to a hash or array) to its JSON representation. Simple scalars will be
1481
+ converted into JSON string or number sequences, while references to arrays
1482
+ become JSON arrays and references to hashes become JSON objects. Undefined
1483
+ Perl values (e.g. C<undef>) become JSON C<null> values.
1484
+ References to the integers C<0> and C<1> are converted into C<true> and C<false>.
1485
+
1486
+ =head2 decode
1487
+
1488
+ $perl_scalar = $json->decode($json_text)
1489
+
1490
+ The opposite of C<encode>: expects a JSON text and tries to parse it,
1491
+ returning the resulting simple scalar or reference. Croaks on error.
1492
+
1493
+ JSON numbers and strings become simple Perl scalars. JSON arrays become
1494
+ Perl arrayrefs and JSON objects become Perl hashrefs. C<true> becomes
1495
+ C<1> (C<JSON::true>), C<false> becomes C<0> (C<JSON::false>) and
1496
+ C<null> becomes C<undef>.
1497
+
1498
+ =head2 decode_prefix
1499
+
1500
+ ($perl_scalar, $characters) = $json->decode_prefix($json_text)
1501
+
1502
+ This works like the C<decode> method, but instead of raising an exception
1503
+ when there is trailing garbage after the first JSON object, it will
1504
+ silently stop parsing there and return the number of characters consumed
1505
+ so far.
1506
+
1507
+ JSON->new->decode_prefix ("[1] the tail")
1508
+ => ([], 3)
1509
+
1510
+ See to L<JSON::XS/OBJECT-ORIENTED INTERFACE>
1511
+
1512
+ =head2 property
1513
+
1514
+ $boolean = $json->property($property_name)
1515
+
1516
+ Returns a boolean value about above some properties.
1517
+
1518
+ The available properties are C<ascii>, C<latin1>, C<utf8>,
1519
+ C<indent>,C<space_before>, C<space_after>, C<relaxed>, C<canonical>,
1520
+ C<allow_nonref>, C<allow_unknown>, C<allow_blessed>, C<convert_blessed>,
1521
+ C<shrink>, C<max_depth> and C<max_size>.
1522
+
1523
+ $boolean = $json->property('utf8');
1524
+ => 0
1525
+ $json->utf8;
1526
+ $boolean = $json->property('utf8');
1527
+ => 1
1528
+
1529
+ Sets the property with a given boolean value.
1530
+
1531
+ $json = $json->property($property_name => $boolean);
1532
+
1533
+ With no argument, it returns all the above properties as a hash reference.
1534
+
1535
+ $flag_hashref = $json->property();
1536
+
1537
+ =head1 INCREMENTAL PARSING
1538
+
1539
+ Most of this section are copied and modified from L<JSON::XS/INCREMENTAL PARSING>.
1540
+
1541
+ In some cases, there is the need for incremental parsing of JSON texts.
1542
+ This module does allow you to parse a JSON stream incrementally.
1543
+ It does so by accumulating text until it has a full JSON object, which
1544
+ it then can decode. This process is similar to using C<decode_prefix>
1545
+ to see if a full JSON object is available, but is much more efficient
1546
+ (and can be implemented with a minimum of method calls).
1547
+
1548
+ The backend module will only attempt to parse the JSON text once it is sure it
1549
+ has enough text to get a decisive result, using a very simple but
1550
+ truly incremental parser. This means that it sometimes won't stop as
1551
+ early as the full parser, for example, it doesn't detect parenthesis
1552
+ mismatches. The only thing it guarantees is that it starts decoding as
1553
+ soon as a syntactically valid JSON text has been seen. This means you need
1554
+ to set resource limits (e.g. C<max_size>) to ensure the parser will stop
1555
+ parsing in the presence if syntax errors.
1556
+
1557
+ The following methods implement this incremental parser.
1558
+
1559
+ =head2 incr_parse
1560
+
1561
+ $json->incr_parse( [$string] ) # void context
1562
+
1563
+ $obj_or_undef = $json->incr_parse( [$string] ) # scalar context
1564
+
1565
+ @obj_or_empty = $json->incr_parse( [$string] ) # list context
1566
+
1567
+ This is the central parsing function. It can both append new text and
1568
+ extract objects from the stream accumulated so far (both of these
1569
+ functions are optional).
1570
+
1571
+ If C<$string> is given, then this string is appended to the already
1572
+ existing JSON fragment stored in the C<$json> object.
1573
+
1574
+ After that, if the function is called in void context, it will simply
1575
+ return without doing anything further. This can be used to add more text
1576
+ in as many chunks as you want.
1577
+
1578
+ If the method is called in scalar context, then it will try to extract
1579
+ exactly I<one> JSON object. If that is successful, it will return this
1580
+ object, otherwise it will return C<undef>. If there is a parse error,
1581
+ this method will croak just as C<decode> would do (one can then use
1582
+ C<incr_skip> to skip the erroneous part). This is the most common way of
1583
+ using the method.
1584
+
1585
+ And finally, in list context, it will try to extract as many objects
1586
+ from the stream as it can find and return them, or the empty list
1587
+ otherwise. For this to work, there must be no separators between the JSON
1588
+ objects or arrays, instead they must be concatenated back-to-back. If
1589
+ an error occurs, an exception will be raised as in the scalar context
1590
+ case. Note that in this case, any previously-parsed JSON texts will be
1591
+ lost.
1592
+
1593
+ Example: Parse some JSON arrays/objects in a given string and return them.
1594
+
1595
+ my @objs = JSON->new->incr_parse ("[5][7][1,2]");
1596
+
1597
+ =head2 incr_text
1598
+
1599
+ $lvalue_string = $json->incr_text
1600
+
1601
+ This method returns the currently stored JSON fragment as an lvalue, that
1602
+ is, you can manipulate it. This I<only> works when a preceding call to
1603
+ C<incr_parse> in I<scalar context> successfully returned an object. Under
1604
+ all other circumstances you must not call this function (I mean it.
1605
+ although in simple tests it might actually work, it I<will> fail under
1606
+ real world conditions). As a special exception, you can also call this
1607
+ method before having parsed anything.
1608
+
1609
+ This function is useful in two cases: a) finding the trailing text after a
1610
+ JSON object or b) parsing multiple JSON objects separated by non-JSON text
1611
+ (such as commas).
1612
+
1613
+ $json->incr_text =~ s/\s*,\s*//;
1614
+
1615
+ In Perl 5.005, C<lvalue> attribute is not available.
1616
+ You must write codes like the below:
1617
+
1618
+ $string = $json->incr_text;
1619
+ $string =~ s/\s*,\s*//;
1620
+ $json->incr_text( $string );
1621
+
1622
+ =head2 incr_skip
1623
+
1624
+ $json->incr_skip
1625
+
1626
+ This will reset the state of the incremental parser and will remove the
1627
+ parsed text from the input buffer. This is useful after C<incr_parse>
1628
+ died, in which case the input buffer and incremental parser state is left
1629
+ unchanged, to skip the text parsed so far and to reset the parse state.
1630
+
1631
+ =head2 incr_reset
1632
+
1633
+ $json->incr_reset
1634
+
1635
+ This completely resets the incremental parser, that is, after this call,
1636
+ it will be as if the parser had never parsed anything.
1637
+
1638
+ This is useful if you want to repeatedly parse JSON objects and want to
1639
+ ignore any trailing data, which means you have to reset the parser after
1640
+ each successful decode.
1641
+
1642
+ See to L<JSON::XS/INCREMENTAL PARSING> for examples.
1643
+
1644
+
1645
+ =head1 JSON::PP SUPPORT METHODS
1646
+
1647
+ The below methods are JSON::PP own methods, so when C<JSON> works
1648
+ with JSON::PP (i.e. the created object is a JSON::PP object), available.
1649
+ See to L<JSON::PP/JSON::PP OWN METHODS> in detail.
1650
+
1651
+ If you use C<JSON> with additional C<-support_by_pp>, some methods
1652
+ are available even with JSON::XS. See to L<USE PP FEATURES EVEN THOUGH XS BACKEND>.
1653
+
1654
+ BEING { $ENV{PERL_JSON_BACKEND} = 'JSON::XS' }
1655
+
1656
+ use JSON -support_by_pp;
1657
+
1658
+ my $json = JSON->new;
1659
+ $json->allow_nonref->escape_slash->encode("/");
1660
+
1661
+ # functional interfaces too.
1662
+ print to_json(["/"], {escape_slash => 1});
1663
+ print from_json('["foo"]', {utf8 => 1});
1664
+
1665
+ If you do not want to all functions but C<-support_by_pp>,
1666
+ use C<-no_export>.
1667
+
1668
+ use JSON -support_by_pp, -no_export;
1669
+ # functional interfaces are not exported.
1670
+
1671
+ =head2 allow_singlequote
1672
+
1673
+ $json = $json->allow_singlequote([$enable])
1674
+
1675
+ If C<$enable> is true (or missing), then C<decode> will accept
1676
+ any JSON strings quoted by single quotations that are invalid JSON
1677
+ format.
1678
+
1679
+ $json->allow_singlequote->decode({"foo":'bar'});
1680
+ $json->allow_singlequote->decode({'foo':"bar"});
1681
+ $json->allow_singlequote->decode({'foo':'bar'});
1682
+
1683
+ As same as the C<relaxed> option, this option may be used to parse
1684
+ application-specific files written by humans.
1685
+
1686
+ =head2 allow_barekey
1687
+
1688
+ $json = $json->allow_barekey([$enable])
1689
+
1690
+ If C<$enable> is true (or missing), then C<decode> will accept
1691
+ bare keys of JSON object that are invalid JSON format.
1692
+
1693
+ As same as the C<relaxed> option, this option may be used to parse
1694
+ application-specific files written by humans.
1695
+
1696
+ $json->allow_barekey->decode('{foo:"bar"}');
1697
+
1698
+ =head2 allow_bignum
1699
+
1700
+ $json = $json->allow_bignum([$enable])
1701
+
1702
+ If C<$enable> is true (or missing), then C<decode> will convert
1703
+ the big integer Perl cannot handle as integer into a L<Math::BigInt>
1704
+ object and convert a floating number (any) into a L<Math::BigFloat>.
1705
+
1706
+ On the contrary, C<encode> converts C<Math::BigInt> objects and C<Math::BigFloat>
1707
+ objects into JSON numbers with C<allow_blessed> enable.
1708
+
1709
+ $json->allow_nonref->allow_blessed->allow_bignum;
1710
+ $bigfloat = $json->decode('2.000000000000000000000000001');
1711
+ print $json->encode($bigfloat);
1712
+ # => 2.000000000000000000000000001
1713
+
1714
+ See to L<MAPPING> about the conversion of JSON number.
1715
+
1716
+ =head2 loose
1717
+
1718
+ $json = $json->loose([$enable])
1719
+
1720
+ The unescaped [\x00-\x1f\x22\x2f\x5c] strings are invalid in JSON strings
1721
+ and the module doesn't allow to C<decode> to these (except for \x2f).
1722
+ If C<$enable> is true (or missing), then C<decode> will accept these
1723
+ unescaped strings.
1724
+
1725
+ $json->loose->decode(qq|["abc
1726
+ def"]|);
1727
+
1728
+ See to L<JSON::PP/JSON::PP OWN METHODS>.
1729
+
1730
+ =head2 escape_slash
1731
+
1732
+ $json = $json->escape_slash([$enable])
1733
+
1734
+ According to JSON Grammar, I<slash> (U+002F) is escaped. But by default
1735
+ JSON backend modules encode strings without escaping slash.
1736
+
1737
+ If C<$enable> is true (or missing), then C<encode> will escape slashes.
1738
+
1739
+ =head2 indent_length
1740
+
1741
+ $json = $json->indent_length($length)
1742
+
1743
+ With JSON::XS, The indent space length is 3 and cannot be changed.
1744
+ With JSON::PP, it sets the indent space length with the given $length.
1745
+ The default is 3. The acceptable range is 0 to 15.
1746
+
1747
+ =head2 sort_by
1748
+
1749
+ $json = $json->sort_by($function_name)
1750
+ $json = $json->sort_by($subroutine_ref)
1751
+
1752
+ If $function_name or $subroutine_ref are set, its sort routine are used.
1753
+
1754
+ $js = $pc->sort_by(sub { $JSON::PP::a cmp $JSON::PP::b })->encode($obj);
1755
+ # is($js, q|{"a":1,"b":2,"c":3,"d":4,"e":5,"f":6,"g":7,"h":8,"i":9}|);
1756
+
1757
+ $js = $pc->sort_by('own_sort')->encode($obj);
1758
+ # is($js, q|{"a":1,"b":2,"c":3,"d":4,"e":5,"f":6,"g":7,"h":8,"i":9}|);
1759
+
1760
+ sub JSON::PP::own_sort { $JSON::PP::a cmp $JSON::PP::b }
1761
+
1762
+ As the sorting routine runs in the JSON::PP scope, the given
1763
+ subroutine name and the special variables C<$a>, C<$b> will begin
1764
+ with 'JSON::PP::'.
1765
+
1766
+ If $integer is set, then the effect is same as C<canonical> on.
1767
+
1768
+ See to L<JSON::PP/JSON::PP OWN METHODS>.
1769
+
1770
+ =head1 MAPPING
1771
+
1772
+ This section is copied from JSON::XS and modified to C<JSON>.
1773
+ JSON::XS and JSON::PP mapping mechanisms are almost equivalent.
1774
+
1775
+ See to L<JSON::XS/MAPPING>.
1776
+
1777
+ =head2 JSON -> PERL
1778
+
1779
+ =over 4
1780
+
1781
+ =item object
1782
+
1783
+ A JSON object becomes a reference to a hash in Perl. No ordering of object
1784
+ keys is preserved (JSON does not preserver object key ordering itself).
1785
+
1786
+ =item array
1787
+
1788
+ A JSON array becomes a reference to an array in Perl.
1789
+
1790
+ =item string
1791
+
1792
+ A JSON string becomes a string scalar in Perl - Unicode codepoints in JSON
1793
+ are represented by the same codepoints in the Perl string, so no manual
1794
+ decoding is necessary.
1795
+
1796
+ =item number
1797
+
1798
+ A JSON number becomes either an integer, numeric (floating point) or
1799
+ string scalar in perl, depending on its range and any fractional parts. On
1800
+ the Perl level, there is no difference between those as Perl handles all
1801
+ the conversion details, but an integer may take slightly less memory and
1802
+ might represent more values exactly than floating point numbers.
1803
+
1804
+ If the number consists of digits only, C<JSON> will try to represent
1805
+ it as an integer value. If that fails, it will try to represent it as
1806
+ a numeric (floating point) value if that is possible without loss of
1807
+ precision. Otherwise it will preserve the number as a string value (in
1808
+ which case you lose roundtripping ability, as the JSON number will be
1809
+ re-encoded to a JSON string).
1810
+
1811
+ Numbers containing a fractional or exponential part will always be
1812
+ represented as numeric (floating point) values, possibly at a loss of
1813
+ precision (in which case you might lose perfect roundtripping ability, but
1814
+ the JSON number will still be re-encoded as a JSON number).
1815
+
1816
+ Note that precision is not accuracy - binary floating point values cannot
1817
+ represent most decimal fractions exactly, and when converting from and to
1818
+ floating point, C<JSON> only guarantees precision up to but not including
1819
+ the least significant bit.
1820
+
1821
+ If the backend is JSON::PP and C<allow_bignum> is enable, the big integers
1822
+ and the numeric can be optionally converted into L<Math::BigInt> and
1823
+ L<Math::BigFloat> objects.
1824
+
1825
+ =item true, false
1826
+
1827
+ These JSON atoms become C<JSON::true> and C<JSON::false>,
1828
+ respectively. They are overloaded to act almost exactly like the numbers
1829
+ C<1> and C<0>. You can check whether a scalar is a JSON boolean by using
1830
+ the C<JSON::is_bool> function.
1831
+
1832
+ print JSON::true + 1;
1833
+ => 1
1834
+
1835
+ ok(JSON::true eq '1');
1836
+ ok(JSON::true == 1);
1837
+
1838
+ C<JSON> will install these missing overloading features to the backend modules.
1839
+
1840
+
1841
+ =item null
1842
+
1843
+ A JSON null atom becomes C<undef> in Perl.
1844
+
1845
+ C<JSON::null> returns C<undef>.
1846
+
1847
+ =back
1848
+
1849
+
1850
+ =head2 PERL -> JSON
1851
+
1852
+ The mapping from Perl to JSON is slightly more difficult, as Perl is a
1853
+ truly typeless language, so we can only guess which JSON type is meant by
1854
+ a Perl value.
1855
+
1856
+ =over 4
1857
+
1858
+ =item hash references
1859
+
1860
+ Perl hash references become JSON objects. As there is no inherent ordering
1861
+ in hash keys (or JSON objects), they will usually be encoded in a
1862
+ pseudo-random order that can change between runs of the same program but
1863
+ stays generally the same within a single run of a program. C<JSON>
1864
+ optionally sort the hash keys (determined by the I<canonical> flag), so
1865
+ the same data structure will serialise to the same JSON text (given same
1866
+ settings and version of JSON::XS), but this incurs a runtime overhead
1867
+ and is only rarely useful, e.g. when you want to compare some JSON text
1868
+ against another for equality.
1869
+
1870
+ In future, the ordered object feature will be added to JSON::PP using C<tie> mechanism.
1871
+
1872
+
1873
+ =item array references
1874
+
1875
+ Perl array references become JSON arrays.
1876
+
1877
+ =item other references
1878
+
1879
+ Other unblessed references are generally not allowed and will cause an
1880
+ exception to be thrown, except for references to the integers C<0> and
1881
+ C<1>, which get turned into C<false> and C<true> atoms in JSON. You can
1882
+ also use C<JSON::false> and C<JSON::true> to improve readability.
1883
+
1884
+ to_json [\0,JSON::true] # yields [false,true]
1885
+
1886
+ =item JSON::true, JSON::false, JSON::null
1887
+
1888
+ These special values become JSON true and JSON false values,
1889
+ respectively. You can also use C<\1> and C<\0> directly if you want.
1890
+
1891
+ JSON::null returns C<undef>.
1892
+
1893
+ =item blessed objects
1894
+
1895
+ Blessed objects are not directly representable in JSON. See the
1896
+ C<allow_blessed> and C<convert_blessed> methods on various options on
1897
+ how to deal with this: basically, you can choose between throwing an
1898
+ exception, encoding the reference as if it weren't blessed, or provide
1899
+ your own serialiser method.
1900
+
1901
+ With C<convert_blessed_universally> mode, C<encode> converts blessed
1902
+ hash references or blessed array references (contains other blessed references)
1903
+ into JSON members and arrays.
1904
+
1905
+ use JSON -convert_blessed_universally;
1906
+ JSON->new->allow_blessed->convert_blessed->encode( $blessed_object );
1907
+
1908
+ See to L<convert_blessed>.
1909
+
1910
+ =item simple scalars
1911
+
1912
+ Simple Perl scalars (any scalar that is not a reference) are the most
1913
+ difficult objects to encode: JSON::XS and JSON::PP will encode undefined scalars as
1914
+ JSON C<null> values, scalars that have last been used in a string context
1915
+ before encoding as JSON strings, and anything else as number value:
1916
+
1917
+ # dump as number
1918
+ encode_json [2] # yields [2]
1919
+ encode_json [-3.0e17] # yields [-3e+17]
1920
+ my $value = 5; encode_json [$value] # yields [5]
1921
+
1922
+ # used as string, so dump as string
1923
+ print $value;
1924
+ encode_json [$value] # yields ["5"]
1925
+
1926
+ # undef becomes null
1927
+ encode_json [undef] # yields [null]
1928
+
1929
+ You can force the type to be a string by stringifying it:
1930
+
1931
+ my $x = 3.1; # some variable containing a number
1932
+ "$x"; # stringified
1933
+ $x .= ""; # another, more awkward way to stringify
1934
+ print $x; # perl does it for you, too, quite often
1935
+
1936
+ You can force the type to be a number by numifying it:
1937
+
1938
+ my $x = "3"; # some variable containing a string
1939
+ $x += 0; # numify it, ensuring it will be dumped as a number
1940
+ $x *= 1; # same thing, the choice is yours.
1941
+
1942
+ You can not currently force the type in other, less obscure, ways.
1943
+
1944
+ Note that numerical precision has the same meaning as under Perl (so
1945
+ binary to decimal conversion follows the same rules as in Perl, which
1946
+ can differ to other languages). Also, your perl interpreter might expose
1947
+ extensions to the floating point numbers of your platform, such as
1948
+ infinities or NaN's - these cannot be represented in JSON, and it is an
1949
+ error to pass those in.
1950
+
1951
+ =item Big Number
1952
+
1953
+ If the backend is JSON::PP and C<allow_bignum> is enable,
1954
+ C<encode> converts C<Math::BigInt> objects and C<Math::BigFloat>
1955
+ objects into JSON numbers.
1956
+
1957
+
1958
+ =back
1959
+
1960
+ =head1 JSON and ECMAscript
1961
+
1962
+ See to L<JSON::XS/JSON and ECMAscript>.
1963
+
1964
+ =head1 JSON and YAML
1965
+
1966
+ JSON is not a subset of YAML.
1967
+ See to L<JSON::XS/JSON and YAML>.
1968
+
1969
+
1970
+ =head1 BACKEND MODULE DECISION
1971
+
1972
+ When you use C<JSON>, C<JSON> tries to C<use> JSON::XS. If this call failed, it will
1973
+ C<uses> JSON::PP. The required JSON::XS version is I<2.2> or later.
1974
+
1975
+ The C<JSON> constructor method returns an object inherited from the backend module,
1976
+ and JSON::XS object is a blessed scalar reference while JSON::PP is a blessed hash
1977
+ reference.
1978
+
1979
+ So, your program should not depend on the backend module, especially
1980
+ returned objects should not be modified.
1981
+
1982
+ my $json = JSON->new; # XS or PP?
1983
+ $json->{stash} = 'this is xs object'; # this code may raise an error!
1984
+
1985
+ To check the backend module, there are some methods - C<backend>, C<is_pp> and C<is_xs>.
1986
+
1987
+ JSON->backend; # 'JSON::XS' or 'JSON::PP'
1988
+
1989
+ JSON->backend->is_pp: # 0 or 1
1990
+
1991
+ JSON->backend->is_xs: # 1 or 0
1992
+
1993
+ $json->is_xs; # 1 or 0
1994
+
1995
+ $json->is_pp; # 0 or 1
1996
+
1997
+
1998
+ If you set an environment variable C<PERL_JSON_BACKEND>, the calling action will be changed.
1999
+
2000
+ =over
2001
+
2002
+ =item PERL_JSON_BACKEND = 0 or PERL_JSON_BACKEND = 'JSON::PP'
2003
+
2004
+ Always use JSON::PP
2005
+
2006
+ =item PERL_JSON_BACKEND == 1 or PERL_JSON_BACKEND = 'JSON::XS,JSON::PP'
2007
+
2008
+ (The default) Use compiled JSON::XS if it is properly compiled & installed,
2009
+ otherwise use JSON::PP.
2010
+
2011
+ =item PERL_JSON_BACKEND == 2 or PERL_JSON_BACKEND = 'JSON::XS'
2012
+
2013
+ Always use compiled JSON::XS, die if it isn't properly compiled & installed.
2014
+
2015
+ =item PERL_JSON_BACKEND = 'JSON::backportPP'
2016
+
2017
+ Always use JSON::backportPP.
2018
+ JSON::backportPP is JSON::PP back port module.
2019
+ C<JSON> includes JSON::backportPP instead of JSON::PP.
2020
+
2021
+ =back
2022
+
2023
+ These ideas come from L<DBI::PurePerl> mechanism.
2024
+
2025
+ example:
2026
+
2027
+ BEGIN { $ENV{PERL_JSON_BACKEND} = 'JSON::PP' }
2028
+ use JSON; # always uses JSON::PP
2029
+
2030
+ In future, it may be able to specify another module.
2031
+
2032
+ =head1 USE PP FEATURES EVEN THOUGH XS BACKEND
2033
+
2034
+ Many methods are available with either JSON::XS or JSON::PP and
2035
+ when the backend module is JSON::XS, if any JSON::PP specific (i.e. JSON::XS unsupported)
2036
+ method is called, it will C<warn> and be noop.
2037
+
2038
+ But If you C<use> C<JSON> passing the optional string C<-support_by_pp>,
2039
+ it makes a part of those unsupported methods available.
2040
+ This feature is achieved by using JSON::PP in C<de/encode>.
2041
+
2042
+ BEGIN { $ENV{PERL_JSON_BACKEND} = 2 } # with JSON::XS
2043
+ use JSON -support_by_pp;
2044
+ my $json = JSON->new;
2045
+ $json->allow_nonref->escape_slash->encode("/");
2046
+
2047
+ At this time, the returned object is a C<JSON::Backend::XS::Supportable>
2048
+ object (re-blessed XS object), and by checking JSON::XS unsupported flags
2049
+ in de/encoding, can support some unsupported methods - C<loose>, C<allow_bignum>,
2050
+ C<allow_barekey>, C<allow_singlequote>, C<escape_slash> and C<indent_length>.
2051
+
2052
+ When any unsupported methods are not enable, C<XS de/encode> will be
2053
+ used as is. The switch is achieved by changing the symbolic tables.
2054
+
2055
+ C<-support_by_pp> is effective only when the backend module is JSON::XS
2056
+ and it makes the de/encoding speed down a bit.
2057
+
2058
+ See to L<JSON::PP SUPPORT METHODS>.
2059
+
2060
+ =head1 INCOMPATIBLE CHANGES TO OLD VERSION
2061
+
2062
+ There are big incompatibility between new version (2.00) and old (1.xx).
2063
+ If you use old C<JSON> 1.xx in your code, please check it.
2064
+
2065
+ See to L<Transition ways from 1.xx to 2.xx.>
2066
+
2067
+ =over
2068
+
2069
+ =item jsonToObj and objToJson are obsoleted.
2070
+
2071
+ Non Perl-style name C<jsonToObj> and C<objToJson> are obsoleted
2072
+ (but not yet deleted from the source).
2073
+ If you use these functions in your code, please replace them
2074
+ with C<from_json> and C<to_json>.
2075
+
2076
+
2077
+ =item Global variables are no longer available.
2078
+
2079
+ C<JSON> class variables - C<$JSON::AUTOCONVERT>, C<$JSON::BareKey>, etc...
2080
+ - are not available any longer.
2081
+ Instead, various features can be used through object methods.
2082
+
2083
+
2084
+ =item Package JSON::Converter and JSON::Parser are deleted.
2085
+
2086
+ Now C<JSON> bundles with JSON::PP which can handle JSON more properly than them.
2087
+
2088
+ =item Package JSON::NotString is deleted.
2089
+
2090
+ There was C<JSON::NotString> class which represents JSON value C<true>, C<false>, C<null>
2091
+ and numbers. It was deleted and replaced by C<JSON::Boolean>.
2092
+
2093
+ C<JSON::Boolean> represents C<true> and C<false>.
2094
+
2095
+ C<JSON::Boolean> does not represent C<null>.
2096
+
2097
+ C<JSON::null> returns C<undef>.
2098
+
2099
+ C<JSON> makes L<JSON::XS::Boolean> and L<JSON::PP::Boolean> is-a relation
2100
+ to L<JSON::Boolean>.
2101
+
2102
+ =item function JSON::Number is obsoleted.
2103
+
2104
+ C<JSON::Number> is now needless because JSON::XS and JSON::PP have
2105
+ round-trip integrity.
2106
+
2107
+ =item JSONRPC modules are deleted.
2108
+
2109
+ Perl implementation of JSON-RPC protocol - C<JSONRPC >, C<JSONRPC::Transport::HTTP>
2110
+ and C<Apache::JSONRPC > are deleted in this distribution.
2111
+ Instead of them, there is L<JSON::RPC> which supports JSON-RPC protocol version 1.1.
2112
+
2113
+ =back
2114
+
2115
+ =head2 Transition ways from 1.xx to 2.xx.
2116
+
2117
+ You should set C<suport_by_pp> mode firstly, because
2118
+ it is always successful for the below codes even with JSON::XS.
2119
+
2120
+ use JSON -support_by_pp;
2121
+
2122
+ =over
2123
+
2124
+ =item Exported jsonToObj (simple)
2125
+
2126
+ from_json($json_text);
2127
+
2128
+ =item Exported objToJson (simple)
2129
+
2130
+ to_json($perl_scalar);
2131
+
2132
+ =item Exported jsonToObj (advanced)
2133
+
2134
+ $flags = {allow_barekey => 1, allow_singlequote => 1};
2135
+ from_json($json_text, $flags);
2136
+
2137
+ equivalent to:
2138
+
2139
+ $JSON::BareKey = 1;
2140
+ $JSON::QuotApos = 1;
2141
+ jsonToObj($json_text);
2142
+
2143
+ =item Exported objToJson (advanced)
2144
+
2145
+ $flags = {allow_blessed => 1, allow_barekey => 1};
2146
+ to_json($perl_scalar, $flags);
2147
+
2148
+ equivalent to:
2149
+
2150
+ $JSON::BareKey = 1;
2151
+ objToJson($perl_scalar);
2152
+
2153
+ =item jsonToObj as object method
2154
+
2155
+ $json->decode($json_text);
2156
+
2157
+ =item objToJson as object method
2158
+
2159
+ $json->encode($perl_scalar);
2160
+
2161
+ =item new method with parameters
2162
+
2163
+ The C<new> method in 2.x takes any parameters no longer.
2164
+ You can set parameters instead;
2165
+
2166
+ $json = JSON->new->pretty;
2167
+
2168
+ =item $JSON::Pretty, $JSON::Indent, $JSON::Delimiter
2169
+
2170
+ If C<indent> is enable, that means C<$JSON::Pretty> flag set. And
2171
+ C<$JSON::Delimiter> was substituted by C<space_before> and C<space_after>.
2172
+ In conclusion:
2173
+
2174
+ $json->indent->space_before->space_after;
2175
+
2176
+ Equivalent to:
2177
+
2178
+ $json->pretty;
2179
+
2180
+ To change indent length, use C<indent_length>.
2181
+
2182
+ (Only with JSON::PP, if C<-support_by_pp> is not used.)
2183
+
2184
+ $json->pretty->indent_length(2)->encode($perl_scalar);
2185
+
2186
+ =item $JSON::BareKey
2187
+
2188
+ (Only with JSON::PP, if C<-support_by_pp> is not used.)
2189
+
2190
+ $json->allow_barekey->decode($json_text)
2191
+
2192
+ =item $JSON::ConvBlessed
2193
+
2194
+ use C<-convert_blessed_universally>. See to L<convert_blessed>.
2195
+
2196
+ =item $JSON::QuotApos
2197
+
2198
+ (Only with JSON::PP, if C<-support_by_pp> is not used.)
2199
+
2200
+ $json->allow_singlequote->decode($json_text)
2201
+
2202
+ =item $JSON::SingleQuote
2203
+
2204
+ Disable. C<JSON> does not make such a invalid JSON string any longer.
2205
+
2206
+ =item $JSON::KeySort
2207
+
2208
+ $json->canonical->encode($perl_scalar)
2209
+
2210
+ This is the ascii sort.
2211
+
2212
+ If you want to use with your own sort routine, check the C<sort_by> method.
2213
+
2214
+ (Only with JSON::PP, even if C<-support_by_pp> is used currently.)
2215
+
2216
+ $json->sort_by($sort_routine_ref)->encode($perl_scalar)
2217
+
2218
+ $json->sort_by(sub { $JSON::PP::a <=> $JSON::PP::b })->encode($perl_scalar)
2219
+
2220
+ Can't access C<$a> and C<$b> but C<$JSON::PP::a> and C<$JSON::PP::b>.
2221
+
2222
+ =item $JSON::SkipInvalid
2223
+
2224
+ $json->allow_unknown
2225
+
2226
+ =item $JSON::AUTOCONVERT
2227
+
2228
+ Needless. C<JSON> backend modules have the round-trip integrity.
2229
+
2230
+ =item $JSON::UTF8
2231
+
2232
+ Needless because C<JSON> (JSON::XS/JSON::PP) sets
2233
+ the UTF8 flag on properly.
2234
+
2235
+ # With UTF8-flagged strings
2236
+
2237
+ $json->allow_nonref;
2238
+ $str = chr(1000); # UTF8-flagged
2239
+
2240
+ $json_text = $json->utf8(0)->encode($str);
2241
+ utf8::is_utf8($json_text);
2242
+ # true
2243
+ $json_text = $json->utf8(1)->encode($str);
2244
+ utf8::is_utf8($json_text);
2245
+ # false
2246
+
2247
+ $str = '"' . chr(1000) . '"'; # UTF8-flagged
2248
+
2249
+ $perl_scalar = $json->utf8(0)->decode($str);
2250
+ utf8::is_utf8($perl_scalar);
2251
+ # true
2252
+ $perl_scalar = $json->utf8(1)->decode($str);
2253
+ # died because of 'Wide character in subroutine'
2254
+
2255
+ See to L<JSON::XS/A FEW NOTES ON UNICODE AND PERL>.
2256
+
2257
+ =item $JSON::UnMapping
2258
+
2259
+ Disable. See to L<MAPPING>.
2260
+
2261
+ =item $JSON::SelfConvert
2262
+
2263
+ This option was deleted.
2264
+ Instead of it, if a given blessed object has the C<TO_JSON> method,
2265
+ C<TO_JSON> will be executed with C<convert_blessed>.
2266
+
2267
+ $json->convert_blessed->encode($blessed_hashref_or_arrayref)
2268
+ # if need, call allow_blessed
2269
+
2270
+ Note that it was C<toJson> in old version, but now not C<toJson> but C<TO_JSON>.
2271
+
2272
+ =back
2273
+
2274
+ =head1 TODO
2275
+
2276
+ =over
2277
+
2278
+ =item example programs
2279
+
2280
+ =back
2281
+
2282
+ =head1 THREADS
2283
+
2284
+ No test with JSON::PP. If with JSON::XS, See to L<JSON::XS/THREADS>.
2285
+
2286
+
2287
+ =head1 BUGS
2288
+
2289
+ Please report bugs relevant to C<JSON> to E<lt>makamaka[at]cpan.orgE<gt>.
2290
+
2291
+
2292
+ =head1 SEE ALSO
2293
+
2294
+ Most of the document is copied and modified from JSON::XS doc.
2295
+
2296
+ L<JSON::XS>, L<JSON::PP>
2297
+
2298
+ C<RFC4627>(L<http://www.ietf.org/rfc/rfc4627.txt>)
2299
+
2300
+ =head1 AUTHOR
2301
+
2302
+ Makamaka Hannyaharamitu, E<lt>makamaka[at]cpan.orgE<gt>
2303
+
2304
+ JSON::XS was written by Marc Lehmann <schmorp[at]schmorp.de>
2305
+
2306
+ The release of this new version owes to the courtesy of Marc Lehmann.
2307
+
2308
+
2309
+ =head1 COPYRIGHT AND LICENSE
2310
+
2311
+ Copyright 2005-2013 by Makamaka Hannyaharamitu
2312
+
2313
+ This library is free software; you can redistribute it and/or modify
2314
+ it under the same terms as Perl itself.
2315
+
2316
+ =cut
2317
+
uroman/lib/JSON/backportPP.pm ADDED
@@ -0,0 +1,2806 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package # This is JSON::backportPP
2
+ JSON::PP;
3
+
4
+ # JSON-2.0
5
+
6
+ use 5.005;
7
+ use strict;
8
+ use base qw(Exporter);
9
+ use overload ();
10
+
11
+ use Carp ();
12
+ use B ();
13
+ #use Devel::Peek;
14
+
15
+ use vars qw($VERSION);
16
+ $VERSION = '2.27204';
17
+
18
+ @JSON::PP::EXPORT = qw(encode_json decode_json from_json to_json);
19
+
20
+ # instead of hash-access, i tried index-access for speed.
21
+ # but this method is not faster than what i expected. so it will be changed.
22
+
23
+ use constant P_ASCII => 0;
24
+ use constant P_LATIN1 => 1;
25
+ use constant P_UTF8 => 2;
26
+ use constant P_INDENT => 3;
27
+ use constant P_CANONICAL => 4;
28
+ use constant P_SPACE_BEFORE => 5;
29
+ use constant P_SPACE_AFTER => 6;
30
+ use constant P_ALLOW_NONREF => 7;
31
+ use constant P_SHRINK => 8;
32
+ use constant P_ALLOW_BLESSED => 9;
33
+ use constant P_CONVERT_BLESSED => 10;
34
+ use constant P_RELAXED => 11;
35
+
36
+ use constant P_LOOSE => 12;
37
+ use constant P_ALLOW_BIGNUM => 13;
38
+ use constant P_ALLOW_BAREKEY => 14;
39
+ use constant P_ALLOW_SINGLEQUOTE => 15;
40
+ use constant P_ESCAPE_SLASH => 16;
41
+ use constant P_AS_NONBLESSED => 17;
42
+
43
+ use constant P_ALLOW_UNKNOWN => 18;
44
+
45
+ use constant OLD_PERL => $] < 5.008 ? 1 : 0;
46
+
47
+ BEGIN {
48
+ my @xs_compati_bit_properties = qw(
49
+ latin1 ascii utf8 indent canonical space_before space_after allow_nonref shrink
50
+ allow_blessed convert_blessed relaxed allow_unknown
51
+ );
52
+ my @pp_bit_properties = qw(
53
+ allow_singlequote allow_bignum loose
54
+ allow_barekey escape_slash as_nonblessed
55
+ );
56
+
57
+ # Perl version check, Unicode handling is enable?
58
+ # Helper module sets @JSON::PP::_properties.
59
+ if ($] < 5.008 ) {
60
+ my $helper = $] >= 5.006 ? 'JSON::backportPP::Compat5006' : 'JSON::backportPP::Compat5005';
61
+ eval qq| require $helper |;
62
+ if ($@) { Carp::croak $@; }
63
+ }
64
+
65
+ for my $name (@xs_compati_bit_properties, @pp_bit_properties) {
66
+ my $flag_name = 'P_' . uc($name);
67
+
68
+ eval qq/
69
+ sub $name {
70
+ my \$enable = defined \$_[1] ? \$_[1] : 1;
71
+
72
+ if (\$enable) {
73
+ \$_[0]->{PROPS}->[$flag_name] = 1;
74
+ }
75
+ else {
76
+ \$_[0]->{PROPS}->[$flag_name] = 0;
77
+ }
78
+
79
+ \$_[0];
80
+ }
81
+
82
+ sub get_$name {
83
+ \$_[0]->{PROPS}->[$flag_name] ? 1 : '';
84
+ }
85
+ /;
86
+ }
87
+
88
+ }
89
+
90
+
91
+
92
+ # Functions
93
+
94
+ my %encode_allow_method
95
+ = map {($_ => 1)} qw/utf8 pretty allow_nonref latin1 self_encode escape_slash
96
+ allow_blessed convert_blessed indent indent_length allow_bignum
97
+ as_nonblessed
98
+ /;
99
+ my %decode_allow_method
100
+ = map {($_ => 1)} qw/utf8 allow_nonref loose allow_singlequote allow_bignum
101
+ allow_barekey max_size relaxed/;
102
+
103
+
104
+ my $JSON; # cache
105
+
106
+ sub encode_json ($) { # encode
107
+ ($JSON ||= __PACKAGE__->new->utf8)->encode(@_);
108
+ }
109
+
110
+
111
+ sub decode_json { # decode
112
+ ($JSON ||= __PACKAGE__->new->utf8)->decode(@_);
113
+ }
114
+
115
+ # Obsoleted
116
+
117
+ sub to_json($) {
118
+ Carp::croak ("JSON::PP::to_json has been renamed to encode_json.");
119
+ }
120
+
121
+
122
+ sub from_json($) {
123
+ Carp::croak ("JSON::PP::from_json has been renamed to decode_json.");
124
+ }
125
+
126
+
127
+ # Methods
128
+
129
+ sub new {
130
+ my $class = shift;
131
+ my $self = {
132
+ max_depth => 512,
133
+ max_size => 0,
134
+ indent => 0,
135
+ FLAGS => 0,
136
+ fallback => sub { encode_error('Invalid value. JSON can only reference.') },
137
+ indent_length => 3,
138
+ };
139
+
140
+ bless $self, $class;
141
+ }
142
+
143
+
144
+ sub encode {
145
+ return $_[0]->PP_encode_json($_[1]);
146
+ }
147
+
148
+
149
+ sub decode {
150
+ return $_[0]->PP_decode_json($_[1], 0x00000000);
151
+ }
152
+
153
+
154
+ sub decode_prefix {
155
+ return $_[0]->PP_decode_json($_[1], 0x00000001);
156
+ }
157
+
158
+
159
+ # accessor
160
+
161
+
162
+ # pretty printing
163
+
164
+ sub pretty {
165
+ my ($self, $v) = @_;
166
+ my $enable = defined $v ? $v : 1;
167
+
168
+ if ($enable) { # indent_length(3) for JSON::XS compatibility
169
+ $self->indent(1)->indent_length(3)->space_before(1)->space_after(1);
170
+ }
171
+ else {
172
+ $self->indent(0)->space_before(0)->space_after(0);
173
+ }
174
+
175
+ $self;
176
+ }
177
+
178
+ # etc
179
+
180
+ sub max_depth {
181
+ my $max = defined $_[1] ? $_[1] : 0x80000000;
182
+ $_[0]->{max_depth} = $max;
183
+ $_[0];
184
+ }
185
+
186
+
187
+ sub get_max_depth { $_[0]->{max_depth}; }
188
+
189
+
190
+ sub max_size {
191
+ my $max = defined $_[1] ? $_[1] : 0;
192
+ $_[0]->{max_size} = $max;
193
+ $_[0];
194
+ }
195
+
196
+
197
+ sub get_max_size { $_[0]->{max_size}; }
198
+
199
+
200
+ sub filter_json_object {
201
+ $_[0]->{cb_object} = defined $_[1] ? $_[1] : 0;
202
+ $_[0]->{F_HOOK} = ($_[0]->{cb_object} or $_[0]->{cb_sk_object}) ? 1 : 0;
203
+ $_[0];
204
+ }
205
+
206
+ sub filter_json_single_key_object {
207
+ if (@_ > 1) {
208
+ $_[0]->{cb_sk_object}->{$_[1]} = $_[2];
209
+ }
210
+ $_[0]->{F_HOOK} = ($_[0]->{cb_object} or $_[0]->{cb_sk_object}) ? 1 : 0;
211
+ $_[0];
212
+ }
213
+
214
+ sub indent_length {
215
+ if (!defined $_[1] or $_[1] > 15 or $_[1] < 0) {
216
+ Carp::carp "The acceptable range of indent_length() is 0 to 15.";
217
+ }
218
+ else {
219
+ $_[0]->{indent_length} = $_[1];
220
+ }
221
+ $_[0];
222
+ }
223
+
224
+ sub get_indent_length {
225
+ $_[0]->{indent_length};
226
+ }
227
+
228
+ sub sort_by {
229
+ $_[0]->{sort_by} = defined $_[1] ? $_[1] : 1;
230
+ $_[0];
231
+ }
232
+
233
+ sub allow_bigint {
234
+ Carp::carp("allow_bigint() is obsoleted. use allow_bignum() insted.");
235
+ }
236
+
237
+ ###############################
238
+
239
+ ###
240
+ ### Perl => JSON
241
+ ###
242
+
243
+
244
+ { # Convert
245
+
246
+ my $max_depth;
247
+ my $indent;
248
+ my $ascii;
249
+ my $latin1;
250
+ my $utf8;
251
+ my $space_before;
252
+ my $space_after;
253
+ my $canonical;
254
+ my $allow_blessed;
255
+ my $convert_blessed;
256
+
257
+ my $indent_length;
258
+ my $escape_slash;
259
+ my $bignum;
260
+ my $as_nonblessed;
261
+
262
+ my $depth;
263
+ my $indent_count;
264
+ my $keysort;
265
+
266
+
267
+ sub PP_encode_json {
268
+ my $self = shift;
269
+ my $obj = shift;
270
+
271
+ $indent_count = 0;
272
+ $depth = 0;
273
+
274
+ my $idx = $self->{PROPS};
275
+
276
+ ($ascii, $latin1, $utf8, $indent, $canonical, $space_before, $space_after, $allow_blessed,
277
+ $convert_blessed, $escape_slash, $bignum, $as_nonblessed)
278
+ = @{$idx}[P_ASCII .. P_SPACE_AFTER, P_ALLOW_BLESSED, P_CONVERT_BLESSED,
279
+ P_ESCAPE_SLASH, P_ALLOW_BIGNUM, P_AS_NONBLESSED];
280
+
281
+ ($max_depth, $indent_length) = @{$self}{qw/max_depth indent_length/};
282
+
283
+ $keysort = $canonical ? sub { $a cmp $b } : undef;
284
+
285
+ if ($self->{sort_by}) {
286
+ $keysort = ref($self->{sort_by}) eq 'CODE' ? $self->{sort_by}
287
+ : $self->{sort_by} =~ /\D+/ ? $self->{sort_by}
288
+ : sub { $a cmp $b };
289
+ }
290
+
291
+ encode_error("hash- or arrayref expected (not a simple scalar, use allow_nonref to allow this)")
292
+ if(!ref $obj and !$idx->[ P_ALLOW_NONREF ]);
293
+
294
+ my $str = $self->object_to_json($obj);
295
+
296
+ $str .= "\n" if ( $indent ); # JSON::XS 2.26 compatible
297
+
298
+ unless ($ascii or $latin1 or $utf8) {
299
+ utf8::upgrade($str);
300
+ }
301
+
302
+ if ($idx->[ P_SHRINK ]) {
303
+ utf8::downgrade($str, 1);
304
+ }
305
+
306
+ return $str;
307
+ }
308
+
309
+
310
+ sub object_to_json {
311
+ my ($self, $obj) = @_;
312
+ my $type = ref($obj);
313
+
314
+ if($type eq 'HASH'){
315
+ return $self->hash_to_json($obj);
316
+ }
317
+ elsif($type eq 'ARRAY'){
318
+ return $self->array_to_json($obj);
319
+ }
320
+ elsif ($type) { # blessed object?
321
+ if (blessed($obj)) {
322
+
323
+ return $self->value_to_json($obj) if ( $obj->isa('JSON::PP::Boolean') );
324
+
325
+ if ( $convert_blessed and $obj->can('TO_JSON') ) {
326
+ my $result = $obj->TO_JSON();
327
+ if ( defined $result and ref( $result ) ) {
328
+ if ( refaddr( $obj ) eq refaddr( $result ) ) {
329
+ encode_error( sprintf(
330
+ "%s::TO_JSON method returned same object as was passed instead of a new one",
331
+ ref $obj
332
+ ) );
333
+ }
334
+ }
335
+
336
+ return $self->object_to_json( $result );
337
+ }
338
+
339
+ return "$obj" if ( $bignum and _is_bignum($obj) );
340
+ return $self->blessed_to_json($obj) if ($allow_blessed and $as_nonblessed); # will be removed.
341
+
342
+ encode_error( sprintf("encountered object '%s', but neither allow_blessed "
343
+ . "nor convert_blessed settings are enabled", $obj)
344
+ ) unless ($allow_blessed);
345
+
346
+ return 'null';
347
+ }
348
+ else {
349
+ return $self->value_to_json($obj);
350
+ }
351
+ }
352
+ else{
353
+ return $self->value_to_json($obj);
354
+ }
355
+ }
356
+
357
+
358
+ sub hash_to_json {
359
+ my ($self, $obj) = @_;
360
+ my @res;
361
+
362
+ encode_error("json text or perl structure exceeds maximum nesting level (max_depth set too low?)")
363
+ if (++$depth > $max_depth);
364
+
365
+ my ($pre, $post) = $indent ? $self->_up_indent() : ('', '');
366
+ my $del = ($space_before ? ' ' : '') . ':' . ($space_after ? ' ' : '');
367
+
368
+ for my $k ( _sort( $obj ) ) {
369
+ if ( OLD_PERL ) { utf8::decode($k) } # key for Perl 5.6 / be optimized
370
+ push @res, string_to_json( $self, $k )
371
+ . $del
372
+ . ( $self->object_to_json( $obj->{$k} ) || $self->value_to_json( $obj->{$k} ) );
373
+ }
374
+
375
+ --$depth;
376
+ $self->_down_indent() if ($indent);
377
+
378
+ return '{' . ( @res ? $pre : '' ) . ( @res ? join( ",$pre", @res ) . $post : '' ) . '}';
379
+ }
380
+
381
+
382
+ sub array_to_json {
383
+ my ($self, $obj) = @_;
384
+ my @res;
385
+
386
+ encode_error("json text or perl structure exceeds maximum nesting level (max_depth set too low?)")
387
+ if (++$depth > $max_depth);
388
+
389
+ my ($pre, $post) = $indent ? $self->_up_indent() : ('', '');
390
+
391
+ for my $v (@$obj){
392
+ push @res, $self->object_to_json($v) || $self->value_to_json($v);
393
+ }
394
+
395
+ --$depth;
396
+ $self->_down_indent() if ($indent);
397
+
398
+ return '[' . ( @res ? $pre : '' ) . ( @res ? join( ",$pre", @res ) . $post : '' ) . ']';
399
+ }
400
+
401
+
402
+ sub value_to_json {
403
+ my ($self, $value) = @_;
404
+
405
+ return 'null' if(!defined $value);
406
+
407
+ my $b_obj = B::svref_2object(\$value); # for round trip problem
408
+ my $flags = $b_obj->FLAGS;
409
+
410
+ return $value # as is
411
+ if $flags & ( B::SVp_IOK | B::SVp_NOK ) and !( $flags & B::SVp_POK ); # SvTYPE is IV or NV?
412
+
413
+ my $type = ref($value);
414
+
415
+ if(!$type){
416
+ return string_to_json($self, $value);
417
+ }
418
+ elsif( blessed($value) and $value->isa('JSON::PP::Boolean') ){
419
+ return $$value == 1 ? 'true' : 'false';
420
+ }
421
+ elsif ($type) {
422
+ if ((overload::StrVal($value) =~ /=(\w+)/)[0]) {
423
+ return $self->value_to_json("$value");
424
+ }
425
+
426
+ if ($type eq 'SCALAR' and defined $$value) {
427
+ return $$value eq '1' ? 'true'
428
+ : $$value eq '0' ? 'false'
429
+ : $self->{PROPS}->[ P_ALLOW_UNKNOWN ] ? 'null'
430
+ : encode_error("cannot encode reference to scalar");
431
+ }
432
+
433
+ if ( $self->{PROPS}->[ P_ALLOW_UNKNOWN ] ) {
434
+ return 'null';
435
+ }
436
+ else {
437
+ if ( $type eq 'SCALAR' or $type eq 'REF' ) {
438
+ encode_error("cannot encode reference to scalar");
439
+ }
440
+ else {
441
+ encode_error("encountered $value, but JSON can only represent references to arrays or hashes");
442
+ }
443
+ }
444
+
445
+ }
446
+ else {
447
+ return $self->{fallback}->($value)
448
+ if ($self->{fallback} and ref($self->{fallback}) eq 'CODE');
449
+ return 'null';
450
+ }
451
+
452
+ }
453
+
454
+
455
+ my %esc = (
456
+ "\n" => '\n',
457
+ "\r" => '\r',
458
+ "\t" => '\t',
459
+ "\f" => '\f',
460
+ "\b" => '\b',
461
+ "\"" => '\"',
462
+ "\\" => '\\\\',
463
+ "\'" => '\\\'',
464
+ );
465
+
466
+
467
+ sub string_to_json {
468
+ my ($self, $arg) = @_;
469
+
470
+ $arg =~ s/([\x22\x5c\n\r\t\f\b])/$esc{$1}/g;
471
+ $arg =~ s/\//\\\//g if ($escape_slash);
472
+ $arg =~ s/([\x00-\x08\x0b\x0e-\x1f])/'\\u00' . unpack('H2', $1)/eg;
473
+
474
+ if ($ascii) {
475
+ $arg = JSON_PP_encode_ascii($arg);
476
+ }
477
+
478
+ if ($latin1) {
479
+ $arg = JSON_PP_encode_latin1($arg);
480
+ }
481
+
482
+ if ($utf8) {
483
+ utf8::encode($arg);
484
+ }
485
+
486
+ return '"' . $arg . '"';
487
+ }
488
+
489
+
490
+ sub blessed_to_json {
491
+ my $reftype = reftype($_[1]) || '';
492
+ if ($reftype eq 'HASH') {
493
+ return $_[0]->hash_to_json($_[1]);
494
+ }
495
+ elsif ($reftype eq 'ARRAY') {
496
+ return $_[0]->array_to_json($_[1]);
497
+ }
498
+ else {
499
+ return 'null';
500
+ }
501
+ }
502
+
503
+
504
+ sub encode_error {
505
+ my $error = shift;
506
+ Carp::croak "$error";
507
+ }
508
+
509
+
510
+ sub _sort {
511
+ defined $keysort ? (sort $keysort (keys %{$_[0]})) : keys %{$_[0]};
512
+ }
513
+
514
+
515
+ sub _up_indent {
516
+ my $self = shift;
517
+ my $space = ' ' x $indent_length;
518
+
519
+ my ($pre,$post) = ('','');
520
+
521
+ $post = "\n" . $space x $indent_count;
522
+
523
+ $indent_count++;
524
+
525
+ $pre = "\n" . $space x $indent_count;
526
+
527
+ return ($pre,$post);
528
+ }
529
+
530
+
531
+ sub _down_indent { $indent_count--; }
532
+
533
+
534
+ sub PP_encode_box {
535
+ {
536
+ depth => $depth,
537
+ indent_count => $indent_count,
538
+ };
539
+ }
540
+
541
+ } # Convert
542
+
543
+
544
+ sub _encode_ascii {
545
+ join('',
546
+ map {
547
+ $_ <= 127 ?
548
+ chr($_) :
549
+ $_ <= 65535 ?
550
+ sprintf('\u%04x', $_) : sprintf('\u%x\u%x', _encode_surrogates($_));
551
+ } unpack('U*', $_[0])
552
+ );
553
+ }
554
+
555
+
556
+ sub _encode_latin1 {
557
+ join('',
558
+ map {
559
+ $_ <= 255 ?
560
+ chr($_) :
561
+ $_ <= 65535 ?
562
+ sprintf('\u%04x', $_) : sprintf('\u%x\u%x', _encode_surrogates($_));
563
+ } unpack('U*', $_[0])
564
+ );
565
+ }
566
+
567
+
568
+ sub _encode_surrogates { # from perlunicode
569
+ my $uni = $_[0] - 0x10000;
570
+ return ($uni / 0x400 + 0xD800, $uni % 0x400 + 0xDC00);
571
+ }
572
+
573
+
574
+ sub _is_bignum {
575
+ $_[0]->isa('Math::BigInt') or $_[0]->isa('Math::BigFloat');
576
+ }
577
+
578
+
579
+
580
+ #
581
+ # JSON => Perl
582
+ #
583
+
584
+ my $max_intsize;
585
+
586
+ BEGIN {
587
+ my $checkint = 1111;
588
+ for my $d (5..64) {
589
+ $checkint .= 1;
590
+ my $int = eval qq| $checkint |;
591
+ if ($int =~ /[eE]/) {
592
+ $max_intsize = $d - 1;
593
+ last;
594
+ }
595
+ }
596
+ }
597
+
598
+ { # PARSE
599
+
600
+ my %escapes = ( # by Jeremy Muhlich <jmuhlich [at] bitflood.org>
601
+ b => "\x8",
602
+ t => "\x9",
603
+ n => "\xA",
604
+ f => "\xC",
605
+ r => "\xD",
606
+ '\\' => '\\',
607
+ '"' => '"',
608
+ '/' => '/',
609
+ );
610
+
611
+ my $text; # json data
612
+ my $at; # offset
613
+ my $ch; # 1chracter
614
+ my $len; # text length (changed according to UTF8 or NON UTF8)
615
+ # INTERNAL
616
+ my $depth; # nest counter
617
+ my $encoding; # json text encoding
618
+ my $is_valid_utf8; # temp variable
619
+ my $utf8_len; # utf8 byte length
620
+ # FLAGS
621
+ my $utf8; # must be utf8
622
+ my $max_depth; # max nest number of objects and arrays
623
+ my $max_size;
624
+ my $relaxed;
625
+ my $cb_object;
626
+ my $cb_sk_object;
627
+
628
+ my $F_HOOK;
629
+
630
+ my $allow_bigint; # using Math::BigInt
631
+ my $singlequote; # loosely quoting
632
+ my $loose; #
633
+ my $allow_barekey; # bareKey
634
+
635
+ # $opt flag
636
+ # 0x00000001 .... decode_prefix
637
+ # 0x10000000 .... incr_parse
638
+
639
+ sub PP_decode_json {
640
+ my ($self, $opt); # $opt is an effective flag during this decode_json.
641
+
642
+ ($self, $text, $opt) = @_;
643
+
644
+ ($at, $ch, $depth) = (0, '', 0);
645
+
646
+ if ( !defined $text or ref $text ) {
647
+ decode_error("malformed JSON string, neither array, object, number, string or atom");
648
+ }
649
+
650
+ my $idx = $self->{PROPS};
651
+
652
+ ($utf8, $relaxed, $loose, $allow_bigint, $allow_barekey, $singlequote)
653
+ = @{$idx}[P_UTF8, P_RELAXED, P_LOOSE .. P_ALLOW_SINGLEQUOTE];
654
+
655
+ if ( $utf8 ) {
656
+ utf8::downgrade( $text, 1 ) or Carp::croak("Wide character in subroutine entry");
657
+ }
658
+ else {
659
+ utf8::upgrade( $text );
660
+ }
661
+
662
+ $len = length $text;
663
+
664
+ ($max_depth, $max_size, $cb_object, $cb_sk_object, $F_HOOK)
665
+ = @{$self}{qw/max_depth max_size cb_object cb_sk_object F_HOOK/};
666
+
667
+ if ($max_size > 1) {
668
+ use bytes;
669
+ my $bytes = length $text;
670
+ decode_error(
671
+ sprintf("attempted decode of JSON text of %s bytes size, but max_size is set to %s"
672
+ , $bytes, $max_size), 1
673
+ ) if ($bytes > $max_size);
674
+ }
675
+
676
+ # Currently no effect
677
+ # should use regexp
678
+ my @octets = unpack('C4', $text);
679
+ $encoding = ( $octets[0] and $octets[1]) ? 'UTF-8'
680
+ : (!$octets[0] and $octets[1]) ? 'UTF-16BE'
681
+ : (!$octets[0] and !$octets[1]) ? 'UTF-32BE'
682
+ : ( $octets[2] ) ? 'UTF-16LE'
683
+ : (!$octets[2] ) ? 'UTF-32LE'
684
+ : 'unknown';
685
+
686
+ white(); # remove head white space
687
+
688
+ my $valid_start = defined $ch; # Is there a first character for JSON structure?
689
+
690
+ my $result = value();
691
+
692
+ return undef if ( !$result && ( $opt & 0x10000000 ) ); # for incr_parse
693
+
694
+ decode_error("malformed JSON string, neither array, object, number, string or atom") unless $valid_start;
695
+
696
+ if ( !$idx->[ P_ALLOW_NONREF ] and !ref $result ) {
697
+ decode_error(
698
+ 'JSON text must be an object or array (but found number, string, true, false or null,'
699
+ . ' use allow_nonref to allow this)', 1);
700
+ }
701
+
702
+ Carp::croak('something wrong.') if $len < $at; # we won't arrive here.
703
+
704
+ my $consumed = defined $ch ? $at - 1 : $at; # consumed JSON text length
705
+
706
+ white(); # remove tail white space
707
+
708
+ if ( $ch ) {
709
+ return ( $result, $consumed ) if ($opt & 0x00000001); # all right if decode_prefix
710
+ decode_error("garbage after JSON object");
711
+ }
712
+
713
+ ( $opt & 0x00000001 ) ? ( $result, $consumed ) : $result;
714
+ }
715
+
716
+
717
+ sub next_chr {
718
+ return $ch = undef if($at >= $len);
719
+ $ch = substr($text, $at++, 1);
720
+ }
721
+
722
+
723
+ sub value {
724
+ white();
725
+ return if(!defined $ch);
726
+ return object() if($ch eq '{');
727
+ return array() if($ch eq '[');
728
+ return string() if($ch eq '"' or ($singlequote and $ch eq "'"));
729
+ return number() if($ch =~ /[0-9]/ or $ch eq '-');
730
+ return word();
731
+ }
732
+
733
+ sub string {
734
+ my ($i, $s, $t, $u);
735
+ my $utf16;
736
+ my $is_utf8;
737
+
738
+ ($is_valid_utf8, $utf8_len) = ('', 0);
739
+
740
+ $s = ''; # basically UTF8 flag on
741
+
742
+ if($ch eq '"' or ($singlequote and $ch eq "'")){
743
+ my $boundChar = $ch;
744
+
745
+ OUTER: while( defined(next_chr()) ){
746
+
747
+ if($ch eq $boundChar){
748
+ next_chr();
749
+
750
+ if ($utf16) {
751
+ decode_error("missing low surrogate character in surrogate pair");
752
+ }
753
+
754
+ utf8::decode($s) if($is_utf8);
755
+
756
+ return $s;
757
+ }
758
+ elsif($ch eq '\\'){
759
+ next_chr();
760
+ if(exists $escapes{$ch}){
761
+ $s .= $escapes{$ch};
762
+ }
763
+ elsif($ch eq 'u'){ # UNICODE handling
764
+ my $u = '';
765
+
766
+ for(1..4){
767
+ $ch = next_chr();
768
+ last OUTER if($ch !~ /[0-9a-fA-F]/);
769
+ $u .= $ch;
770
+ }
771
+
772
+ # U+D800 - U+DBFF
773
+ if ($u =~ /^[dD][89abAB][0-9a-fA-F]{2}/) { # UTF-16 high surrogate?
774
+ $utf16 = $u;
775
+ }
776
+ # U+DC00 - U+DFFF
777
+ elsif ($u =~ /^[dD][c-fC-F][0-9a-fA-F]{2}/) { # UTF-16 low surrogate?
778
+ unless (defined $utf16) {
779
+ decode_error("missing high surrogate character in surrogate pair");
780
+ }
781
+ $is_utf8 = 1;
782
+ $s .= JSON_PP_decode_surrogates($utf16, $u) || next;
783
+ $utf16 = undef;
784
+ }
785
+ else {
786
+ if (defined $utf16) {
787
+ decode_error("surrogate pair expected");
788
+ }
789
+
790
+ if ( ( my $hex = hex( $u ) ) > 127 ) {
791
+ $is_utf8 = 1;
792
+ $s .= JSON_PP_decode_unicode($u) || next;
793
+ }
794
+ else {
795
+ $s .= chr $hex;
796
+ }
797
+ }
798
+
799
+ }
800
+ else{
801
+ unless ($loose) {
802
+ $at -= 2;
803
+ decode_error('illegal backslash escape sequence in string');
804
+ }
805
+ $s .= $ch;
806
+ }
807
+ }
808
+ else{
809
+
810
+ if ( ord $ch > 127 ) {
811
+ if ( $utf8 ) {
812
+ unless( $ch = is_valid_utf8($ch) ) {
813
+ $at -= 1;
814
+ decode_error("malformed UTF-8 character in JSON string");
815
+ }
816
+ else {
817
+ $at += $utf8_len - 1;
818
+ }
819
+ }
820
+ else {
821
+ utf8::encode( $ch );
822
+ }
823
+
824
+ $is_utf8 = 1;
825
+ }
826
+
827
+ if (!$loose) {
828
+ if ($ch =~ /[\x00-\x1f\x22\x5c]/) { # '/' ok
829
+ $at--;
830
+ decode_error('invalid character encountered while parsing JSON string');
831
+ }
832
+ }
833
+
834
+ $s .= $ch;
835
+ }
836
+ }
837
+ }
838
+
839
+ decode_error("unexpected end of string while parsing JSON string");
840
+ }
841
+
842
+
843
+ sub white {
844
+ while( defined $ch ){
845
+ if($ch le ' '){
846
+ next_chr();
847
+ }
848
+ elsif($ch eq '/'){
849
+ next_chr();
850
+ if(defined $ch and $ch eq '/'){
851
+ 1 while(defined(next_chr()) and $ch ne "\n" and $ch ne "\r");
852
+ }
853
+ elsif(defined $ch and $ch eq '*'){
854
+ next_chr();
855
+ while(1){
856
+ if(defined $ch){
857
+ if($ch eq '*'){
858
+ if(defined(next_chr()) and $ch eq '/'){
859
+ next_chr();
860
+ last;
861
+ }
862
+ }
863
+ else{
864
+ next_chr();
865
+ }
866
+ }
867
+ else{
868
+ decode_error("Unterminated comment");
869
+ }
870
+ }
871
+ next;
872
+ }
873
+ else{
874
+ $at--;
875
+ decode_error("malformed JSON string, neither array, object, number, string or atom");
876
+ }
877
+ }
878
+ else{
879
+ if ($relaxed and $ch eq '#') { # correctly?
880
+ pos($text) = $at;
881
+ $text =~ /\G([^\n]*(?:\r\n|\r|\n|$))/g;
882
+ $at = pos($text);
883
+ next_chr;
884
+ next;
885
+ }
886
+
887
+ last;
888
+ }
889
+ }
890
+ }
891
+
892
+
893
+ sub array {
894
+ my $a = $_[0] || []; # you can use this code to use another array ref object.
895
+
896
+ decode_error('json text or perl structure exceeds maximum nesting level (max_depth set too low?)')
897
+ if (++$depth > $max_depth);
898
+
899
+ next_chr();
900
+ white();
901
+
902
+ if(defined $ch and $ch eq ']'){
903
+ --$depth;
904
+ next_chr();
905
+ return $a;
906
+ }
907
+ else {
908
+ while(defined($ch)){
909
+ push @$a, value();
910
+
911
+ white();
912
+
913
+ if (!defined $ch) {
914
+ last;
915
+ }
916
+
917
+ if($ch eq ']'){
918
+ --$depth;
919
+ next_chr();
920
+ return $a;
921
+ }
922
+
923
+ if($ch ne ','){
924
+ last;
925
+ }
926
+
927
+ next_chr();
928
+ white();
929
+
930
+ if ($relaxed and $ch eq ']') {
931
+ --$depth;
932
+ next_chr();
933
+ return $a;
934
+ }
935
+
936
+ }
937
+ }
938
+
939
+ decode_error(", or ] expected while parsing array");
940
+ }
941
+
942
+
943
+ sub object {
944
+ my $o = $_[0] || {}; # you can use this code to use another hash ref object.
945
+ my $k;
946
+
947
+ decode_error('json text or perl structure exceeds maximum nesting level (max_depth set too low?)')
948
+ if (++$depth > $max_depth);
949
+ next_chr();
950
+ white();
951
+
952
+ if(defined $ch and $ch eq '}'){
953
+ --$depth;
954
+ next_chr();
955
+ if ($F_HOOK) {
956
+ return _json_object_hook($o);
957
+ }
958
+ return $o;
959
+ }
960
+ else {
961
+ while (defined $ch) {
962
+ $k = ($allow_barekey and $ch ne '"' and $ch ne "'") ? bareKey() : string();
963
+ white();
964
+
965
+ if(!defined $ch or $ch ne ':'){
966
+ $at--;
967
+ decode_error("':' expected");
968
+ }
969
+
970
+ next_chr();
971
+ $o->{$k} = value();
972
+ white();
973
+
974
+ last if (!defined $ch);
975
+
976
+ if($ch eq '}'){
977
+ --$depth;
978
+ next_chr();
979
+ if ($F_HOOK) {
980
+ return _json_object_hook($o);
981
+ }
982
+ return $o;
983
+ }
984
+
985
+ if($ch ne ','){
986
+ last;
987
+ }
988
+
989
+ next_chr();
990
+ white();
991
+
992
+ if ($relaxed and $ch eq '}') {
993
+ --$depth;
994
+ next_chr();
995
+ if ($F_HOOK) {
996
+ return _json_object_hook($o);
997
+ }
998
+ return $o;
999
+ }
1000
+
1001
+ }
1002
+
1003
+ }
1004
+
1005
+ $at--;
1006
+ decode_error(", or } expected while parsing object/hash");
1007
+ }
1008
+
1009
+
1010
+ sub bareKey { # doesn't strictly follow Standard ECMA-262 3rd Edition
1011
+ my $key;
1012
+ while($ch =~ /[^\x00-\x23\x25-\x2F\x3A-\x40\x5B-\x5E\x60\x7B-\x7F]/){
1013
+ $key .= $ch;
1014
+ next_chr();
1015
+ }
1016
+ return $key;
1017
+ }
1018
+
1019
+
1020
+ sub word {
1021
+ my $word = substr($text,$at-1,4);
1022
+
1023
+ if($word eq 'true'){
1024
+ $at += 3;
1025
+ next_chr;
1026
+ return $JSON::PP::true;
1027
+ }
1028
+ elsif($word eq 'null'){
1029
+ $at += 3;
1030
+ next_chr;
1031
+ return undef;
1032
+ }
1033
+ elsif($word eq 'fals'){
1034
+ $at += 3;
1035
+ if(substr($text,$at,1) eq 'e'){
1036
+ $at++;
1037
+ next_chr;
1038
+ return $JSON::PP::false;
1039
+ }
1040
+ }
1041
+
1042
+ $at--; # for decode_error report
1043
+
1044
+ decode_error("'null' expected") if ($word =~ /^n/);
1045
+ decode_error("'true' expected") if ($word =~ /^t/);
1046
+ decode_error("'false' expected") if ($word =~ /^f/);
1047
+ decode_error("malformed JSON string, neither array, object, number, string or atom");
1048
+ }
1049
+
1050
+
1051
+ sub number {
1052
+ my $n = '';
1053
+ my $v;
1054
+
1055
+ # According to RFC4627, hex or oct digits are invalid.
1056
+ if($ch eq '0'){
1057
+ my $peek = substr($text,$at,1);
1058
+ my $hex = $peek =~ /[xX]/; # 0 or 1
1059
+
1060
+ if($hex){
1061
+ decode_error("malformed number (leading zero must not be followed by another digit)");
1062
+ ($n) = ( substr($text, $at+1) =~ /^([0-9a-fA-F]+)/);
1063
+ }
1064
+ else{ # oct
1065
+ ($n) = ( substr($text, $at) =~ /^([0-7]+)/);
1066
+ if (defined $n and length $n > 1) {
1067
+ decode_error("malformed number (leading zero must not be followed by another digit)");
1068
+ }
1069
+ }
1070
+
1071
+ if(defined $n and length($n)){
1072
+ if (!$hex and length($n) == 1) {
1073
+ decode_error("malformed number (leading zero must not be followed by another digit)");
1074
+ }
1075
+ $at += length($n) + $hex;
1076
+ next_chr;
1077
+ return $hex ? hex($n) : oct($n);
1078
+ }
1079
+ }
1080
+
1081
+ if($ch eq '-'){
1082
+ $n = '-';
1083
+ next_chr;
1084
+ if (!defined $ch or $ch !~ /\d/) {
1085
+ decode_error("malformed number (no digits after initial minus)");
1086
+ }
1087
+ }
1088
+
1089
+ while(defined $ch and $ch =~ /\d/){
1090
+ $n .= $ch;
1091
+ next_chr;
1092
+ }
1093
+
1094
+ if(defined $ch and $ch eq '.'){
1095
+ $n .= '.';
1096
+
1097
+ next_chr;
1098
+ if (!defined $ch or $ch !~ /\d/) {
1099
+ decode_error("malformed number (no digits after decimal point)");
1100
+ }
1101
+ else {
1102
+ $n .= $ch;
1103
+ }
1104
+
1105
+ while(defined(next_chr) and $ch =~ /\d/){
1106
+ $n .= $ch;
1107
+ }
1108
+ }
1109
+
1110
+ if(defined $ch and ($ch eq 'e' or $ch eq 'E')){
1111
+ $n .= $ch;
1112
+ next_chr;
1113
+
1114
+ if(defined($ch) and ($ch eq '+' or $ch eq '-')){
1115
+ $n .= $ch;
1116
+ next_chr;
1117
+ if (!defined $ch or $ch =~ /\D/) {
1118
+ decode_error("malformed number (no digits after exp sign)");
1119
+ }
1120
+ $n .= $ch;
1121
+ }
1122
+ elsif(defined($ch) and $ch =~ /\d/){
1123
+ $n .= $ch;
1124
+ }
1125
+ else {
1126
+ decode_error("malformed number (no digits after exp sign)");
1127
+ }
1128
+
1129
+ while(defined(next_chr) and $ch =~ /\d/){
1130
+ $n .= $ch;
1131
+ }
1132
+
1133
+ }
1134
+
1135
+ $v .= $n;
1136
+
1137
+ if ($v !~ /[.eE]/ and length $v > $max_intsize) {
1138
+ if ($allow_bigint) { # from Adam Sussman
1139
+ require Math::BigInt;
1140
+ return Math::BigInt->new($v);
1141
+ }
1142
+ else {
1143
+ return "$v";
1144
+ }
1145
+ }
1146
+ elsif ($allow_bigint) {
1147
+ require Math::BigFloat;
1148
+ return Math::BigFloat->new($v);
1149
+ }
1150
+
1151
+ return 0+$v;
1152
+ }
1153
+
1154
+
1155
+ sub is_valid_utf8 {
1156
+
1157
+ $utf8_len = $_[0] =~ /[\x00-\x7F]/ ? 1
1158
+ : $_[0] =~ /[\xC2-\xDF]/ ? 2
1159
+ : $_[0] =~ /[\xE0-\xEF]/ ? 3
1160
+ : $_[0] =~ /[\xF0-\xF4]/ ? 4
1161
+ : 0
1162
+ ;
1163
+
1164
+ return unless $utf8_len;
1165
+
1166
+ my $is_valid_utf8 = substr($text, $at - 1, $utf8_len);
1167
+
1168
+ return ( $is_valid_utf8 =~ /^(?:
1169
+ [\x00-\x7F]
1170
+ |[\xC2-\xDF][\x80-\xBF]
1171
+ |[\xE0][\xA0-\xBF][\x80-\xBF]
1172
+ |[\xE1-\xEC][\x80-\xBF][\x80-\xBF]
1173
+ |[\xED][\x80-\x9F][\x80-\xBF]
1174
+ |[\xEE-\xEF][\x80-\xBF][\x80-\xBF]
1175
+ |[\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF]
1176
+ |[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]
1177
+ |[\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF]
1178
+ )$/x ) ? $is_valid_utf8 : '';
1179
+ }
1180
+
1181
+
1182
+ sub decode_error {
1183
+ my $error = shift;
1184
+ my $no_rep = shift;
1185
+ my $str = defined $text ? substr($text, $at) : '';
1186
+ my $mess = '';
1187
+ my $type = $] >= 5.008 ? 'U*'
1188
+ : $] < 5.006 ? 'C*'
1189
+ : utf8::is_utf8( $str ) ? 'U*' # 5.6
1190
+ : 'C*'
1191
+ ;
1192
+
1193
+ for my $c ( unpack( $type, $str ) ) { # emulate pv_uni_display() ?
1194
+ $mess .= $c == 0x07 ? '\a'
1195
+ : $c == 0x09 ? '\t'
1196
+ : $c == 0x0a ? '\n'
1197
+ : $c == 0x0d ? '\r'
1198
+ : $c == 0x0c ? '\f'
1199
+ : $c < 0x20 ? sprintf('\x{%x}', $c)
1200
+ : $c == 0x5c ? '\\\\'
1201
+ : $c < 0x80 ? chr($c)
1202
+ : sprintf('\x{%x}', $c)
1203
+ ;
1204
+ if ( length $mess >= 20 ) {
1205
+ $mess .= '...';
1206
+ last;
1207
+ }
1208
+ }
1209
+
1210
+ unless ( length $mess ) {
1211
+ $mess = '(end of string)';
1212
+ }
1213
+
1214
+ Carp::croak (
1215
+ $no_rep ? "$error" : "$error, at character offset $at (before \"$mess\")"
1216
+ );
1217
+
1218
+ }
1219
+
1220
+
1221
+ sub _json_object_hook {
1222
+ my $o = $_[0];
1223
+ my @ks = keys %{$o};
1224
+
1225
+ if ( $cb_sk_object and @ks == 1 and exists $cb_sk_object->{ $ks[0] } and ref $cb_sk_object->{ $ks[0] } ) {
1226
+ my @val = $cb_sk_object->{ $ks[0] }->( $o->{$ks[0]} );
1227
+ if (@val == 1) {
1228
+ return $val[0];
1229
+ }
1230
+ }
1231
+
1232
+ my @val = $cb_object->($o) if ($cb_object);
1233
+ if (@val == 0 or @val > 1) {
1234
+ return $o;
1235
+ }
1236
+ else {
1237
+ return $val[0];
1238
+ }
1239
+ }
1240
+
1241
+
1242
+ sub PP_decode_box {
1243
+ {
1244
+ text => $text,
1245
+ at => $at,
1246
+ ch => $ch,
1247
+ len => $len,
1248
+ depth => $depth,
1249
+ encoding => $encoding,
1250
+ is_valid_utf8 => $is_valid_utf8,
1251
+ };
1252
+ }
1253
+
1254
+ } # PARSE
1255
+
1256
+
1257
+ sub _decode_surrogates { # from perlunicode
1258
+ my $uni = 0x10000 + (hex($_[0]) - 0xD800) * 0x400 + (hex($_[1]) - 0xDC00);
1259
+ my $un = pack('U*', $uni);
1260
+ utf8::encode( $un );
1261
+ return $un;
1262
+ }
1263
+
1264
+
1265
+ sub _decode_unicode {
1266
+ my $un = pack('U', hex shift);
1267
+ utf8::encode( $un );
1268
+ return $un;
1269
+ }
1270
+
1271
+ #
1272
+ # Setup for various Perl versions (the code from JSON::PP58)
1273
+ #
1274
+
1275
+ BEGIN {
1276
+
1277
+ unless ( defined &utf8::is_utf8 ) {
1278
+ require Encode;
1279
+ *utf8::is_utf8 = *Encode::is_utf8;
1280
+ }
1281
+
1282
+ if ( $] >= 5.008 ) {
1283
+ *JSON::PP::JSON_PP_encode_ascii = \&_encode_ascii;
1284
+ *JSON::PP::JSON_PP_encode_latin1 = \&_encode_latin1;
1285
+ *JSON::PP::JSON_PP_decode_surrogates = \&_decode_surrogates;
1286
+ *JSON::PP::JSON_PP_decode_unicode = \&_decode_unicode;
1287
+ }
1288
+
1289
+ if ($] >= 5.008 and $] < 5.008003) { # join() in 5.8.0 - 5.8.2 is broken.
1290
+ package # hide from PAUSE
1291
+ JSON::PP;
1292
+ require subs;
1293
+ subs->import('join');
1294
+ eval q|
1295
+ sub join {
1296
+ return '' if (@_ < 2);
1297
+ my $j = shift;
1298
+ my $str = shift;
1299
+ for (@_) { $str .= $j . $_; }
1300
+ return $str;
1301
+ }
1302
+ |;
1303
+ }
1304
+
1305
+
1306
+ sub JSON::PP::incr_parse {
1307
+ local $Carp::CarpLevel = 1;
1308
+ ( $_[0]->{_incr_parser} ||= JSON::PP::IncrParser->new )->incr_parse( @_ );
1309
+ }
1310
+
1311
+
1312
+ sub JSON::PP::incr_skip {
1313
+ ( $_[0]->{_incr_parser} ||= JSON::PP::IncrParser->new )->incr_skip;
1314
+ }
1315
+
1316
+
1317
+ sub JSON::PP::incr_reset {
1318
+ ( $_[0]->{_incr_parser} ||= JSON::PP::IncrParser->new )->incr_reset;
1319
+ }
1320
+
1321
+ eval q{
1322
+ sub JSON::PP::incr_text : lvalue {
1323
+ $_[0]->{_incr_parser} ||= JSON::PP::IncrParser->new;
1324
+
1325
+ if ( $_[0]->{_incr_parser}->{incr_parsing} ) {
1326
+ Carp::croak("incr_text can not be called when the incremental parser already started parsing");
1327
+ }
1328
+ $_[0]->{_incr_parser}->{incr_text};
1329
+ }
1330
+ } if ( $] >= 5.006 );
1331
+
1332
+ } # Setup for various Perl versions (the code from JSON::PP58)
1333
+
1334
+
1335
+ ###############################
1336
+ # Utilities
1337
+ #
1338
+
1339
+ BEGIN {
1340
+ eval 'require Scalar::Util';
1341
+ unless($@){
1342
+ *JSON::PP::blessed = \&Scalar::Util::blessed;
1343
+ *JSON::PP::reftype = \&Scalar::Util::reftype;
1344
+ *JSON::PP::refaddr = \&Scalar::Util::refaddr;
1345
+ }
1346
+ else{ # This code is from Scalar::Util.
1347
+ # warn $@;
1348
+ eval 'sub UNIVERSAL::a_sub_not_likely_to_be_here { ref($_[0]) }';
1349
+ *JSON::PP::blessed = sub {
1350
+ local($@, $SIG{__DIE__}, $SIG{__WARN__});
1351
+ ref($_[0]) ? eval { $_[0]->a_sub_not_likely_to_be_here } : undef;
1352
+ };
1353
+ my %tmap = qw(
1354
+ B::NULL SCALAR
1355
+ B::HV HASH
1356
+ B::AV ARRAY
1357
+ B::CV CODE
1358
+ B::IO IO
1359
+ B::GV GLOB
1360
+ B::REGEXP REGEXP
1361
+ );
1362
+ *JSON::PP::reftype = sub {
1363
+ my $r = shift;
1364
+
1365
+ return undef unless length(ref($r));
1366
+
1367
+ my $t = ref(B::svref_2object($r));
1368
+
1369
+ return
1370
+ exists $tmap{$t} ? $tmap{$t}
1371
+ : length(ref($$r)) ? 'REF'
1372
+ : 'SCALAR';
1373
+ };
1374
+ *JSON::PP::refaddr = sub {
1375
+ return undef unless length(ref($_[0]));
1376
+
1377
+ my $addr;
1378
+ if(defined(my $pkg = blessed($_[0]))) {
1379
+ $addr .= bless $_[0], 'Scalar::Util::Fake';
1380
+ bless $_[0], $pkg;
1381
+ }
1382
+ else {
1383
+ $addr .= $_[0]
1384
+ }
1385
+
1386
+ $addr =~ /0x(\w+)/;
1387
+ local $^W;
1388
+ #no warnings 'portable';
1389
+ hex($1);
1390
+ }
1391
+ }
1392
+ }
1393
+
1394
+
1395
+ # shamelessly copied and modified from JSON::XS code.
1396
+
1397
+ unless ( $INC{'JSON/PP.pm'} ) {
1398
+ eval q|
1399
+ package
1400
+ JSON::PP::Boolean;
1401
+
1402
+ use overload (
1403
+ "0+" => sub { ${$_[0]} },
1404
+ "++" => sub { $_[0] = ${$_[0]} + 1 },
1405
+ "--" => sub { $_[0] = ${$_[0]} - 1 },
1406
+ fallback => 1,
1407
+ );
1408
+ |;
1409
+ }
1410
+
1411
+ $JSON::PP::true = do { bless \(my $dummy = 1), "JSON::PP::Boolean" };
1412
+ $JSON::PP::false = do { bless \(my $dummy = 0), "JSON::PP::Boolean" };
1413
+
1414
+ sub is_bool { defined $_[0] and UNIVERSAL::isa($_[0], "JSON::PP::Boolean"); }
1415
+
1416
+ sub true { $JSON::PP::true }
1417
+ sub false { $JSON::PP::false }
1418
+ sub null { undef; }
1419
+
1420
+ ###############################
1421
+
1422
+ ###############################
1423
+
1424
+ package # hide from PAUSE
1425
+ JSON::PP::IncrParser;
1426
+
1427
+ use strict;
1428
+
1429
+ use constant INCR_M_WS => 0; # initial whitespace skipping
1430
+ use constant INCR_M_STR => 1; # inside string
1431
+ use constant INCR_M_BS => 2; # inside backslash
1432
+ use constant INCR_M_JSON => 3; # outside anything, count nesting
1433
+ use constant INCR_M_C0 => 4;
1434
+ use constant INCR_M_C1 => 5;
1435
+
1436
+ use vars qw($VERSION);
1437
+ $VERSION = '1.01';
1438
+
1439
+ my $unpack_format = $] < 5.006 ? 'C*' : 'U*';
1440
+
1441
+ sub new {
1442
+ my ( $class ) = @_;
1443
+
1444
+ bless {
1445
+ incr_nest => 0,
1446
+ incr_text => undef,
1447
+ incr_parsing => 0,
1448
+ incr_p => 0,
1449
+ }, $class;
1450
+ }
1451
+
1452
+
1453
+ sub incr_parse {
1454
+ my ( $self, $coder, $text ) = @_;
1455
+
1456
+ $self->{incr_text} = '' unless ( defined $self->{incr_text} );
1457
+
1458
+ if ( defined $text ) {
1459
+ if ( utf8::is_utf8( $text ) and !utf8::is_utf8( $self->{incr_text} ) ) {
1460
+ utf8::upgrade( $self->{incr_text} ) ;
1461
+ utf8::decode( $self->{incr_text} ) ;
1462
+ }
1463
+ $self->{incr_text} .= $text;
1464
+ }
1465
+
1466
+
1467
+ my $max_size = $coder->get_max_size;
1468
+
1469
+ if ( defined wantarray ) {
1470
+
1471
+ $self->{incr_mode} = INCR_M_WS unless defined $self->{incr_mode};
1472
+
1473
+ if ( wantarray ) {
1474
+ my @ret;
1475
+
1476
+ $self->{incr_parsing} = 1;
1477
+
1478
+ do {
1479
+ push @ret, $self->_incr_parse( $coder, $self->{incr_text} );
1480
+
1481
+ unless ( !$self->{incr_nest} and $self->{incr_mode} == INCR_M_JSON ) {
1482
+ $self->{incr_mode} = INCR_M_WS if $self->{incr_mode} != INCR_M_STR;
1483
+ }
1484
+
1485
+ } until ( length $self->{incr_text} >= $self->{incr_p} );
1486
+
1487
+ $self->{incr_parsing} = 0;
1488
+
1489
+ return @ret;
1490
+ }
1491
+ else { # in scalar context
1492
+ $self->{incr_parsing} = 1;
1493
+ my $obj = $self->_incr_parse( $coder, $self->{incr_text} );
1494
+ $self->{incr_parsing} = 0 if defined $obj; # pointed by Martin J. Evans
1495
+ return $obj ? $obj : undef; # $obj is an empty string, parsing was completed.
1496
+ }
1497
+
1498
+ }
1499
+
1500
+ }
1501
+
1502
+
1503
+ sub _incr_parse {
1504
+ my ( $self, $coder, $text, $skip ) = @_;
1505
+ my $p = $self->{incr_p};
1506
+ my $restore = $p;
1507
+
1508
+ my @obj;
1509
+ my $len = length $text;
1510
+
1511
+ if ( $self->{incr_mode} == INCR_M_WS ) {
1512
+ while ( $len > $p ) {
1513
+ my $s = substr( $text, $p, 1 );
1514
+ $p++ and next if ( 0x20 >= unpack($unpack_format, $s) );
1515
+ $self->{incr_mode} = INCR_M_JSON;
1516
+ last;
1517
+ }
1518
+ }
1519
+
1520
+ while ( $len > $p ) {
1521
+ my $s = substr( $text, $p++, 1 );
1522
+
1523
+ if ( $s eq '"' ) {
1524
+ if (substr( $text, $p - 2, 1 ) eq '\\' ) {
1525
+ next;
1526
+ }
1527
+
1528
+ if ( $self->{incr_mode} != INCR_M_STR ) {
1529
+ $self->{incr_mode} = INCR_M_STR;
1530
+ }
1531
+ else {
1532
+ $self->{incr_mode} = INCR_M_JSON;
1533
+ unless ( $self->{incr_nest} ) {
1534
+ last;
1535
+ }
1536
+ }
1537
+ }
1538
+
1539
+ if ( $self->{incr_mode} == INCR_M_JSON ) {
1540
+
1541
+ if ( $s eq '[' or $s eq '{' ) {
1542
+ if ( ++$self->{incr_nest} > $coder->get_max_depth ) {
1543
+ Carp::croak('json text or perl structure exceeds maximum nesting level (max_depth set too low?)');
1544
+ }
1545
+ }
1546
+ elsif ( $s eq ']' or $s eq '}' ) {
1547
+ last if ( --$self->{incr_nest} <= 0 );
1548
+ }
1549
+ elsif ( $s eq '#' ) {
1550
+ while ( $len > $p ) {
1551
+ last if substr( $text, $p++, 1 ) eq "\n";
1552
+ }
1553
+ }
1554
+
1555
+ }
1556
+
1557
+ }
1558
+
1559
+ $self->{incr_p} = $p;
1560
+
1561
+ return if ( $self->{incr_mode} == INCR_M_STR and not $self->{incr_nest} );
1562
+ return if ( $self->{incr_mode} == INCR_M_JSON and $self->{incr_nest} > 0 );
1563
+
1564
+ return '' unless ( length substr( $self->{incr_text}, 0, $p ) );
1565
+
1566
+ local $Carp::CarpLevel = 2;
1567
+
1568
+ $self->{incr_p} = $restore;
1569
+ $self->{incr_c} = $p;
1570
+
1571
+ my ( $obj, $tail ) = $coder->PP_decode_json( substr( $self->{incr_text}, 0, $p ), 0x10000001 );
1572
+
1573
+ $self->{incr_text} = substr( $self->{incr_text}, $p );
1574
+ $self->{incr_p} = 0;
1575
+
1576
+ return $obj || '';
1577
+ }
1578
+
1579
+
1580
+ sub incr_text {
1581
+ if ( $_[0]->{incr_parsing} ) {
1582
+ Carp::croak("incr_text can not be called when the incremental parser already started parsing");
1583
+ }
1584
+ $_[0]->{incr_text};
1585
+ }
1586
+
1587
+
1588
+ sub incr_skip {
1589
+ my $self = shift;
1590
+ $self->{incr_text} = substr( $self->{incr_text}, $self->{incr_c} );
1591
+ $self->{incr_p} = 0;
1592
+ }
1593
+
1594
+
1595
+ sub incr_reset {
1596
+ my $self = shift;
1597
+ $self->{incr_text} = undef;
1598
+ $self->{incr_p} = 0;
1599
+ $self->{incr_mode} = 0;
1600
+ $self->{incr_nest} = 0;
1601
+ $self->{incr_parsing} = 0;
1602
+ }
1603
+
1604
+ ###############################
1605
+
1606
+
1607
+ 1;
1608
+ __END__
1609
+ =pod
1610
+
1611
+ =head1 NAME
1612
+
1613
+ JSON::PP - JSON::XS compatible pure-Perl module.
1614
+
1615
+ =head1 SYNOPSIS
1616
+
1617
+ use JSON::PP;
1618
+
1619
+ # exported functions, they croak on error
1620
+ # and expect/generate UTF-8
1621
+
1622
+ $utf8_encoded_json_text = encode_json $perl_hash_or_arrayref;
1623
+ $perl_hash_or_arrayref = decode_json $utf8_encoded_json_text;
1624
+
1625
+ # OO-interface
1626
+
1627
+ $coder = JSON::PP->new->ascii->pretty->allow_nonref;
1628
+
1629
+ $json_text = $json->encode( $perl_scalar );
1630
+ $perl_scalar = $json->decode( $json_text );
1631
+
1632
+ $pretty_printed = $json->pretty->encode( $perl_scalar ); # pretty-printing
1633
+
1634
+ # Note that JSON version 2.0 and above will automatically use
1635
+ # JSON::XS or JSON::PP, so you should be able to just:
1636
+
1637
+ use JSON;
1638
+
1639
+
1640
+ =head1 VERSION
1641
+
1642
+ 2.27200
1643
+
1644
+ L<JSON::XS> 2.27 (~2.30) compatible.
1645
+
1646
+ =head1 DESCRIPTION
1647
+
1648
+ This module is L<JSON::XS> compatible pure Perl module.
1649
+ (Perl 5.8 or later is recommended)
1650
+
1651
+ JSON::XS is the fastest and most proper JSON module on CPAN.
1652
+ It is written by Marc Lehmann in C, so must be compiled and
1653
+ installed in the used environment.
1654
+
1655
+ JSON::PP is a pure-Perl module and has compatibility to JSON::XS.
1656
+
1657
+
1658
+ =head2 FEATURES
1659
+
1660
+ =over
1661
+
1662
+ =item * correct unicode handling
1663
+
1664
+ This module knows how to handle Unicode (depending on Perl version).
1665
+
1666
+ See to L<JSON::XS/A FEW NOTES ON UNICODE AND PERL> and
1667
+ L<UNICODE HANDLING ON PERLS>.
1668
+
1669
+
1670
+ =item * round-trip integrity
1671
+
1672
+ When you serialise a perl data structure using only data types
1673
+ supported by JSON and Perl, the deserialised data structure is
1674
+ identical on the Perl level. (e.g. the string "2.0" doesn't suddenly
1675
+ become "2" just because it looks like a number). There I<are> minor
1676
+ exceptions to this, read the MAPPING section below to learn about
1677
+ those.
1678
+
1679
+
1680
+ =item * strict checking of JSON correctness
1681
+
1682
+ There is no guessing, no generating of illegal JSON texts by default,
1683
+ and only JSON is accepted as input by default (the latter is a
1684
+ security feature). But when some options are set, loose checking
1685
+ features are available.
1686
+
1687
+ =back
1688
+
1689
+ =head1 FUNCTIONAL INTERFACE
1690
+
1691
+ Some documents are copied and modified from L<JSON::XS/FUNCTIONAL INTERFACE>.
1692
+
1693
+ =head2 encode_json
1694
+
1695
+ $json_text = encode_json $perl_scalar
1696
+
1697
+ Converts the given Perl data structure to a UTF-8 encoded, binary string.
1698
+
1699
+ This function call is functionally identical to:
1700
+
1701
+ $json_text = JSON::PP->new->utf8->encode($perl_scalar)
1702
+
1703
+ =head2 decode_json
1704
+
1705
+ $perl_scalar = decode_json $json_text
1706
+
1707
+ The opposite of C<encode_json>: expects an UTF-8 (binary) string and tries
1708
+ to parse that as an UTF-8 encoded JSON text, returning the resulting
1709
+ reference.
1710
+
1711
+ This function call is functionally identical to:
1712
+
1713
+ $perl_scalar = JSON::PP->new->utf8->decode($json_text)
1714
+
1715
+ =head2 JSON::PP::is_bool
1716
+
1717
+ $is_boolean = JSON::PP::is_bool($scalar)
1718
+
1719
+ Returns true if the passed scalar represents either JSON::PP::true or
1720
+ JSON::PP::false, two constants that act like C<1> and C<0> respectively
1721
+ and are also used to represent JSON C<true> and C<false> in Perl strings.
1722
+
1723
+ =head2 JSON::PP::true
1724
+
1725
+ Returns JSON true value which is blessed object.
1726
+ It C<isa> JSON::PP::Boolean object.
1727
+
1728
+ =head2 JSON::PP::false
1729
+
1730
+ Returns JSON false value which is blessed object.
1731
+ It C<isa> JSON::PP::Boolean object.
1732
+
1733
+ =head2 JSON::PP::null
1734
+
1735
+ Returns C<undef>.
1736
+
1737
+ See L<MAPPING>, below, for more information on how JSON values are mapped to
1738
+ Perl.
1739
+
1740
+
1741
+ =head1 HOW DO I DECODE A DATA FROM OUTER AND ENCODE TO OUTER
1742
+
1743
+ This section supposes that your perl version is 5.8 or later.
1744
+
1745
+ If you know a JSON text from an outer world - a network, a file content, and so on,
1746
+ is encoded in UTF-8, you should use C<decode_json> or C<JSON> module object
1747
+ with C<utf8> enable. And the decoded result will contain UNICODE characters.
1748
+
1749
+ # from network
1750
+ my $json = JSON::PP->new->utf8;
1751
+ my $json_text = CGI->new->param( 'json_data' );
1752
+ my $perl_scalar = $json->decode( $json_text );
1753
+
1754
+ # from file content
1755
+ local $/;
1756
+ open( my $fh, '<', 'json.data' );
1757
+ $json_text = <$fh>;
1758
+ $perl_scalar = decode_json( $json_text );
1759
+
1760
+ If an outer data is not encoded in UTF-8, firstly you should C<decode> it.
1761
+
1762
+ use Encode;
1763
+ local $/;
1764
+ open( my $fh, '<', 'json.data' );
1765
+ my $encoding = 'cp932';
1766
+ my $unicode_json_text = decode( $encoding, <$fh> ); # UNICODE
1767
+
1768
+ # or you can write the below code.
1769
+ #
1770
+ # open( my $fh, "<:encoding($encoding)", 'json.data' );
1771
+ # $unicode_json_text = <$fh>;
1772
+
1773
+ In this case, C<$unicode_json_text> is of course UNICODE string.
1774
+ So you B<cannot> use C<decode_json> nor C<JSON> module object with C<utf8> enable.
1775
+ Instead of them, you use C<JSON> module object with C<utf8> disable.
1776
+
1777
+ $perl_scalar = $json->utf8(0)->decode( $unicode_json_text );
1778
+
1779
+ Or C<encode 'utf8'> and C<decode_json>:
1780
+
1781
+ $perl_scalar = decode_json( encode( 'utf8', $unicode_json_text ) );
1782
+ # this way is not efficient.
1783
+
1784
+ And now, you want to convert your C<$perl_scalar> into JSON data and
1785
+ send it to an outer world - a network or a file content, and so on.
1786
+
1787
+ Your data usually contains UNICODE strings and you want the converted data to be encoded
1788
+ in UTF-8, you should use C<encode_json> or C<JSON> module object with C<utf8> enable.
1789
+
1790
+ print encode_json( $perl_scalar ); # to a network? file? or display?
1791
+ # or
1792
+ print $json->utf8->encode( $perl_scalar );
1793
+
1794
+ If C<$perl_scalar> does not contain UNICODE but C<$encoding>-encoded strings
1795
+ for some reason, then its characters are regarded as B<latin1> for perl
1796
+ (because it does not concern with your $encoding).
1797
+ You B<cannot> use C<encode_json> nor C<JSON> module object with C<utf8> enable.
1798
+ Instead of them, you use C<JSON> module object with C<utf8> disable.
1799
+ Note that the resulted text is a UNICODE string but no problem to print it.
1800
+
1801
+ # $perl_scalar contains $encoding encoded string values
1802
+ $unicode_json_text = $json->utf8(0)->encode( $perl_scalar );
1803
+ # $unicode_json_text consists of characters less than 0x100
1804
+ print $unicode_json_text;
1805
+
1806
+ Or C<decode $encoding> all string values and C<encode_json>:
1807
+
1808
+ $perl_scalar->{ foo } = decode( $encoding, $perl_scalar->{ foo } );
1809
+ # ... do it to each string values, then encode_json
1810
+ $json_text = encode_json( $perl_scalar );
1811
+
1812
+ This method is a proper way but probably not efficient.
1813
+
1814
+ See to L<Encode>, L<perluniintro>.
1815
+
1816
+
1817
+ =head1 METHODS
1818
+
1819
+ Basically, check to L<JSON> or L<JSON::XS>.
1820
+
1821
+ =head2 new
1822
+
1823
+ $json = JSON::PP->new
1824
+
1825
+ Returns a new JSON::PP object that can be used to de/encode JSON
1826
+ strings.
1827
+
1828
+ All boolean flags described below are by default I<disabled>.
1829
+
1830
+ The mutators for flags all return the JSON object again and thus calls can
1831
+ be chained:
1832
+
1833
+ my $json = JSON::PP->new->utf8->space_after->encode({a => [1,2]})
1834
+ => {"a": [1, 2]}
1835
+
1836
+ =head2 ascii
1837
+
1838
+ $json = $json->ascii([$enable])
1839
+
1840
+ $enabled = $json->get_ascii
1841
+
1842
+ If $enable is true (or missing), then the encode method will not generate characters outside
1843
+ the code range 0..127. Any Unicode characters outside that range will be escaped using either
1844
+ a single \uXXXX or a double \uHHHH\uLLLLL escape sequence, as per RFC4627.
1845
+ (See to L<JSON::XS/OBJECT-ORIENTED INTERFACE>).
1846
+
1847
+ In Perl 5.005, there is no character having high value (more than 255).
1848
+ See to L<UNICODE HANDLING ON PERLS>.
1849
+
1850
+ If $enable is false, then the encode method will not escape Unicode characters unless
1851
+ required by the JSON syntax or other flags. This results in a faster and more compact format.
1852
+
1853
+ JSON::PP->new->ascii(1)->encode([chr 0x10401])
1854
+ => ["\ud801\udc01"]
1855
+
1856
+ =head2 latin1
1857
+
1858
+ $json = $json->latin1([$enable])
1859
+
1860
+ $enabled = $json->get_latin1
1861
+
1862
+ If $enable is true (or missing), then the encode method will encode the resulting JSON
1863
+ text as latin1 (or iso-8859-1), escaping any characters outside the code range 0..255.
1864
+
1865
+ If $enable is false, then the encode method will not escape Unicode characters
1866
+ unless required by the JSON syntax or other flags.
1867
+
1868
+ JSON::XS->new->latin1->encode (["\x{89}\x{abc}"]
1869
+ => ["\x{89}\\u0abc"] # (perl syntax, U+abc escaped, U+89 not)
1870
+
1871
+ See to L<UNICODE HANDLING ON PERLS>.
1872
+
1873
+ =head2 utf8
1874
+
1875
+ $json = $json->utf8([$enable])
1876
+
1877
+ $enabled = $json->get_utf8
1878
+
1879
+ If $enable is true (or missing), then the encode method will encode the JSON result
1880
+ into UTF-8, as required by many protocols, while the decode method expects to be handled
1881
+ an UTF-8-encoded string. Please note that UTF-8-encoded strings do not contain any
1882
+ characters outside the range 0..255, they are thus useful for bytewise/binary I/O.
1883
+
1884
+ (In Perl 5.005, any character outside the range 0..255 does not exist.
1885
+ See to L<UNICODE HANDLING ON PERLS>.)
1886
+
1887
+ In future versions, enabling this option might enable autodetection of the UTF-16 and UTF-32
1888
+ encoding families, as described in RFC4627.
1889
+
1890
+ If $enable is false, then the encode method will return the JSON string as a (non-encoded)
1891
+ Unicode string, while decode expects thus a Unicode string. Any decoding or encoding
1892
+ (e.g. to UTF-8 or UTF-16) needs to be done yourself, e.g. using the Encode module.
1893
+
1894
+ Example, output UTF-16BE-encoded JSON:
1895
+
1896
+ use Encode;
1897
+ $jsontext = encode "UTF-16BE", JSON::PP->new->encode ($object);
1898
+
1899
+ Example, decode UTF-32LE-encoded JSON:
1900
+
1901
+ use Encode;
1902
+ $object = JSON::PP->new->decode (decode "UTF-32LE", $jsontext);
1903
+
1904
+
1905
+ =head2 pretty
1906
+
1907
+ $json = $json->pretty([$enable])
1908
+
1909
+ This enables (or disables) all of the C<indent>, C<space_before> and
1910
+ C<space_after> flags in one call to generate the most readable
1911
+ (or most compact) form possible.
1912
+
1913
+ Equivalent to:
1914
+
1915
+ $json->indent->space_before->space_after
1916
+
1917
+ =head2 indent
1918
+
1919
+ $json = $json->indent([$enable])
1920
+
1921
+ $enabled = $json->get_indent
1922
+
1923
+ The default indent space length is three.
1924
+ You can use C<indent_length> to change the length.
1925
+
1926
+ =head2 space_before
1927
+
1928
+ $json = $json->space_before([$enable])
1929
+
1930
+ $enabled = $json->get_space_before
1931
+
1932
+ If C<$enable> is true (or missing), then the C<encode> method will add an extra
1933
+ optional space before the C<:> separating keys from values in JSON objects.
1934
+
1935
+ If C<$enable> is false, then the C<encode> method will not add any extra
1936
+ space at those places.
1937
+
1938
+ This setting has no effect when decoding JSON texts.
1939
+
1940
+ Example, space_before enabled, space_after and indent disabled:
1941
+
1942
+ {"key" :"value"}
1943
+
1944
+ =head2 space_after
1945
+
1946
+ $json = $json->space_after([$enable])
1947
+
1948
+ $enabled = $json->get_space_after
1949
+
1950
+ If C<$enable> is true (or missing), then the C<encode> method will add an extra
1951
+ optional space after the C<:> separating keys from values in JSON objects
1952
+ and extra whitespace after the C<,> separating key-value pairs and array
1953
+ members.
1954
+
1955
+ If C<$enable> is false, then the C<encode> method will not add any extra
1956
+ space at those places.
1957
+
1958
+ This setting has no effect when decoding JSON texts.
1959
+
1960
+ Example, space_before and indent disabled, space_after enabled:
1961
+
1962
+ {"key": "value"}
1963
+
1964
+ =head2 relaxed
1965
+
1966
+ $json = $json->relaxed([$enable])
1967
+
1968
+ $enabled = $json->get_relaxed
1969
+
1970
+ If C<$enable> is true (or missing), then C<decode> will accept some
1971
+ extensions to normal JSON syntax (see below). C<encode> will not be
1972
+ affected in anyway. I<Be aware that this option makes you accept invalid
1973
+ JSON texts as if they were valid!>. I suggest only to use this option to
1974
+ parse application-specific files written by humans (configuration files,
1975
+ resource files etc.)
1976
+
1977
+ If C<$enable> is false (the default), then C<decode> will only accept
1978
+ valid JSON texts.
1979
+
1980
+ Currently accepted extensions are:
1981
+
1982
+ =over 4
1983
+
1984
+ =item * list items can have an end-comma
1985
+
1986
+ JSON I<separates> array elements and key-value pairs with commas. This
1987
+ can be annoying if you write JSON texts manually and want to be able to
1988
+ quickly append elements, so this extension accepts comma at the end of
1989
+ such items not just between them:
1990
+
1991
+ [
1992
+ 1,
1993
+ 2, <- this comma not normally allowed
1994
+ ]
1995
+ {
1996
+ "k1": "v1",
1997
+ "k2": "v2", <- this comma not normally allowed
1998
+ }
1999
+
2000
+ =item * shell-style '#'-comments
2001
+
2002
+ Whenever JSON allows whitespace, shell-style comments are additionally
2003
+ allowed. They are terminated by the first carriage-return or line-feed
2004
+ character, after which more white-space and comments are allowed.
2005
+
2006
+ [
2007
+ 1, # this comment not allowed in JSON
2008
+ # neither this one...
2009
+ ]
2010
+
2011
+ =back
2012
+
2013
+ =head2 canonical
2014
+
2015
+ $json = $json->canonical([$enable])
2016
+
2017
+ $enabled = $json->get_canonical
2018
+
2019
+ If C<$enable> is true (or missing), then the C<encode> method will output JSON objects
2020
+ by sorting their keys. This is adding a comparatively high overhead.
2021
+
2022
+ If C<$enable> is false, then the C<encode> method will output key-value
2023
+ pairs in the order Perl stores them (which will likely change between runs
2024
+ of the same script).
2025
+
2026
+ This option is useful if you want the same data structure to be encoded as
2027
+ the same JSON text (given the same overall settings). If it is disabled,
2028
+ the same hash might be encoded differently even if contains the same data,
2029
+ as key-value pairs have no inherent ordering in Perl.
2030
+
2031
+ This setting has no effect when decoding JSON texts.
2032
+
2033
+ If you want your own sorting routine, you can give a code reference
2034
+ or a subroutine name to C<sort_by>. See to C<JSON::PP OWN METHODS>.
2035
+
2036
+ =head2 allow_nonref
2037
+
2038
+ $json = $json->allow_nonref([$enable])
2039
+
2040
+ $enabled = $json->get_allow_nonref
2041
+
2042
+ If C<$enable> is true (or missing), then the C<encode> method can convert a
2043
+ non-reference into its corresponding string, number or null JSON value,
2044
+ which is an extension to RFC4627. Likewise, C<decode> will accept those JSON
2045
+ values instead of croaking.
2046
+
2047
+ If C<$enable> is false, then the C<encode> method will croak if it isn't
2048
+ passed an arrayref or hashref, as JSON texts must either be an object
2049
+ or array. Likewise, C<decode> will croak if given something that is not a
2050
+ JSON object or array.
2051
+
2052
+ JSON::PP->new->allow_nonref->encode ("Hello, World!")
2053
+ => "Hello, World!"
2054
+
2055
+ =head2 allow_unknown
2056
+
2057
+ $json = $json->allow_unknown ([$enable])
2058
+
2059
+ $enabled = $json->get_allow_unknown
2060
+
2061
+ If $enable is true (or missing), then "encode" will *not* throw an
2062
+ exception when it encounters values it cannot represent in JSON (for
2063
+ example, filehandles) but instead will encode a JSON "null" value.
2064
+ Note that blessed objects are not included here and are handled
2065
+ separately by c<allow_nonref>.
2066
+
2067
+ If $enable is false (the default), then "encode" will throw an
2068
+ exception when it encounters anything it cannot encode as JSON.
2069
+
2070
+ This option does not affect "decode" in any way, and it is
2071
+ recommended to leave it off unless you know your communications
2072
+ partner.
2073
+
2074
+ =head2 allow_blessed
2075
+
2076
+ $json = $json->allow_blessed([$enable])
2077
+
2078
+ $enabled = $json->get_allow_blessed
2079
+
2080
+ If C<$enable> is true (or missing), then the C<encode> method will not
2081
+ barf when it encounters a blessed reference. Instead, the value of the
2082
+ B<convert_blessed> option will decide whether C<null> (C<convert_blessed>
2083
+ disabled or no C<TO_JSON> method found) or a representation of the
2084
+ object (C<convert_blessed> enabled and C<TO_JSON> method found) is being
2085
+ encoded. Has no effect on C<decode>.
2086
+
2087
+ If C<$enable> is false (the default), then C<encode> will throw an
2088
+ exception when it encounters a blessed object.
2089
+
2090
+ =head2 convert_blessed
2091
+
2092
+ $json = $json->convert_blessed([$enable])
2093
+
2094
+ $enabled = $json->get_convert_blessed
2095
+
2096
+ If C<$enable> is true (or missing), then C<encode>, upon encountering a
2097
+ blessed object, will check for the availability of the C<TO_JSON> method
2098
+ on the object's class. If found, it will be called in scalar context
2099
+ and the resulting scalar will be encoded instead of the object. If no
2100
+ C<TO_JSON> method is found, the value of C<allow_blessed> will decide what
2101
+ to do.
2102
+
2103
+ The C<TO_JSON> method may safely call die if it wants. If C<TO_JSON>
2104
+ returns other blessed objects, those will be handled in the same
2105
+ way. C<TO_JSON> must take care of not causing an endless recursion cycle
2106
+ (== crash) in this case. The name of C<TO_JSON> was chosen because other
2107
+ methods called by the Perl core (== not by the user of the object) are
2108
+ usually in upper case letters and to avoid collisions with the C<to_json>
2109
+ function or method.
2110
+
2111
+ This setting does not yet influence C<decode> in any way.
2112
+
2113
+ If C<$enable> is false, then the C<allow_blessed> setting will decide what
2114
+ to do when a blessed object is found.
2115
+
2116
+ =head2 filter_json_object
2117
+
2118
+ $json = $json->filter_json_object([$coderef])
2119
+
2120
+ When C<$coderef> is specified, it will be called from C<decode> each
2121
+ time it decodes a JSON object. The only argument passed to the coderef
2122
+ is a reference to the newly-created hash. If the code references returns
2123
+ a single scalar (which need not be a reference), this value
2124
+ (i.e. a copy of that scalar to avoid aliasing) is inserted into the
2125
+ deserialised data structure. If it returns an empty list
2126
+ (NOTE: I<not> C<undef>, which is a valid scalar), the original deserialised
2127
+ hash will be inserted. This setting can slow down decoding considerably.
2128
+
2129
+ When C<$coderef> is omitted or undefined, any existing callback will
2130
+ be removed and C<decode> will not change the deserialised hash in any
2131
+ way.
2132
+
2133
+ Example, convert all JSON objects into the integer 5:
2134
+
2135
+ my $js = JSON::PP->new->filter_json_object (sub { 5 });
2136
+ # returns [5]
2137
+ $js->decode ('[{}]'); # the given subroutine takes a hash reference.
2138
+ # throw an exception because allow_nonref is not enabled
2139
+ # so a lone 5 is not allowed.
2140
+ $js->decode ('{"a":1, "b":2}');
2141
+
2142
+ =head2 filter_json_single_key_object
2143
+
2144
+ $json = $json->filter_json_single_key_object($key [=> $coderef])
2145
+
2146
+ Works remotely similar to C<filter_json_object>, but is only called for
2147
+ JSON objects having a single key named C<$key>.
2148
+
2149
+ This C<$coderef> is called before the one specified via
2150
+ C<filter_json_object>, if any. It gets passed the single value in the JSON
2151
+ object. If it returns a single value, it will be inserted into the data
2152
+ structure. If it returns nothing (not even C<undef> but the empty list),
2153
+ the callback from C<filter_json_object> will be called next, as if no
2154
+ single-key callback were specified.
2155
+
2156
+ If C<$coderef> is omitted or undefined, the corresponding callback will be
2157
+ disabled. There can only ever be one callback for a given key.
2158
+
2159
+ As this callback gets called less often then the C<filter_json_object>
2160
+ one, decoding speed will not usually suffer as much. Therefore, single-key
2161
+ objects make excellent targets to serialise Perl objects into, especially
2162
+ as single-key JSON objects are as close to the type-tagged value concept
2163
+ as JSON gets (it's basically an ID/VALUE tuple). Of course, JSON does not
2164
+ support this in any way, so you need to make sure your data never looks
2165
+ like a serialised Perl hash.
2166
+
2167
+ Typical names for the single object key are C<__class_whatever__>, or
2168
+ C<$__dollars_are_rarely_used__$> or C<}ugly_brace_placement>, or even
2169
+ things like C<__class_md5sum(classname)__>, to reduce the risk of clashing
2170
+ with real hashes.
2171
+
2172
+ Example, decode JSON objects of the form C<< { "__widget__" => <id> } >>
2173
+ into the corresponding C<< $WIDGET{<id>} >> object:
2174
+
2175
+ # return whatever is in $WIDGET{5}:
2176
+ JSON::PP
2177
+ ->new
2178
+ ->filter_json_single_key_object (__widget__ => sub {
2179
+ $WIDGET{ $_[0] }
2180
+ })
2181
+ ->decode ('{"__widget__": 5')
2182
+
2183
+ # this can be used with a TO_JSON method in some "widget" class
2184
+ # for serialisation to json:
2185
+ sub WidgetBase::TO_JSON {
2186
+ my ($self) = @_;
2187
+
2188
+ unless ($self->{id}) {
2189
+ $self->{id} = ..get..some..id..;
2190
+ $WIDGET{$self->{id}} = $self;
2191
+ }
2192
+
2193
+ { __widget__ => $self->{id} }
2194
+ }
2195
+
2196
+ =head2 shrink
2197
+
2198
+ $json = $json->shrink([$enable])
2199
+
2200
+ $enabled = $json->get_shrink
2201
+
2202
+ In JSON::XS, this flag resizes strings generated by either
2203
+ C<encode> or C<decode> to their minimum size possible.
2204
+ It will also try to downgrade any strings to octet-form if possible.
2205
+
2206
+ In JSON::PP, it is noop about resizing strings but tries
2207
+ C<utf8::downgrade> to the returned string by C<encode>.
2208
+ See to L<utf8>.
2209
+
2210
+ See to L<JSON::XS/OBJECT-ORIENTED INTERFACE>
2211
+
2212
+ =head2 max_depth
2213
+
2214
+ $json = $json->max_depth([$maximum_nesting_depth])
2215
+
2216
+ $max_depth = $json->get_max_depth
2217
+
2218
+ Sets the maximum nesting level (default C<512>) accepted while encoding
2219
+ or decoding. If a higher nesting level is detected in JSON text or a Perl
2220
+ data structure, then the encoder and decoder will stop and croak at that
2221
+ point.
2222
+
2223
+ Nesting level is defined by number of hash- or arrayrefs that the encoder
2224
+ needs to traverse to reach a given point or the number of C<{> or C<[>
2225
+ characters without their matching closing parenthesis crossed to reach a
2226
+ given character in a string.
2227
+
2228
+ If no argument is given, the highest possible setting will be used, which
2229
+ is rarely useful.
2230
+
2231
+ See L<JSON::XS/SSECURITY CONSIDERATIONS> for more info on why this is useful.
2232
+
2233
+ When a large value (100 or more) was set and it de/encodes a deep nested object/text,
2234
+ it may raise a warning 'Deep recursion on subroutine' at the perl runtime phase.
2235
+
2236
+ =head2 max_size
2237
+
2238
+ $json = $json->max_size([$maximum_string_size])
2239
+
2240
+ $max_size = $json->get_max_size
2241
+
2242
+ Set the maximum length a JSON text may have (in bytes) where decoding is
2243
+ being attempted. The default is C<0>, meaning no limit. When C<decode>
2244
+ is called on a string that is longer then this many bytes, it will not
2245
+ attempt to decode the string but throw an exception. This setting has no
2246
+ effect on C<encode> (yet).
2247
+
2248
+ If no argument is given, the limit check will be deactivated (same as when
2249
+ C<0> is specified).
2250
+
2251
+ See L<JSON::XS/SECURITY CONSIDERATIONS> for more info on why this is useful.
2252
+
2253
+ =head2 encode
2254
+
2255
+ $json_text = $json->encode($perl_scalar)
2256
+
2257
+ Converts the given Perl data structure (a simple scalar or a reference
2258
+ to a hash or array) to its JSON representation. Simple scalars will be
2259
+ converted into JSON string or number sequences, while references to arrays
2260
+ become JSON arrays and references to hashes become JSON objects. Undefined
2261
+ Perl values (e.g. C<undef>) become JSON C<null> values.
2262
+ References to the integers C<0> and C<1> are converted into C<true> and C<false>.
2263
+
2264
+ =head2 decode
2265
+
2266
+ $perl_scalar = $json->decode($json_text)
2267
+
2268
+ The opposite of C<encode>: expects a JSON text and tries to parse it,
2269
+ returning the resulting simple scalar or reference. Croaks on error.
2270
+
2271
+ JSON numbers and strings become simple Perl scalars. JSON arrays become
2272
+ Perl arrayrefs and JSON objects become Perl hashrefs. C<true> becomes
2273
+ C<1> (C<JSON::true>), C<false> becomes C<0> (C<JSON::false>) and
2274
+ C<null> becomes C<undef>.
2275
+
2276
+ =head2 decode_prefix
2277
+
2278
+ ($perl_scalar, $characters) = $json->decode_prefix($json_text)
2279
+
2280
+ This works like the C<decode> method, but instead of raising an exception
2281
+ when there is trailing garbage after the first JSON object, it will
2282
+ silently stop parsing there and return the number of characters consumed
2283
+ so far.
2284
+
2285
+ JSON->new->decode_prefix ("[1] the tail")
2286
+ => ([], 3)
2287
+
2288
+ =head1 INCREMENTAL PARSING
2289
+
2290
+ Most of this section are copied and modified from L<JSON::XS/INCREMENTAL PARSING>.
2291
+
2292
+ In some cases, there is the need for incremental parsing of JSON texts.
2293
+ This module does allow you to parse a JSON stream incrementally.
2294
+ It does so by accumulating text until it has a full JSON object, which
2295
+ it then can decode. This process is similar to using C<decode_prefix>
2296
+ to see if a full JSON object is available, but is much more efficient
2297
+ (and can be implemented with a minimum of method calls).
2298
+
2299
+ This module will only attempt to parse the JSON text once it is sure it
2300
+ has enough text to get a decisive result, using a very simple but
2301
+ truly incremental parser. This means that it sometimes won't stop as
2302
+ early as the full parser, for example, it doesn't detect parenthesis
2303
+ mismatches. The only thing it guarantees is that it starts decoding as
2304
+ soon as a syntactically valid JSON text has been seen. This means you need
2305
+ to set resource limits (e.g. C<max_size>) to ensure the parser will stop
2306
+ parsing in the presence if syntax errors.
2307
+
2308
+ The following methods implement this incremental parser.
2309
+
2310
+ =head2 incr_parse
2311
+
2312
+ $json->incr_parse( [$string] ) # void context
2313
+
2314
+ $obj_or_undef = $json->incr_parse( [$string] ) # scalar context
2315
+
2316
+ @obj_or_empty = $json->incr_parse( [$string] ) # list context
2317
+
2318
+ This is the central parsing function. It can both append new text and
2319
+ extract objects from the stream accumulated so far (both of these
2320
+ functions are optional).
2321
+
2322
+ If C<$string> is given, then this string is appended to the already
2323
+ existing JSON fragment stored in the C<$json> object.
2324
+
2325
+ After that, if the function is called in void context, it will simply
2326
+ return without doing anything further. This can be used to add more text
2327
+ in as many chunks as you want.
2328
+
2329
+ If the method is called in scalar context, then it will try to extract
2330
+ exactly I<one> JSON object. If that is successful, it will return this
2331
+ object, otherwise it will return C<undef>. If there is a parse error,
2332
+ this method will croak just as C<decode> would do (one can then use
2333
+ C<incr_skip> to skip the erroneous part). This is the most common way of
2334
+ using the method.
2335
+
2336
+ And finally, in list context, it will try to extract as many objects
2337
+ from the stream as it can find and return them, or the empty list
2338
+ otherwise. For this to work, there must be no separators between the JSON
2339
+ objects or arrays, instead they must be concatenated back-to-back. If
2340
+ an error occurs, an exception will be raised as in the scalar context
2341
+ case. Note that in this case, any previously-parsed JSON texts will be
2342
+ lost.
2343
+
2344
+ Example: Parse some JSON arrays/objects in a given string and return them.
2345
+
2346
+ my @objs = JSON->new->incr_parse ("[5][7][1,2]");
2347
+
2348
+ =head2 incr_text
2349
+
2350
+ $lvalue_string = $json->incr_text
2351
+
2352
+ This method returns the currently stored JSON fragment as an lvalue, that
2353
+ is, you can manipulate it. This I<only> works when a preceding call to
2354
+ C<incr_parse> in I<scalar context> successfully returned an object. Under
2355
+ all other circumstances you must not call this function (I mean it.
2356
+ although in simple tests it might actually work, it I<will> fail under
2357
+ real world conditions). As a special exception, you can also call this
2358
+ method before having parsed anything.
2359
+
2360
+ This function is useful in two cases: a) finding the trailing text after a
2361
+ JSON object or b) parsing multiple JSON objects separated by non-JSON text
2362
+ (such as commas).
2363
+
2364
+ $json->incr_text =~ s/\s*,\s*//;
2365
+
2366
+ In Perl 5.005, C<lvalue> attribute is not available.
2367
+ You must write codes like the below:
2368
+
2369
+ $string = $json->incr_text;
2370
+ $string =~ s/\s*,\s*//;
2371
+ $json->incr_text( $string );
2372
+
2373
+ =head2 incr_skip
2374
+
2375
+ $json->incr_skip
2376
+
2377
+ This will reset the state of the incremental parser and will remove the
2378
+ parsed text from the input buffer. This is useful after C<incr_parse>
2379
+ died, in which case the input buffer and incremental parser state is left
2380
+ unchanged, to skip the text parsed so far and to reset the parse state.
2381
+
2382
+ =head2 incr_reset
2383
+
2384
+ $json->incr_reset
2385
+
2386
+ This completely resets the incremental parser, that is, after this call,
2387
+ it will be as if the parser had never parsed anything.
2388
+
2389
+ This is useful if you want to repeatedly parse JSON objects and want to
2390
+ ignore any trailing data, which means you have to reset the parser after
2391
+ each successful decode.
2392
+
2393
+ See to L<JSON::XS/INCREMENTAL PARSING> for examples.
2394
+
2395
+
2396
+ =head1 JSON::PP OWN METHODS
2397
+
2398
+ =head2 allow_singlequote
2399
+
2400
+ $json = $json->allow_singlequote([$enable])
2401
+
2402
+ If C<$enable> is true (or missing), then C<decode> will accept
2403
+ JSON strings quoted by single quotations that are invalid JSON
2404
+ format.
2405
+
2406
+ $json->allow_singlequote->decode({"foo":'bar'});
2407
+ $json->allow_singlequote->decode({'foo':"bar"});
2408
+ $json->allow_singlequote->decode({'foo':'bar'});
2409
+
2410
+ As same as the C<relaxed> option, this option may be used to parse
2411
+ application-specific files written by humans.
2412
+
2413
+
2414
+ =head2 allow_barekey
2415
+
2416
+ $json = $json->allow_barekey([$enable])
2417
+
2418
+ If C<$enable> is true (or missing), then C<decode> will accept
2419
+ bare keys of JSON object that are invalid JSON format.
2420
+
2421
+ As same as the C<relaxed> option, this option may be used to parse
2422
+ application-specific files written by humans.
2423
+
2424
+ $json->allow_barekey->decode('{foo:"bar"}');
2425
+
2426
+ =head2 allow_bignum
2427
+
2428
+ $json = $json->allow_bignum([$enable])
2429
+
2430
+ If C<$enable> is true (or missing), then C<decode> will convert
2431
+ the big integer Perl cannot handle as integer into a L<Math::BigInt>
2432
+ object and convert a floating number (any) into a L<Math::BigFloat>.
2433
+
2434
+ On the contrary, C<encode> converts C<Math::BigInt> objects and C<Math::BigFloat>
2435
+ objects into JSON numbers with C<allow_blessed> enable.
2436
+
2437
+ $json->allow_nonref->allow_blessed->allow_bignum;
2438
+ $bigfloat = $json->decode('2.000000000000000000000000001');
2439
+ print $json->encode($bigfloat);
2440
+ # => 2.000000000000000000000000001
2441
+
2442
+ See to L<JSON::XS/MAPPING> about the normal conversion of JSON number.
2443
+
2444
+ =head2 loose
2445
+
2446
+ $json = $json->loose([$enable])
2447
+
2448
+ The unescaped [\x00-\x1f\x22\x2f\x5c] strings are invalid in JSON strings
2449
+ and the module doesn't allow to C<decode> to these (except for \x2f).
2450
+ If C<$enable> is true (or missing), then C<decode> will accept these
2451
+ unescaped strings.
2452
+
2453
+ $json->loose->decode(qq|["abc
2454
+ def"]|);
2455
+
2456
+ See L<JSON::XS/SSECURITY CONSIDERATIONS>.
2457
+
2458
+ =head2 escape_slash
2459
+
2460
+ $json = $json->escape_slash([$enable])
2461
+
2462
+ According to JSON Grammar, I<slash> (U+002F) is escaped. But default
2463
+ JSON::PP (as same as JSON::XS) encodes strings without escaping slash.
2464
+
2465
+ If C<$enable> is true (or missing), then C<encode> will escape slashes.
2466
+
2467
+ =head2 indent_length
2468
+
2469
+ $json = $json->indent_length($length)
2470
+
2471
+ JSON::XS indent space length is 3 and cannot be changed.
2472
+ JSON::PP set the indent space length with the given $length.
2473
+ The default is 3. The acceptable range is 0 to 15.
2474
+
2475
+ =head2 sort_by
2476
+
2477
+ $json = $json->sort_by($function_name)
2478
+ $json = $json->sort_by($subroutine_ref)
2479
+
2480
+ If $function_name or $subroutine_ref are set, its sort routine are used
2481
+ in encoding JSON objects.
2482
+
2483
+ $js = $pc->sort_by(sub { $JSON::PP::a cmp $JSON::PP::b })->encode($obj);
2484
+ # is($js, q|{"a":1,"b":2,"c":3,"d":4,"e":5,"f":6,"g":7,"h":8,"i":9}|);
2485
+
2486
+ $js = $pc->sort_by('own_sort')->encode($obj);
2487
+ # is($js, q|{"a":1,"b":2,"c":3,"d":4,"e":5,"f":6,"g":7,"h":8,"i":9}|);
2488
+
2489
+ sub JSON::PP::own_sort { $JSON::PP::a cmp $JSON::PP::b }
2490
+
2491
+ As the sorting routine runs in the JSON::PP scope, the given
2492
+ subroutine name and the special variables C<$a>, C<$b> will begin
2493
+ 'JSON::PP::'.
2494
+
2495
+ If $integer is set, then the effect is same as C<canonical> on.
2496
+
2497
+ =head1 INTERNAL
2498
+
2499
+ For developers.
2500
+
2501
+ =over
2502
+
2503
+ =item PP_encode_box
2504
+
2505
+ Returns
2506
+
2507
+ {
2508
+ depth => $depth,
2509
+ indent_count => $indent_count,
2510
+ }
2511
+
2512
+
2513
+ =item PP_decode_box
2514
+
2515
+ Returns
2516
+
2517
+ {
2518
+ text => $text,
2519
+ at => $at,
2520
+ ch => $ch,
2521
+ len => $len,
2522
+ depth => $depth,
2523
+ encoding => $encoding,
2524
+ is_valid_utf8 => $is_valid_utf8,
2525
+ };
2526
+
2527
+ =back
2528
+
2529
+ =head1 MAPPING
2530
+
2531
+ This section is copied from JSON::XS and modified to C<JSON::PP>.
2532
+ JSON::XS and JSON::PP mapping mechanisms are almost equivalent.
2533
+
2534
+ See to L<JSON::XS/MAPPING>.
2535
+
2536
+ =head2 JSON -> PERL
2537
+
2538
+ =over 4
2539
+
2540
+ =item object
2541
+
2542
+ A JSON object becomes a reference to a hash in Perl. No ordering of object
2543
+ keys is preserved (JSON does not preserver object key ordering itself).
2544
+
2545
+ =item array
2546
+
2547
+ A JSON array becomes a reference to an array in Perl.
2548
+
2549
+ =item string
2550
+
2551
+ A JSON string becomes a string scalar in Perl - Unicode codepoints in JSON
2552
+ are represented by the same codepoints in the Perl string, so no manual
2553
+ decoding is necessary.
2554
+
2555
+ =item number
2556
+
2557
+ A JSON number becomes either an integer, numeric (floating point) or
2558
+ string scalar in perl, depending on its range and any fractional parts. On
2559
+ the Perl level, there is no difference between those as Perl handles all
2560
+ the conversion details, but an integer may take slightly less memory and
2561
+ might represent more values exactly than floating point numbers.
2562
+
2563
+ If the number consists of digits only, C<JSON> will try to represent
2564
+ it as an integer value. If that fails, it will try to represent it as
2565
+ a numeric (floating point) value if that is possible without loss of
2566
+ precision. Otherwise it will preserve the number as a string value (in
2567
+ which case you lose roundtripping ability, as the JSON number will be
2568
+ re-encoded to a JSON string).
2569
+
2570
+ Numbers containing a fractional or exponential part will always be
2571
+ represented as numeric (floating point) values, possibly at a loss of
2572
+ precision (in which case you might lose perfect roundtripping ability, but
2573
+ the JSON number will still be re-encoded as a JSON number).
2574
+
2575
+ Note that precision is not accuracy - binary floating point values cannot
2576
+ represent most decimal fractions exactly, and when converting from and to
2577
+ floating point, C<JSON> only guarantees precision up to but not including
2578
+ the least significant bit.
2579
+
2580
+ When C<allow_bignum> is enable, the big integers
2581
+ and the numeric can be optionally converted into L<Math::BigInt> and
2582
+ L<Math::BigFloat> objects.
2583
+
2584
+ =item true, false
2585
+
2586
+ These JSON atoms become C<JSON::PP::true> and C<JSON::PP::false>,
2587
+ respectively. They are overloaded to act almost exactly like the numbers
2588
+ C<1> and C<0>. You can check whether a scalar is a JSON boolean by using
2589
+ the C<JSON::is_bool> function.
2590
+
2591
+ print JSON::PP::true . "\n";
2592
+ => true
2593
+ print JSON::PP::true + 1;
2594
+ => 1
2595
+
2596
+ ok(JSON::true eq '1');
2597
+ ok(JSON::true == 1);
2598
+
2599
+ C<JSON> will install these missing overloading features to the backend modules.
2600
+
2601
+
2602
+ =item null
2603
+
2604
+ A JSON null atom becomes C<undef> in Perl.
2605
+
2606
+ C<JSON::PP::null> returns C<undef>.
2607
+
2608
+ =back
2609
+
2610
+
2611
+ =head2 PERL -> JSON
2612
+
2613
+ The mapping from Perl to JSON is slightly more difficult, as Perl is a
2614
+ truly typeless language, so we can only guess which JSON type is meant by
2615
+ a Perl value.
2616
+
2617
+ =over 4
2618
+
2619
+ =item hash references
2620
+
2621
+ Perl hash references become JSON objects. As there is no inherent ordering
2622
+ in hash keys (or JSON objects), they will usually be encoded in a
2623
+ pseudo-random order that can change between runs of the same program but
2624
+ stays generally the same within a single run of a program. C<JSON>
2625
+ optionally sort the hash keys (determined by the I<canonical> flag), so
2626
+ the same data structure will serialise to the same JSON text (given same
2627
+ settings and version of JSON::XS), but this incurs a runtime overhead
2628
+ and is only rarely useful, e.g. when you want to compare some JSON text
2629
+ against another for equality.
2630
+
2631
+
2632
+ =item array references
2633
+
2634
+ Perl array references become JSON arrays.
2635
+
2636
+ =item other references
2637
+
2638
+ Other unblessed references are generally not allowed and will cause an
2639
+ exception to be thrown, except for references to the integers C<0> and
2640
+ C<1>, which get turned into C<false> and C<true> atoms in JSON. You can
2641
+ also use C<JSON::false> and C<JSON::true> to improve readability.
2642
+
2643
+ to_json [\0,JSON::PP::true] # yields [false,true]
2644
+
2645
+ =item JSON::PP::true, JSON::PP::false, JSON::PP::null
2646
+
2647
+ These special values become JSON true and JSON false values,
2648
+ respectively. You can also use C<\1> and C<\0> directly if you want.
2649
+
2650
+ JSON::PP::null returns C<undef>.
2651
+
2652
+ =item blessed objects
2653
+
2654
+ Blessed objects are not directly representable in JSON. See the
2655
+ C<allow_blessed> and C<convert_blessed> methods on various options on
2656
+ how to deal with this: basically, you can choose between throwing an
2657
+ exception, encoding the reference as if it weren't blessed, or provide
2658
+ your own serialiser method.
2659
+
2660
+ See to L<convert_blessed>.
2661
+
2662
+ =item simple scalars
2663
+
2664
+ Simple Perl scalars (any scalar that is not a reference) are the most
2665
+ difficult objects to encode: JSON::XS and JSON::PP will encode undefined scalars as
2666
+ JSON C<null> values, scalars that have last been used in a string context
2667
+ before encoding as JSON strings, and anything else as number value:
2668
+
2669
+ # dump as number
2670
+ encode_json [2] # yields [2]
2671
+ encode_json [-3.0e17] # yields [-3e+17]
2672
+ my $value = 5; encode_json [$value] # yields [5]
2673
+
2674
+ # used as string, so dump as string
2675
+ print $value;
2676
+ encode_json [$value] # yields ["5"]
2677
+
2678
+ # undef becomes null
2679
+ encode_json [undef] # yields [null]
2680
+
2681
+ You can force the type to be a string by stringifying it:
2682
+
2683
+ my $x = 3.1; # some variable containing a number
2684
+ "$x"; # stringified
2685
+ $x .= ""; # another, more awkward way to stringify
2686
+ print $x; # perl does it for you, too, quite often
2687
+
2688
+ You can force the type to be a number by numifying it:
2689
+
2690
+ my $x = "3"; # some variable containing a string
2691
+ $x += 0; # numify it, ensuring it will be dumped as a number
2692
+ $x *= 1; # same thing, the choice is yours.
2693
+
2694
+ You can not currently force the type in other, less obscure, ways.
2695
+
2696
+ Note that numerical precision has the same meaning as under Perl (so
2697
+ binary to decimal conversion follows the same rules as in Perl, which
2698
+ can differ to other languages). Also, your perl interpreter might expose
2699
+ extensions to the floating point numbers of your platform, such as
2700
+ infinities or NaN's - these cannot be represented in JSON, and it is an
2701
+ error to pass those in.
2702
+
2703
+ =item Big Number
2704
+
2705
+ When C<allow_bignum> is enable,
2706
+ C<encode> converts C<Math::BigInt> objects and C<Math::BigFloat>
2707
+ objects into JSON numbers.
2708
+
2709
+
2710
+ =back
2711
+
2712
+ =head1 UNICODE HANDLING ON PERLS
2713
+
2714
+ If you do not know about Unicode on Perl well,
2715
+ please check L<JSON::XS/A FEW NOTES ON UNICODE AND PERL>.
2716
+
2717
+ =head2 Perl 5.8 and later
2718
+
2719
+ Perl can handle Unicode and the JSON::PP de/encode methods also work properly.
2720
+
2721
+ $json->allow_nonref->encode(chr hex 3042);
2722
+ $json->allow_nonref->encode(chr hex 12345);
2723
+
2724
+ Returns C<"\u3042"> and C<"\ud808\udf45"> respectively.
2725
+
2726
+ $json->allow_nonref->decode('"\u3042"');
2727
+ $json->allow_nonref->decode('"\ud808\udf45"');
2728
+
2729
+ Returns UTF-8 encoded strings with UTF8 flag, regarded as C<U+3042> and C<U+12345>.
2730
+
2731
+ Note that the versions from Perl 5.8.0 to 5.8.2, Perl built-in C<join> was broken,
2732
+ so JSON::PP wraps the C<join> with a subroutine. Thus JSON::PP works slow in the versions.
2733
+
2734
+
2735
+ =head2 Perl 5.6
2736
+
2737
+ Perl can handle Unicode and the JSON::PP de/encode methods also work.
2738
+
2739
+ =head2 Perl 5.005
2740
+
2741
+ Perl 5.005 is a byte semantics world -- all strings are sequences of bytes.
2742
+ That means the unicode handling is not available.
2743
+
2744
+ In encoding,
2745
+
2746
+ $json->allow_nonref->encode(chr hex 3042); # hex 3042 is 12354.
2747
+ $json->allow_nonref->encode(chr hex 12345); # hex 12345 is 74565.
2748
+
2749
+ Returns C<B> and C<E>, as C<chr> takes a value more than 255, it treats
2750
+ as C<$value % 256>, so the above codes are equivalent to :
2751
+
2752
+ $json->allow_nonref->encode(chr 66);
2753
+ $json->allow_nonref->encode(chr 69);
2754
+
2755
+ In decoding,
2756
+
2757
+ $json->decode('"\u00e3\u0081\u0082"');
2758
+
2759
+ The returned is a byte sequence C<0xE3 0x81 0x82> for UTF-8 encoded
2760
+ japanese character (C<HIRAGANA LETTER A>).
2761
+ And if it is represented in Unicode code point, C<U+3042>.
2762
+
2763
+ Next,
2764
+
2765
+ $json->decode('"\u3042"');
2766
+
2767
+ We ordinary expect the returned value is a Unicode character C<U+3042>.
2768
+ But here is 5.005 world. This is C<0xE3 0x81 0x82>.
2769
+
2770
+ $json->decode('"\ud808\udf45"');
2771
+
2772
+ This is not a character C<U+12345> but bytes - C<0xf0 0x92 0x8d 0x85>.
2773
+
2774
+
2775
+ =head1 TODO
2776
+
2777
+ =over
2778
+
2779
+ =item speed
2780
+
2781
+ =item memory saving
2782
+
2783
+ =back
2784
+
2785
+
2786
+ =head1 SEE ALSO
2787
+
2788
+ Most of the document are copied and modified from JSON::XS doc.
2789
+
2790
+ L<JSON::XS>
2791
+
2792
+ RFC4627 (L<http://www.ietf.org/rfc/rfc4627.txt>)
2793
+
2794
+ =head1 AUTHOR
2795
+
2796
+ Makamaka Hannyaharamitu, E<lt>makamaka[at]cpan.orgE<gt>
2797
+
2798
+
2799
+ =head1 COPYRIGHT AND LICENSE
2800
+
2801
+ Copyright 2007-2012 by Makamaka Hannyaharamitu
2802
+
2803
+ This library is free software; you can redistribute it and/or modify
2804
+ it under the same terms as Perl itself.
2805
+
2806
+ =cut
uroman/lib/JSON/backportPP/Boolean.pm ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ =head1 NAME
2
+
3
+ JSON::PP::Boolean - dummy module providing JSON::PP::Boolean
4
+
5
+ =head1 SYNOPSIS
6
+
7
+ # do not "use" yourself
8
+
9
+ =head1 DESCRIPTION
10
+
11
+ This module exists only to provide overload resolution for Storable
12
+ and similar modules. See L<JSON::PP> for more info about this class.
13
+
14
+ =cut
15
+
16
+ use JSON::backportPP ();
17
+ use strict;
18
+
19
+ 1;
20
+
21
+ =head1 AUTHOR
22
+
23
+ This idea is from L<JSON::XS::Boolean> written by
24
+ Marc Lehmann <schmorp[at]schmorp.de>
25
+
26
+ =cut
27
+
uroman/lib/JSON/backportPP/Compat5005.pm ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package # This is JSON::backportPP
2
+ JSON::backportPP5005;
3
+
4
+ use 5.005;
5
+ use strict;
6
+
7
+ my @properties;
8
+
9
+ $JSON::PP5005::VERSION = '1.10';
10
+
11
+ BEGIN {
12
+
13
+ sub utf8::is_utf8 {
14
+ 0; # It is considered that UTF8 flag off for Perl 5.005.
15
+ }
16
+
17
+ sub utf8::upgrade {
18
+ }
19
+
20
+ sub utf8::downgrade {
21
+ 1; # must always return true.
22
+ }
23
+
24
+ sub utf8::encode {
25
+ }
26
+
27
+ sub utf8::decode {
28
+ }
29
+
30
+ *JSON::PP::JSON_PP_encode_ascii = \&_encode_ascii;
31
+ *JSON::PP::JSON_PP_encode_latin1 = \&_encode_latin1;
32
+ *JSON::PP::JSON_PP_decode_surrogates = \&_decode_surrogates;
33
+ *JSON::PP::JSON_PP_decode_unicode = \&_decode_unicode;
34
+
35
+ # missing in B module.
36
+ sub B::SVp_IOK () { 0x01000000; }
37
+ sub B::SVp_NOK () { 0x02000000; }
38
+ sub B::SVp_POK () { 0x04000000; }
39
+
40
+ $INC{'bytes.pm'} = 1; # dummy
41
+ }
42
+
43
+
44
+
45
+ sub _encode_ascii {
46
+ join('', map { $_ <= 127 ? chr($_) : sprintf('\u%04x', $_) } unpack('C*', $_[0]) );
47
+ }
48
+
49
+
50
+ sub _encode_latin1 {
51
+ join('', map { chr($_) } unpack('C*', $_[0]) );
52
+ }
53
+
54
+
55
+ sub _decode_surrogates { # from http://homepage1.nifty.com/nomenclator/unicode/ucs_utf.htm
56
+ my $uni = 0x10000 + (hex($_[0]) - 0xD800) * 0x400 + (hex($_[1]) - 0xDC00); # from perlunicode
57
+ my $bit = unpack('B32', pack('N', $uni));
58
+
59
+ if ( $bit =~ /^00000000000(...)(......)(......)(......)$/ ) {
60
+ my ($w, $x, $y, $z) = ($1, $2, $3, $4);
61
+ return pack('B*', sprintf('11110%s10%s10%s10%s', $w, $x, $y, $z));
62
+ }
63
+ else {
64
+ Carp::croak("Invalid surrogate pair");
65
+ }
66
+ }
67
+
68
+
69
+ sub _decode_unicode {
70
+ my ($u) = @_;
71
+ my ($utf8bit);
72
+
73
+ if ( $u =~ /^00([89a-f][0-9a-f])$/i ) { # 0x80-0xff
74
+ return pack( 'H2', $1 );
75
+ }
76
+
77
+ my $bit = unpack("B*", pack("H*", $u));
78
+
79
+ if ( $bit =~ /^00000(.....)(......)$/ ) {
80
+ $utf8bit = sprintf('110%s10%s', $1, $2);
81
+ }
82
+ elsif ( $bit =~ /^(....)(......)(......)$/ ) {
83
+ $utf8bit = sprintf('1110%s10%s10%s', $1, $2, $3);
84
+ }
85
+ else {
86
+ Carp::croak("Invalid escaped unicode");
87
+ }
88
+
89
+ return pack('B*', $utf8bit);
90
+ }
91
+
92
+
93
+ sub JSON::PP::incr_text {
94
+ $_[0]->{_incr_parser} ||= JSON::PP::IncrParser->new;
95
+
96
+ if ( $_[0]->{_incr_parser}->{incr_parsing} ) {
97
+ Carp::croak("incr_text can not be called when the incremental parser already started parsing");
98
+ }
99
+
100
+ $_[0]->{_incr_parser}->{incr_text} = $_[1] if ( @_ > 1 );
101
+ $_[0]->{_incr_parser}->{incr_text};
102
+ }
103
+
104
+
105
+ 1;
106
+ __END__
107
+
108
+ =pod
109
+
110
+ =head1 NAME
111
+
112
+ JSON::PP5005 - Helper module in using JSON::PP in Perl 5.005
113
+
114
+ =head1 DESCRIPTION
115
+
116
+ JSON::PP calls internally.
117
+
118
+ =head1 AUTHOR
119
+
120
+ Makamaka Hannyaharamitu, E<lt>makamaka[at]cpan.orgE<gt>
121
+
122
+
123
+ =head1 COPYRIGHT AND LICENSE
124
+
125
+ Copyright 2007-2012 by Makamaka Hannyaharamitu
126
+
127
+ This library is free software; you can redistribute it and/or modify
128
+ it under the same terms as Perl itself.
129
+
130
+ =cut
131
+
uroman/lib/JSON/backportPP/Compat5006.pm ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package # This is JSON::backportPP
2
+ JSON::backportPP56;
3
+
4
+ use 5.006;
5
+ use strict;
6
+
7
+ my @properties;
8
+
9
+ $JSON::PP56::VERSION = '1.08';
10
+
11
+ BEGIN {
12
+
13
+ sub utf8::is_utf8 {
14
+ my $len = length $_[0]; # char length
15
+ {
16
+ use bytes; # byte length;
17
+ return $len != length $_[0]; # if !=, UTF8-flagged on.
18
+ }
19
+ }
20
+
21
+
22
+ sub utf8::upgrade {
23
+ ; # noop;
24
+ }
25
+
26
+
27
+ sub utf8::downgrade ($;$) {
28
+ return 1 unless ( utf8::is_utf8( $_[0] ) );
29
+
30
+ if ( _is_valid_utf8( $_[0] ) ) {
31
+ my $downgrade;
32
+ for my $c ( unpack( "U*", $_[0] ) ) {
33
+ if ( $c < 256 ) {
34
+ $downgrade .= pack("C", $c);
35
+ }
36
+ else {
37
+ $downgrade .= pack("U", $c);
38
+ }
39
+ }
40
+ $_[0] = $downgrade;
41
+ return 1;
42
+ }
43
+ else {
44
+ Carp::croak("Wide character in subroutine entry") unless ( $_[1] );
45
+ 0;
46
+ }
47
+ }
48
+
49
+
50
+ sub utf8::encode ($) { # UTF8 flag off
51
+ if ( utf8::is_utf8( $_[0] ) ) {
52
+ $_[0] = pack( "C*", unpack( "C*", $_[0] ) );
53
+ }
54
+ else {
55
+ $_[0] = pack( "U*", unpack( "C*", $_[0] ) );
56
+ $_[0] = pack( "C*", unpack( "C*", $_[0] ) );
57
+ }
58
+ }
59
+
60
+
61
+ sub utf8::decode ($) { # UTF8 flag on
62
+ if ( _is_valid_utf8( $_[0] ) ) {
63
+ utf8::downgrade( $_[0] );
64
+ $_[0] = pack( "U*", unpack( "U*", $_[0] ) );
65
+ }
66
+ }
67
+
68
+
69
+ *JSON::PP::JSON_PP_encode_ascii = \&_encode_ascii;
70
+ *JSON::PP::JSON_PP_encode_latin1 = \&_encode_latin1;
71
+ *JSON::PP::JSON_PP_decode_surrogates = \&JSON::PP::_decode_surrogates;
72
+ *JSON::PP::JSON_PP_decode_unicode = \&JSON::PP::_decode_unicode;
73
+
74
+ unless ( defined &B::SVp_NOK ) { # missing in B module.
75
+ eval q{ sub B::SVp_NOK () { 0x02000000; } };
76
+ }
77
+
78
+ }
79
+
80
+
81
+
82
+ sub _encode_ascii {
83
+ join('',
84
+ map {
85
+ $_ <= 127 ?
86
+ chr($_) :
87
+ $_ <= 65535 ?
88
+ sprintf('\u%04x', $_) : sprintf('\u%x\u%x', JSON::PP::_encode_surrogates($_));
89
+ } _unpack_emu($_[0])
90
+ );
91
+ }
92
+
93
+
94
+ sub _encode_latin1 {
95
+ join('',
96
+ map {
97
+ $_ <= 255 ?
98
+ chr($_) :
99
+ $_ <= 65535 ?
100
+ sprintf('\u%04x', $_) : sprintf('\u%x\u%x', JSON::PP::_encode_surrogates($_));
101
+ } _unpack_emu($_[0])
102
+ );
103
+ }
104
+
105
+
106
+ sub _unpack_emu { # for Perl 5.6 unpack warnings
107
+ return !utf8::is_utf8($_[0]) ? unpack('C*', $_[0])
108
+ : _is_valid_utf8($_[0]) ? unpack('U*', $_[0])
109
+ : unpack('C*', $_[0]);
110
+ }
111
+
112
+
113
+ sub _is_valid_utf8 {
114
+ my $str = $_[0];
115
+ my $is_utf8;
116
+
117
+ while ($str =~ /(?:
118
+ (
119
+ [\x00-\x7F]
120
+ |[\xC2-\xDF][\x80-\xBF]
121
+ |[\xE0][\xA0-\xBF][\x80-\xBF]
122
+ |[\xE1-\xEC][\x80-\xBF][\x80-\xBF]
123
+ |[\xED][\x80-\x9F][\x80-\xBF]
124
+ |[\xEE-\xEF][\x80-\xBF][\x80-\xBF]
125
+ |[\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF]
126
+ |[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]
127
+ |[\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF]
128
+ )
129
+ | (.)
130
+ )/xg)
131
+ {
132
+ if (defined $1) {
133
+ $is_utf8 = 1 if (!defined $is_utf8);
134
+ }
135
+ else {
136
+ $is_utf8 = 0 if (!defined $is_utf8);
137
+ if ($is_utf8) { # eventually, not utf8
138
+ return;
139
+ }
140
+ }
141
+ }
142
+
143
+ return $is_utf8;
144
+ }
145
+
146
+
147
+ 1;
148
+ __END__
149
+
150
+ =pod
151
+
152
+ =head1 NAME
153
+
154
+ JSON::PP56 - Helper module in using JSON::PP in Perl 5.6
155
+
156
+ =head1 DESCRIPTION
157
+
158
+ JSON::PP calls internally.
159
+
160
+ =head1 AUTHOR
161
+
162
+ Makamaka Hannyaharamitu, E<lt>makamaka[at]cpan.orgE<gt>
163
+
164
+
165
+ =head1 COPYRIGHT AND LICENSE
166
+
167
+ Copyright 2007-2012 by Makamaka Hannyaharamitu
168
+
169
+ This library is free software; you can redistribute it and/or modify
170
+ it under the same terms as Perl itself.
171
+
172
+ =cut
173
+
uroman/lib/NLP/Chinese.pm ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ################################################################
2
+ # #
3
+ # Chinese #
4
+ # #
5
+ ################################################################
6
+
7
+ package NLP::Chinese;
8
+
9
+ $utf8 = NLP::UTF8;
10
+ %empty_ht = ();
11
+
12
+ sub read_chinese_tonal_pinyin_files {
13
+ local($caller, *ht, @filenames) = @_;
14
+
15
+ $n_kHanyuPinlu = 0;
16
+ $n_kXHC1983 = 0;
17
+ $n_kHanyuPinyin = 0;
18
+ $n_kMandarin = 0;
19
+ $n_cedict = 0;
20
+ $n_simple_pinyin = 0;
21
+
22
+ foreach $filename (@filenames) {
23
+ if ($filename =~ /unihan/i) {
24
+ my $line_number = 0;
25
+ if (open(IN, $filename)) {
26
+ while (<IN>) {
27
+ $line_number++;
28
+ next if /^#/;
29
+ s/\s*$//;
30
+ if (($u, $type, $value) = split(/\t/, $_)) {
31
+ if ($type =~ /^(kHanyuPinlu|kXHC1983|kHanyuPinyin|kMandarin)$/) {
32
+ $u = $util->trim($u);
33
+ $type = $util->trim($type);
34
+ $value = $util->trim($value);
35
+ $f = $utf8->unicode_string2string($u);
36
+
37
+ if ($type eq "kHanyuPinlu") {
38
+ $value =~ s/\(.*?\)//g;
39
+ $value = $util->trim($value);
40
+ $translit = $caller->number_to_accent_tone($value);
41
+ $ht{"kHanyuPinlu"}->{$f} = $translit;
42
+ $n_kHanyuPinlu++;
43
+ } elsif ($type eq "kXHC1983") {
44
+ @translits = ($value =~ /:(\S+)/g);
45
+ $translit = join(" ", @translits);
46
+ $ht{"kXHC1983"}->{$f} = $translit;
47
+ $n_kXHC1983++;
48
+ } elsif ($type eq "kHanyuPinyin") {
49
+ $value =~ s/^.*://;
50
+ $value =~ s/,/ /g;
51
+ $ht{"kHanyuPinyin"}->{$f} = $value;
52
+ $n_kHanyuPinyin++;
53
+ } elsif ($type eq "kMandarin") {
54
+ $ht{"kMandarin"}->{$f} = $value;
55
+ $n_kMandarin++;
56
+ }
57
+ }
58
+ }
59
+ }
60
+ close(IN);
61
+ print "Read in $n_kHanyuPinlu kHanyuPinlu, $n_kXHC1983 n_kXHC1983, $n_kHanyuPinyin n_kHanyuPinyin $n_kMandarin n_kMandarin\n";
62
+ } else {
63
+ print STDERR "Can't open $filename\n";
64
+ }
65
+ } elsif ($filename =~ /cedict/i) {
66
+ if (open(IN, $filename)) {
67
+ my $line_number = 0;
68
+ while (<IN>) {
69
+ $line_number++;
70
+ next if /^#/;
71
+ s/\s*$//;
72
+ if (($f, $translit) = ($_ =~ /^\S+\s+(\S+)\s+\[([^\[\]]+)\]/)) {
73
+ $translit = $utf8->extended_lower_case($translit);
74
+ $translit = $caller->number_to_accent_tone($translit);
75
+ $translit =~ s/\s//g;
76
+ if ($old_translit = $ht{"cedict"}->{$f}) {
77
+ # $ht{CONFLICT}->{("DUPLICATE " . $f)} = "CEDICT($f): $old_translit\nCEDICT($f): $translit (duplicate)\n" unless $translit eq $old_translit;
78
+ $ht{"cedicts"}->{$f} = join(" ", $ht{"cedicts"}->{$f}, $translit) unless $old_translit eq $translit;
79
+ } else {
80
+ $ht{"cedict"}->{$f} = $translit;
81
+ $ht{"cedicts"}->{$f} = $translit;
82
+ }
83
+ $n_cedict++;
84
+ }
85
+ }
86
+ close(IN);
87
+ # print "Read in $n_cedict n_cedict\n";
88
+ } else {
89
+ print STDERR "Can't open $filename";
90
+ }
91
+ } elsif ($filename =~ /chinese_to_pinyin/i) {
92
+ if (open(IN, $filename)) {
93
+ my $line_number = 0;
94
+ while (<IN>) {
95
+ $line_number++;
96
+ next if /^#/;
97
+ if (($f, $translit) = ($_ =~ /^(\S+)\t(\S+)\s*$/)) {
98
+ $ht{"simple_pinyin"}->{$f} = $translit;
99
+ $n_simple_pinyin++;
100
+ }
101
+ }
102
+ close(IN);
103
+ # print "Read in $n_simple_pinyin n_simple_pinyin\n";
104
+ } else {
105
+ print STDERR "Can't open $filename";
106
+ }
107
+ } else {
108
+ print STDERR "Don't know what to do with file $filename (in read_chinese_tonal_pinyin_files)\n";
109
+ }
110
+ }
111
+ }
112
+
113
+ sub tonal_pinyin {
114
+ local($caller, $s, *ht, $gloss) = @_;
115
+
116
+ return $result if defined($result = $ht{COMBINED}->{$s});
117
+
118
+ $cedict_pinyin = $ht{"cedict"}->{$s} || "";
119
+ $cedicts_pinyin = $ht{"cedicts"}->{$s} || "";
120
+ $unihan_pinyin = "";
121
+ @characters = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht);
122
+ foreach $c (@characters) {
123
+ if ($pinyin = $ht{"simple_pinyin"}->{$c}) {
124
+ $unihan_pinyin .= $pinyin;
125
+ } elsif ($pinyin = $ht{"kHanyuPinlu"}->{$c}) {
126
+ $pinyin =~ s/^(\S+)\s.*$/$1/;
127
+ $unihan_pinyin .= $pinyin;
128
+ } elsif ($pinyin = $ht{"kXHC1983"}->{$c}) {
129
+ $pinyin =~ s/^(\S+)\s.*$/$1/;
130
+ $unihan_pinyin .= $pinyin;
131
+ } elsif ($pinyin = $ht{"kHanyuPinyin"}->{$c}) {
132
+ $pinyin =~ s/^(\S+)\s.*$/$1/;
133
+ $unihan_pinyin .= $pinyin;
134
+ } elsif ($pinyin = $ht{"cedicts"}->{$c}) {
135
+ $pinyin =~ s/^(\S+)\s.*$/$1/;
136
+ $unihan_pinyin .= $pinyin;
137
+ # middle dot, katakana middle dot, multiplication sign
138
+ } elsif ($c =~ /^(\xC2\xB7|\xE3\x83\xBB|\xC3\x97)$/) {
139
+ $unihan_pinyin .= $c;
140
+ # ASCII
141
+ } elsif ($c =~ /^([\x21-\x7E])$/) {
142
+ $unihan_pinyin .= $c;
143
+ } else {
144
+ $unihan_pinyin .= "?";
145
+ $hex = $utf8->utf8_to_hex($c);
146
+ $unicode = uc $utf8->utf8_to_4hex_unicode($c);
147
+ # print STDERR "Tonal pinyin: Unknown character $c ($hex/U+$unicode) -> ?\n";
148
+ }
149
+ }
150
+ $pinyin_title = "";
151
+ if (($#characters >= 1) && $cedicts_pinyin) {
152
+ foreach $pinyin (split(/\s+/, $cedicts_pinyin)) {
153
+ $pinyin_title .= "$s $pinyin (CEDICT)\n";
154
+ }
155
+ $pinyin_title .= "\n";
156
+ }
157
+ foreach $c (@characters) {
158
+ my %local_ht = ();
159
+ @pinyins = ();
160
+ foreach $type (("kHanyuPinlu", "kXHC1983", "kHanyuPinyin", "cedicts")) {
161
+ if ($pinyin_s = $ht{$type}->{$c}) {
162
+ foreach $pinyin (split(/\s+/, $pinyin_s)) {
163
+ push(@pinyins, $pinyin) unless $util->member($pinyin, @pinyins);
164
+ $type2 = ($type eq "cedicts") ? "CEDICT" : $type;
165
+ $local_ht{$pinyin} = ($local_ht{$pinyin}) ? join(", ", $local_ht{$pinyin}, $type2) : $type2;
166
+ }
167
+ }
168
+ }
169
+ foreach $pinyin (@pinyins) {
170
+ $type_s = $local_ht{$pinyin};
171
+ $pinyin_title .= "$c $pinyin ($type_s)\n";
172
+ }
173
+ }
174
+ $pinyin_title =~ s/\n$//;
175
+ $pinyin_title =~ s/\n/&#xA;/g;
176
+ $unihan_pinyin = "" if $unihan_pinyin =~ /^\?+$/;
177
+ if (($#characters >= 1) && $cedict_pinyin && $unihan_pinyin && ($unihan_pinyin ne $cedict_pinyin)) {
178
+ $log = "Gloss($s): $gloss\nCEdict($s): $cedicts_pinyin\nUnihan($s): $unihan_pinyin\n";
179
+ foreach $type (("kHanyuPinlu", "kXHC1983", "kHanyuPinyin")) {
180
+ $log_line = "$type($s): ";
181
+ foreach $c (@characters) {
182
+ $pinyin = $ht{$type}->{$c} || "";
183
+ if ($pinyin =~ / /) {
184
+ $log_line .= "($pinyin)";
185
+ } elsif ($pinyin) {
186
+ $log_line .= $pinyin;
187
+ } else {
188
+ $log_line .= "?";
189
+ }
190
+ }
191
+ $log .= "$log_line\n";
192
+ }
193
+ $ht{CONFLICT}->{$s} = $log;
194
+ }
195
+ $result = $unihan_pinyin || $cedict_pinyin;
196
+ $result = $cedict_pinyin if ($#characters > 0) && $cedict_pinyin;
197
+ $ht{COMBINED}->{$s} = $result;
198
+ $ht{PINYIN_TITLE}->{$s} = $pinyin_title;
199
+ return $result;
200
+ }
201
+
202
+ %number_to_accent_tone_ht = (
203
+ "a1", "\xC4\x81", "a2", "\xC3\xA1", "a3", "\xC7\x8E", "a4", "\xC3\xA0",
204
+ "e1", "\xC4\x93", "e2", "\xC3\xA9", "e3", "\xC4\x9B", "e4", "\xC3\xA8",
205
+ "i1", "\xC4\xAB", "i2", "\xC3\xAD", "i3", "\xC7\x90", "i4", "\xC3\xAC",
206
+ "o1", "\xC5\x8D", "o2", "\xC3\xB3", "o3", "\xC7\x92", "o4", "\xC3\xB2",
207
+ "u1", "\xC5\xAB", "u2", "\xC3\xBA", "u3", "\xC7\x94", "u4", "\xC3\xB9",
208
+ "u:1","\xC7\x96", "u:2","\xC7\x98", "u:3","\xC7\x9A", "u:4","\xC7\x9C",
209
+ "\xC3\xBC1","\xC7\x96","\xC3\xBC2","\xC7\x98","\xC3\xBC3","\xC7\x9A","\xC3\xBC4","\xC7\x9C"
210
+ );
211
+
212
+ sub number_to_accent_tone {
213
+ local($caller, $s) = @_;
214
+
215
+ my $result = "";
216
+ while (($pre,$alpha,$tone_number,$rest) = ($s =~ /^(.*?)((?:[a-z]|u:|\xC3\xBC)+)([1-5])(.*)$/i)) {
217
+ if ($tone_number eq "5") {
218
+ $result .= "$pre$alpha";
219
+ } elsif ((($pre_acc,$acc_letter,$post_acc) = ($alpha =~ /^(.*)([ae])(.*)$/))
220
+ || (($pre_acc,$acc_letter,$post_acc) = ($alpha =~ /^(.*)(o)(u.*)$/))
221
+ || (($pre_acc,$acc_letter,$post_acc) = ($alpha =~ /^(.*)(u:|[iou]|\xC3\xBC)([^aeiou]*)$/))) {
222
+ $result .= "$pre$pre_acc" . ($number_to_accent_tone_ht{($acc_letter . $tone_number)} || ($acc_letter . $tone_number)) . $post_acc;
223
+ } else {
224
+ $result .= "$pre$alpha$tone_number";
225
+ }
226
+ $s = $rest;
227
+ }
228
+ $result .= $s;
229
+ $result =~ s/u:/\xC3\xBC/g;
230
+ return $result;
231
+ }
232
+
233
+ sub string_contains_utf8_cjk_unified_ideograph_p {
234
+ local($caller, $s) = @_;
235
+
236
+ return ($s =~ /([\xE4-\xE9]|\xE3[\x90-\xBF]|\xF0[\xA0-\xAC])/);
237
+ }
238
+
239
+ 1;
uroman/lib/NLP/English.pm ADDED
The diff for this file is too large to render. See raw diff
 
uroman/lib/NLP/Romanizer.pm ADDED
@@ -0,0 +1,2020 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ################################################################
2
+ # #
3
+ # Romanizer #
4
+ # #
5
+ ################################################################
6
+
7
+ package NLP::Romanizer;
8
+
9
+ use NLP::Chinese;
10
+ use NLP::UTF8;
11
+ use NLP::utilities;
12
+ use JSON;
13
+ $utf8 = NLP::UTF8;
14
+ $util = NLP::utilities;
15
+ $chinesePM = NLP::Chinese;
16
+
17
+ my $verbosePM = 0;
18
+ %empty_ht = ();
19
+
20
+ my $braille_capital_letter_indicator = "\xE2\xA0\xA0";
21
+ my $braille_number_indicator = "\xE2\xA0\xBC";
22
+ my $braille_decimal_point = "\xE2\xA0\xA8";
23
+ my $braille_comma = "\xE2\xA0\x82";
24
+ my $braille_solidus = "\xE2\xA0\x8C";
25
+ my $braille_numeric_space = "\xE2\xA0\x90";
26
+ my $braille_letter_indicator = "\xE2\xA0\xB0";
27
+ my $braille_period = "\xE2\xA0\xB2";
28
+
29
+ sub new {
30
+ local($caller) = @_;
31
+
32
+ my $object = {};
33
+ my $class = ref( $caller ) || $caller;
34
+ bless($object, $class);
35
+ return $object;
36
+ }
37
+
38
+ sub load_unicode_data {
39
+ local($this, *ht, $filename) = @_;
40
+ # ../../data/UnicodeData.txt
41
+
42
+ $n = 0;
43
+ if (open(IN, $filename)) {
44
+ while (<IN>) {
45
+ if (($unicode_value, $char_name, $general_category, $canon_comb_classes, $bidir_category, $char_decomp_mapping, $decimal_digit_value, $digit_value, $numeric_value, $mirrored, $unicode_1_0_name, $comment_field, $uc_mapping, $lc_mapping, $title_case_mapping) = split(";", $_)) {
46
+ $utf8_code = $utf8->unicode_hex_string2string($unicode_value);
47
+ $ht{UTF_TO_CHAR_NAME}->{$utf8_code} = $char_name;
48
+ $ht{UTF_NAME_TO_UNICODE}->{$char_name} = $unicode_value;
49
+ $ht{UTF_NAME_TO_CODE}->{$char_name} = $utf8_code;
50
+ $ht{UTF_TO_CAT}->{$utf8_code} = $general_category;
51
+ $ht{UTF_TO_NUMERIC}->{$utf8_code} = $numeric_value unless $numeric_value eq "";
52
+ $n++;
53
+ }
54
+ }
55
+ close(IN);
56
+ # print STDERR "Loaded $n entries from $filename\n";
57
+ } else {
58
+ print STDERR "Can't open $filename\n";
59
+ }
60
+ }
61
+
62
+ sub load_unicode_overwrite_romanization {
63
+ local($this, *ht, $filename) = @_;
64
+ # ../../data/UnicodeDataOverwrite.txt
65
+
66
+ $n = 0;
67
+ if (open(IN, $filename)) {
68
+ while (<IN>) {
69
+ next if /^#/;
70
+ $unicode_value = $util->slot_value_in_double_colon_del_list($_, "u");
71
+ $romanization = $util->slot_value_in_double_colon_del_list($_, "r");
72
+ $numeric = $util->slot_value_in_double_colon_del_list($_, "num");
73
+ $picture = $util->slot_value_in_double_colon_del_list($_, "pic");
74
+ $syllable_info = $util->slot_value_in_double_colon_del_list($_, "syllable-info");
75
+ $tone_mark = $util->slot_value_in_double_colon_del_list($_, "tone-mark");
76
+ $char_name = $util->slot_value_in_double_colon_del_list($_, "name");
77
+ $entry_processed_p = 0;
78
+ $utf8_code = $utf8->unicode_hex_string2string($unicode_value);
79
+ if ($unicode_value) {
80
+ $ht{UTF_TO_CHAR_ROMANIZATION}->{$utf8_code} = $romanization if $romanization;
81
+ $ht{UTF_TO_NUMERIC}->{$utf8_code} = $numeric if defined($numeric) && ($numeric ne "");
82
+ $ht{UTF_TO_PICTURE_DESCR}->{$utf8_code} = $picture if $picture;
83
+ $ht{UTF_TO_SYLLABLE_INFO}->{$utf8_code} = $syllable_info if $syllable_info;
84
+ $ht{UTF_TO_TONE_MARK}->{$utf8_code} = $tone_mark if $tone_mark;
85
+ $ht{UTF_TO_CHAR_NAME}->{$utf8_code} = $char_name if $char_name;
86
+ $entry_processed_p = 1 if $romanization || $numeric || $picture || $syllable_info || $tone_mark;
87
+ }
88
+ $n++ if $entry_processed_p;
89
+ }
90
+ close(IN);
91
+ } else {
92
+ print STDERR "Can't open $filename\n";
93
+ }
94
+ }
95
+
96
+ sub load_script_data {
97
+ local($this, *ht, $filename) = @_;
98
+ # ../../data/Scripts.txt
99
+
100
+ $n = 0;
101
+ if (open(IN, $filename)) {
102
+ while (<IN>) {
103
+ next unless $script_name = $util->slot_value_in_double_colon_del_list($_, "script-name");
104
+ $abugida_default_vowel_s = $util->slot_value_in_double_colon_del_list($_, "abugida-default-vowel");
105
+ $alt_script_name_s = $util->slot_value_in_double_colon_del_list($_, "alt-script-name");
106
+ $language_s = $util->slot_value_in_double_colon_del_list($_, "language");
107
+ $direction = $util->slot_value_in_double_colon_del_list($_, "direction"); # right-to-left
108
+ $font_family_s = $util->slot_value_in_double_colon_del_list($_, "font-family");
109
+ $ht{SCRIPT_P}->{$script_name} = 1;
110
+ $ht{SCRIPT_NORM}->{(uc $script_name)} = $script_name;
111
+ $ht{DIRECTION}->{$script_name} = $direction if $direction;
112
+ foreach $language (split(/,\s*/, $language_s)) {
113
+ $ht{SCRIPT_LANGUAGE}->{$script_name}->{$language} = 1;
114
+ $ht{LANGUAGE_SCRIPT}->{$language}->{$script_name} = 1;
115
+ }
116
+ foreach $alt_script_name (split(/,\s*/, $alt_script_name_s)) {
117
+ $ht{SCRIPT_NORM}->{$alt_script_name} = $script_name;
118
+ $ht{SCRIPT_NORM}->{(uc $alt_script_name)} = $script_name;
119
+ }
120
+ foreach $abugida_default_vowel (split(/,\s*/, $abugida_default_vowel_s)) {
121
+ $ht{SCRIPT_ABUDIGA_DEFAULT_VOWEL}->{$script_name}->{$abugida_default_vowel} = 1 if $abugida_default_vowel;
122
+ }
123
+ foreach $font_family (split(/,\s*/, $font_family_s)) {
124
+ $ht{SCRIPT_FONT}->{$script_name}->{$font_family} = 1 if $font_family;
125
+ }
126
+ $n++;
127
+ }
128
+ close(IN);
129
+ # print STDERR "Loaded $n entries from $filename\n";
130
+ } else {
131
+ print STDERR "Can't open $filename\n";
132
+ }
133
+ }
134
+
135
+ sub unicode_hangul_romanization {
136
+ local($this, $s, $pass_through_p) = @_;
137
+
138
+ $pass_through_p = 0 unless defined($pass_through_p);
139
+ @leads = split(/\s+/, "g gg n d dd r m b bb s ss - j jj c k t p h");
140
+ # @vowels = split(/\s+/, "a ae ya yai e ei ye yei o oa oai oi yo u ue uei ui yu w wi i");
141
+ @vowels = split(/\s+/, "a ae ya yae eo e yeo ye o wa wai oe yo u weo we wi yu eu yi i");
142
+ @tails = split(/\s+/, "- g gg gs n nj nh d l lg lm lb ls lt lp lh m b bs s ss ng j c k t p h");
143
+ $result = "";
144
+ @chars = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht);
145
+ foreach $char (@chars) {
146
+ $unicode = $utf8->utf8_to_unicode($char);
147
+ if (($unicode >= 0xAC00) && ($unicode <= 0xD7A3)) {
148
+ $code = $unicode - 0xAC00;
149
+ $lead_index = int($code / (28*21));
150
+ $vowel_index = int($code/28) % 21;
151
+ $tail_index = $code % 28;
152
+ $rom = $leads[$lead_index] . $vowels[$vowel_index] . $tails[$tail_index];
153
+ $rom =~ s/-//g;
154
+ $result .= $rom;
155
+ } elsif ($pass_through_p) {
156
+ $result .= $char;
157
+ }
158
+ }
159
+ return $result;
160
+ }
161
+
162
+ sub listify_comma_sep_string {
163
+ local($this, $s) = @_;
164
+
165
+ @result_list = ();
166
+ return @result_list unless $s =~ /\S/;
167
+ $s = $util->trim2($s);
168
+ my $elem;
169
+
170
+ while (($elem, $rest) = ($s =~ /^("(?:\\"|[^"])*"|'(?:\\'|[^'])*'|[^"', ]+),\s*(.*)$/)) {
171
+ push(@result_list, $util->dequote_string($elem));
172
+ $s = $rest;
173
+ }
174
+ push(@result_list, $util->dequote_string($s)) if $s =~ /\S/;
175
+
176
+ return @result_list;
177
+ }
178
+
179
+ sub braille_string_p {
180
+ local($this, $s) = @_;
181
+
182
+ return ($s =~ /^(\xE2[\xA0-\xA3][\x80-\xBF])+$/);
183
+ }
184
+
185
+ sub register_word_boundary_info {
186
+ local($this, *ht, $lang_code, $utf8_source_string, $utf8_target_string, $use_only_for_whole_word_p,
187
+ $use_only_at_start_of_word_p, $use_only_at_end_of_word_p,
188
+ $dont_use_at_start_of_word_p, $dont_use_at_end_of_word_p) = @_;
189
+
190
+ if ($use_only_for_whole_word_p) {
191
+ if ($lang_code) {
192
+ $ht{USE_ONLY_FOR_WHOLE_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1;
193
+ } else {
194
+ $ht{USE_ONLY_FOR_WHOLE_WORD}->{$utf8_source_string}->{$utf8_target_string} = 1;
195
+ }
196
+ }
197
+ if ($use_only_at_start_of_word_p) {
198
+ if ($lang_code) {
199
+ $ht{USE_ONLY_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1;
200
+ } else {
201
+ $ht{USE_ONLY_AT_START_OF_WORD}->{$utf8_source_string}->{$utf8_target_string} = 1;
202
+ }
203
+ }
204
+ if ($use_only_at_end_of_word_p) {
205
+ if ($lang_code) {
206
+ $ht{USE_ONLY_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1;
207
+ } else {
208
+ $ht{USE_ONLY_AT_END_OF_WORD}->{$utf8_source_string}->{$utf8_target_string} = 1;
209
+ }
210
+ }
211
+ if ($dont_use_at_start_of_word_p) {
212
+ if ($lang_code) {
213
+ $ht{DONT_USE_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1;
214
+ } else {
215
+ $ht{DONT_USE_AT_START_OF_WORD}->{$utf8_source_string}->{$utf8_target_string} = 1;
216
+ }
217
+ }
218
+ if ($dont_use_at_end_of_word_p) {
219
+ if ($lang_code) {
220
+ $ht{DONT_USE_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1;
221
+ } else {
222
+ $ht{DONT_USE_AT_END_OF_WORD}->{$utf8_source_string}->{$utf8_target_string} = 1;
223
+ }
224
+ }
225
+ }
226
+
227
+ sub load_romanization_table {
228
+ local($this, *ht, $filename) = @_;
229
+ # ../../data/romanization-table.txt
230
+
231
+ $n = 0;
232
+ $line_number = 0;
233
+ if (open(IN, $filename)) {
234
+ while (<IN>) {
235
+ $line_number++;
236
+ next if /^#/;
237
+ if ($_ =~ /^::preserve\s/) {
238
+ $from_unicode = $util->slot_value_in_double_colon_del_list($_, "from");
239
+ $to_unicode = $util->slot_value_in_double_colon_del_list($_, "to");
240
+ if ($from_unicode =~ /^(?:U\+|\\u)[0-9A-F]{4,}$/i) {
241
+ $from_unicode =~ s/^(?:U\+|\\u)//;
242
+ $from_code_point = hex($from_unicode);
243
+ } else {
244
+ $from_code_point = "";
245
+ }
246
+ if ($to_unicode =~ /^(?:U\+|\\u)[0-9A-F]{4,}$/i) {
247
+ $to_unicode =~ s/^(?:U\+|\\u)//;
248
+ $to_code_point = hex($to_unicode);
249
+ } else {
250
+ $to_code_point = $from_code_point;
251
+ }
252
+ if ($from_code_point ne "") {
253
+ # print STDERR "Preserve code-points $from_unicode--$to_unicode = $from_code_point--$to_code_point\n";
254
+ foreach $code_point (($from_code_point .. $to_code_point)) {
255
+ $utf8_string = $utf8->unicode2string($code_point);
256
+ $ht{UTF_CHAR_MAPPING}->{$utf8_string}->{$utf8_string} = 1;
257
+ }
258
+ $n++;
259
+ }
260
+ next;
261
+ }
262
+ $utf8_source_string = $util->slot_value_in_double_colon_del_list($_, "s");
263
+ $utf8_target_string = $util->slot_value_in_double_colon_del_list($_, "t");
264
+ $utf8_alt_target_string_s = $util->slot_value_in_double_colon_del_list($_, "t-alt");
265
+ $use_alt_in_pointed_p = ($_ =~ /::use-alt-in-pointed\b/);
266
+ $use_only_for_whole_word_p = ($_ =~ /::use-only-for-whole-word\b/);
267
+ $use_only_at_start_of_word_p = ($_ =~ /::use-only-at-start-of-word\b/);
268
+ $use_only_at_end_of_word_p = ($_ =~ /::use-only-at-end-of-word\b/);
269
+ $dont_use_at_start_of_word_p = ($_ =~ /::dont-use-at-start-of-word\b/);
270
+ $dont_use_at_end_of_word_p = ($_ =~ /::dont-use-at-end-of-word\b/);
271
+ $use_only_in_lower_case_enviroment_p = ($_ =~ /::use-only-in-lower-case-enviroment\b/);
272
+ $word_external_punctuation_p = ($_ =~ /::word-external-punctuation\b/);
273
+ $utf8_source_string =~ s/\s*$//;
274
+ $utf8_target_string =~ s/\s*$//;
275
+ $utf8_alt_target_string_s =~ s/\s*$//;
276
+ $utf8_target_string =~ s/^"(.*)"$/$1/;
277
+ $utf8_target_string =~ s/^'(.*)'$/$1/;
278
+ @utf8_alt_targets = $this->listify_comma_sep_string($utf8_alt_target_string_s);
279
+ $numeric = $util->slot_value_in_double_colon_del_list($_, "num");
280
+ $numeric =~ s/\s*$//;
281
+ $annotation = $util->slot_value_in_double_colon_del_list($_, "annotation");
282
+ $annotation =~ s/\s*$//;
283
+ $lang_code = $util->slot_value_in_double_colon_del_list($_, "lcode");
284
+ $prob = $util->slot_value_in_double_colon_del_list($_, "p") || 1;
285
+ unless (($utf8_target_string eq "") && ($numeric =~ /\d/)) {
286
+ if ($lang_code) {
287
+ $ht{UTF_CHAR_MAPPING_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = $prob;
288
+ } else {
289
+ $ht{UTF_CHAR_MAPPING}->{$utf8_source_string}->{$utf8_target_string} = $prob;
290
+ }
291
+ if ($word_external_punctuation_p) {
292
+ if ($lang_code) {
293
+ $ht{WORD_EXTERNAL_PUNCTUATION_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = $prob;
294
+ } else {
295
+ $ht{WORD_EXTERNAL_PUNCTUATION}->{$utf8_source_string}->{$utf8_target_string} = $prob;
296
+ }
297
+ }
298
+ if ($this->braille_string_p($utf8_source_string)) {
299
+ if (($utf8_target_string =~ /^[a-z]+$/)
300
+ && (! ($utf8_source_string =~ /^$braille_capital_letter_indicator/))) {
301
+ my $uc_utf8_source_string = "$braille_capital_letter_indicator$utf8_source_string";
302
+ my $uc_utf8_target_string = ucfirst $utf8_target_string;
303
+ if ($lang_code) {
304
+ $ht{UTF_CHAR_MAPPING_LANG_SPEC}->{$lang_code}->{$uc_utf8_source_string}->{$uc_utf8_target_string} = $prob;
305
+ } else {
306
+ $ht{UTF_CHAR_MAPPING}->{$uc_utf8_source_string}->{$uc_utf8_target_string} = $prob;
307
+ }
308
+ $this->register_word_boundary_info(*ht, $lang_code, $uc_utf8_source_string, $uc_utf8_target_string,
309
+ $use_only_for_whole_word_p, $use_only_at_start_of_word_p, $use_only_at_end_of_word_p,
310
+ $dont_use_at_start_of_word_p, $dont_use_at_end_of_word_p);
311
+ }
312
+ if (($utf8_target_string =~ /^[0-9]$/)
313
+ && ($utf8_source_string =~ /^$braille_number_indicator./)) {
314
+ my $core_number_char = $utf8_source_string;
315
+ $core_number_char =~ s/$braille_number_indicator//;
316
+ $ht{BRAILLE_TO_DIGIT}->{$core_number_char} = $utf8_target_string;
317
+ }
318
+ }
319
+ }
320
+ if ($use_only_in_lower_case_enviroment_p) {
321
+ if ($lang_code) {
322
+ $ht{USE_ONLY_IN_LOWER_CASE_ENVIROMENT_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1;
323
+ } else {
324
+ $ht{USE_ONLY_IN_LOWER_CASE_ENVIROMENT}->{$utf8_source_string}->{$utf8_target_string} = 1;
325
+ }
326
+ }
327
+ $this->register_word_boundary_info(*ht, $lang_code, $utf8_source_string, $utf8_target_string,
328
+ $use_only_for_whole_word_p, $use_only_at_start_of_word_p, $use_only_at_end_of_word_p,
329
+ $dont_use_at_start_of_word_p, $dont_use_at_end_of_word_p);
330
+ foreach $utf8_alt_target (@utf8_alt_targets) {
331
+ if ($lang_code) {
332
+ $ht{UTF_CHAR_ALT_MAPPING_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_alt_target} = $prob;
333
+ $ht{USE_ALT_IN_POINTED_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_alt_target} = 1 if $use_alt_in_pointed_p;
334
+ } else {
335
+ $ht{UTF_CHAR_ALT_MAPPING}->{$utf8_source_string}->{$utf8_alt_target} = $prob;
336
+ $ht{USE_ALT_IN_POINTED}->{$utf8_source_string}->{$utf8_alt_target} = 1 if $use_alt_in_pointed_p;
337
+ }
338
+ if ($use_only_for_whole_word_p) {
339
+ if ($lang_code) {
340
+ $ht{USE_ALT_ONLY_FOR_WHOLE_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_alt_target} = 1;
341
+ } else {
342
+ $ht{USE_ALT_ONLY_FOR_WHOLE_WORD}->{$utf8_source_string}->{$utf8_alt_target} = 1;
343
+ }
344
+ }
345
+ if ($use_only_at_start_of_word_p) {
346
+ if ($lang_code) {
347
+ $ht{USE_ALT_ONLY_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_alt_target} = 1;
348
+ } else {
349
+ $ht{USE_ALT_ONLY_AT_START_OF_WORD}->{$utf8_source_string}->{$utf8_alt_target} = 1;
350
+ }
351
+ }
352
+ if ($use_only_at_end_of_word_p) {
353
+ if ($lang_code) {
354
+ $ht{USE_ALT_ONLY_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_alt_target} = 1;
355
+ } else {
356
+ $ht{USE_ALT_ONLY_AT_END_OF_WORD}->{$utf8_source_string}->{$utf8_alt_target} = 1;
357
+ }
358
+ }
359
+ }
360
+ if ($numeric =~ /\d/) {
361
+ $ht{UTF_TO_NUMERIC}->{$utf8_source_string} = $numeric;
362
+ }
363
+ if ($annotation =~ /\S/) {
364
+ $ht{UTF_ANNOTATION}->{$utf8_source_string} = $annotation;
365
+ }
366
+ $n++;
367
+ }
368
+ close(IN);
369
+ # print STDERR "Loaded $n entries from $filename\n";
370
+ } else {
371
+ print STDERR "Can't open $filename\n";
372
+ }
373
+ }
374
+
375
+ sub char_name_to_script {
376
+ local($this, $char_name, *ht) = @_;
377
+
378
+ return $cached_result if $cached_result = $ht{CHAR_NAME_TO_SCRIPT}->{$char_name};
379
+ $orig_char_name = $char_name;
380
+ $char_name =~ s/\s+(CONSONANT|LETTER|LIGATURE|SIGN|SYLLABLE|SYLLABICS|VOWEL)\b.*$//;
381
+ my $script_name;
382
+ while ($char_name) {
383
+ last if $script_name = $ht{SCRIPT_NORM}->{(uc $char_name)};
384
+ $char_name =~ s/\s*\S+\s*$//;
385
+ }
386
+ $script_name = "" unless defined($script_name);
387
+ $ht{CHAR_NAME_TO_SCRIPT}->{$char_name} = $script_name;
388
+ return $script_name;
389
+ }
390
+
391
+ sub letter_plus_char_p {
392
+ local($this, $char_name) = @_;
393
+
394
+ return $cached_result if $cached_result = $ht{CHAR_NAME_LETTER_PLUS}->{$char_name};
395
+ my $letter_plus_p = ($char_name =~ /\b(?:LETTER|VOWEL SIGN|AU LENGTH MARK|CONSONANT SIGN|SIGN VIRAMA|SIGN PAMAAEH|SIGN COENG|SIGN AL-LAKUNA|SIGN ASAT|SIGN ANUSVARA|SIGN ANUSVARAYA|SIGN BINDI|TIPPI|SIGN NIKAHIT|SIGN CANDRABINDU|SIGN VISARGA|SIGN REAHMUK|SIGN NUKTA|SIGN DOT BELOW|HEBREW POINT)\b/) ? 1 : 0;
396
+ $ht{CHAR_NAME_LETTER_PLUS}->{$char_name} = $letter_plus_p;
397
+ return $letter_plus_p;
398
+ }
399
+
400
+ sub subjoined_char_p {
401
+ local($this, $char_name) = @_;
402
+
403
+ return $cached_result if $cached_result = $ht{CHAR_NAME_SUBJOINED}->{$char_name};
404
+ my $subjoined_p = (($char_name =~ /\b(?:SUBJOINED LETTER|VOWEL SIGN|AU LENGTH MARK|EMPHASIS MARK|CONSONANT SIGN|SIGN VIRAMA|SIGN PAMAAEH|SIGN COENG|SIGN ASAT|SIGN ANUSVARA|SIGN ANUSVARAYA|SIGN BINDI|TIPPI|SIGN NIKAHIT|SIGN CANDRABINDU|SIGN VISARGA|SIGN REAHMUK|SIGN DOT BELOW|HEBREW (POINT|PUNCTUATION GERESH)|ARABIC (?:DAMMA|DAMMATAN|FATHA|FATHATAN|HAMZA|KASRA|KASRATAN|MADDAH|SHADDA|SUKUN))\b/)) ? 1 : 0;
405
+ $ht{CHAR_NAME_SUBJOINED}->{$char_name} = $subjoined_p;
406
+ return $subjoined_p;
407
+ }
408
+
409
+ sub new_node_id {
410
+ local($this, *chart_ht) = @_;
411
+
412
+ my $n_nodes = $chart_ht{N_NODES};
413
+ $n_nodes++;
414
+ $chart_ht{N_NODES} = $n_nodes;
415
+ return $n_nodes;
416
+ }
417
+
418
+ sub add_node {
419
+ local($this, $s, $start, $end, *chart_ht, $type, $comment) = @_;
420
+
421
+ my $node_id = $this->new_node_id(*chart_ht);
422
+ # print STDERR "add_node($node_id, $start-$end): $s [$comment]\n" if $comment =~ /number/;
423
+ # print STDERR "add_node($node_id, $start-$end): $s [$comment]\n" if ($start >= 0) && ($start < 50);
424
+ $chart_ht{NODE_START}->{$node_id} = $start;
425
+ $chart_ht{NODE_END}->{$node_id} = $end;
426
+ $chart_ht{NODES_STARTING_AT}->{$start}->{$node_id} = 1;
427
+ $chart_ht{NODES_ENDING_AT}->{$end}->{$node_id} = 1;
428
+ $chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}->{$node_id} = 1;
429
+ $chart_ht{NODE_TYPE}->{$node_id} = $type;
430
+ $chart_ht{NODE_COMMENT}->{$node_id} = $comment;
431
+ $chart_ht{NODE_ROMAN}->{$node_id} = $s;
432
+ return $node_id;
433
+ }
434
+
435
+ sub get_node_for_span {
436
+ local($this, $start, $end, *chart_ht) = @_;
437
+
438
+ return "" unless defined($chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end});
439
+ my @node_ids = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}};
440
+
441
+ return (@node_ids) ? $node_ids[0] : "";
442
+ }
443
+
444
+ sub get_node_for_span_and_type {
445
+ local($this, $start, $end, *chart_ht, $type) = @_;
446
+
447
+ return "" unless defined($chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end});
448
+ my @node_ids = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}};
449
+
450
+ foreach $node_id (@node_ids) {
451
+ return $node_id if $chart_ht{NODE_TYPE}->{$node_id} eq $type;
452
+ }
453
+ return "";
454
+ }
455
+
456
+ sub get_node_roman {
457
+ local($this, $node_id, *chart_id, $default) = @_;
458
+
459
+ $default = "" unless defined($default);
460
+ my $roman = $chart_ht{NODE_ROMAN}->{$node_id};
461
+ return (defined($roman)) ? $roman : $default;
462
+ }
463
+
464
+ sub set_node_id_slot_value {
465
+ local($this, $node_id, $slot, $value, *chart_id) = @_;
466
+
467
+ $chart_ht{NODE_SLOT}->{$node_id}->{$slot} = $value;
468
+ }
469
+
470
+ sub copy_slot_values {
471
+ local($this, $old_node_id, $new_node_id, *chart_id, @slots) = @_;
472
+
473
+ if (@slots) {
474
+ foreach $slot (keys %{$chart_ht{NODE_SLOT}->{$old_node_id}}) {
475
+ if (($slots[0] eq "all") || $util->member($slot, @slots)) {
476
+ my $value = $chart_ht{NODE_SLOT}->{$old_node_id}->{$slot};
477
+ $chart_ht{NODE_SLOT}->{$new_node_id}->{$slot} = $value if defined($value);
478
+ }
479
+ }
480
+ }
481
+ }
482
+
483
+ sub get_node_id_slot_value {
484
+ local($this, $node_id, $slot, *chart_id, $default) = @_;
485
+
486
+ $default = "" unless defined($default);
487
+ my $value = $chart_ht{NODE_SLOT}->{$node_id}->{$slot};
488
+ return (defined($value)) ? $value : $default;
489
+ }
490
+
491
+ sub get_node_for_span_with_slot_value {
492
+ local($this, $start, $end, $slot, *chart_id, $default) = @_;
493
+
494
+ $default = "" unless defined($default);
495
+ return $default unless defined($chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end});
496
+ my @node_ids = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}};
497
+ foreach $node_id (@node_ids) {
498
+ my $value = $chart_ht{NODE_SLOT}->{$node_id}->{$slot};
499
+ return $value if defined($value);
500
+ }
501
+ return $default;
502
+ }
503
+
504
+ sub get_node_for_span_with_slot {
505
+ local($this, $start, $end, $slot, *chart_id, $default) = @_;
506
+
507
+ $default = "" unless defined($default);
508
+ return $default unless defined($chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end});
509
+ my @node_ids = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}};
510
+ foreach $node_id (@node_ids) {
511
+ my $value = $chart_ht{NODE_SLOT}->{$node_id}->{$slot};
512
+ return $node_id if defined($value);
513
+ }
514
+ return $default;
515
+ }
516
+
517
+ sub register_new_complex_number_span_segment {
518
+ local($this, $start, $mid, $end, *chart_id, $line_number) = @_;
519
+ # e.g. 4 10 (= 40); 20 5 (= 25)
520
+ # might become part of larger complex number span, e.g. 4 1000 3 100 20 1
521
+
522
+ # print STDERR "register_new_complex_number_span_segment $start-$mid-$end\n" if $line_number == 43;
523
+ if (defined($old_start = $chart_ht{COMPLEX_NUMERIC_END_START}->{$mid})) {
524
+ undef($chart_ht{COMPLEX_NUMERIC_END_START}->{$mid});
525
+ $chart_ht{COMPLEX_NUMERIC_START_END}->{$old_start} = $end;
526
+ $chart_ht{COMPLEX_NUMERIC_END_START}->{$end} = $old_start;
527
+ } else {
528
+ $chart_ht{COMPLEX_NUMERIC_START_END}->{$start} = $end;
529
+ $chart_ht{COMPLEX_NUMERIC_END_START}->{$end} = $start;
530
+ }
531
+ }
532
+
533
+ sub romanize_by_token_with_caching {
534
+ local($this, $s, $lang_code, $output_style, *ht, *pinyin_ht, $initial_char_offset, $control, $line_number) = @_;
535
+
536
+ $control = "" unless defined($control);
537
+ my $return_chart_p = ($control =~ /return chart/i);
538
+ my $return_offset_mappings_p = ($control =~ /return offset mappings/i);
539
+ return $this->romanize($s, $lang_code, $output_style, *ht, *pinyin_ht, $initial_char_offset, $control, $line_number)
540
+ if $return_chart_p || $return_offset_mappings_p;
541
+ my $result = "";
542
+ my @separators = ();
543
+ my @tokens = ();
544
+ $s =~ s/\n$//; # Added May 2, 2019 as bug-fix (duplicate empty lines)
545
+ while (($sep, $token, $rest) = ($s =~ /^(\s*)(\S+)(.*)$/)) {
546
+ push(@separators, $sep);
547
+ push(@tokens, $token);
548
+ $s = $rest;
549
+ }
550
+ push(@separators, $s);
551
+ while (@tokens) {
552
+ my $sep = shift @separators;
553
+ my $token = shift @tokens;
554
+ $result .= $sep;
555
+ if ($token =~ /^[\x00-\x7F]*$/) { # all ASCII
556
+ $result .= $token;
557
+ } else {
558
+ my $rom_token = $ht{CACHED_ROMANIZATION}->{$lang_code}->{$token};
559
+ unless (defined($rom_token)) {
560
+ $rom_token = $this->romanize($token, $lang_code, $output_style, *ht, *pinyin_ht, $initial_char_offset, $control, $line_number);
561
+ $ht{CACHED_ROMANIZATION}->{$lang_code}->{$token} = $rom_token if defined($rom_token);
562
+ }
563
+ $result .= $rom_token;
564
+ }
565
+ }
566
+ my $sep = shift @separators;
567
+ $result .= $sep if defined($sep);
568
+
569
+ return $result;
570
+ }
571
+
572
+ sub romanize {
573
+ local($this, $s, $lang_code, $output_style, *ht, *pinyin_ht, $initial_char_offset, $control, $line_number, $initial_rom_char_offset) = @_;
574
+
575
+ my $orig_lang_code = $lang_code;
576
+ # Check whether the text (to be romanized) starts with a language code directive.
577
+ if (($line_lang_code) = ($s =~ /^::lcode\s+([a-z][a-z][a-z])\s/)) {
578
+ $lang_code = $line_lang_code;
579
+ }
580
+ $initial_char_offset = 0 unless defined($initial_char_offset);
581
+ $initial_rom_char_offset = 0 unless defined($initial_rom_char_offset);
582
+ $control = "" unless defined($control);
583
+ my $return_chart_p = ($control =~ /return chart/i);
584
+ my $return_offset_mappings_p = ($control =~ /return offset mappings/i);
585
+ $line_number = "" unless defined($line_number);
586
+ my @chars = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht);
587
+ my $n_characters = $#chars + 1;
588
+ %chart_ht = ();
589
+ $chart_ht{N_CHARS} = $n_characters;
590
+ $chart_ht{N_NODES} = 0;
591
+ my $char = "";
592
+ my $char_name = "";
593
+ my $prev_script = "";
594
+ my $current_script = "";
595
+ my $script_start = 0;
596
+ my $script_end = 0;
597
+ my $prev_letter_plus_script = "";
598
+ my $current_letter_plus_script = "";
599
+ my $letter_plus_script_start = 0;
600
+ my $letter_plus_script_end = 0;
601
+ my $log ="";
602
+ my $n_right_to_left_chars = 0;
603
+ my $n_left_to_right_chars = 0;
604
+ my $hebrew_word_start = ""; # used to identify Hebrew words with points
605
+ my $hebrew_word_contains_point = 0;
606
+ my $current_word_start = "";
607
+ my $current_word_script = "";
608
+ my $braille_all_caps_p = 0;
609
+
610
+ # prep
611
+ foreach $i ((0 .. ($#chars + 1))) {
612
+ if ($i <= $#chars) {
613
+ $char = $chars[$i];
614
+ $chart_ht{ORIG_CHAR}->{$i} = $char;
615
+ $char_name = $ht{UTF_TO_CHAR_NAME}->{$char} || "";
616
+ $chart_ht{CHAR_NAME}->{$i} = $char_name;
617
+ $current_script = $this->char_name_to_script($char_name, *ht);
618
+ $current_script_direction = $ht{DIRECTION}->{$current_script} || '';
619
+ if ($current_script_direction eq 'right-to-left') {
620
+ $n_right_to_left_chars++;
621
+ } elsif (($char =~ /^[a-z]$/i) || ! ($char =~ /^[\x00-\x7F]$/)) {
622
+ $n_left_to_right_chars++;
623
+ }
624
+ $chart_ht{CHAR_SCRIPT}->{$i} = $current_script;
625
+ $chart_ht{SCRIPT_SEGMENT_START}->{$i} = ""; # default value, to be updated later
626
+ $chart_ht{SCRIPT_SEGMENT_END}->{$i} = ""; # default value, to be updated later
627
+ $chart_ht{LETTER_TOKEN_SEGMENT_START}->{$i} = ""; # default value, to be updated later
628
+ $chart_ht{LETTER_TOKEN_SEGMENT_END}->{$i} = ""; # default value, to be updated later
629
+ $subjoined_char_p = $this->subjoined_char_p($char_name);
630
+ $chart_ht{CHAR_SUBJOINED}->{$i} = $subjoined_char_p;
631
+ $letter_plus_char_p = $this->letter_plus_char_p($char_name);
632
+ $chart_ht{CHAR_LETTER_PLUS}->{$i} = $letter_plus_char_p;
633
+ $current_letter_plus_script = ($letter_plus_char_p) ? $current_script : "";
634
+ $numeric_value = $ht{UTF_TO_NUMERIC}->{$char};
635
+ $numeric_value = "" unless defined($numeric_value);
636
+ $annotation = $ht{UTF_ANNOTATION}->{$char};
637
+ $annotation = "" unless defined($annotation);
638
+ $chart_ht{CHAR_NUMERIC_VALUE}->{$i} = $numeric_value;
639
+ $chart_ht{CHAR_ANNOTATION}->{$i} = $annotation;
640
+ $syllable_info = $ht{UTF_TO_SYLLABLE_INFO}->{$char} || "";
641
+ $chart_ht{CHAR_SYLLABLE_INFO}->{$i} = $syllable_info;
642
+ $tone_mark = $ht{UTF_TO_TONE_MARK}->{$char} || "";
643
+ $chart_ht{CHAR_TONE_MARK}->{$i} = $tone_mark;
644
+ } else {
645
+ $char = "";
646
+ $char_name = "";
647
+ $current_script = "";
648
+ $current_letter_plus_script = "";
649
+ }
650
+ if ($char_name =~ /^HEBREW (LETTER|POINT|PUNCTUATION GERESH) /) {
651
+ $hebrew_word_start = $i if $hebrew_word_start eq "";
652
+ $hebrew_word_contains_point = 1 if $char_name =~ /^HEBREW POINT /;
653
+ } elsif ($hebrew_word_start ne "") {
654
+ if ($hebrew_word_contains_point) {
655
+ foreach $j (($hebrew_word_start .. ($i-1))) {
656
+ $chart_ht{CHAR_PART_OF_POINTED_HEBREW_WORD}->{$j} = 1;
657
+ }
658
+ $chart_ht{CHAR_START_OF_WORD}->{$hebrew_word_start} = 1;
659
+ $chart_ht{CHAR_END_OF_WORD}->{($i-1)} = 1;
660
+ }
661
+ $hebrew_word_start = "";
662
+ $hebrew_word_contains_point = 0;
663
+ }
664
+ my $part_of_word_p = $current_script
665
+ && ($this->letter_plus_char_p($char_name)
666
+ || $this->subjoined_char_p($char_name)
667
+ || ($char_name =~ /\b(LETTER|SYLLABLE|SYLLABICS|LIGATURE)\b/));
668
+
669
+ # Braille punctuation
670
+ my $end_offset = 0;
671
+ if ($char_name =~ /^Braille\b/i) {
672
+ if (($char =~ /^\s*$/) || ($char_name =~ /BLANK/)) {
673
+ $part_of_word_p = 0;
674
+ $braille_all_caps_p = 0;
675
+ } elsif ($chart_ht{NOT_PART_OF_WORD_P}->{$i}) {
676
+ $part_of_word_p = 0;
677
+ $braille_all_caps_p = 0;
678
+ } elsif ((keys %{$ht{WORD_EXTERNAL_PUNCTUATION_LANG_SPEC}->{$lang_code}->{$char}})
679
+ || (keys %{$ht{WORD_EXTERNAL_PUNCTUATION}->{$char}})) {
680
+ $part_of_word_p = 0;
681
+ $braille_all_caps_p = 0;
682
+ } elsif (($i+1 <= $#chars)
683
+ && ($s1 = $char . $chars[$i+1])
684
+ && ((keys %{$ht{WORD_EXTERNAL_PUNCTUATION_LANG_SPEC}->{$lang_code}->{$s1}})
685
+ || (keys %{$ht{WORD_EXTERNAL_PUNCTUATION}->{$s1}}))) {
686
+ $part_of_word_p = 0;
687
+ $braille_all_caps_p = 0;
688
+ $chart_ht{NOT_PART_OF_WORD_P}->{($i+1)} = 1;
689
+ } elsif (($i+2 <= $#chars)
690
+ && ($s2 = $char . $chars[$i+1] . $chars[$i+2])
691
+ && ((keys %{$ht{WORD_EXTERNAL_PUNCTUATION_LANG_SPEC}->{$lang_code}->{$s2}})
692
+ || (keys %{$ht{WORD_EXTERNAL_PUNCTUATION}->{$s2}}))) {
693
+ $part_of_word_p = 0;
694
+ $braille_all_caps_p = 0;
695
+ $chart_ht{NOT_PART_OF_WORD_P}->{($i+1)} = 1;
696
+ $chart_ht{NOT_PART_OF_WORD_P}->{($i+2)} = 1;
697
+ } elsif (($i+1 <= $#chars)
698
+ && ($char eq $braille_capital_letter_indicator)
699
+ && ($chars[$i+1] eq $braille_capital_letter_indicator)) {
700
+ $braille_all_caps_p = 1;
701
+ } else {
702
+ $part_of_word_p = 1;
703
+ }
704
+ # last period in Braille text is also not part_of_word_p
705
+ if (($char eq $braille_period)
706
+ && (($i == $#chars)
707
+ || (($i < $#chars)
708
+ && (! $this->braille_string_p($chars[$i+1]))))) {
709
+ $part_of_word_p = 0;
710
+ }
711
+ # period before other word-external punctuation is also not part_of_word_p
712
+ if (($i > 0)
713
+ && ($chars[$i-1] eq $braille_period)
714
+ && (! $part_of_word_p)
715
+ && ($current_word_start ne "")) {
716
+ $end_offset = -1;
717
+ }
718
+ } else {
719
+ $braille_all_caps_p = 0;
720
+ }
721
+ $chart_ht{BRAILLE_ALL_CAPS_P}->{$i} = $braille_all_caps_p;
722
+
723
+ if (($current_word_start ne "")
724
+ && ((! $part_of_word_p)
725
+ || ($current_script ne $current_word_script))) {
726
+ # END OF WORD
727
+ $chart_ht{CHAR_START_OF_WORD}->{$current_word_start} = 1;
728
+ $chart_ht{CHAR_END_OF_WORD}->{($i-1+$end_offset)} = 1;
729
+ my $word = join("", @chars[$current_word_start .. ($i-1+$end_offset)]);
730
+ $chart_ht{WORD_START_END}->{$current_word_start}->{$i} = $word;
731
+ $chart_ht{WORD_END_START}->{$i+$end_offset}->{$current_word_start} = $word;
732
+ # print STDERR "Word ($current_word_start-$i+$end_offset): $word ($current_word_script)\n";
733
+ $current_word_start = "";
734
+ $current_word_script = "";
735
+ }
736
+ if ($part_of_word_p && ($current_word_start eq "")) {
737
+ # START OF WORD
738
+ $current_word_start = $i;
739
+ $current_word_script = $current_script;
740
+ }
741
+ # print STDERR "$i char: $char ($current_script)\n";
742
+ unless ($current_script eq $prev_script) {
743
+ if ($prev_script && ($i-1 >= $script_start)) {
744
+ my $script_end = $i;
745
+ $chart_ht{SCRIPT_SEGMENT_START_TO_END}->{$script_start} = $script_end;
746
+ $chart_ht{SCRIPT_SEGMENT_END_TO_START}->{$script_end} = $script_start;
747
+ foreach $i (($script_start .. $script_end)) {
748
+ $chart_ht{SCRIPT_SEGMENT_START}->{$i} = $script_start;
749
+ $chart_ht{SCRIPT_SEGMENT_END}->{$i} = $script_end;
750
+ }
751
+ # print STDERR "Script segment $script_start-$script_end: $prev_script\n";
752
+ }
753
+ $script_start = $i;
754
+ }
755
+ unless ($current_letter_plus_script eq $prev_letter_plus_script) {
756
+ if ($prev_letter_plus_script && ($i-1 >= $letter_plus_script_start)) {
757
+ my $letter_plus_script_end = $i;
758
+ $chart_ht{LETTER_TOKEN_SEGMENT_START_TO_END}->{$letter_plus_script_start} = $letter_plus_script_end;
759
+ $chart_ht{LETTER_TOKEN_SEGMENT_END_TO_START}->{$letter_plus_script_end} = $letter_plus_script_start;
760
+ foreach $i (($letter_plus_script_start .. $letter_plus_script_end)) {
761
+ $chart_ht{LETTER_TOKEN_SEGMENT_START}->{$i} = $letter_plus_script_start;
762
+ $chart_ht{LETTER_TOKEN_SEGMENT_END}->{$i} = $letter_plus_script_end;
763
+ }
764
+ # print STDERR "Script token segment $letter_plus_script_start-$letter_plus_script_end: $prev_letter_plus_script\n";
765
+ }
766
+ $letter_plus_script_start = $i;
767
+ }
768
+ $prev_script = $current_script;
769
+ $prev_letter_plus_script = $current_letter_plus_script;
770
+ }
771
+ $ht{STRING_IS_DOMINANTLY_RIGHT_TO_LEFT}->{$s} = 1 if $n_right_to_left_chars > $n_left_to_right_chars;
772
+
773
+ # main
774
+ my $i = 0;
775
+ while ($i <= $#chars) {
776
+ my $char = $chart_ht{ORIG_CHAR}->{$i};
777
+ my $current_script = $chart_ht{CHAR_SCRIPT}->{$i};
778
+ $chart_ht{CHART_CONTAINS_SCRIPT}->{$current_script} = 1;
779
+ my $script_segment_start = $chart_ht{SCRIPT_SEGMENT_START}->{$i};
780
+ my $script_segment_end = $chart_ht{SCRIPT_SEGMENT_END}->{$i};
781
+ my $char_name = $chart_ht{CHAR_NAME}->{$i};
782
+ my $subjoined_char_p = $chart_ht{CHAR_SUBJOINED}->{$i};
783
+ my $letter_plus_char_p = $chart_ht{CHAR_LETTER_PLUS}->{$i};
784
+ my $numeric_value = $chart_ht{CHAR_NUMERIC_VALUE}->{$i};
785
+ my $annotation = $chart_ht{CHAR_ANNOTATION}->{$i};
786
+ # print STDERR " $char_name annotation: $annotation\n" if $annotation;
787
+ my $tone_mark = $chart_ht{CHAR_TONE_MARK}->{$i};
788
+ my $found_char_mapping_p = 0;
789
+ my $prev_char_name = ($i >= 1) ? $chart_ht{CHAR_NAME}->{($i-1)} : "";
790
+ my $prev2_script = ($i >= 2) ? $chart_ht{CHAR_SCRIPT}->{($i-2)} : "";
791
+ my $prev_script = ($i >= 1) ? $chart_ht{CHAR_SCRIPT}->{($i-1)} : "";
792
+ my $next_script = ($i < $#chars) ? $chart_ht{CHAR_SCRIPT}->{($i+1)} : "";
793
+ my $next_char = ($i < $#chars) ? $chart_ht{ORIG_CHAR}->{($i+1)} : "";
794
+ my $next_char_name = $ht{UTF_TO_CHAR_NAME}->{$next_char} || "";
795
+ my $prev2_letter_plus_char_p = ($i >= 2) ? $chart_ht{CHAR_LETTER_PLUS}->{($i-2)} : 0;
796
+ my $prev_letter_plus_char_p = ($i >= 1) ? $chart_ht{CHAR_LETTER_PLUS}->{($i-1)} : 0;
797
+ my $next_letter_plus_char_p = ($i < $#chars) ? $chart_ht{CHAR_LETTER_PLUS}->{($i+1)} : 0;
798
+ my $next_index = $i + 1;
799
+
800
+ # Braille numeric mode
801
+ if ($char eq $braille_number_indicator) {
802
+ my $offset = 0;
803
+ my $numeric_value = "";
804
+ my $digit;
805
+ while ($i+$offset < $#chars) {
806
+ $offset++;
807
+ my $offset_char = $chart_ht{ORIG_CHAR}->{$i+$offset};
808
+ if (defined($digit = $ht{BRAILLE_TO_DIGIT}->{$offset_char})) {
809
+ $numeric_value .= $digit;
810
+ } elsif (($offset_char eq $braille_decimal_point)
811
+ || ($ht{UTF_CHAR_MAPPING}->{$offset_char}->{"."})) {
812
+ $numeric_value .= ".";
813
+ } elsif ($offset_char eq $braille_comma) {
814
+ $numeric_value .= ",";
815
+ } elsif ($offset_char eq $braille_numeric_space) {
816
+ $numeric_value .= " ";
817
+ } elsif ($offset_char eq $braille_solidus) {
818
+ $numeric_value .= "/";
819
+ } elsif ($offset_char eq $braille_number_indicator) {
820
+ # stay in Braille numeric mode
821
+ } elsif ($offset_char eq $braille_letter_indicator) {
822
+ # consider as part of number, but without contributing to numeric_value
823
+ last;
824
+ } else {
825
+ $offset--;
826
+ last;
827
+ }
828
+ }
829
+ if ($offset) {
830
+ $next_index = $i + $offset + 1;
831
+ $node_id = $this->add_node($numeric_value, $i, $next_index, *chart_ht, "", "braille number");
832
+ $found_char_mapping_p = 1;
833
+ }
834
+ }
835
+
836
+ unless ($found_char_mapping_p) {
837
+ foreach $string_length (reverse(1 .. 6)) {
838
+ next if ($i + $string_length-1) > $#chars;
839
+ my $start_of_word_p = $chart_ht{CHAR_START_OF_WORD}->{$i} || 0;
840
+ my $end_of_word_p = $chart_ht{CHAR_END_OF_WORD}->{($i+$string_length-1)} || 0;
841
+ my $multi_char_substring = join("", @chars[$i..($i+$string_length-1)]);
842
+ my @mappings = keys %{$ht{UTF_CHAR_MAPPING_LANG_SPEC}->{$lang_code}->{$multi_char_substring}};
843
+ @mappings = keys %{$ht{UTF_CHAR_MAPPING}->{$multi_char_substring}} unless @mappings;
844
+ my @mappings_whole = ();
845
+ my @mappings_start_or_end = ();
846
+ my @mappings_other = ();
847
+ foreach $mapping (@mappings) {
848
+ next if $mapping =~ /\(__.*__\)/;
849
+ if ($ht{USE_ONLY_FOR_WHOLE_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$mapping}
850
+ || $ht{USE_ONLY_FOR_WHOLE_WORD}->{$multi_char_substring}->{$mapping}) {
851
+ push(@mappings_whole, $mapping) if $start_of_word_p && $end_of_word_p;
852
+ } elsif ($ht{USE_ONLY_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$mapping}
853
+ || $ht{USE_ONLY_AT_START_OF_WORD}->{$multi_char_substring}->{$mapping}) {
854
+ push(@mappings_start_or_end, $mapping) if $start_of_word_p;
855
+ } elsif ($ht{USE_ONLY_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$mapping}
856
+ || $ht{USE_ONLY_AT_END_OF_WORD}->{$multi_char_substring}->{$mapping}) {
857
+ push(@mappings_start_or_end, $mapping) if $end_of_word_p;
858
+ } else {
859
+ push(@mappings_other, $mapping);
860
+ }
861
+ }
862
+ @mappings = @mappings_whole;
863
+ @mappings = @mappings_start_or_end unless @mappings;
864
+ @mappings = @mappings_other unless @mappings;
865
+ foreach $mapping (@mappings) {
866
+ next if $mapping =~ /\(__.*__\)/;
867
+ if ($ht{DONT_USE_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$mapping}
868
+ || $ht{DONT_USE_AT_START_OF_WORD}->{$multi_char_substring}->{$mapping}) {
869
+ next if $start_of_word_p;
870
+ }
871
+ if ($ht{DONT_USE_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$mapping}
872
+ || $ht{DONT_USE_AT_END_OF_WORD}->{$multi_char_substring}->{$mapping}) {
873
+ next if $end_of_word_p;
874
+ }
875
+ my $mapping2 = ($chart_ht{BRAILLE_ALL_CAPS_P}->{$i}) ? (uc $mapping) : $mapping;
876
+ $node_id = $this->add_node($mapping2, $i, $i+$string_length, *chart_ht, "", "multi-char-mapping");
877
+ $next_index = $i + $string_length;
878
+ $found_char_mapping_p = 1;
879
+ if ($annotation) {
880
+ @annotation_elems = split(/,\s*/, $annotation);
881
+ foreach $annotation_elem (@annotation_elems) {
882
+ if (($a_slot, $a_value) = ($annotation_elem =~ /^(\S+?):(\S+)\s*$/)) {
883
+ $this->set_node_id_slot_value($node_id, $a_slot, $a_value, *chart_ht);
884
+ } else {
885
+ $this->set_node_id_slot_value($node_id, $annotation_elem, 1, *chart_ht);
886
+ }
887
+ }
888
+ }
889
+ }
890
+ my @alt_mappings = keys %{$ht{UTF_CHAR_ALT_MAPPING_LANG_SPEC}->{$lang_code}->{$multi_char_substring}};
891
+ @alt_mappings = keys %{$ht{UTF_CHAR_ALT_MAPPING}->{$multi_char_substring}} unless @alt_mappings;
892
+ @alt_mappings = () if ($#alt_mappings == 0) && ($alt_mappings[0] eq "_NONE_");
893
+ foreach $alt_mapping (@alt_mappings) {
894
+ if ($chart_ht{CHAR_PART_OF_POINTED_HEBREW_WORD}->{$i}) {
895
+ next unless
896
+ $ht{USE_ALT_IN_POINTED_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$alt_mapping}
897
+ || $ht{USE_ALT_IN_POINTED}->{$multi_char_substring}->{$alt_mapping};
898
+ }
899
+ if ($ht{USE_ALT_ONLY_FOR_WHOLE_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$alt_mapping}
900
+ || $ht{USE_ALT_ONLY_FOR_WHOLE_WORD}->{$multi_char_substring}->{$alt_mapping}) {
901
+ next unless $start_of_word_p && $end_of_word_p;
902
+ }
903
+ if ($ht{USE_ALT_ONLY_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$alt_mapping}
904
+ || $ht{USE_ALT_ONLY_AT_START_OF_WORD}->{$multi_char_substring}->{$alt_mapping}) {
905
+ next unless $start_of_word_p;
906
+ }
907
+ if ($ht{USE_ALT_ONLY_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$alt_mapping}
908
+ || $ht{USE_ALT_ONLY_AT_END_OF_WORD}->{$multi_char_substring}->{$alt_mapping}) {
909
+ next unless $end_of_word_p;
910
+ }
911
+ my $alt_mapping2 = ($chart_ht{BRAILLE_ALL_CAPS_P}->{$i}) ? (uc $alt_mapping) : $alt_mapping;
912
+ $node_id = $this->add_node($alt_mapping2, $i, $i+$string_length, *chart_ht, "alt", "multi-char-mapping");
913
+ if ($annotation) {
914
+ @annotation_elems = split(/,\s*/, $annotation);
915
+ foreach $annotation_elem (@annotation_elems) {
916
+ if (($a_slot, $a_value) = ($annotation_elem =~ /^(\S+?):(\S+)\s*$/)) {
917
+ $this->set_node_id_slot_value($node_id, $a_slot, $a_value, *chart_ht);
918
+ } else {
919
+ $this->set_node_id_slot_value($node_id, $annotation_elem, 1, *chart_ht);
920
+ }
921
+ }
922
+ }
923
+ }
924
+ }
925
+ }
926
+ unless ($found_char_mapping_p) {
927
+ my $prev_node_id = $this->get_node_for_span($i-4, $i, *chart_ht)
928
+ || $this->get_node_for_span($i-3, $i, *chart_ht)
929
+ || $this->get_node_for_span($i-2, $i, *chart_ht)
930
+ || $this->get_node_for_span($i-1, $i, *chart_ht);
931
+ my $prev_char_roman = ($prev_node_id) ? $this->get_node_roman($prev_node_id, *chart_id) : "";
932
+ my $prev_node_start = ($prev_node_id) ? $chart_ht{NODE_START}->{$prev_node_id} : "";
933
+
934
+ # Number
935
+ if (($numeric_value =~ /\d/)
936
+ && (! ($char_name =~ /SUPERSCRIPT/))) {
937
+ my $prev_numeric_value = $this->get_node_for_span_with_slot_value($i-1, $i, "numeric-value", *chart_id);
938
+ my $sep = "";
939
+ $sep = " " if ($char_name =~ /^vulgar fraction /i) && ($prev_numeric_value =~ /\d/);
940
+ $node_id = $this->add_node("$sep$numeric_value", $i, $i+1, *chart_ht, "", "number");
941
+ $this->set_node_id_slot_value($node_id, "numeric-value", $numeric_value, *chart_ht);
942
+ if ((($prev_numeric_value =~ /\d/) && ($numeric_value =~ /\d\d/))
943
+ || (($prev_numeric_value =~ /\d\d/) && ($numeric_value =~ /\d/))) {
944
+ # pull in any other parts of single digits
945
+ my $j = 1;
946
+ # pull in any single digits adjoining on left
947
+ if ($prev_numeric_value =~ /^\d$/) {
948
+ while (1) {
949
+ if (($i-$j-1 >= 0)
950
+ && defined($digit_value = $this->get_node_for_span_with_slot_value($i-$j-1, $i-$j, "numeric-value", *chart_id))
951
+ && ($digit_value =~ /^\d$/)) {
952
+ $j++;
953
+ } elsif (($i-$j-2 >= 0)
954
+ && ($chart_ht{ORIG_CHAR}->{($i-$j-1)} =~ /^[.,]$/)
955
+ && defined($digit_value = $this->get_node_for_span_with_slot_value($i-$j-2, $i-$j-1, "numeric-value", *chart_id))
956
+ && ($digit_value =~ /^\d$/)) {
957
+ $j += 2;
958
+ } else {
959
+ last;
960
+ }
961
+ }
962
+ }
963
+ # pull in any single digits adjoining on right
964
+ my $k = 0;
965
+ if ($numeric_value =~ /^\d$/) {
966
+ while (1) {
967
+ if (defined($next_numeric_value = $chart_ht{CHAR_NUMERIC_VALUE}->{($i+$k+1)})
968
+ && ($next_numeric_value =~ /^\d$/)) {
969
+ $k++;
970
+ } else {
971
+ last;
972
+ }
973
+ }
974
+ }
975
+ $this->register_new_complex_number_span_segment($i-$j, $i, $i+$k+1, *chart_ht, $line_number);
976
+ }
977
+ if ($chinesePM->string_contains_utf8_cjk_unified_ideograph_p($char)
978
+ && ($tonal_translit = $chinesePM->tonal_pinyin($char, *pinyin_ht, ""))) {
979
+ $de_accented_translit = $util->de_accent_string($tonal_translit);
980
+ if ($numeric_value =~ /^(10000|1000000000000|10000000000000000)$/) {
981
+ $chart_ht{NODE_TYPE}->{$node_id} = "alt"; # keep, but demote
982
+ $alt_node_id = $this->add_node($de_accented_translit, $i, $i+1, *chart_ht, "", "CJK");
983
+ } else {
984
+ $alt_node_id = $this->add_node($de_accented_translit, $i, $i+1, *chart_ht, "alt", "CJK");
985
+ }
986
+ }
987
+
988
+ # ASCII
989
+ } elsif ($char =~ /^[\x00-\x7F]$/) {
990
+ $this->add_node($char, $i, $i+1, *chart_ht, "", "ASCII"); # ASCII character, incl. control characters
991
+
992
+ # Emoji, dingbats, pictographs
993
+ } elsif ($char =~ /^(\xE2[\x98-\x9E]|\xF0\x9F[\x8C-\xA7])/) {
994
+ $this->add_node($char, $i, $i+1, *chart_ht, "", "pictograph");
995
+
996
+ # Hangul (Korean)
997
+ } elsif (($char =~ /^[\xEA-\xED]/)
998
+ && ($romanized_char = $this->unicode_hangul_romanization($char))) {
999
+ $this->add_node($romanized_char, $i, $i+1, *chart_ht, "", "Hangul");
1000
+
1001
+ # CJK (Chinese, Japanese, Korean)
1002
+ } elsif ($chinesePM->string_contains_utf8_cjk_unified_ideograph_p($char)
1003
+ && ($tonal_translit = $chinesePM->tonal_pinyin($char, *pinyin_ht, ""))) {
1004
+ $de_accented_translit = $util->de_accent_string($tonal_translit);
1005
+ $this->add_node($de_accented_translit, $i, $i+1, *chart_ht, "", "CJK");
1006
+
1007
+ # Virama (cancel preceding vowel in Abudiga scripts)
1008
+ } elsif ($char_name =~ /\bSIGN (?:VIRAMA|AL-LAKUNA|ASAT|COENG|PAMAAEH)\b/) {
1009
+ # VIRAMA: cancel preceding default vowel (in Abudiga scripts)
1010
+ if (($prev_script eq $current_script)
1011
+ && (($prev_char_roman_consonant, $prev_char_roman_vowel) = ($prev_char_roman =~ /^(.*[bcdfghjklmnpqrstvwxyz])([aeiou]+)$/i))
1012
+ && ($ht{SCRIPT_ABUDIGA_DEFAULT_VOWEL}->{$current_script}->{(lc $prev_char_roman_vowel)})) {
1013
+ $this->add_node($prev_char_roman_consonant, $prev_node_start, $i+1, *chart_ht, "", "virama");
1014
+ } else {
1015
+ $this->add_node("", $i, $i+1, *chart_ht, "", "unexpected-virama");
1016
+ }
1017
+
1018
+ # Nukta (special (typically foreign) variant)
1019
+ } elsif ($char_name =~ /\bSIGN (?:NUKTA)\b/) {
1020
+ # NUKTA (dot): indicates special (typically foreign) variant; normally covered by multi-mappings
1021
+ if ($prev_script eq $current_script) {
1022
+ my $node_id = $this->add_node($prev_char_roman, $prev_node_start, $i+1, *chart_ht, "", "nukta");
1023
+ $this->copy_slot_values($prev_node_id, $node_id, *chart_id, "all");
1024
+ $this->set_node_id_slot_value($node_id, "nukta", 1, *chart_ht);
1025
+ } else {
1026
+ $this->add_node("", $i, $i+1, *chart_ht, "", "unexpected-nukta");
1027
+ }
1028
+
1029
+ # Zero-width character, incl. zero width space/non-joiner/joiner, left-to-right/right-to-left mark
1030
+ } elsif ($char =~ /^\xE2\x80[\x8B-\x8F\xAA-\xAE]$/) {
1031
+ if ($prev_node_id) {
1032
+ my $node_id = $this->add_node($prev_char_roman, $prev_node_start, $i+1, *chart_ht, "", "zero-width-char");
1033
+ $this->copy_slot_values($prev_node_id, $node_id, *chart_id, "all");
1034
+ } else {
1035
+ $this->add_node("", $i, $i+1, *chart_ht, "", "zero-width-char");
1036
+ }
1037
+ } elsif (($char =~ /^\xEF\xBB\xBF$/) && $prev_node_id) { # OK to leave byte-order-mark at beginning of line
1038
+ my $node_id = $this->add_node($prev_char_roman, $prev_node_start, $i+1, *chart_ht, "", "zero-width-char");
1039
+ $this->copy_slot_values($prev_node_id, $node_id, *chart_id, "all");
1040
+
1041
+ # Tone mark
1042
+ } elsif ($tone_mark) {
1043
+ if ($prev_script eq $current_script) {
1044
+ my $node_id = $this->add_node($prev_char_roman, $prev_node_start, $i+1, *chart_ht, "", "tone-mark");
1045
+ $this->copy_slot_values($prev_node_id, $node_id, *chart_id, "all");
1046
+ $this->set_node_id_slot_value($node_id, "tone-mark", $tone_mark, *chart_ht);
1047
+ } else {
1048
+ $this->add_node("", $i, $i+1, *chart_ht, "", "unexpected-tone-mark");
1049
+ }
1050
+
1051
+ # Diacritic
1052
+ } elsif (($char_name =~ /\b(ACCENT|TONE|COMBINING DIAERESIS|COMBINING DIAERESIS BELOW|COMBINING MACRON|COMBINING VERTICAL LINE ABOVE|COMBINING DOT ABOVE RIGHT|COMBINING TILDE|COMBINING CYRILLIC|MUUSIKATOAN|TRIISAP)\b/) && ($ht{UTF_TO_CAT}->{$char} =~ /^Mn/)) {
1053
+ if ($prev_script eq $current_script) {
1054
+ my $node_id = $this->add_node($prev_char_roman, $prev_node_start, $i+1, *chart_ht, "", "diacritic");
1055
+ $this->copy_slot_values($prev_node_id, $node_id, *chart_id, "all");
1056
+ $diacritic = lc $char_name;
1057
+ $diacritic =~ s/^.*(?:COMBINING CYRILLIC|COMBINING|SIGN)\s+//i;
1058
+ $diacritic =~ s/^.*(ACCENT|TONE)/$1/i;
1059
+ $diacritic =~ s/^\s*//;
1060
+ $this->set_node_id_slot_value($node_id, "diacritic", $diacritic, *chart_ht);
1061
+ # print STDERR "diacritic: $diacritic\n";
1062
+ } else {
1063
+ $this->add_node("", $i, $i+1, *chart_ht, "", "unexpected-diacritic");
1064
+ }
1065
+
1066
+ # Romanize to find out more
1067
+ } elsif ($char_name) {
1068
+ if (defined($romanized_char = $this->romanize_char_at_position($i, $lang_code, $output_style, *ht, *chart_ht))) {
1069
+ # print STDERR "ROM l.$line_number/$i: $romanized_char\n" if $line_number =~ /^[12]$/;
1070
+ print STDOUT "ROM l.$line_number/$i: $romanized_char\n" if $verbosePM;
1071
+
1072
+ # Empty string mapping
1073
+ if ($romanized_char eq "\"\"") {
1074
+ $this->add_node("", $i, $i+1, *chart_ht, "", "empty-string-mapping");
1075
+ # consider adding something for implausible romanizations of length 6+
1076
+
1077
+ # keep original character (instead of romanized_char lengthener, character-18b00 etc.)
1078
+ } elsif (($romanized_char =~ /^(character|lengthener|modifier)/)) {
1079
+ $this->add_node($char, $i, $i+1, *chart_ht, "", "nevermind-keep-original");
1080
+
1081
+ # Syllabic suffix in Abudiga languages, e.g. -m, -ng
1082
+ } elsif (($romanized_char =~ /^\+(H|M|N|NG)$/i)
1083
+ && ($prev_script eq $current_script)
1084
+ && ($ht{SCRIPT_ABUDIGA_DEFAULT_VOWEL}->{$current_script}->{"a"})) {
1085
+ my $core_suffix = $romanized_char;
1086
+ $core_suffix =~ s/^\+//;
1087
+ if ($prev_char_roman =~ /[aeiou]$/i) {
1088
+ $this->add_node($core_suffix, $i, $i+1, *chart_ht, "", "syllable-end-consonant");
1089
+ } else {
1090
+ $this->add_node(join("", $prev_char_roman, "a", $core_suffix), $prev_node_start, $i+1, *chart_ht, "", "syllable-end-consonant-with-added-a");
1091
+ $this->add_node(join("", "a", $core_suffix), $i, $i+1, *chart_ht, "backup", "syllable-end-consonant");
1092
+ }
1093
+
1094
+ # Japanese special cases
1095
+ } elsif ($char_name =~ /(?:HIRAGANA|KATAKANA) LETTER SMALL Y/) {
1096
+ if (($prev_script eq $current_script)
1097
+ && (($prev_char_roman_consonant) = ($prev_char_roman =~ /^(.*[bcdfghjklmnpqrstvwxyz])i$/i))) {
1098
+ unless ($this->get_node_for_span_and_type($prev_node_start, $i+1, *chart_ht, "")) {
1099
+ $this->add_node("$prev_char_roman_consonant$romanized_char", $prev_node_start, $i+1, *chart_ht, "", "japanese-contraction");
1100
+ }
1101
+ } else {
1102
+ $this->add_node($romanized_char, $i, $i+1, *chart_ht, "", "unexpected-japanese-contraction-character");
1103
+ }
1104
+ } elsif (($prev_script =~ /^(HIRAGANA|KATAKANA)$/i)
1105
+ && ($char_name eq "KATAKANA-HIRAGANA PROLONGED SOUND MARK") # Choonpu
1106
+ && (($prev_char_roman_vowel) = ($prev_char_roman =~ /([aeiou])$/i))) {
1107
+ $this->add_node("$prev_char_roman$prev_char_roman_vowel", $prev_node_start, $i+1, *chart_ht, "", "japanese-vowel-lengthening");
1108
+ } elsif (($current_script =~ /^(Hiragana|Katakana)$/i)
1109
+ && ($char_name =~ /^(HIRAGANA|KATAKANA) LETTER SMALL TU$/i) # Sokuon/Sukun
1110
+ && ($next_script eq $current_script)
1111
+ && ($romanized_next_char = $this->romanize_char_at_position_incl_multi($i+1, $lang_code, $output_style, *ht, *chart_ht))
1112
+ && (($doubled_consonant) = ($romanized_next_char =~ /^(ch|[bcdfghjklmnpqrstwz])/i))) {
1113
+ # Note: $romanized_next_char could be part of a multi-character mapping
1114
+ # print STDERR "current_script: $current_script char_name: $char_name next_script: $next_script romanized_next_char: $romanized_next_char doubled_consonant: $doubled_consonant\n";
1115
+ $doubled_consonant = "t" if $doubled_consonant eq "ch";
1116
+ $this->add_node($doubled_consonant, $i, $i+1, *chart_ht, "", "japanese-consonant-doubling");
1117
+
1118
+ # Greek small letter mu to micro-sign (instead of to "m") as used in abbreviations for microgram/micrometer/microliter/microsecond/micromolar/microfarad etc.
1119
+ } elsif (($char_name eq "GREEK SMALL LETTER MU")
1120
+ && (! ($prev_script =~ /^GREEK$/))
1121
+ && ($i < $#chars)
1122
+ && ($chart_ht{ORIG_CHAR}->{($i+1)} =~ /^[cfgjlmstv]$/i)) {
1123
+ $this->add_node("\xC2\xB5", $i, $i+1, *chart_ht, "", "greek-mu-to-micro-sign");
1124
+
1125
+ # Gurmukhi addak (doubles following consonant)
1126
+ } elsif (($current_script eq "Gurmukhi")
1127
+ && ($char_name eq "GURMUKHI ADDAK")) {
1128
+ if (($next_script eq $current_script)
1129
+ && ($romanized_next_char = $this->romanize_char_at_position_incl_multi($i+1, $lang_code, $output_style, *ht, *chart_ht))
1130
+ && (($doubled_consonant) = ($romanized_next_char =~ /^([bcdfghjklmnpqrstvwxz])/i))) {
1131
+ $this->add_node($doubled_consonant, $i, $i+1, *chart_ht, "", "gurmukhi-consonant-doubling");
1132
+ } else {
1133
+ $this->add_node("'", $i, $i+1, *chart_ht, "", "gurmukhi-unexpected-addak");
1134
+ }
1135
+
1136
+ # Subjoined character
1137
+ } elsif ($subjoined_char_p
1138
+ && ($prev_script eq $current_script)
1139
+ && (($prev_char_roman_consonant, $prev_char_roman_vowel) = ($prev_char_roman =~ /^(.*[bcdfghjklmnpqrstvwxyz])([aeiou]+)$/i))
1140
+ && ($ht{SCRIPT_ABUDIGA_DEFAULT_VOWEL}->{$current_script}->{(lc $prev_char_roman_vowel)})) {
1141
+ my $new_roman = "$prev_char_roman_consonant$romanized_char";
1142
+ $this->add_node($new_roman, $prev_node_start, $i+1, *chart_ht, "", "subjoined-character");
1143
+ # print STDERR " Subjoin l.$line_number/$i: $new_roman\n" if $line_number =~ /^[12]$/;
1144
+
1145
+ # Thai special case: written-pre-consonant-spoken-post-consonant
1146
+ } elsif (($char_name =~ /THAI CHARACTER/)
1147
+ && ($prev_script eq $current_script)
1148
+ && ($chart_ht{CHAR_SYLLABLE_INFO}->{($i-1)} =~ /written-pre-consonant-spoken-post-consonant/i)
1149
+ && ($prev_char_roman =~ /^[aeiou]+$/i)
1150
+ && ($romanized_char =~ /^[bcdfghjklmnpqrstvwxyz]/)) {
1151
+ $this->add_node("$romanized_char$prev_char_roman", $prev_node_start, $i+1, *chart_ht, "", "thai-vowel-consonant-swap");
1152
+
1153
+ # Thai special case: THAI CHARACTER O ANG (U+0E2D "\xE0\xB8\xAD")
1154
+ } elsif ($char_name eq "THAI CHARACTER O ANG") {
1155
+ if ($prev_script ne $current_script) {
1156
+ $this->add_node("", $i, $i+1, *chart_ht, "", "thai-initial-o-ang-drop");
1157
+ } elsif ($next_script ne $current_script) {
1158
+ $this->add_node("", $i, $i+1, *chart_ht, "", "thai-final-o-ang-drop");
1159
+ } else {
1160
+ my $romanized_next_char = $this->romanize_char_at_position($i+1, $lang_code, $output_style, *ht, *chart_ht);
1161
+ my $romanized_prev2_char = $this->romanize_char_at_position($i-2, $lang_code, $output_style, *ht, *chart_ht);
1162
+ if (($prev_char_roman =~ /^[bcdfghjklmnpqrstvwxz]+$/i)
1163
+ && ($romanized_next_char =~ /^[bcdfghjklmnpqrstvwxz]+$/i)) {
1164
+ $this->add_node("o", $i, $i+1, *chart_ht, "", "thai-middle-o-ang"); # keep between consonants
1165
+ } elsif (($prev2_script eq $current_script)
1166
+ && 0
1167
+ && ($prev_char_name =~ /^THAI CHARACTER MAI [A-Z]+$/) # Thai tone
1168
+ && ($romanized_prev2_char =~ /^[bcdfghjklmnpqrstvwxz]+$/i)
1169
+ && ($romanized_next_char =~ /^[bcdfghjklmnpqrstvwxz]+$/i)) {
1170
+ $this->add_node("o", $i, $i+1, *chart_ht, "", "thai-middle-o-ang"); # keep between consonant+tone-mark and consonant
1171
+ } else {
1172
+ $this->add_node("", $i, $i+1, *chart_ht, "", "thai-middle-o-ang-drop"); # drop next to vowel
1173
+ }
1174
+ }
1175
+
1176
+ # Romanization with space
1177
+ } elsif ($romanized_char =~ /\s/) {
1178
+ $this->add_node($char, $i, $i+1, *chart_ht, "", "space");
1179
+
1180
+ # Tibetan special cases
1181
+ } elsif ($current_script eq "Tibetan") {
1182
+
1183
+ if ($subjoined_char_p
1184
+ && ($prev_script eq $current_script)
1185
+ && $prev_letter_plus_char_p
1186
+ && ($prev_char_roman =~ /^[bcdfghjklmnpqrstvwxyz]+$/i)) {
1187
+ $this->add_node("$prev_char_roman$romanized_char", $prev_node_start, $i+1, *chart_ht, "", "subjoined-tibetan-character");
1188
+ } elsif ($romanized_char =~ /^-A$/i) {
1189
+ my $romanized_next_char = $this->romanize_char_at_position($i+1, $lang_code, $output_style, *ht, *chart_ht);
1190
+ if (! $prev_letter_plus_char_p) {
1191
+ $this->add_node("'", $i, $i+1, *chart_ht, "", "tibetan-frontal-dash-a");
1192
+ } elsif (($prev_script eq $current_script)
1193
+ && ($next_script eq $current_script)
1194
+ && ($prev_char_roman =~ /[bcdfghjklmnpqrstvwxyz]$/)
1195
+ && ($romanized_next_char =~ /^[aeiou]/)) {
1196
+ $this->add_node("a'", $i, $i+1, *chart_ht, "", "tibetan-medial-dash-a");
1197
+ } elsif (($prev_script eq $current_script)
1198
+ && ($next_script eq $current_script)
1199
+ && ($prev_char_roman =~ /[aeiou]$/)
1200
+ && ($romanized_next_char =~ /[aeiou]/)) {
1201
+ $this->add_node("'", $i, $i+1, *chart_ht, "", "tibetan-reduced-medial-dash-a");
1202
+ } elsif (($prev_script eq $current_script)
1203
+ && (! ($prev_char_roman =~ /[aeiou]/))
1204
+ && (! $next_letter_plus_char_p)) {
1205
+ $this->add_node("a", $i, $i+1, *chart_ht, "", "tibetan-final-dash-a");
1206
+ } else {
1207
+ $this->add_node("a", $i, $i+1, *chart_ht, "", "unexpected-tibetan-dash-a");
1208
+ }
1209
+ } elsif (($romanized_char =~ /^[AEIOU]/i)
1210
+ && ($prev_script eq $current_script)
1211
+ && ($prev_char_roman =~ /^A$/i)
1212
+ && (! $prev2_letter_plus_char_p)) {
1213
+ $this->add_node($romanized_char, $prev_node_start, $i+1, *chart_ht, "", "tibetan-dropped-word-initial-a");
1214
+ } else {
1215
+ $this->add_node($romanized_char, $i, $i+1, *chart_ht, "", "standard-unicode-based-romanization");
1216
+ }
1217
+
1218
+ # Khmer (for MUUSIKATOAN etc. see under "Diacritic" above)
1219
+ } elsif (($current_script eq "Khmer")
1220
+ && (($char_roman_consonant, $char_roman_vowel) = ($romanized_char =~ /^(.*[bcdfghjklmnpqrstvwxyz])([ao]+)-$/i))) {
1221
+ my $romanized_next_char = $this->romanize_char_at_position($i+1, $lang_code, $output_style, *ht, *chart_ht);
1222
+ if (($next_script eq $current_script)
1223
+ && ($romanized_next_char =~ /^[aeiouy]/i)) {
1224
+ $this->add_node($char_roman_consonant, $i, $i+1, *chart_ht, "", "khmer-vowel-drop");
1225
+ } else {
1226
+ $this->add_node("$char_roman_consonant$char_roman_vowel", $i, $i+1, *chart_ht, "", "khmer-standard-unicode-based-romanization");
1227
+ }
1228
+
1229
+ # Abudiga add default vowel
1230
+ } elsif ((@abudiga_default_vowels = sort keys %{$ht{SCRIPT_ABUDIGA_DEFAULT_VOWEL}->{$current_script}})
1231
+ && ($abudiga_default_vowel = $abudiga_default_vowels[0])
1232
+ && ($romanized_char =~ /^[bcdfghjklmnpqrstvwxyz]+$/i)) {
1233
+ my $new_roman = join("", $romanized_char, $abudiga_default_vowel);
1234
+ $this->add_node($new_roman, $i, $i+1, *chart_ht, "", "standard-unicode-based-romanization-plus-abudiga-default-vowel");
1235
+ # print STDERR " Abudiga add default vowel l.$line_number/$i: $new_roman\n" if $line_number =~ /^[12]$/;
1236
+
1237
+ # Standard romanization
1238
+ } else {
1239
+ $node_id = $this->add_node($romanized_char, $i, $i+1, *chart_ht, "", "standard-unicode-based-romanization");
1240
+ }
1241
+ } else {
1242
+ $this->add_node($char, $i, $i+1, *chart_ht, "", "unexpected-original");
1243
+ }
1244
+ } elsif (defined($romanized_char = $this->romanize_char_at_position($i, $lang_code, $output_style, *ht, *chart_ht))
1245
+ && ((length($romanized_char) <= 2)
1246
+ || ($ht{UTF_TO_CHAR_ROMANIZATION}->{$char}))) { # or from unicode_overwrite_romanization table
1247
+ $romanized_char =~ s/^""$//;
1248
+ $this->add_node($romanized_char, $i, $i+1, *chart_ht, "", "romanized-without-character-name");
1249
+ } else {
1250
+ $this->add_node($char, $i, $i+1, *chart_ht, "", "unexpected-original-without-character-name");
1251
+ }
1252
+ }
1253
+ $i = $next_index;
1254
+ }
1255
+
1256
+ $this->schwa_deletion(0, $n_characters, *chart_ht, $lang_code);
1257
+ $this->default_vowelize_tibetan(0, $n_characters, *chart_ht, $lang_code, $line_number) if $chart_ht{CHART_CONTAINS_SCRIPT}->{"Tibetan"};
1258
+ $this->assemble_numbers_in_chart(*chart_ht, $line_number);
1259
+
1260
+ if ($return_chart_p) {
1261
+ } elsif ($return_offset_mappings_p) {
1262
+ ($result, $offset_mappings, $new_char_offset, $new_rom_char_offset) = $this->best_romanized_string(0, $n_characters, *chart_ht, $control, $initial_char_offset, $initial_rom_char_offset);
1263
+ } else {
1264
+ $result = $this->best_romanized_string(0, $n_characters, *chart_ht) unless $return_chart_p;
1265
+ }
1266
+
1267
+ if ($verbosePM) {
1268
+ my $logfile = "/nfs/isd/ulf/cgi-mt/amr-tmp/uroman-log.txt";
1269
+ $util->append_to_file($logfile, $log) if $log && (-r $logfile);
1270
+ }
1271
+
1272
+ return ($result, $offset_mappings) if $return_offset_mappings_p;
1273
+ return *chart_ht if $return_chart_p;
1274
+ return $result;
1275
+ }
1276
+
1277
+ sub string_to_json_string {
1278
+ local($this, $s) = @_;
1279
+
1280
+ utf8::decode($s);
1281
+ my $j = JSON->new->utf8->encode([$s]);
1282
+ $j =~ s/^\[(.*)\]$/$1/;
1283
+ return $j;
1284
+ }
1285
+
1286
+ sub chart_to_json_romanization_elements {
1287
+ local($this, $chart_start, $chart_end, *chart_ht, $line_number) = @_;
1288
+
1289
+ my $result = "";
1290
+ my $start = $chart_start;
1291
+ my $end;
1292
+ while ($start < $chart_end) {
1293
+ $end = $this->find_end_of_rom_segment($start, $chart_end, *chart_ht);
1294
+ my @best_romanizations;
1295
+ if (($end && ($start < $end))
1296
+ && (@best_romanizations = $this->best_romanizations($start, $end, *chart_ht))) {
1297
+ $orig_segment = $this->orig_string_at_span($start, $end, *chart_ht);
1298
+ $next_start = $end;
1299
+ } else {
1300
+ $orig_segment = $chart_ht{ORIG_CHAR}->{$start};
1301
+ @best_romanizations = ($orig);
1302
+ $next_start = $start + 1;
1303
+ }
1304
+ $exclusive_end = $end - 1;
1305
+ # $guarded_orig = $util->string_guard($orig_segment);
1306
+ $guarded_orig = $this->string_to_json_string($orig_segment);
1307
+ $result .= " { \"line\": $line_number, \"start\": $start, \"end\": $exclusive_end, \"orig\": $guarded_orig, \"roms\": [";
1308
+ foreach $i ((0 .. $#best_romanizations)) {
1309
+ my $rom = $best_romanizations[$i];
1310
+ # my $guarded_rom = $util->string_guard($rom);
1311
+ my $guarded_rom = $this->string_to_json_string($rom);
1312
+ $result .= " { \"rom\": $guarded_rom";
1313
+ # $result .= ", \"alt\": true" if $i >= 1;
1314
+ $result .= " }";
1315
+ $result .= "," if $i < $#best_romanizations;
1316
+ }
1317
+ $result .= " ] },\n";
1318
+ $start = $next_start;
1319
+ }
1320
+ return $result;
1321
+ }
1322
+
1323
+ sub default_vowelize_tibetan {
1324
+ local($this, $chart_start, $chart_end, *chart_ht, $lang_code, $line_number) = @_;
1325
+
1326
+ # my $verbose = ($line_number == 103);
1327
+ # print STDERR "\nStart default_vowelize_tibetan l.$line_number $chart_start-$chart_end\n" if $verbose;
1328
+ my $token_start = $chart_start;
1329
+ my $next_token_start = $chart_start;
1330
+ while (($token_start = $next_token_start) < $chart_end) {
1331
+ $next_token_start = $token_start + 1;
1332
+
1333
+ next unless $chart_ht{CHAR_LETTER_PLUS}->{$token_start};
1334
+ my $current_script = $chart_ht{CHAR_SCRIPT}->{$token_start};
1335
+ next unless ($current_script eq "Tibetan");
1336
+ my $token_end = $chart_ht{LETTER_TOKEN_SEGMENT_START_TO_END}->{$token_start};
1337
+ next unless $token_end;
1338
+ next unless $token_end > $token_start;
1339
+ $next_token_start = $token_end;
1340
+
1341
+ my $start = $token_start;
1342
+ my $end;
1343
+ my @node_ids = ();
1344
+ while ($start < $token_end) {
1345
+ $end = $this->find_end_of_rom_segment($start, $chart_end, *chart_ht);
1346
+ last unless $end && ($end > $start);
1347
+ my @alt_node_ids = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}};
1348
+ last unless @alt_node_ids;
1349
+ push(@node_ids, $alt_node_ids[0]);
1350
+ $start = $end;
1351
+ }
1352
+ my $contains_vowel_p = 0;
1353
+ my @romanizations = ();
1354
+ foreach $node_id (@node_ids) {
1355
+ my $roman = $chart_ht{NODE_ROMAN}->{$node_id};
1356
+ $roman = "" unless defined($roman);
1357
+ push(@romanizations, $roman);
1358
+ $contains_vowel_p = 1 if $roman =~ /[aeiou]/i;
1359
+ }
1360
+ # print STDERR " old: $token_start-$token_end @romanizations\n" if $verbose;
1361
+ unless ($contains_vowel_p) {
1362
+ my $default_vowel_target_index;
1363
+ if ($#node_ids <= 1) {
1364
+ $default_vowel_target_index = 0;
1365
+ } elsif ($romanizations[$#romanizations] eq "s") {
1366
+ if ($romanizations[($#romanizations-1)] eq "y") {
1367
+ $default_vowel_target_index = $#romanizations-1;
1368
+ } else {
1369
+ $default_vowel_target_index = $#romanizations-2;
1370
+ }
1371
+ } else {
1372
+ $default_vowel_target_index = $#romanizations-1;
1373
+ }
1374
+ $romanizations[$default_vowel_target_index] .= "a";
1375
+ my $old_node_id = $node_ids[$default_vowel_target_index];
1376
+ my $old_start = $chart_ht{NODE_START}->{$old_node_id};
1377
+ my $old_end = $chart_ht{NODE_END}->{$old_node_id};
1378
+ my $old_roman = $chart_ht{NODE_ROMAN}->{$old_node_id};
1379
+ my $new_roman = $old_roman . "a";
1380
+ my $new_node_id = $this->add_node($new_roman, $old_start, $old_end, *chart_ht, "", "tibetan-default-vowel");
1381
+ $this->copy_slot_values($old_node_id, $new_node_id, *chart_id, "all");
1382
+ $chart_ht{NODE_TYPE}->{$old_node_id} = "backup"; # keep, but demote
1383
+ }
1384
+ if (($romanizations[0] eq "'")
1385
+ && ($#romanizations >= 1)
1386
+ && ($romanizations[1] =~ /^[o]$/)) {
1387
+ my $old_node_id = $node_ids[0];
1388
+ my $old_start = $chart_ht{NODE_START}->{$old_node_id};
1389
+ my $old_end = $chart_ht{NODE_END}->{$old_node_id};
1390
+ my $new_node_id = $this->add_node("", $old_start, $old_end, *chart_ht, "", "tibetan-delete-apostrophe");
1391
+ $this->copy_slot_values($old_node_id, $new_node_id, *chart_id, "all");
1392
+ $chart_ht{NODE_TYPE}->{$old_node_id} = "alt"; # keep, but demote
1393
+ }
1394
+ if (($#node_ids >= 1)
1395
+ && ($romanizations[$#romanizations] =~ /^[bcdfghjklmnpqrstvwxz]+y$/)) {
1396
+ my $old_node_id = $node_ids[$#romanizations];
1397
+ my $old_start = $chart_ht{NODE_START}->{$old_node_id};
1398
+ my $old_end = $chart_ht{NODE_END}->{$old_node_id};
1399
+ my $old_roman = $chart_ht{NODE_ROMAN}->{$old_node_id};
1400
+ my $new_roman = $old_roman . "a";
1401
+ my $new_node_id = $this->add_node($new_roman, $old_start, $old_end, *chart_ht, "", "tibetan-syllable-final-vowel");
1402
+ $this->copy_slot_values($old_node_id, $new_node_id, *chart_id, "all");
1403
+ $chart_ht{NODE_TYPE}->{$old_node_id} = "alt"; # keep, but demote
1404
+ }
1405
+ foreach $old_node_id (@node_ids) {
1406
+ my $old_roman = $chart_ht{NODE_ROMAN}->{$old_node_id};
1407
+ next unless $old_roman =~ /-a/;
1408
+ my $old_start = $chart_ht{NODE_START}->{$old_node_id};
1409
+ my $old_end = $chart_ht{NODE_END}->{$old_node_id};
1410
+ my $new_roman = $old_roman;
1411
+ $new_roman =~ s/-a/a/;
1412
+ my $new_node_id = $this->add_node($new_roman, $old_start, $old_end, *chart_ht, "", "tibetan-syllable-delete-dash");
1413
+ $this->copy_slot_values($old_node_id, $new_node_id, *chart_id, "all");
1414
+ $chart_ht{NODE_TYPE}->{$old_node_id} = "alt"; # keep, but demote
1415
+ }
1416
+ }
1417
+ }
1418
+
1419
+ sub schwa_deletion {
1420
+ local($this, $chart_start, $chart_end, *chart_ht, $lang_code) = @_;
1421
+ # delete word-final simple "a" in Devanagari (e.g. nepaala -> nepaal)
1422
+ # see Wikipedia article "Schwa deletion in Indo-Aryan languages"
1423
+
1424
+ if ($chart_ht{CHART_CONTAINS_SCRIPT}->{"Devanagari"}) {
1425
+ my $script_start = $chart_start;
1426
+ my $next_script_start = $chart_start;
1427
+ while (($script_start = $next_script_start) < $chart_end) {
1428
+ $next_script_start = $script_start + 1;
1429
+
1430
+ my $current_script = $chart_ht{CHAR_SCRIPT}->{$script_start};
1431
+ next unless ($current_script eq "Devanagari");
1432
+ my $script_end = $chart_ht{SCRIPT_SEGMENT_START_TO_END}->{$script_start};
1433
+ next unless $script_end;
1434
+ next unless $script_end - $script_start >= 2;
1435
+ $next_script_start = $script_end;
1436
+ my $end_node_id = $this->get_node_for_span($script_end-1, $script_end, *chart_ht);
1437
+ next unless $end_node_id;
1438
+ my $end_roman = $chart_ht{NODE_ROMAN}->{$end_node_id};
1439
+ next unless ($end_consonant) = ($end_roman =~ /^([bcdfghjklmnpqrstvwxz]+)a$/i);
1440
+ my $prev_node_id = $this->get_node_for_span($script_end-4, $script_end-1, *chart_ht)
1441
+ || $this->get_node_for_span($script_end-3, $script_end-1, *chart_ht)
1442
+ || $this->get_node_for_span($script_end-2, $script_end-1, *chart_ht);
1443
+ next unless $prev_node_id;
1444
+ my $prev_roman = $chart_ht{NODE_ROMAN}->{$prev_node_id};
1445
+ next unless $prev_roman =~ /[aeiou]/i;
1446
+ # TO DO: check further back for vowel (e.g. if $prev_roman eq "r" due to vowel cancelation)
1447
+
1448
+ $chart_ht{NODE_TYPE}->{$end_node_id} = "alt"; # keep, but demote
1449
+ # print STDERR "* Schwa deletion " . ($script_end-1) . "-$script_end $end_roman->$end_consonant\n";
1450
+ $this->add_node($end_consonant, $script_end-1, $script_end, *chart_ht, "", "devanagari-with-deleted-final-schwa");
1451
+ }
1452
+ }
1453
+ }
1454
+
1455
+ sub best_romanized_string {
1456
+ local($this, $chart_start, $chart_end, *chart_ht, $control, $orig_char_offset, $rom_char_offset) = @_;
1457
+
1458
+ $control = "" unless defined($control);
1459
+ my $current_orig_char_offset = $orig_char_offset || 0;
1460
+ my $current_rom_char_offset = $rom_char_offset || 0;
1461
+ my $return_offset_mappings_p = ($control =~ /\breturn offset mappings\b/);
1462
+ my $result = "";
1463
+ my $start = $chart_start;
1464
+ my $end;
1465
+ my @char_offsets = ("$current_orig_char_offset:$current_rom_char_offset");
1466
+ while ($start < $chart_end) {
1467
+ $end = $this->find_end_of_rom_segment($start, $chart_end, *chart_ht);
1468
+ my $n_orig_chars_in_segment = 0;
1469
+ my $n_rom_chars_in_segment = 0;
1470
+ if ($end && ($start < $end)) {
1471
+ my @best_romanizations = $this->best_romanizations($start, $end, *chart_ht);
1472
+ my $best_romanization = (@best_romanizations) ? $best_romanizations[0] : undef;
1473
+ if (defined($best_romanization)) {
1474
+ $result .= $best_romanization;
1475
+ if ($return_offset_mappings_p) {
1476
+ $n_orig_chars_in_segment = $end-$start;
1477
+ $n_rom_chars_in_segment = $utf8->length_in_utf8_chars($best_romanization);
1478
+ }
1479
+ $start = $end;
1480
+ } else {
1481
+ my $best_romanization = $chart_ht{ORIG_CHAR}->{$start};
1482
+ $result .= $best_romanization;
1483
+ $start++;
1484
+ if ($return_offset_mappings_p) {
1485
+ $n_orig_chars_in_segment = 1;
1486
+ $n_rom_chars_in_segment = $utf8->length_in_utf8_chars($best_romanization);
1487
+ }
1488
+ }
1489
+ } else {
1490
+ my $best_romanization = $chart_ht{ORIG_CHAR}->{$start};
1491
+ $result .= $best_romanization;
1492
+ $start++;
1493
+ if ($return_offset_mappings_p) {
1494
+ $n_orig_chars_in_segment = 1;
1495
+ $n_rom_chars_in_segment = $utf8->length_in_utf8_chars($best_romanization);
1496
+ }
1497
+ }
1498
+ if ($return_offset_mappings_p) {
1499
+ my $new_orig_char_offset = $current_orig_char_offset + $n_orig_chars_in_segment;
1500
+ my $new_rom_char_offset = $current_rom_char_offset + $n_rom_chars_in_segment;
1501
+ my $offset_mapping = "$new_orig_char_offset:$new_rom_char_offset";
1502
+ push(@char_offsets, $offset_mapping);
1503
+ $current_orig_char_offset = $new_orig_char_offset;
1504
+ $current_rom_char_offset = $new_rom_char_offset;
1505
+ }
1506
+ }
1507
+ return ($result, join(",", @char_offsets), $current_orig_char_offset, $current_rom_char_offset) if $return_offset_mappings_p;
1508
+ return $result;
1509
+ }
1510
+
1511
+ sub orig_string_at_span {
1512
+ local($this, $start, $end, *chart_ht) = @_;
1513
+
1514
+ my $result = "";
1515
+ foreach $i (($start .. ($end-1))) {
1516
+ $result .= $chart_ht{ORIG_CHAR}->{$i};
1517
+ }
1518
+ return $result;
1519
+ }
1520
+
1521
+ sub find_end_of_rom_segment {
1522
+ local($this, $start, $chart_end, *chart_ht) = @_;
1523
+
1524
+ my @ends = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}};
1525
+ my $end_index = $#ends;
1526
+ while (($end_index >= 0) && ($ends[$end_index] > $chart_end)) {
1527
+ $end_index--;
1528
+ }
1529
+ if (($end_index >= 0)
1530
+ && defined($end = $ends[$end_index])
1531
+ && ($start < $end)) {
1532
+ return $end;
1533
+ } else {
1534
+ return "";
1535
+ }
1536
+ }
1537
+
1538
+ sub best_romanizations {
1539
+ local($this, $start, $end, *chart_ht) = @_;
1540
+
1541
+ @regular_romanizations = ();
1542
+ @alt_romanizations = ();
1543
+ @backup_romanizations = ();
1544
+
1545
+ foreach $node_id (sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}}) {
1546
+ my $type = $chart_ht{NODE_TYPE}->{$node_id};
1547
+ my $roman = $chart_ht{NODE_ROMAN}->{$node_id};
1548
+ if (! defined($roman)) {
1549
+ # ignore
1550
+ } elsif (($type eq "backup") && ! defined($backup_romanization)) {
1551
+ push(@backup_romanizations, $roman) unless $util->member($roman, @backup_romanizations);
1552
+ } elsif (($type eq "alt") && ! defined($alt_romanization)) {
1553
+ push(@alt_romanizations, $roman) unless $util->member($roman, @alt_romanizations);
1554
+ } else {
1555
+ push(@regular_romanizations, $roman) unless $util->member($roman, @regular_romanizations);
1556
+ }
1557
+ }
1558
+ @regular_alt_romanizations = sort @regular_romanizations;
1559
+ foreach $alt_romanization (sort @alt_romanizations) {
1560
+ push(@regular_alt_romanizations, $alt_romanization) unless $util->member($alt_romanization, @regular_alt_romanizations);
1561
+ }
1562
+ return @regular_alt_romanizations if @regular_alt_romanizations;
1563
+ return sort @backup_romanizations;
1564
+ }
1565
+
1566
+ sub join_alt_romanizations_for_viz {
1567
+ local($this, @list) = @_;
1568
+
1569
+ my @viz_romanizations = ();
1570
+
1571
+ foreach $alt_rom (@list) {
1572
+ if ($alt_rom eq "") {
1573
+ push(@viz_romanizations, "-");
1574
+ } else {
1575
+ push(@viz_romanizations, $alt_rom);
1576
+ }
1577
+ }
1578
+ return join(", ", @viz_romanizations);
1579
+ }
1580
+
1581
+ sub markup_orig_rom_strings {
1582
+ local($this, $chart_start, $chart_end, *ht, *chart_ht, *pinyin_ht, $last_group_id_index) = @_;
1583
+
1584
+ my $marked_up_rom = "";
1585
+ my $marked_up_orig = "";
1586
+ my $start = $chart_start;
1587
+ my $end;
1588
+ while ($start < $chart_end) {
1589
+ my $segment_start = $start;
1590
+ my $segment_end = $start+1;
1591
+ my $end = $this->find_end_of_rom_segment($start, $chart_end, *chart_ht);
1592
+ my $rom_segment = "";
1593
+ my $orig_segment = "";
1594
+ my $rom_title = "";
1595
+ my $orig_title = "";
1596
+ my $contains_alt_romanizations = 0;
1597
+ if ($end) {
1598
+ $segment_end = $end;
1599
+ my @best_romanizations = $this->best_romanizations($start, $end, *chart_ht);
1600
+ my $best_romanization = (@best_romanizations) ? $best_romanizations[0] : undef;
1601
+ if (defined($best_romanization)) {
1602
+ $rom_segment .= $best_romanization;
1603
+ $orig_segment .= $this->orig_string_at_span($start, $end, *chart_ht);
1604
+ $segment_end = $end;
1605
+ if ($#best_romanizations >= 1) {
1606
+ $rom_title .= $util->guard_html("Alternative romanizations: " . $this->join_alt_romanizations_for_viz(@best_romanizations) . "\n");
1607
+ $contains_alt_romanizations = 1;
1608
+ }
1609
+ } else {
1610
+ my $segment = $this->orig_string_at_span($start, $start+1, *chart_ht);
1611
+ $rom_segment .= $segment;
1612
+ $orig_segment .= $segment;
1613
+ $segment_end = $start+1;
1614
+ }
1615
+ $start = $segment_end;
1616
+ } else {
1617
+ $rom_segment .= $chart_ht{ORIG_CHAR}->{$start};
1618
+ $orig_segment .= $this->orig_string_at_span($start, $start+1, *chart_ht);
1619
+ $segment_end = $start+1;
1620
+ $start = $segment_end;
1621
+ }
1622
+ my $next_char = $chart_ht{ORIG_CHAR}->{$segment_end};
1623
+ my $next_char_is_combining_p = $this->char_is_combining_char($next_char, *ht);
1624
+ while ($next_char_is_combining_p
1625
+ && ($segment_end < $chart_end)
1626
+ && ($end = $this->find_end_of_rom_segment($segment_end, $chart_end, *chart_ht))
1627
+ && ($end > $segment_end)
1628
+ && (@best_romanizations = $this->best_romanizations($segment_end, $end, *chart_ht))
1629
+ && defined($best_romanization = $best_romanizations[0])) {
1630
+ $orig_segment .= $this->orig_string_at_span($segment_end, $end, *chart_ht);
1631
+ $rom_segment .= $best_romanization;
1632
+ if ($#best_romanizations >= 1) {
1633
+ $rom_title .= $util->guard_html("Alternative romanizations: " . $this->join_alt_romanizations_for_viz(@best_romanizations) . "\n");
1634
+ $contains_alt_romanizations = 1;
1635
+ }
1636
+ $segment_end = $end;
1637
+ $start = $segment_end;
1638
+ $next_char = $chart_ht{ORIG_CHAR}->{$segment_end};
1639
+ $next_char_is_combining_p = $this->char_is_combining_char($next_char, *ht);
1640
+ }
1641
+ foreach $i (($segment_start .. ($segment_end-1))) {
1642
+ $orig_title .= "+&#x200E; &#x200E;" unless $orig_title eq "";
1643
+ my $char = $chart_ht{ORIG_CHAR}->{$i};
1644
+ my $numeric = $ht{UTF_TO_NUMERIC}->{$char};
1645
+ $numeric = "" unless defined($numeric);
1646
+ my $pic_descr = $ht{UTF_TO_PICTURE_DESCR}->{$char};
1647
+ $pic_descr = "" unless defined($pic_descr);
1648
+ if ($char =~ /^\xE4\xB7[\x80-\xBF]$/) {
1649
+ $orig_title .= "$char_name\n";
1650
+ } elsif (($char =~ /^[\xE3-\xE9][\x80-\xBF]{2,2}$/) && $chinesePM->string_contains_utf8_cjk_unified_ideograph_p($char)) {
1651
+ my $unicode = $utf8->utf8_to_unicode($char);
1652
+ $orig_title .= "CJK Unified Ideograph U+" . (uc sprintf("%04x", $unicode)) . "\n";
1653
+ $orig_title .= "Chinese: $tonal_translit\n" if $tonal_translit = $chinesePM->tonal_pinyin($char, *pinyin_ht, "");
1654
+ $orig_title .= "Number: $numeric\n" if $numeric =~ /\d/;
1655
+ } elsif ($char_name = $ht{UTF_TO_CHAR_NAME}->{$char}) {
1656
+ $orig_title .= "$char_name\n";
1657
+ $orig_title .= "Number: $numeric\n" if $numeric =~ /\d/;
1658
+ $orig_title .= "Picture: $pic_descr\n" if $pic_descr =~ /\S/;
1659
+ } else {
1660
+ my $unicode = $utf8->utf8_to_unicode($char);
1661
+ if (($unicode >= 0xAC00) && ($unicode <= 0xD7A3)) {
1662
+ $orig_title .= "Hangul syllable U+" . (uc sprintf("%04x", $unicode)) . "\n";
1663
+ } else {
1664
+ $orig_title .= "Unicode character U+" . (uc sprintf("%04x", $unicode)) . "\n";
1665
+ }
1666
+ }
1667
+ }
1668
+ (@non_ascii_roms) = ($rom_segment =~ /([\xC0-\xFF][\x80-\xBF]*)/g);
1669
+ foreach $char (@non_ascii_roms) {
1670
+ my $char_name = $ht{UTF_TO_CHAR_NAME}->{$char};
1671
+ my $unicode = $utf8->utf8_to_unicode($char);
1672
+ my $unicode_s = "U+" . (uc sprintf("%04x", $unicode));
1673
+ if ($char_name) {
1674
+ $rom_title .= "$char_name\n";
1675
+ } else {
1676
+ $rom_title .= "$unicode_s\n";
1677
+ }
1678
+ }
1679
+ $last_group_id_index++;
1680
+ $rom_title =~ s/\s*$//;
1681
+ $rom_title =~ s/\n/&#xA;/g;
1682
+ $orig_title =~ s/\s*$//;
1683
+ $orig_title =~ s/\n/&#xA;&#x200E;/g;
1684
+ $orig_title = "&#x202D;" . $orig_title . "&#x202C;";
1685
+ my $rom_title_clause = ($rom_title eq "") ? "" : " title=\"$rom_title\"";
1686
+ my $orig_title_clause = ($orig_title eq "") ? "" : " title=\"$orig_title\"";
1687
+ my $alt_rom_clause = ($contains_alt_romanizations) ? "border-bottom:1px dotted;" : "";
1688
+ $marked_up_rom .= "<span id=\"span-$last_group_id_index-1\" onmouseover=\"highlight_elems('span-$last_group_id_index','1');\" onmouseout=\"highlight_elems('span-$last_group_id_index','0');\" style=\"color:#00BB00;$alt_rom_clause\"$rom_title_clause>" . $util->guard_html($rom_segment) . "<\/span>";
1689
+ $marked_up_orig .= "<span id=\"span-$last_group_id_index-2\" onmouseover=\"highlight_elems('span-$last_group_id_index','1');\" onmouseout=\"highlight_elems('span-$last_group_id_index','0');\"$orig_title_clause>" . $util->guard_html($orig_segment) . "<\/span>";
1690
+ if (($last_char = $chart_ht{ORIG_CHAR}->{($segment_end-1)})
1691
+ && ($last_char_name = $ht{UTF_TO_CHAR_NAME}->{$last_char})
1692
+ && ($last_char_name =~ /^(FULLWIDTH COLON|FULLWIDTH COMMA|FULLWIDTH RIGHT PARENTHESIS|IDEOGRAPHIC COMMA|IDEOGRAPHIC FULL STOP|RIGHT CORNER BRACKET|BRAILLE PATTERN BLANK|TIBETAN MARK .*)$/)) {
1693
+ $marked_up_orig .= "<wbr>";
1694
+ $marked_up_rom .= "<wbr>";
1695
+ }
1696
+ }
1697
+ return ($marked_up_rom, $marked_up_orig, $last_group_id_index);
1698
+ }
1699
+
1700
+ sub romanizations_with_alternatives {
1701
+ local($this, *ht, *chart_ht, *pinyin_ht, $chart_start, $chart_end) = @_;
1702
+
1703
+ $chart_start = 0 unless defined($chart_start);
1704
+ $chart_end = $chart_ht{N_CHARS} unless defined($chart_end);
1705
+ my $result = "";
1706
+ my $start = $chart_start;
1707
+ my $end;
1708
+ # print STDOUT "romanizations_with_alternatives $chart_start-$chart_end\n";
1709
+ while ($start < $chart_end) {
1710
+ my $segment_start = $start;
1711
+ my $segment_end = $start+1;
1712
+ my $end = $this->find_end_of_rom_segment($start, $chart_end, *chart_ht);
1713
+ my $rom_segment = "";
1714
+ # print STDOUT " $start-$end\n";
1715
+ if ($end) {
1716
+ $segment_end = $end;
1717
+ my @best_romanizations = $this->best_romanizations($start, $end, *chart_ht);
1718
+ # print STDOUT " $start-$end @best_romanizations\n";
1719
+ if (@best_romanizations) {
1720
+ if ($#best_romanizations == 0) {
1721
+ $rom_segment .= $best_romanizations[0];
1722
+ } else {
1723
+ $rom_segment .= "{" . join("|", @best_romanizations) . "}";
1724
+ }
1725
+ $segment_end = $end;
1726
+ } else {
1727
+ my $segment = $this->orig_string_at_span($start, $start+1, *chart_ht);
1728
+ $rom_segment .= $segment;
1729
+ $segment_end = $start+1;
1730
+ }
1731
+ $start = $segment_end;
1732
+ } else {
1733
+ $rom_segment .= $chart_ht{ORIG_CHAR}->{$start};
1734
+ $segment_end = $start+1;
1735
+ $start = $segment_end;
1736
+ }
1737
+ # print STDOUT " $start-$end ** $rom_segment\n";
1738
+ $result .= $rom_segment;
1739
+ }
1740
+ return $result;
1741
+ }
1742
+
1743
+ sub quick_romanize {
1744
+ local($this, $s, $lang_code, *ht) = @_;
1745
+
1746
+ my $result = "";
1747
+ my @chars = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht);
1748
+ while (@chars) {
1749
+ my $found_match_in_table_p = 0;
1750
+ foreach $string_length (reverse(1..4)) {
1751
+ next if ($string_length-1) > $#chars;
1752
+ $multi_char_substring = join("", @chars[0..($string_length-1)]);
1753
+ my @mappings = keys %{$ht{UTF_CHAR_MAPPING_LANG_SPEC}->{$lang_code}->{$multi_char_substring}};
1754
+ @mappings = keys %{$ht{UTF_CHAR_MAPPING}->{$multi_char_substring}} unless @mappings;
1755
+ if (@mappings) {
1756
+ my $mapping = $mappings[0];
1757
+ $result .= $mapping;
1758
+ foreach $_ ((1 .. $string_length)) {
1759
+ shift @chars;
1760
+ }
1761
+ $found_match_in_table_p = 1;
1762
+ last;
1763
+ }
1764
+ }
1765
+ unless ($found_match_in_table_p) {
1766
+ $result .= $chars[0];
1767
+ shift @chars;
1768
+ }
1769
+ }
1770
+ return $result;
1771
+ }
1772
+
1773
+ sub char_is_combining_char {
1774
+ local($this, $c, *ht) = @_;
1775
+
1776
+ return 0 unless $c;
1777
+ my $category = $ht{UTF_TO_CAT}->{$c};
1778
+ return 0 unless $category;
1779
+ return $category =~ /^M/;
1780
+ }
1781
+
1782
+ sub mark_up_string_for_mouse_over {
1783
+ local($this, $s, *ht, $control, *pinyin_ht) = @_;
1784
+
1785
+ $control = "" unless defined($control);
1786
+ $no_ascii_p = ($control =~ /NO-ASCII/);
1787
+ my $result = "";
1788
+ @chars = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht);
1789
+ while (@chars) {
1790
+ $char = shift @chars;
1791
+ $numeric = $ht{UTF_TO_NUMERIC}->{$char};
1792
+ $numeric = "" unless defined($numeric);
1793
+ $pic_descr = $ht{UTF_TO_PICTURE_DESCR}->{$char};
1794
+ $pic_descr = "" unless defined($pic_descr);
1795
+ $next_char = ($#chars >= 0) ? $chars[0] : "";
1796
+ $next_char_is_combining_p = $this->char_is_combining_char($next_char, *ht);
1797
+ if ($no_ascii_p
1798
+ && ($char =~ /^[\x00-\x7F]*$/)
1799
+ && ! $next_char_is_combining_p) {
1800
+ $result .= $util->guard_html($char);
1801
+ } elsif (($char =~ /^[\xE3-\xE9][\x80-\xBF]{2,2}$/) && $chinesePM->string_contains_utf8_cjk_unified_ideograph_p($char)) {
1802
+ $unicode = $utf8->utf8_to_unicode($char);
1803
+ $title = "CJK Unified Ideograph U+" . (uc sprintf("%04x", $unicode));
1804
+ $title .= "&#xA;Chinese: $tonal_translit" if $tonal_translit = $chinesePM->tonal_pinyin($char, *pinyin_ht, "");
1805
+ $title .= "&#xA;Number: $numeric" if $numeric =~ /\d/;
1806
+ $result .= "<span title=\"$title\">" . $util->guard_html($char) . "<\/span>";
1807
+ } elsif ($char_name = $ht{UTF_TO_CHAR_NAME}->{$char}) {
1808
+ $title = $char_name;
1809
+ $title .= "&#xA;Number: $numeric" if $numeric =~ /\d/;
1810
+ $title .= "&#xA;Picture: $pic_descr" if $pic_descr =~ /\S/;
1811
+ $char_plus = $char;
1812
+ while ($next_char_is_combining_p) {
1813
+ # combining marks (Mc:non-spacing, Mc:spacing combining, Me: enclosing)
1814
+ $next_char_name = $ht{UTF_TO_CHAR_NAME}->{$next_char};
1815
+ $title .= "&#xA;+ $next_char_name";
1816
+ $char = shift @chars;
1817
+ $char_plus .= $char;
1818
+ $next_char = ($#chars >= 0) ? $chars[0] : "";
1819
+ $next_char_is_combining_p = $this->char_is_combining_char($next_char, *ht);
1820
+ }
1821
+ $result .= "<span title=\"$title\">" . $util->guard_html($char_plus) . "<\/span>";
1822
+ $result .= "<wbr>" if $char_name =~ /^(FULLWIDTH COLON|FULLWIDTH COMMA|FULLWIDTH RIGHT PARENTHESIS|IDEOGRAPHIC COMMA|IDEOGRAPHIC FULL STOP|RIGHT CORNER BRACKET)$/;
1823
+ } elsif (($unicode = $utf8->utf8_to_unicode($char))
1824
+ && ($unicode >= 0xAC00) && ($unicode <= 0xD7A3)) {
1825
+ $title = "Hangul syllable U+" . (uc sprintf("%04x", $unicode));
1826
+ $result .= "<span title=\"$title\">" . $util->guard_html($char) . "<\/span>";
1827
+ } else {
1828
+ $result .= $util->guard_html($char);
1829
+ }
1830
+ }
1831
+ return $result;
1832
+ }
1833
+
1834
+ sub romanize_char_at_position_incl_multi {
1835
+ local($this, $i, $lang_code, $output_style, *ht, *chart_ht) = @_;
1836
+
1837
+ my $char = $chart_ht{ORIG_CHAR}->{$i};
1838
+ return "" unless defined($char);
1839
+ my @mappings = keys %{$ht{UTF_CHAR_MAPPING_LANG_SPEC}->{$lang_code}->{$char}};
1840
+ return $mappings[0] if @mappings;
1841
+ @mappings = keys %{$ht{UTF_CHAR_MAPPING}->{$char}};
1842
+ return $mappings[0] if @mappings;
1843
+ return $this->romanize_char_at_position($i, $lang_code, $output_style, *ht, *chart_ht);
1844
+ }
1845
+
1846
+ sub romanize_char_at_position {
1847
+ local($this, $i, $lang_code, $output_style, *ht, *chart_ht) = @_;
1848
+
1849
+ my $char = $chart_ht{ORIG_CHAR}->{$i};
1850
+ return "" unless defined($char);
1851
+ return $char if $char =~ /^[\x00-\x7F]$/; # ASCII
1852
+ my $romanization = $ht{UTF_TO_CHAR_ROMANIZATION}->{$char};
1853
+ return $romanization if $romanization;
1854
+ my $char_name = $chart_ht{CHAR_NAME}->{$i};
1855
+ $romanization = $this->romanize_charname($char_name, $lang_code, $output_style, *ht, $char);
1856
+ $ht{SUSPICIOUS_ROMANIZATION}->{$char_name}->{$romanization}
1857
+ = ($ht{SUSPICIOUS_ROMANIZATION}->{$char_name}->{$romanization} || 0) + 1
1858
+ unless (length($romanization) < 4)
1859
+ || ($romanization =~ /\s/)
1860
+ || ($romanization =~ /^[bcdfghjklmnpqrstvwxyz]{2,3}[aeiou]-$/) # Khmer ngo-/nyo-/pho- OK
1861
+ || ($romanization =~ /^[bcdfghjklmnpqrstvwxyz]{2,2}[aeiougw][aeiou]{1,2}$/) # Canadian, Ethiopic syllable OK
1862
+ || ($romanization =~ /^(allah|bbux|nyaa|nnya|quuv|rrep|shch|shur|syrx)$/i) # Arabic; Yi; Ethiopic syllable nyaa; Cyrillic letter shcha
1863
+ || (($char_name =~ /^(YI SYLLABLE|VAI SYLLABLE|ETHIOPIC SYLLABLE|CANADIAN SYLLABICS|CANADIAN SYLLABICS CARRIER)\s+(\S+)$/) && (length($romanization) <= 5));
1864
+ # print STDERR "romanize_char_at_position $i $char_name :: $romanization\n" if $char_name =~ /middle/i;
1865
+ return $romanization;
1866
+ }
1867
+
1868
+ sub romanize_charname {
1869
+ local($this, $char_name, $lang_code, $output_style, *ht, $char) = @_;
1870
+
1871
+ my $cached_result = $ht{ROMANIZE_CHARNAME}->{$char_name}->{$lang_code}->{$output_style};
1872
+ # print STDERR "(C) romanize_charname($char_name): $cached_result\n" if $cached_result && ($char_name =~ /middle/i);
1873
+ return $cached_result if defined($cashed_result);
1874
+ $orig_char_name = $char_name;
1875
+ $char_name =~ s/^.* LETTER\s+([A-Z]+)-\d+$/$1/; # HENTAIGANA LETTER A-3
1876
+ $char_name =~ s/^.* LETTER\s+//;
1877
+ $char_name =~ s/^.* SYLLABLE\s+B\d\d\d\s+//; # Linear B syllables
1878
+ $char_name =~ s/^.* SYLLABLE\s+//;
1879
+ $char_name =~ s/^.* SYLLABICS\s+//;
1880
+ $char_name =~ s/^.* LIGATURE\s+//;
1881
+ $char_name =~ s/^.* VOWEL SIGN\s+//;
1882
+ $char_name =~ s/^.* CONSONANT SIGN\s+//;
1883
+ $char_name =~ s/^.* CONSONANT\s+//;
1884
+ $char_name =~ s/^.* VOWEL\s+//;
1885
+ $char_name =~ s/ WITH .*$//;
1886
+ $char_name =~ s/ WITHOUT .*$//;
1887
+ $char_name =~ s/\s+(ABOVE|AGUNG|BAR|BARREE|BELOW|CEDILLA|CEREK|DIGRAPH|DOACHASHMEE|FINAL FORM|GHUNNA|GOAL|INITIAL FORM|ISOLATED FORM|KAWI|LELET|LELET RASWADI|LONSUM|MAHAPRANA|MEDIAL FORM|MURDA|MURDA MAHAPRANA|REVERSED|ROTUNDA|SASAK|SUNG|TAM|TEDUNG|TYPE ONE|TYPE TWO|WOLOSO)\s*$//;
1888
+ $char_name =~ s/^([A-Z]+)\d+$/$1/; # Linear B syllables etc.
1889
+ foreach $_ ((1 .. 3)) {
1890
+ $char_name =~ s/^.*\b(?:ABKHASIAN|ACADEMY|AFRICAN|AIVILIK|AITON|AKHMIMIC|ALEUT|ALI GALI|ALPAPRAANA|ALTERNATE|ALTERNATIVE|AMBA|ARABIC|ARCHAIC|ASPIRATED|ATHAPASCAN|BASELINE|BLACKLETTER|BARRED|BASHKIR|BERBER|BHATTIPROLU|BIBLE-CREE|BIG|BINOCULAR|BLACKFOOT|BLENDED|BOTTOM|BROAD|BROKEN|CANDRA|CAPITAL|CARRIER|CHILLU|CLOSE|CLOSED|COPTIC|CROSSED|CRYPTOGRAMMIC|CURLED|CURLY|CYRILLIC|DANTAJA|DENTAL|DIALECT-P|DIAERESIZED|DOTLESS|DOUBLE|DOUBLE-STRUCK|EASTERN PWO KAREN|EGYPTOLOGICAL|FARSI|FINAL|FLATTENED|GLOTTAL|GREAT|GREEK|HALF|HIGH|INITIAL|INSULAR|INVERTED|IOTIFIED|JONA|KANTAJA|KASHMIRI|KHAKASSIAN|KHAMTI|KHANDA|KINNA|KIRGHIZ|KOMI|L-SHAPED|LATINATE|LITTLE|LONG|LONG-LEGGED|LOOPED|LOW|MAHAAPRAANA|MALAYALAM|MANCHU|MANDAILING|MATHEMATICAL|MEDIAL|MIDDLE-WELSH|MON|MONOCULAR|MOOSE-CREE|MULTIOCULAR|MUURDHAJA|N-CREE|NARROW|NASKAPI|NDOLE|NEUTRAL|NIKOLSBURG|NORTHERN|NUBIAN|NUNAVIK|NUNAVUT|OJIBWAY|OLD|OPEN|ORKHON|OVERLONG|PALI|PERSIAN|PHARYNGEAL|PRISHTHAMATRA|R-CREE|REDUPLICATION|REVERSED|ROMANIAN|ROUND|ROUNDED|RUDIMENTA|RUMAI PALAUNG|SANSKRIT|SANYAKA|SARA|SAYISI|SCRIPT|SEBATBEIT|SEMISOFT|SGAW KAREN|SHAN|SHARP|SHWE PALAUNG|SHORT|SIBE|SIDEWAYS|SIMALUNGUN|SMALL|SOGDIAN|SOFT|SOUTH-SLAVEY|SOUTHERN|SPIDERY|STIRRUP|STRAIGHT|STRETCHED|SUBSCRIPT|SWASH|TAI LAING|TAILED|TAILLESS|TAALUJA|TH-CREE|TALL|THREE-LEGGED|TURNED|TODO|TOP|TROKUTASTI|TUAREG|UKRAINIAN|UNBLENDED|VISIGOTHIC|VOCALIC|VOICED|VOICELESS|VOLAPUK|WAVY|WESTERN PWO KAREN|WEST-CREE|WESTERN|WIDE|WOODS-CREE|Y-CREE|YENISEI|YIDDISH)\s+//;
1891
+ }
1892
+ $char_name =~ s/\s+(ABOVE|AGUNG|BAR|BARREE|BELOW|CEDILLA|CEREK|DIGRAPH|DOACHASHMEE|FINAL FORM|GHUNNA|GOAL|INITIAL FORM|ISOLATED FORM|KAWI|LELET|LELET RASWADI|LONSUM|MAHAPRANA|MEDIAL FORM|MURDA|MURDA MAHAPRANA|REVERSED|ROTUNDA|SASAK|SUNG|TAM|TEDUNG|TYPE ONE|TYPE TWO|WOLOSO)\s*$//;
1893
+ if ($char_name =~ /THAI CHARACTER/) {
1894
+ $char_name =~ s/^THAI CHARACTER\s+//;
1895
+ if ($char =~ /^\xE0\xB8[\x81-\xAE]/) {
1896
+ # Thai consonants
1897
+ $char_name =~ s/^([^AEIOU]*).*/$1/i;
1898
+ } elsif ($char_name =~ /^SARA [AEIOU]/) {
1899
+ # Thai vowels
1900
+ $char_name =~ s/^SARA\s+//;
1901
+ } else {
1902
+ $char_name = $char;
1903
+ }
1904
+ }
1905
+ if ($orig_char_name =~ /(HIRAGANA LETTER|KATAKANA LETTER|SYLLABLE|LIGATURE)/) {
1906
+ $char_name = lc $char_name;
1907
+ } elsif ($char_name =~ /\b(ANUSVARA|ANUSVARAYA|NIKAHIT|SIGN BINDI|TIPPI)\b/) {
1908
+ $char_name = "+m";
1909
+ } elsif ($char_name =~ /\bSCHWA\b/) {
1910
+ $char_name = "e";
1911
+ } elsif ($char_name =~ /\bIOTA\b/) {
1912
+ $char_name = "i";
1913
+ } elsif ($char_name =~ /\s/) {
1914
+ } elsif ($orig_char_name =~ /KHMER LETTER/) {
1915
+ $char_name .= "-";
1916
+ } elsif ($orig_char_name =~ /CHEROKEE LETTER/) {
1917
+ # use whole letter as is
1918
+ } elsif ($orig_char_name =~ /KHMER INDEPENDENT VOWEL/) {
1919
+ $char_name =~ s/q//;
1920
+ } elsif ($orig_char_name =~ /LETTER/) {
1921
+ $char_name =~ s/^[AEIOU]+([^AEIOU]+)$/$1/i;
1922
+ $char_name =~ s/^([^-AEIOUY]+)[AEIOU].*/$1/i;
1923
+ $char_name =~ s/^(Y)[AEIOU].*/$1/i if $orig_char_name =~ /\b(?:BENGALI|DEVANAGARI|GURMUKHI|GUJARATI|KANNADA|MALAYALAM|MODI|MYANMAR|ORIYA|TAMIL|TELUGU|TIBETAN)\b.*\bLETTER YA\b/;
1924
+ $char_name =~ s/^(Y[AEIOU]+)[^AEIOU].*$/$1/i;
1925
+ $char_name =~ s/^([AEIOU]+)[^AEIOU]+[AEIOU].*/$1/i;
1926
+ }
1927
+
1928
+ my $result = ($orig_char_name =~ /\bCAPITAL\b/) ? (uc $char_name) : (lc $char_name);
1929
+ # print STDERR "(R) romanize_charname($orig_char_name): $result\n" if $orig_char_name =~ /middle/i;
1930
+ $ht{ROMANIZE_CHARNAME}->{$char_name}->{$lang_code}->{$output_style} = $result;
1931
+ return $result;
1932
+ }
1933
+
1934
+ sub assemble_numbers_in_chart {
1935
+ local($this, *chart_ht, $line_number) = @_;
1936
+
1937
+ foreach $start (sort { $a <=> $b } keys %{$chart_ht{COMPLEX_NUMERIC_START_END}}) {
1938
+ my $end = $chart_ht{COMPLEX_NUMERIC_START_END}->{$start};
1939
+ my @numbers = ();
1940
+ foreach $i (($start .. ($end-1))) {
1941
+ my $orig_char = $chart_ht{ORIG_CHAR}->{$i};
1942
+ my $node_id = $this->get_node_for_span_with_slot($i, $i+1, "numeric-value", *chart_id);
1943
+ if (defined($node_id)) {
1944
+ my $number = $chart_ht{NODE_ROMAN}->{$node_id};
1945
+ if (defined($number)) {
1946
+ push(@numbers, $number);
1947
+ } elsif ($orig_char =~ /^[.,]$/) { # decimal point, comma separator
1948
+ push(@numbers, $orig_char);
1949
+ } else {
1950
+ print STDERR "Found no romanization for node_id $node_id ($i-" . ($i+1) . ") in assemble_numbers_in_chart\n" if $verbosePM;
1951
+ }
1952
+ } else {
1953
+ print STDERR "Found no node_id for span $i-" . ($i+1) . " in assemble_numbers_in_chart\n" if $verbosePM;
1954
+ }
1955
+ }
1956
+ my $complex_number = $this->assemble_number(join("\xC2\xB7", @numbers), $line_number);
1957
+ # print STDERR "assemble_numbers_in_chart l.$line_number $start-$end $complex_number (@numbers)\n";
1958
+ $this->add_node($complex_number, $start, $end, *chart_ht, "", "complex-number");
1959
+ }
1960
+ }
1961
+
1962
+ sub assemble_number {
1963
+ local($this, $s, $line_number) = @_;
1964
+ # e.g. 10 9 100 7 10 8 = 1978
1965
+
1966
+ my $middot = "\xC2\xB7";
1967
+ my @tokens = split(/$middot/, $s); # middle dot U+00B7
1968
+ my $i = 0;
1969
+ my @orig_tokens = @tokens;
1970
+
1971
+ # assemble single digit numbers, e.g. 1 7 5 -> 175
1972
+ while ($i < $#tokens) {
1973
+ if ($tokens[$i] =~ /^\d$/) {
1974
+ my $j = $i+1;
1975
+ while (($j <= $#tokens) && ($tokens[$j] =~ /^[0-9.,]$/)) {
1976
+ $j++;
1977
+ }
1978
+ $j--;
1979
+ if ($j>$i) {
1980
+ my $new_token = join("", @tokens[$i .. $j]);
1981
+ $new_token =~ s/,//g;
1982
+ splice(@tokens, $i, $j-$i+1, $new_token);
1983
+ }
1984
+ }
1985
+ $i++;
1986
+ }
1987
+
1988
+ foreach $power ((10, 100, 1000, 10000, 100000, 1000000, 100000000, 1000000000, 1000000000000)) {
1989
+ for (my $i=0; $i <= $#tokens; $i++) {
1990
+ if ($tokens[$i] == $power) {
1991
+ if (($i > 0) && ($tokens[($i-1)] < $power)) {
1992
+ splice(@tokens, $i-1, 2, ($tokens[($i-1)] * $tokens[$i]));
1993
+ $i--;
1994
+ if (($i < $#tokens) && ($tokens[($i+1)] < $power)) {
1995
+ splice(@tokens, $i, 2, ($tokens[$i] + $tokens[($i+1)]));
1996
+ $i--;
1997
+ }
1998
+ }
1999
+ }
2000
+ # 400 30 (e.g. Egyptian)
2001
+ my $gen_pattern = $power;
2002
+ $gen_pattern =~ s/^1/\[1-9\]/;
2003
+ if (($tokens[$i] =~ /^$gen_pattern$/) && ($i < $#tokens) && ($tokens[($i+1)] < $power)) {
2004
+ splice(@tokens, $i, 2, ($tokens[$i] + $tokens[($i+1)]));
2005
+ $i--;
2006
+ }
2007
+ }
2008
+ last if $#tokens == 0;
2009
+ }
2010
+ my $result = join($middot, @tokens);
2011
+ if ($verbosePM) {
2012
+ my $logfile = "/nfs/isd/ulf/cgi-mt/amr-tmp/uroman-number-log.txt";
2013
+ $util->append_to_file($logfile, "$s -> $result\n") if -r $logfile;
2014
+ # print STDERR " assemble number l.$line_number @orig_tokens -> $result\n" if $line_number == 43;
2015
+ }
2016
+ return $result;
2017
+ }
2018
+
2019
+ 1;
2020
+
uroman/lib/NLP/UTF8.pm ADDED
@@ -0,0 +1,1404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ################################################################
2
+ # #
3
+ # UTF8 #
4
+ # #
5
+ ################################################################
6
+
7
+ package NLP::UTF8;
8
+
9
+ use NLP::utilities;
10
+ $util = NLP::utilities;
11
+
12
+ %empty_ht = ();
13
+
14
+ sub new {
15
+ local($caller) = @_;
16
+
17
+ my $object = {};
18
+ my $class = ref( $caller ) || $caller;
19
+ bless($object, $class);
20
+ return $object;
21
+ }
22
+
23
+ sub unicode_string2string {
24
+ # input: string that might contain unicode sequences such as "U+0627"
25
+ # output: string in pure utf-8
26
+ local($caller,$s) = @_;
27
+
28
+ my $pre;
29
+ my $unicode;
30
+ my $post;
31
+ my $r1;
32
+ my $r2;
33
+ my $r3;
34
+
35
+ ($pre,$unicode,$post) = ($s =~ /^(.*)(?:U\+|\\u)([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])(.*)$/);
36
+ return $s unless defined($post);
37
+ $r1 = $caller->unicode_string2string($pre);
38
+ $r2 = $caller->unicode_hex_string2string($unicode);
39
+ $r3 = $caller->unicode_string2string($post);
40
+ $result = $r1 . $r2 . $r3;
41
+ return $result;
42
+ }
43
+
44
+ sub unicode_hex_string2string {
45
+ # input: "0627" (interpreted as hex code)
46
+ # output: utf-8 string for Arabic letter alef
47
+ local($caller,$unicode) = @_;
48
+ return "" unless defined($unicode);
49
+ my $d = hex($unicode);
50
+ return $caller->unicode2string($d);
51
+ }
52
+
53
+ sub unicode2string {
54
+ # input: non-neg integer, e.g. 0x627
55
+ # output: utf-8 string for Arabic letter alef
56
+ local($caller,$d) = @_;
57
+ return "" unless defined($d) && $d >= 0;
58
+ return sprintf("%c",$d) if $d <= 0x7F;
59
+
60
+ my $lastbyte1 = ($d & 0x3F) | 0x80;
61
+ $d >>= 6;
62
+ return sprintf("%c%c",$d | 0xC0, $lastbyte1) if $d <= 0x1F;
63
+
64
+ my $lastbyte2 = ($d & 0x3F) | 0x80;
65
+ $d >>= 6;
66
+ return sprintf("%c%c%c",$d | 0xE0, $lastbyte2, $lastbyte1) if $d <= 0xF;
67
+
68
+ my $lastbyte3 = ($d & 0x3F) | 0x80;
69
+ $d >>= 6;
70
+ return sprintf("%c%c%c%c",$d | 0xF0, $lastbyte3, $lastbyte2, $lastbyte1) if $d <= 0x7;
71
+
72
+ my $lastbyte4 = ($d & 0x3F) | 0x80;
73
+ $d >>= 6;
74
+ return sprintf("%c%c%c%c%c",$d | 0xF8, $lastbyte4, $lastbyte3, $lastbyte2, $lastbyte1) if $d <= 0x3;
75
+
76
+ my $lastbyte5 = ($d & 0x3F) | 0x80;
77
+ $d >>= 6;
78
+ return sprintf("%c%c%c%c%c%c",$d | 0xFC, $lastbyte5, $lastbyte4, $lastbyte3, $lastbyte2, $lastbyte1) if $d <= 0x1;
79
+ return ""; # bad input
80
+ }
81
+
82
+ sub html2utf8 {
83
+ local($caller, $string) = @_;
84
+
85
+ return $string unless $string =~ /\&\#\d{3,5};/;
86
+
87
+ my $prev = "";
88
+ my $s = $string;
89
+ while ($s ne $prev) {
90
+ $prev = $s;
91
+ ($pre,$d,$post) = ($s =~ /^(.*)\&\#(\d+);(.*)$/);
92
+ if (defined($d) && ((($d >= 160) && ($d <= 255))
93
+ || (($d >= 1500) && ($d <= 1699))
94
+ || (($d >= 19968) && ($d <= 40879)))) {
95
+ $html_code = "\&\#" . $d . ";";
96
+ $utf8_code = $caller->unicode2string($d);
97
+ $s =~ s/$html_code/$utf8_code/;
98
+ }
99
+ }
100
+ return $s;
101
+ }
102
+
103
+ sub xhtml2utf8 {
104
+ local($caller, $string) = @_;
105
+
106
+ return $string unless $string =~ /\&\#x[0-9a-fA-F]{2,5};/;
107
+
108
+ my $prev = "";
109
+ my $s = $string;
110
+ while ($s ne $prev) {
111
+ $prev = $s;
112
+ if (($pre, $html_code, $x, $post) = ($s =~ /^(.*)(\&\#x([0-9a-fA-F]{2,5});)(.*)$/)) {
113
+ $utf8_code = $caller->unicode_hex_string2string($x);
114
+ $s =~ s/$html_code/$utf8_code/;
115
+ }
116
+ }
117
+ return $s;
118
+ }
119
+
120
+ sub utf8_marker {
121
+ return sprintf("%c%c%c\n", 0xEF, 0xBB, 0xBF);
122
+ }
123
+
124
+ sub enforcer {
125
+ # input: string that might not conform to utf-8
126
+ # output: string in pure utf-8, with a few "smart replacements" and possibly "?"
127
+ local($caller,$s,$no_repair) = @_;
128
+
129
+ my $ascii;
130
+ my $utf8;
131
+ my $rest;
132
+
133
+ return $s if $s =~ /^[\x00-\x7F]*$/;
134
+
135
+ $no_repair = 0 unless defined($no_repair);
136
+ $orig = $s;
137
+ $result = "";
138
+
139
+ while ($s ne "") {
140
+ ($ascii,$rest) = ($s =~ /^([\x00-\x7F]+)(.*)$/);
141
+ if (defined($ascii)) {
142
+ $result .= $ascii;
143
+ $s = $rest;
144
+ next;
145
+ }
146
+ ($utf8,$rest) = ($s =~ /^([\xC0-\xDF][\x80-\xBF])(.*)$/);
147
+ ($utf8,$rest) = ($s =~ /^([\xE0-\xEF][\x80-\xBF][\x80-\xBF])(.*)$/)
148
+ unless defined($rest);
149
+ ($utf8,$rest) = ($s =~ /^([\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])(.*)$/)
150
+ unless defined($rest);
151
+ ($utf8,$rest) = ($s =~ /^([\xF8-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF])(.*)$/)
152
+ unless defined($rest);
153
+ if (defined($utf8)) {
154
+ $result .= $utf8;
155
+ $s = $rest;
156
+ next;
157
+ }
158
+ ($c,$rest) = ($s =~ /^(.)(.*)$/);
159
+ if (defined($c)) {
160
+ if ($no_repair) { $result .= "?"; }
161
+ elsif ($c =~ /\x85/) { $result .= "..."; }
162
+ elsif ($c =~ /\x91/) { $result .= "'"; }
163
+ elsif ($c =~ /\x92/) { $result .= "'"; }
164
+ elsif ($c =~ /\x93/) { $result .= $caller->unicode2string(0x201C); }
165
+ elsif ($c =~ /\x94/) { $result .= $caller->unicode2string(0x201D); }
166
+ elsif ($c =~ /[\xC0-\xFF]/) {
167
+ $c2 = $c;
168
+ $c2 =~ tr/[\xC0-\xFF]/[\x80-\xBF]/;
169
+ $result .= "\xC3$c2";
170
+ } else {
171
+ $result .= "?";
172
+ }
173
+ $s = $rest;
174
+ next;
175
+ }
176
+ $s = "";
177
+ }
178
+ $result .= "\n" if ($orig =~ /\n$/) && ! ($result =~ /\n$/);
179
+ return $result;
180
+ }
181
+
182
+ sub split_into_utf8_characters {
183
+ # input: utf8 string
184
+ # output: list of sub-strings, each representing a utf8 character
185
+ local($caller,$string,$group_control, *ht) = @_;
186
+
187
+ @characters = ();
188
+ $end_of_token_p_string = "";
189
+ $skipped_bytes = "";
190
+ $group_control = "" unless defined($group_control);
191
+ $group_ascii_numbers = ($group_control =~ /ASCII numbers/);
192
+ $group_ascii_spaces = ($group_control =~ /ASCII spaces/);
193
+ $group_ascii_punct = ($group_control =~ /ASCII punct/);
194
+ $group_ascii_chars = ($group_control =~ /ASCII chars/);
195
+ $group_xml_chars = ($group_control =~ /XML chars/);
196
+ $group_xml_tags = ($group_control =~ /XML tags/);
197
+ $return_only_chars = ($group_control =~ /return only chars/);
198
+ $return_trailing_whitespaces = ($group_control =~ /return trailing whitespaces/);
199
+ if ($group_control =~ /ASCII all/) {
200
+ $group_ascii_numbers = 1;
201
+ $group_ascii_spaces = 1;
202
+ $group_ascii_chars = 1;
203
+ $group_ascii_punct = 1;
204
+ }
205
+ if ($group_control =~ /(XML chars and tags|XML tags and chars)/) {
206
+ $group_xml_chars = 1;
207
+ $group_xml_tags = 1;
208
+ }
209
+ $orig_string = $string;
210
+ $string .= " ";
211
+ while ($string =~ /\S/) {
212
+ # one-character UTF-8 = ASCII
213
+ if ($string =~ /^[\x00-\x7F]/) {
214
+ if ($group_xml_chars
215
+ && (($dec_unicode, $rest) = ($string =~ /^&#(\d+);(.*)$/s))
216
+ && ($utf8_char = $caller->unicode2string($dec_unicode))) {
217
+ push(@characters, $utf8_char);
218
+ $string = $rest;
219
+ } elsif ($group_xml_chars
220
+ && (($hex_unicode, $rest) = ($string =~ /^&#x([0-9a-f]{1,6});(.*)$/is))
221
+ && ($utf8_char = $caller->unicode_hex_string2string($hex_unicode))) {
222
+ push(@characters, $utf8_char);
223
+ $string = $rest;
224
+ } elsif ($group_xml_chars
225
+ && (($html_entity_name, $rest) = ($string =~ /^&([a-z]{1,6});(.*)$/is))
226
+ && ($dec_unicode = $ht{HTML_ENTITY_NAME_TO_DECUNICODE}->{$html_entity_name})
227
+ && ($utf8_char = $caller->unicode2string($dec_unicode))
228
+ ) {
229
+ push(@characters, $utf8_char);
230
+ $string = $rest;
231
+ } elsif ($group_xml_tags
232
+ && (($tag, $rest) = ($string =~ /^(<\/?[a-zA-Z][-_:a-zA-Z0-9]*(\s+[a-zA-Z][-_:a-zA-Z0-9]*=\"[^"]*\")*\s*\/?>)(.*)$/s))) {
233
+ push(@characters, $tag);
234
+ $string = $rest;
235
+ } elsif ($group_ascii_numbers && ($string =~ /^[12]\d\d\d\.[01]?\d.[0-3]?\d([^0-9].*)?$/)) {
236
+ ($date) = ($string =~ /^(\d\d\d\d\.\d?\d.\d?\d)([^0-9].*)?$/);
237
+ push(@characters,$date);
238
+ $string = substr($string, length($date));
239
+ } elsif ($group_ascii_numbers && ($string =~ /^\d/)) {
240
+ ($number) = ($string =~ /^(\d+(,\d\d\d)*(\.\d+)?)/);
241
+ push(@characters,$number);
242
+ $string = substr($string, length($number));
243
+ } elsif ($group_ascii_spaces && ($string =~ /^(\s+)/)) {
244
+ ($space) = ($string =~ /^(\s+)/);
245
+ $string = substr($string, length($space));
246
+ } elsif ($group_ascii_punct && (($punct_seq) = ($string =~ /^(-+|\.+|[:,%()"])/))) {
247
+ push(@characters,$punct_seq);
248
+ $string = substr($string, length($punct_seq));
249
+ } elsif ($group_ascii_chars && (($word) = ($string =~ /^(\$[A-Z]*|[A-Z]{1,3}\$)/))) {
250
+ push(@characters,$word);
251
+ $string = substr($string, length($word));
252
+ } elsif ($group_ascii_chars && (($abbrev) = ($string =~ /^((?:Jan|Feb|Febr|Mar|Apr|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec|Mr|Mrs|Dr|a.m|p.m)\.)/))) {
253
+ push(@characters,$abbrev);
254
+ $string = substr($string, length($abbrev));
255
+ } elsif ($group_ascii_chars && (($word) = ($string =~ /^(second|minute|hour|day|week|month|year|inch|foot|yard|meter|kilometer|mile)-(?:long|old)/i))) {
256
+ push(@characters,$word);
257
+ $string = substr($string, length($word));
258
+ } elsif ($group_ascii_chars && (($word) = ($string =~ /^(zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion|trillion)-/i))) {
259
+ push(@characters,$word);
260
+ $string = substr($string, length($word));
261
+ } elsif ($group_ascii_chars && (($word) = ($string =~ /^([a-zA-Z]+)(?:[ ,;%?|()"]|'s |' |\. |\d+[:hms][0-9 ])/))) {
262
+ push(@characters,$word);
263
+ $string = substr($string, length($word));
264
+ } elsif ($group_ascii_chars && ($string =~ /^([\x21-\x27\x2A-\x7E]+)/)) { # exclude ()
265
+ ($ascii) = ($string =~ /^([\x21-\x27\x2A-\x7E]+)/); # ASCII black-characters
266
+ push(@characters,$ascii);
267
+ $string = substr($string, length($ascii));
268
+ } elsif ($group_ascii_chars && ($string =~ /^([\x21-\x7E]+)/)) {
269
+ ($ascii) = ($string =~ /^([\x21-\x7E]+)/); # ASCII black-characters
270
+ push(@characters,$ascii);
271
+ $string = substr($string, length($ascii));
272
+ } elsif ($group_ascii_chars && ($string =~ /^([\x00-\x7F]+)/)) {
273
+ ($ascii) = ($string =~ /^([\x00-\x7F]+)/);
274
+ push(@characters,$ascii);
275
+ $string = substr($string, length($ascii));
276
+ } else {
277
+ push(@characters,substr($string, 0, 1));
278
+ $string = substr($string, 1);
279
+ }
280
+
281
+ # two-character UTF-8
282
+ } elsif ($string =~ /^[\xC0-\xDF][\x80-\xBF]/) {
283
+ push(@characters,substr($string, 0, 2));
284
+ $string = substr($string, 2);
285
+
286
+ # three-character UTF-8
287
+ } elsif ($string =~ /^[\xE0-\xEF][\x80-\xBF][\x80-\xBF]/) {
288
+ push(@characters,substr($string, 0, 3));
289
+ $string = substr($string, 3);
290
+
291
+ # four-character UTF-8
292
+ } elsif ($string =~ /^[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]/) {
293
+ push(@characters,substr($string, 0, 4));
294
+ $string = substr($string, 4);
295
+
296
+ # five-character UTF-8
297
+ } elsif ($string =~ /^[\xF8-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]/) {
298
+ push(@characters,substr($string, 0, 5));
299
+ $string = substr($string, 5);
300
+
301
+ # six-character UTF-8
302
+ } elsif ($string =~ /^[\xFC-\xFD][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]/) {
303
+ push(@characters,substr($string, 0, 6));
304
+ $string = substr($string, 6);
305
+
306
+ # not a UTF-8 character
307
+ } else {
308
+ $skipped_bytes .= substr($string, 0, 1);
309
+ $string = substr($string, 1);
310
+ }
311
+
312
+ $end_of_token_p_string .= ($string =~ /^\S/) ? "0" : "1"
313
+ if $#characters >= length($end_of_token_p_string);
314
+ }
315
+ $string =~ s/ $//; # remove previously added space, but keep original spaces
316
+ if ($return_trailing_whitespaces) {
317
+ while ($string =~ /^[ \t]/) {
318
+ push(@characters,substr($string, 0, 1));
319
+ $string = substr($string, 1);
320
+ }
321
+ push(@characters, "\n") if $orig_string =~ /\n$/;
322
+ }
323
+ return ($return_only_chars) ? @characters : ($skipped_bytes, $end_of_token_p_string, @characters);
324
+ }
325
+
326
+ sub max_substring_info {
327
+ local($caller,$s1,$s2,$info_type) = @_;
328
+
329
+ ($skipped_bytes1, $end_of_token_p_string1, @char_list1) = $caller->split_into_utf8_characters($s1, "", *empty_ht);
330
+ ($skipped_bytes2, $end_of_token_p_string2, @char_list2) = $caller->split_into_utf8_characters($s2, "", *empty_ht);
331
+ return 0 if $skipped_bytes1 || $skipped_bytes2;
332
+
333
+ $best_substring_start1 = 0;
334
+ $best_substring_start2 = 0;
335
+ $best_substring_length = 0;
336
+
337
+ foreach $start_pos2 ((0 .. $#char_list2)) {
338
+ last if $start_pos2 + $best_substring_length > $#char_list2;
339
+ foreach $start_pos1 ((0 .. $#char_list1)) {
340
+ last if $start_pos1 + $best_substring_length > $#char_list1;
341
+ $matching_length = 0;
342
+ while (($start_pos1 + $matching_length <= $#char_list1)
343
+ && ($start_pos2 + $matching_length <= $#char_list2)
344
+ && ($char_list1[$start_pos1+$matching_length] eq $char_list2[$start_pos2+$matching_length])) {
345
+ $matching_length++;
346
+ }
347
+ if ($matching_length > $best_substring_length) {
348
+ $best_substring_length = $matching_length;
349
+ $best_substring_start1 = $start_pos1;
350
+ $best_substring_start2 = $start_pos2;
351
+ }
352
+ }
353
+ }
354
+ if ($info_type =~ /^max-ratio1$/) {
355
+ $length1 = $#char_list1 + 1;
356
+ return ($length1 > 0) ? ($best_substring_length / $length1) : 0;
357
+ } elsif ($info_type =~ /^max-ratio2$/) {
358
+ $length2 = $#char_list2 + 1;
359
+ return ($length2 > 0) ? ($best_substring_length / $length2) : 0;
360
+ } elsif ($info_type =~ /^substring$/) {
361
+ return join("", @char_list1[$best_substring_start1 .. $best_substring_start1+$best_substring_length-1]);
362
+ } else {
363
+ $length1 = $#char_list1 + 1;
364
+ $length2 = $#char_list2 + 1;
365
+ $info = "s1=$s1;s2=$s2";
366
+ $info .= ";best_substring_length=$best_substring_length";
367
+ $info .= ";best_substring_start1=$best_substring_start1";
368
+ $info .= ";best_substring_start2=$best_substring_start2";
369
+ $info .= ";length1=$length1";
370
+ $info .= ";length2=$length2";
371
+ return $info;
372
+ }
373
+ }
374
+
375
+ sub n_shared_chars_at_start {
376
+ local($caller,$s1,$s2) = @_;
377
+
378
+ my $n = 0;
379
+ while (($s1 ne "") && ($s2 ne "")) {
380
+ ($c1, $rest1) = ($s1 =~ /^(.[\x80-\xBF]*)(.*)$/);
381
+ ($c2, $rest2) = ($s2 =~ /^(.[\x80-\xBF]*)(.*)$/);
382
+ if ($c1 eq $c2) {
383
+ $n++;
384
+ $s1 = $rest1;
385
+ $s2 = $rest2;
386
+ } else {
387
+ last;
388
+ }
389
+ }
390
+ return $n;
391
+ }
392
+
393
+ sub char_length {
394
+ local($caller,$string,$byte_offset) = @_;
395
+
396
+ my $char = ($byte_offset) ? substr($string, $byte_offset) : $string;
397
+ return 1 if $char =~ /^[\x00-\x7F]/;
398
+ return 2 if $char =~ /^[\xC0-\xDF]/;
399
+ return 3 if $char =~ /^[\xE0-\xEF]/;
400
+ return 4 if $char =~ /^[\xF0-\xF7]/;
401
+ return 5 if $char =~ /^[\xF8-\xFB]/;
402
+ return 6 if $char =~ /^[\xFC-\xFD]/;
403
+ return 0;
404
+ }
405
+
406
+ sub length_in_utf8_chars {
407
+ local($caller,$s) = @_;
408
+
409
+ $s =~ s/[\x80-\xBF]//g;
410
+ $s =~ s/[\x00-\x7F\xC0-\xFF]/c/g;
411
+ return length($s);
412
+ }
413
+
414
+ sub byte_length_of_n_chars {
415
+ local($caller,$char_length,$string,$byte_offset,$undef_return_value) = @_;
416
+
417
+ $byte_offset = 0 unless defined($byte_offset);
418
+ $undef_return_value = -1 unless defined($undef_return_value);
419
+ my $result = 0;
420
+ my $len;
421
+ foreach $i ((1 .. $char_length)) {
422
+ $len = $caller->char_length($string,($byte_offset+$result));
423
+ return $undef_return_value unless $len;
424
+ $result += $len;
425
+ }
426
+ return $result;
427
+ }
428
+
429
+ sub replace_non_ASCII_bytes {
430
+ local($caller,$string,$replacement) = @_;
431
+
432
+ $replacement = "HEX" unless defined($replacement);
433
+ if ($replacement =~ /^(Unicode|U\+4|\\u|HEX)$/) {
434
+ $new_string = "";
435
+ while (($pre,$utf8_char, $post) = ($string =~ /^([\x09\x0A\x20-\x7E]*)([\x00-\x08\x0B-\x1F\x7F]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF][\x80-\xBF]|[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]|[\xF8-\xFF][\x80-\xBF]+|[\x80-\xBF])(.*)$/s)) {
436
+ if ($replacement =~ /Unicode/) {
437
+ $new_string .= $pre . "<U" . (uc $caller->utf8_to_unicode($utf8_char)) . ">";
438
+ } elsif ($replacement =~ /\\u/) {
439
+ $new_string .= $pre . "\\u" . (uc sprintf("%04x", $caller->utf8_to_unicode($utf8_char)));
440
+ } elsif ($replacement =~ /U\+4/) {
441
+ $new_string .= $pre . "<U+" . (uc $caller->utf8_to_4hex_unicode($utf8_char)) . ">";
442
+ } else {
443
+ $new_string .= $pre . "<HEX-" . $caller->utf8_to_hex($utf8_char) . ">";
444
+ }
445
+ $string = $post;
446
+ }
447
+ $new_string .= $string;
448
+ } else {
449
+ $new_string = $string;
450
+ $new_string =~ s/[\x80-\xFF]/$replacement/g;
451
+ }
452
+ return $new_string;
453
+ }
454
+
455
+ sub valid_utf8_string_p {
456
+ local($caller,$string) = @_;
457
+
458
+ return $string =~ /^(?:[\x09\x0A\x20-\x7E]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF][\x80-\xBF]|[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])*$/;
459
+ }
460
+
461
+ sub valid_utf8_string_incl_ascii_control_p {
462
+ local($caller,$string) = @_;
463
+
464
+ return $string =~ /^(?:[\x00-\x7F]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF][\x80-\xBF]|[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])*$/;
465
+ }
466
+
467
+ sub utf8_to_hex {
468
+ local($caller,$s) = @_;
469
+
470
+ $hex = "";
471
+ foreach $i ((0 .. length($s)-1)) {
472
+ $hex .= uc sprintf("%2.2x",ord(substr($s, $i, 1)));
473
+ }
474
+ return $hex;
475
+ }
476
+
477
+ sub hex_to_utf8 {
478
+ local($caller,$s) = @_;
479
+ # surface string \xE2\x80\xBA to UTF8
480
+
481
+ my $utf8 = "";
482
+ while (($hex, $rest) = ($s =~ /^(?:\\x)?([0-9A-Fa-f]{2,2})(.*)$/)) {
483
+ $utf8 .= sprintf("%c", hex($hex));
484
+ $s = $rest;
485
+ }
486
+ return $utf8;
487
+ }
488
+
489
+ sub utf8_to_4hex_unicode {
490
+ local($caller,$s) = @_;
491
+
492
+ return sprintf("%4.4x", $caller->utf8_to_unicode($s));
493
+ }
494
+
495
+ sub utf8_to_unicode {
496
+ local($caller,$s) = @_;
497
+
498
+ $unicode = 0;
499
+ foreach $i ((0 .. length($s)-1)) {
500
+ $c = substr($s, $i, 1);
501
+ if ($c =~ /^[\x80-\xBF]$/) {
502
+ $unicode = $unicode * 64 + (ord($c) & 0x3F);
503
+ } elsif ($c =~ /^[\xC0-\xDF]$/) {
504
+ $unicode = $unicode * 32 + (ord($c) & 0x1F);
505
+ } elsif ($c =~ /^[\xE0-\xEF]$/) {
506
+ $unicode = $unicode * 16 + (ord($c) & 0x0F);
507
+ } elsif ($c =~ /^[\xF0-\xF7]$/) {
508
+ $unicode = $unicode * 8 + (ord($c) & 0x07);
509
+ } elsif ($c =~ /^[\xF8-\xFB]$/) {
510
+ $unicode = $unicode * 4 + (ord($c) & 0x03);
511
+ } elsif ($c =~ /^[\xFC-\xFD]$/) {
512
+ $unicode = $unicode * 2 + (ord($c) & 0x01);
513
+ }
514
+ }
515
+ return $unicode;
516
+ }
517
+
518
+ sub charhex {
519
+ local($caller,$string) = @_;
520
+
521
+ my $result = "";
522
+ while ($string ne "") {
523
+ $char = substr($string, 0, 1);
524
+ $string = substr($string, 1);
525
+ if ($char =~ /^[ -~]$/) {
526
+ $result .= $char;
527
+ } else {
528
+ $hex = sprintf("%2.2x",ord($char));
529
+ $hex =~ tr/a-f/A-F/;
530
+ $result .= "<HEX-$hex>";
531
+ }
532
+ }
533
+ return $result;
534
+ }
535
+
536
+ sub windows1252_to_utf8 {
537
+ local($caller,$s, $norm_to_ascii_p, $preserve_potential_utf8s_p) = @_;
538
+
539
+ return $s if $s =~ /^[\x00-\x7F]*$/; # all ASCII
540
+
541
+ $norm_to_ascii_p = 1 unless defined($norm_to_ascii_p);
542
+ $preserve_potential_utf8s_p = 1 unless defined($preserve_potential_utf8s_p);
543
+ my $result = "";
544
+ my $c = "";
545
+ while ($s ne "") {
546
+ $n_bytes = 1;
547
+ if ($s =~ /^[\x00-\x7F]/) {
548
+ $result .= substr($s, 0, 1); # ASCII
549
+ } elsif ($preserve_potential_utf8s_p && ($s =~ /^[\xC0-\xDF][\x80-\xBF]/)) {
550
+ $result .= substr($s, 0, 2); # valid 2-byte UTF8
551
+ $n_bytes = 2;
552
+ } elsif ($preserve_potential_utf8s_p && ($s =~ /^[\xE0-\xEF][\x80-\xBF][\x80-\xBF]/)) {
553
+ $result .= substr($s, 0, 3); # valid 3-byte UTF8
554
+ $n_bytes = 3;
555
+ } elsif ($preserve_potential_utf8s_p && ($s =~ /^[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]/)) {
556
+ $result .= substr($s, 0, 4); # valid 4-byte UTF8
557
+ $n_bytes = 4;
558
+ } elsif ($preserve_potential_utf8s_p && ($s =~ /^[\xF8-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]/)) {
559
+ $result .= substr($s, 0, 5); # valid 5-byte UTF8
560
+ $n_bytes = 5;
561
+ } elsif ($s =~ /^[\xA0-\xBF]/) {
562
+ $c = substr($s, 0, 1);
563
+ $result .= "\xC2$c";
564
+ } elsif ($s =~ /^[\xC0-\xFF]/) {
565
+ $c = substr($s, 0, 1);
566
+ $c =~ tr/[\xC0-\xFF]/[\x80-\xBF]/;
567
+ $result .= "\xC3$c";
568
+ } elsif ($s =~ /^\x80/) {
569
+ $result .= "\xE2\x82\xAC"; # Euro sign
570
+ } elsif ($s =~ /^\x82/) {
571
+ $result .= "\xE2\x80\x9A"; # single low quotation mark
572
+ } elsif ($s =~ /^\x83/) {
573
+ $result .= "\xC6\x92"; # Latin small letter f with hook
574
+ } elsif ($s =~ /^\x84/) {
575
+ $result .= "\xE2\x80\x9E"; # double low quotation mark
576
+ } elsif ($s =~ /^\x85/) {
577
+ $result .= ($norm_to_ascii_p) ? "..." : "\xE2\x80\xA6"; # horizontal ellipsis (three dots)
578
+ } elsif ($s =~ /^\x86/) {
579
+ $result .= "\xE2\x80\xA0"; # dagger
580
+ } elsif ($s =~ /^\x87/) {
581
+ $result .= "\xE2\x80\xA1"; # double dagger
582
+ } elsif ($s =~ /^\x88/) {
583
+ $result .= "\xCB\x86"; # circumflex
584
+ } elsif ($s =~ /^\x89/) {
585
+ $result .= "\xE2\x80\xB0"; # per mille sign
586
+ } elsif ($s =~ /^\x8A/) {
587
+ $result .= "\xC5\xA0"; # Latin capital letter S with caron
588
+ } elsif ($s =~ /^\x8B/) {
589
+ $result .= "\xE2\x80\xB9"; # single left-pointing angle quotation mark
590
+ } elsif ($s =~ /^\x8C/) {
591
+ $result .= "\xC5\x92"; # OE ligature
592
+ } elsif ($s =~ /^\x8E/) {
593
+ $result .= "\xC5\xBD"; # Latin capital letter Z with caron
594
+ } elsif ($s =~ /^\x91/) {
595
+ $result .= ($norm_to_ascii_p) ? "`" : "\xE2\x80\x98"; # left single quotation mark
596
+ } elsif ($s =~ /^\x92/) {
597
+ $result .= ($norm_to_ascii_p) ? "'" : "\xE2\x80\x99"; # right single quotation mark
598
+ } elsif ($s =~ /^\x93/) {
599
+ $result .= "\xE2\x80\x9C"; # left double quotation mark
600
+ } elsif ($s =~ /^\x94/) {
601
+ $result .= "\xE2\x80\x9D"; # right double quotation mark
602
+ } elsif ($s =~ /^\x95/) {
603
+ $result .= "\xE2\x80\xA2"; # bullet
604
+ } elsif ($s =~ /^\x96/) {
605
+ $result .= ($norm_to_ascii_p) ? "-" : "\xE2\x80\x93"; # n dash
606
+ } elsif ($s =~ /^\x97/) {
607
+ $result .= ($norm_to_ascii_p) ? "-" : "\xE2\x80\x94"; # m dash
608
+ } elsif ($s =~ /^\x98/) {
609
+ $result .= ($norm_to_ascii_p) ? "~" : "\xCB\x9C"; # small tilde
610
+ } elsif ($s =~ /^\x99/) {
611
+ $result .= "\xE2\x84\xA2"; # trade mark sign
612
+ } elsif ($s =~ /^\x9A/) {
613
+ $result .= "\xC5\xA1"; # Latin small letter s with caron
614
+ } elsif ($s =~ /^\x9B/) {
615
+ $result .= "\xE2\x80\xBA"; # single right-pointing angle quotation mark
616
+ } elsif ($s =~ /^\x9C/) {
617
+ $result .= "\xC5\x93"; # oe ligature
618
+ } elsif ($s =~ /^\x9E/) {
619
+ $result .= "\xC5\xBE"; # Latin small letter z with caron
620
+ } elsif ($s =~ /^\x9F/) {
621
+ $result .= "\xC5\xB8"; # Latin capital letter Y with diaeresis
622
+ } else {
623
+ $result .= "?";
624
+ }
625
+ $s = substr($s, $n_bytes);
626
+ }
627
+ return $result;
628
+ }
629
+
630
+ sub delete_weird_stuff {
631
+ local($caller, $s) = @_;
632
+
633
+ # delete control chacters (except tab and linefeed), zero-width characters, byte order mark,
634
+ # directional marks, join marks, variation selectors, Arabic tatweel
635
+ $s =~ s/([\x00-\x08\x0B-\x1F\x7F]|\xC2[\x80-\x9F]|\xD9\x80|\xE2\x80[\x8B-\x8F]|\xEF\xB8[\x80-\x8F]|\xEF\xBB\xBF|\xF3\xA0[\x84-\x87][\x80-\xBF])//g;
636
+ return $s;
637
+ }
638
+
639
+ sub number_of_utf8_character {
640
+ local($caller, $s) = @_;
641
+
642
+ $s2 = $s;
643
+ $s2 =~ s/[\x80-\xBF]//g;
644
+ return length($s2);
645
+ }
646
+
647
+ sub cap_letter_reg_exp {
648
+ # includes A-Z and other Latin-based capital letters with accents, umlauts and other decorations etc.
649
+ return "[A-Z]|\xC3[\x80-\x96\x98-\x9E]|\xC4[\x80\x82\x84\x86\x88\x8A\x8C\x8E\x90\x94\x964\x98\x9A\x9C\x9E\xA0\xA2\xA4\xA6\xA8\xAA\xAC\xAE\xB0\xB2\xB4\xB6\xB9\xBB\xBD\xBF]|\xC5[\x81\x83\x85\x87\x8A\x8C\x8E\x90\x92\x96\x98\x9A\x9C\x9E\xA0\xA2\xA4\xA6\xA8\xAA\xAC\xB0\xB2\xB4\xB6\xB8\xB9\xBB\xBD]";
650
+ }
651
+
652
+ sub regex_extended_case_expansion {
653
+ local($caller, $s) = @_;
654
+
655
+ if ($s =~ /\xC3/) {
656
+ $s =~ s/\xC3\xA0/\xC3\[\x80\xA0\]/g;
657
+ $s =~ s/\xC3\xA1/\xC3\[\x81\xA1\]/g;
658
+ $s =~ s/\xC3\xA2/\xC3\[\x82\xA2\]/g;
659
+ $s =~ s/\xC3\xA3/\xC3\[\x83\xA3\]/g;
660
+ $s =~ s/\xC3\xA4/\xC3\[\x84\xA4\]/g;
661
+ $s =~ s/\xC3\xA5/\xC3\[\x85\xA5\]/g;
662
+ $s =~ s/\xC3\xA6/\xC3\[\x86\xA6\]/g;
663
+ $s =~ s/\xC3\xA7/\xC3\[\x87\xA7\]/g;
664
+ $s =~ s/\xC3\xA8/\xC3\[\x88\xA8\]/g;
665
+ $s =~ s/\xC3\xA9/\xC3\[\x89\xA9\]/g;
666
+ $s =~ s/\xC3\xAA/\xC3\[\x8A\xAA\]/g;
667
+ $s =~ s/\xC3\xAB/\xC3\[\x8B\xAB\]/g;
668
+ $s =~ s/\xC3\xAC/\xC3\[\x8C\xAC\]/g;
669
+ $s =~ s/\xC3\xAD/\xC3\[\x8D\xAD\]/g;
670
+ $s =~ s/\xC3\xAE/\xC3\[\x8E\xAE\]/g;
671
+ $s =~ s/\xC3\xAF/\xC3\[\x8F\xAF\]/g;
672
+ $s =~ s/\xC3\xB0/\xC3\[\x90\xB0\]/g;
673
+ $s =~ s/\xC3\xB1/\xC3\[\x91\xB1\]/g;
674
+ $s =~ s/\xC3\xB2/\xC3\[\x92\xB2\]/g;
675
+ $s =~ s/\xC3\xB3/\xC3\[\x93\xB3\]/g;
676
+ $s =~ s/\xC3\xB4/\xC3\[\x94\xB4\]/g;
677
+ $s =~ s/\xC3\xB5/\xC3\[\x95\xB5\]/g;
678
+ $s =~ s/\xC3\xB6/\xC3\[\x96\xB6\]/g;
679
+ $s =~ s/\xC3\xB8/\xC3\[\x98\xB8\]/g;
680
+ $s =~ s/\xC3\xB9/\xC3\[\x99\xB9\]/g;
681
+ $s =~ s/\xC3\xBA/\xC3\[\x9A\xBA\]/g;
682
+ $s =~ s/\xC3\xBB/\xC3\[\x9B\xBB\]/g;
683
+ $s =~ s/\xC3\xBC/\xC3\[\x9C\xBC\]/g;
684
+ $s =~ s/\xC3\xBD/\xC3\[\x9D\xBD\]/g;
685
+ $s =~ s/\xC3\xBE/\xC3\[\x9E\xBE\]/g;
686
+ }
687
+ if ($s =~ /\xC5/) {
688
+ $s =~ s/\xC5\x91/\xC5\[\x90\x91\]/g;
689
+ $s =~ s/\xC5\xA1/\xC5\[\xA0\xA1\]/g;
690
+ $s =~ s/\xC5\xB1/\xC5\[\xB0\xB1\]/g;
691
+ }
692
+
693
+ return $s;
694
+ }
695
+
696
+ sub extended_lower_case {
697
+ local($caller, $s) = @_;
698
+
699
+ $s =~ tr/A-Z/a-z/;
700
+
701
+ # Latin-1
702
+ if ($s =~ /\xC3[\x80-\x9F]/) {
703
+ $s =~ s/À/à/g;
704
+ $s =~ s/Á/á/g;
705
+ $s =~ s/Â/â/g;
706
+ $s =~ s/Ã/ã/g;
707
+ $s =~ s/Ä/ä/g;
708
+ $s =~ s/Å/å/g;
709
+ $s =~ s/Æ/æ/g;
710
+ $s =~ s/Ç/ç/g;
711
+ $s =~ s/È/è/g;
712
+ $s =~ s/É/é/g;
713
+ $s =~ s/Ê/ê/g;
714
+ $s =~ s/Ë/ë/g;
715
+ $s =~ s/Ì/ì/g;
716
+ $s =~ s/Í/í/g;
717
+ $s =~ s/Î/î/g;
718
+ $s =~ s/Ï/ï/g;
719
+ $s =~ s/Ð/ð/g;
720
+ $s =~ s/Ñ/ñ/g;
721
+ $s =~ s/Ò/ò/g;
722
+ $s =~ s/Ó/ó/g;
723
+ $s =~ s/Ô/ô/g;
724
+ $s =~ s/Õ/õ/g;
725
+ $s =~ s/Ö/ö/g;
726
+ $s =~ s/Ø/ø/g;
727
+ $s =~ s/Ù/ù/g;
728
+ $s =~ s/Ú/ú/g;
729
+ $s =~ s/Û/û/g;
730
+ $s =~ s/Ü/ü/g;
731
+ $s =~ s/Ý/ý/g;
732
+ $s =~ s/Þ/þ/g;
733
+ }
734
+ # Latin Extended-A
735
+ if ($s =~ /[\xC4-\xC5][\x80-\xBF]/) {
736
+ $s =~ s/Ā/ā/g;
737
+ $s =~ s/Ă/ă/g;
738
+ $s =~ s/Ą/ą/g;
739
+ $s =~ s/Ć/ć/g;
740
+ $s =~ s/Ĉ/ĉ/g;
741
+ $s =~ s/Ċ/ċ/g;
742
+ $s =~ s/Č/č/g;
743
+ $s =~ s/Ď/ď/g;
744
+ $s =~ s/Đ/đ/g;
745
+ $s =~ s/Ē/ē/g;
746
+ $s =~ s/Ĕ/ĕ/g;
747
+ $s =~ s/Ė/ė/g;
748
+ $s =~ s/Ę/ę/g;
749
+ $s =~ s/Ě/ě/g;
750
+ $s =~ s/Ĝ/ĝ/g;
751
+ $s =~ s/Ğ/ğ/g;
752
+ $s =~ s/Ġ/ġ/g;
753
+ $s =~ s/Ģ/ģ/g;
754
+ $s =~ s/Ĥ/ĥ/g;
755
+ $s =~ s/Ħ/ħ/g;
756
+ $s =~ s/Ĩ/ĩ/g;
757
+ $s =~ s/Ī/ī/g;
758
+ $s =~ s/Ĭ/ĭ/g;
759
+ $s =~ s/Į/į/g;
760
+ $s =~ s/İ/ı/g;
761
+ $s =~ s/IJ/ij/g;
762
+ $s =~ s/Ĵ/ĵ/g;
763
+ $s =~ s/Ķ/ķ/g;
764
+ $s =~ s/Ĺ/ĺ/g;
765
+ $s =~ s/Ļ/ļ/g;
766
+ $s =~ s/Ľ/ľ/g;
767
+ $s =~ s/Ŀ/ŀ/g;
768
+ $s =~ s/Ł/ł/g;
769
+ $s =~ s/Ń/ń/g;
770
+ $s =~ s/Ņ/ņ/g;
771
+ $s =~ s/Ň/ň/g;
772
+ $s =~ s/Ŋ/ŋ/g;
773
+ $s =~ s/Ō/ō/g;
774
+ $s =~ s/Ŏ/ŏ/g;
775
+ $s =~ s/Ő/ő/g;
776
+ $s =~ s/Œ/œ/g;
777
+ $s =~ s/Ŕ/ŕ/g;
778
+ $s =~ s/Ŗ/ŗ/g;
779
+ $s =~ s/Ř/ř/g;
780
+ $s =~ s/Ś/ś/g;
781
+ $s =~ s/Ŝ/ŝ/g;
782
+ $s =~ s/Ş/ş/g;
783
+ $s =~ s/Š/š/g;
784
+ $s =~ s/Ţ/ţ/g;
785
+ $s =~ s/Ť/ť/g;
786
+ $s =~ s/Ŧ/ŧ/g;
787
+ $s =~ s/Ũ/ũ/g;
788
+ $s =~ s/Ū/ū/g;
789
+ $s =~ s/Ŭ/ŭ/g;
790
+ $s =~ s/Ů/ů/g;
791
+ $s =~ s/Ű/ű/g;
792
+ $s =~ s/Ų/ų/g;
793
+ $s =~ s/Ŵ/ŵ/g;
794
+ $s =~ s/Ŷ/ŷ/g;
795
+ $s =~ s/Ź/ź/g;
796
+ $s =~ s/Ż/ż/g;
797
+ $s =~ s/Ž/ž/g;
798
+ }
799
+ # Greek letters
800
+ if ($s =~ /\xCE[\x86-\xAB]/) {
801
+ $s =~ s/Α/α/g;
802
+ $s =~ s/Β/β/g;
803
+ $s =~ s/Γ/γ/g;
804
+ $s =~ s/Δ/δ/g;
805
+ $s =~ s/Ε/ε/g;
806
+ $s =~ s/Ζ/ζ/g;
807
+ $s =~ s/Η/η/g;
808
+ $s =~ s/Θ/θ/g;
809
+ $s =~ s/Ι/ι/g;
810
+ $s =~ s/Κ/κ/g;
811
+ $s =~ s/Λ/λ/g;
812
+ $s =~ s/Μ/μ/g;
813
+ $s =~ s/Ν/ν/g;
814
+ $s =~ s/Ξ/ξ/g;
815
+ $s =~ s/Ο/ο/g;
816
+ $s =~ s/Π/π/g;
817
+ $s =~ s/Ρ/ρ/g;
818
+ $s =~ s/Σ/σ/g;
819
+ $s =~ s/Τ/τ/g;
820
+ $s =~ s/Υ/υ/g;
821
+ $s =~ s/Φ/φ/g;
822
+ $s =~ s/Χ/χ/g;
823
+ $s =~ s/Ψ/ψ/g;
824
+ $s =~ s/Ω/ω/g;
825
+ $s =~ s/Ϊ/ϊ/g;
826
+ $s =~ s/Ϋ/ϋ/g;
827
+ $s =~ s/Ά/ά/g;
828
+ $s =~ s/Έ/έ/g;
829
+ $s =~ s/Ή/ή/g;
830
+ $s =~ s/Ί/ί/g;
831
+ $s =~ s/Ό/ό/g;
832
+ $s =~ s/Ύ/ύ/g;
833
+ $s =~ s/Ώ/ώ/g;
834
+ }
835
+ # Cyrillic letters
836
+ if ($s =~ /\xD0[\x80-\xAF]/) {
837
+ $s =~ s/А/а/g;
838
+ $s =~ s/Б/б/g;
839
+ $s =~ s/В/в/g;
840
+ $s =~ s/Г/г/g;
841
+ $s =~ s/Д/д/g;
842
+ $s =~ s/Е/е/g;
843
+ $s =~ s/Ж/ж/g;
844
+ $s =~ s/З/з/g;
845
+ $s =~ s/И/и/g;
846
+ $s =~ s/Й/й/g;
847
+ $s =~ s/К/к/g;
848
+ $s =~ s/Л/л/g;
849
+ $s =~ s/М/м/g;
850
+ $s =~ s/Н/н/g;
851
+ $s =~ s/О/о/g;
852
+ $s =~ s/П/п/g;
853
+ $s =~ s/Р/р/g;
854
+ $s =~ s/С/с/g;
855
+ $s =~ s/Т/т/g;
856
+ $s =~ s/У/у/g;
857
+ $s =~ s/Ф/ф/g;
858
+ $s =~ s/Х/х/g;
859
+ $s =~ s/Ц/ц/g;
860
+ $s =~ s/Ч/ч/g;
861
+ $s =~ s/Ш/ш/g;
862
+ $s =~ s/Щ/щ/g;
863
+ $s =~ s/Ъ/ъ/g;
864
+ $s =~ s/Ы/ы/g;
865
+ $s =~ s/Ь/ь/g;
866
+ $s =~ s/Э/э/g;
867
+ $s =~ s/Ю/ю/g;
868
+ $s =~ s/Я/я/g;
869
+ $s =~ s/Ѐ/ѐ/g;
870
+ $s =~ s/Ё/ё/g;
871
+ $s =~ s/Ђ/ђ/g;
872
+ $s =~ s/Ѓ/ѓ/g;
873
+ $s =~ s/Є/є/g;
874
+ $s =~ s/Ѕ/ѕ/g;
875
+ $s =~ s/І/і/g;
876
+ $s =~ s/Ї/ї/g;
877
+ $s =~ s/Ј/ј/g;
878
+ $s =~ s/Љ/љ/g;
879
+ $s =~ s/Њ/њ/g;
880
+ $s =~ s/Ћ/ћ/g;
881
+ $s =~ s/Ќ/ќ/g;
882
+ $s =~ s/Ѝ/ѝ/g;
883
+ $s =~ s/Ў/ў/g;
884
+ $s =~ s/Џ/џ/g;
885
+ }
886
+ # Fullwidth A-Z
887
+ if ($s =~ /\xEF\xBC[\xA1-\xBA]/) {
888
+ $s =~ s/A/a/g;
889
+ $s =~ s/B/b/g;
890
+ $s =~ s/C/c/g;
891
+ $s =~ s/D/d/g;
892
+ $s =~ s/E/e/g;
893
+ $s =~ s/F/f/g;
894
+ $s =~ s/G/g/g;
895
+ $s =~ s/H/h/g;
896
+ $s =~ s/I/i/g;
897
+ $s =~ s/J/j/g;
898
+ $s =~ s/K/k/g;
899
+ $s =~ s/L/l/g;
900
+ $s =~ s/M/m/g;
901
+ $s =~ s/N/n/g;
902
+ $s =~ s/O/o/g;
903
+ $s =~ s/P/p/g;
904
+ $s =~ s/Q/q/g;
905
+ $s =~ s/R/r/g;
906
+ $s =~ s/S/s/g;
907
+ $s =~ s/T/t/g;
908
+ $s =~ s/U/u/g;
909
+ $s =~ s/V/v/g;
910
+ $s =~ s/W/w/g;
911
+ $s =~ s/X/x/g;
912
+ $s =~ s/Y/y/g;
913
+ $s =~ s/Z/z/g;
914
+ }
915
+
916
+ return $s;
917
+ }
918
+
919
+ sub extended_upper_case {
920
+ local($caller, $s) = @_;
921
+
922
+ $s =~ tr/a-z/A-Z/;
923
+ return $s unless $s =~ /[\xC3-\xC5][\x80-\xBF]/;
924
+
925
+ $s =~ s/\xC3\xA0/\xC3\x80/g;
926
+ $s =~ s/\xC3\xA1/\xC3\x81/g;
927
+ $s =~ s/\xC3\xA2/\xC3\x82/g;
928
+ $s =~ s/\xC3\xA3/\xC3\x83/g;
929
+ $s =~ s/\xC3\xA4/\xC3\x84/g;
930
+ $s =~ s/\xC3\xA5/\xC3\x85/g;
931
+ $s =~ s/\xC3\xA6/\xC3\x86/g;
932
+ $s =~ s/\xC3\xA7/\xC3\x87/g;
933
+ $s =~ s/\xC3\xA8/\xC3\x88/g;
934
+ $s =~ s/\xC3\xA9/\xC3\x89/g;
935
+ $s =~ s/\xC3\xAA/\xC3\x8A/g;
936
+ $s =~ s/\xC3\xAB/\xC3\x8B/g;
937
+ $s =~ s/\xC3\xAC/\xC3\x8C/g;
938
+ $s =~ s/\xC3\xAD/\xC3\x8D/g;
939
+ $s =~ s/\xC3\xAE/\xC3\x8E/g;
940
+ $s =~ s/\xC3\xAF/\xC3\x8F/g;
941
+ $s =~ s/\xC3\xB0/\xC3\x90/g;
942
+ $s =~ s/\xC3\xB1/\xC3\x91/g;
943
+ $s =~ s/\xC3\xB2/\xC3\x92/g;
944
+ $s =~ s/\xC3\xB3/\xC3\x93/g;
945
+ $s =~ s/\xC3\xB4/\xC3\x94/g;
946
+ $s =~ s/\xC3\xB5/\xC3\x95/g;
947
+ $s =~ s/\xC3\xB6/\xC3\x96/g;
948
+ $s =~ s/\xC3\xB8/\xC3\x98/g;
949
+ $s =~ s/\xC3\xB9/\xC3\x99/g;
950
+ $s =~ s/\xC3\xBA/\xC3\x9A/g;
951
+ $s =~ s/\xC3\xBB/\xC3\x9B/g;
952
+ $s =~ s/\xC3\xBC/\xC3\x9C/g;
953
+ $s =~ s/\xC3\xBD/\xC3\x9D/g;
954
+ $s =~ s/\xC3\xBE/\xC3\x9E/g;
955
+
956
+ $s =~ s/\xC5\x91/\xC5\x90/g;
957
+ $s =~ s/\xC5\xA1/\xC5\xA0/g;
958
+ $s =~ s/\xC5\xB1/\xC5\xB0/g;
959
+ return $s unless $s =~ /[\xC3-\xC5][\x80-\xBF]/;
960
+
961
+ return $s;
962
+ }
963
+
964
+ sub extended_first_upper_case {
965
+ local($caller, $s) = @_;
966
+
967
+ if (($first_char, $rest) = ($s =~ /^([\x00-\x7F]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF][\x80-\xBF])(.*)$/)) {
968
+ return $caller->extended_upper_case($first_char) . $rest;
969
+ } else {
970
+ return $s;
971
+ }
972
+ }
973
+
974
+ sub repair_doubly_converted_utf8_strings {
975
+ local($caller, $s) = @_;
976
+
977
+ if ($s =~ /\xC3[\x82-\x85]\xC2[\x80-\xBF]/) {
978
+ $s =~ s/\xC3\x82\xC2([\x80-\xBF])/\xC2$1/g;
979
+ $s =~ s/\xC3\x83\xC2([\x80-\xBF])/\xC3$1/g;
980
+ $s =~ s/\xC3\x84\xC2([\x80-\xBF])/\xC4$1/g;
981
+ $s =~ s/\xC3\x85\xC2([\x80-\xBF])/\xC5$1/g;
982
+ }
983
+ return $s;
984
+ }
985
+
986
+ sub repair_misconverted_windows_to_utf8_strings {
987
+ local($caller, $s) = @_;
988
+
989
+ # correcting conversions of UTF8 using Latin1-to-UTF converter
990
+ if ($s =~ /\xC3\xA2\xC2\x80\xC2[\x90-\xEF]/) {
991
+ my $result = "";
992
+ while (($pre,$last_c,$post) = ($s =~ /^(.*?)\xC3\xA2\xC2\x80\xC2([\x90-\xEF])(.*)$/s)) {
993
+ $result .= "$pre\xE2\x80$last_c";
994
+ $s = $post;
995
+ }
996
+ $result .= $s;
997
+ $s = $result;
998
+ }
999
+ # correcting conversions of Windows1252-to-UTF8 using Latin1-to-UTF converter
1000
+ if ($s =~ /\xC2[\x80-\x9F]/) {
1001
+ my $result = "";
1002
+ while (($pre,$c_windows,$post) = ($s =~ /^(.*?)\xC2([\x80-\x9F])(.*)$/s)) {
1003
+ $c_utf8 = $caller->windows1252_to_utf8($c_windows, 0);
1004
+ $result .= ($c_utf8 eq "?") ? ($pre . "\xC2" . $c_windows) : "$pre$c_utf8";
1005
+ $s = $post;
1006
+ }
1007
+ $result .= $s;
1008
+ $s = $result;
1009
+ }
1010
+ if ($s =~ /\xC3/) {
1011
+ $s =~ s/\xC3\xA2\xE2\x80\x9A\xC2\xAC/\xE2\x82\xAC/g; # x80 -> Euro sign
1012
+ # x81 codepoint undefined in Windows 1252
1013
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xC5\xA1/\xE2\x80\x9A/g; # x82 -> single low-9 quotation mark
1014
+ $s =~ s/\xC3\x86\xE2\x80\x99/\xC6\x92/g; # x83 -> Latin small letter f with hook
1015
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xC5\xBE/\xE2\x80\x9E/g; # x84 -> double low-9 quotation mark
1016
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xA6/\xE2\x80\xA6/g; # x85 -> horizontal ellipsis
1017
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xA0/\xE2\x80\xA0/g; # x86 -> dagger
1018
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xA1/\xE2\x80\xA1/g; # x87 -> double dagger
1019
+ $s =~ s/\xC3\x8B\xE2\x80\xA0/\xCB\x86/g; # x88 -> modifier letter circumflex accent
1020
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xB0/\xE2\x80\xB0/g; # x89 -> per mille sign
1021
+ $s =~ s/\xC3\x85\xC2\xA0/\xC5\xA0/g; # x8A -> Latin capital letter S with caron
1022
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xB9/\xE2\x80\xB9/g; # x8B -> single left-pointing angle quotation mark
1023
+ $s =~ s/\xC3\x85\xE2\x80\x99/\xC5\x92/g; # x8C -> Latin capital ligature OE
1024
+ # x8D codepoint undefined in Windows 1252
1025
+ $s =~ s/\xC3\x85\xC2\xBD/\xC5\xBD/g; # x8E -> Latin capital letter Z with caron
1026
+ # x8F codepoint undefined in Windows 1252
1027
+ # x90 codepoint undefined in Windows 1252
1028
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xCB\x9C/\xE2\x80\x98/g; # x91 a-circumflex+euro+small tilde -> left single quotation mark
1029
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xE2\x84\xA2/\xE2\x80\x99/g; # x92 a-circumflex+euro+trademark -> right single quotation mark
1030
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xC5\x93/\xE2\x80\x9C/g; # x93 a-circumflex+euro+Latin small ligature oe -> left double quotation mark
1031
+ # x94 maps through undefined intermediate code point
1032
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xA2/\xE2\x80\xA2/g; # x95 a-circumflex+euro+cent sign -> bullet
1033
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xE2\x80\x9C/\xE2\x80\x93/g; # x96 a-circumflex+euro+left double quotation mark -> en dash
1034
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xE2\x80\x9D/\xE2\x80\x94/g; # x97 a-circumflex+euro+right double quotation mark -> em dash
1035
+ $s =~ s/\xC3\x8B\xC5\x93/\xCB\x9C/g; # x98 Latin capital e diaeresis+Latin small ligature oe -> small tilde
1036
+ $s =~ s/\xC3\xA2\xE2\x80\x9E\xC2\xA2/\xE2\x84\xA2/g; # x99 -> trade mark sign
1037
+ $s =~ s/\xC3\x85\xC2\xA1/\xC5\xA1/g; # x9A -> Latin small letter s with caron
1038
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xBA/\xE2\x80\xBA/g; # x9B -> single right-pointing angle quotation mark
1039
+ $s =~ s/\xC3\x85\xE2\x80\x9C/\xC5\x93/g; # x9C -> Latin small ligature oe
1040
+ # x9D codepoint undefined in Windows 1252
1041
+ $s =~ s/\xC3\x85\xC2\xBE/\xC5\xBE/g; # x9E -> Latin small letter z with caron
1042
+ $s =~ s/\xC3\x85\xC2\xB8/\xC5\xB8/g; # x9F -> Latin capital letter Y with diaeresis
1043
+ $s =~ s/\xC3\xAF\xC2\xBF\xC2\xBD/\xEF\xBF\xBD/g; # replacement character
1044
+ }
1045
+
1046
+ return $s;
1047
+ }
1048
+
1049
+ sub latin1_to_utf {
1050
+ local($caller, $s) = @_;
1051
+
1052
+ my $result = "";
1053
+ while (($pre,$c,$post) = ($s =~ /^(.*?)([\x80-\xFF])(.*)$/s)) {
1054
+ $result .= $pre;
1055
+ if ($c =~ /^[\x80-\xBF]$/) {
1056
+ $result .= "\xC2$c";
1057
+ } elsif ($c =~ /^[\xC0-\xFF]$/) {
1058
+ $c =~ tr/[\xC0-\xFF]/[\x80-\xBF]/;
1059
+ $result .= "\xC3$c";
1060
+ }
1061
+ $s = $post;
1062
+ }
1063
+ $result .= $s;
1064
+ return $result;
1065
+ }
1066
+
1067
+ sub character_type_is_letter_type {
1068
+ local($caller, $char_type) = @_;
1069
+
1070
+ return ($char_type =~ /\b((CJK|hiragana|kana|katakana)\s+character|diacritic|letter|syllable)\b/);
1071
+ }
1072
+
1073
+ sub character_type {
1074
+ local($caller, $c) = @_;
1075
+
1076
+ if ($c =~ /^[\x00-\x7F]/) {
1077
+ return "XML tag" if $c =~ /^<.*>$/;
1078
+ return "ASCII Latin letter" if $c =~ /^[a-z]$/i;
1079
+ return "ASCII digit" if $c =~ /^[0-9]$/i;
1080
+ return "ASCII whitespace" if $c =~ /^[\x09-\x0D\x20]$/;
1081
+ return "ASCII control-character" if $c =~ /^[\x00-\x1F\x7F]$/;
1082
+ return "ASCII currency" if $c eq "\$";
1083
+ return "ASCII punctuation";
1084
+ } elsif ($c =~ /^[\xC0-\xDF]/) {
1085
+ return "non-UTF8 (invalid)" unless $c =~ /^[\xC0-\xDF][\x80-\xBF]$/;
1086
+ return "non-shortest-UTF8 (invalid)" if $c =~ /[\xC0-\xC1]/;
1087
+ return "non-ASCII control-character" if $c =~ /\xC2[\x80-\x9F]/;
1088
+ return "non-ASCII whitespace" if $c =~ /\xC2\xA0/;
1089
+ return "non-ASCII currency" if $c =~ /\xC2[\xA2-\xA5]/;
1090
+ return "fraction" if $c =~ /\xC2[\xBC-\xBE]/; # NEW
1091
+ return "superscript digit" if $c =~ /\xC2[\xB2\xB3\xB9]/;
1092
+ return "non-ASCII Latin letter" if $c =~ /\xC2\xB5/; # micro sign
1093
+ return "non-ASCII punctuation" if $c =~ /\xC2[\xA0-\xBF]/;
1094
+ return "non-ASCII punctuation" if $c =~ /\xC3[\x97\xB7]/;
1095
+ return "non-ASCII Latin letter" if $c =~ /\xC3[\x80-\xBF]/;
1096
+ return "Latin ligature letter" if $c =~ /\xC4[\xB2\xB3]/;
1097
+ return "Latin ligature letter" if $c =~ /\xC5[\x92\x93]/;
1098
+ return "non-ASCII Latin letter" if $c =~ /[\xC4-\xC8]/;
1099
+ return "non-ASCII Latin letter" if $c =~ /\xC9[\x80-\x8F]/;
1100
+ return "IPA" if $c =~ /\xC9[\x90-\xBF]/;
1101
+ return "IPA" if $c =~ /\xCA[\x80-\xBF]/;
1102
+ return "IPA" if $c =~ /\xCB[\x80-\xBF]/;
1103
+ return "combining-diacritic" if $c =~ /\xCC[\x80-\xBF]/;
1104
+ return "combining-diacritic" if $c =~ /\xCD[\x80-\xAF]/;
1105
+ return "Greek punctuation" if $c =~ /\xCD[\xBE]/; # Greek question mark
1106
+ return "Greek punctuation" if $c =~ /\xCE[\x87]/; # Greek semicolon
1107
+ return "Greek letter" if $c =~ /\xCD[\xB0-\xBF]/;
1108
+ return "Greek letter" if $c =~ /\xCE/;
1109
+ return "Greek letter" if $c =~ /\xCF[\x80-\xA1\xB3\xB7\xB8\xBA\xBB]/;
1110
+ return "Coptic letter" if $c =~ /\xCF[\xA2-\xAF]/;
1111
+ return "Cyrillic letter" if $c =~ /[\xD0-\xD3]/;
1112
+ return "Cyrillic letter" if $c =~ /\xD4[\x80-\xAF]/;
1113
+ return "Armenian punctuation" if $c =~ /\xD5[\x9A-\x9F]/;
1114
+ return "Armenian punctuation" if $c =~ /\xD6[\x89-\x8F]/;
1115
+ return "Armenian letter" if $c =~ /\xD4[\xB0-\xBF]/;
1116
+ return "Armenian letter" if $c =~ /\xD5/;
1117
+ return "Armenian letter" if $c =~ /\xD6[\x80-\x8F]/;
1118
+ return "Hebrew accent" if $c =~ /\xD6[\x91-\xAE]/;
1119
+ return "Hebrew punctuation" if $c =~ /\xD6\xBE/;
1120
+ return "Hebrew punctuation" if $c =~ /\xD7[\x80\x83\x86\xB3\xB4]/;
1121
+ return "Hebrew point" if $c =~ /\xD6[\xB0-\xBF]/;
1122
+ return "Hebrew point" if $c =~ /\xD7[\x81\x82\x87]/;
1123
+ return "Hebrew letter" if $c =~ /\xD7[\x90-\xB2]/;
1124
+ return "other Hebrew" if $c =~ /\xD6[\x90-\xBF]/;
1125
+ return "other Hebrew" if $c =~ /\xD7/;
1126
+ return "Arabic currency" if $c =~ /\xD8\x8B/; # Afghani sign
1127
+ return "Arabic punctuation" if $c =~ /\xD8[\x89-\x8D\x9B\x9E\x9F]/;
1128
+ return "Arabic punctuation" if $c =~ /\xD9[\xAA-\xAD]/;
1129
+ return "Arabic punctuation" if $c =~ /\xDB[\x94]/;
1130
+ return "Arabic tatweel" if $c =~ /\xD9\x80/;
1131
+ return "Arabic letter" if $c =~ /\xD8[\xA0-\xBF]/;
1132
+ return "Arabic letter" if $c =~ /\xD9[\x81-\x9F]/;
1133
+ return "Arabic letter" if $c =~ /\xD9[\xAE-\xBF]/;
1134
+ return "Arabic letter" if $c =~ /\xDA[\x80-\xBF]/;
1135
+ return "Arabic letter" if $c =~ /\xDB[\x80-\x95]/;
1136
+ return "Arabic Indic digit" if $c =~ /\xD9[\xA0-\xA9]/;
1137
+ return "Arabic Indic digit" if $c =~ /\xDB[\xB0-\xB9]/;
1138
+ return "other Arabic" if $c =~ /[\xD8-\xDB]/;
1139
+ return "Syriac punctuation" if $c =~ /\xDC[\x80-\x8F]/;
1140
+ return "Syriac letter" if $c =~ /\xDC[\x90-\xAF]/;
1141
+ return "Syriac diacritic" if $c =~ /\xDC[\xB0-\xBF]/;
1142
+ return "Syriac diacritic" if $c =~ /\xDD[\x80-\x8A]/;
1143
+ return "Thaana letter" if $c =~ /\xDE/;
1144
+ } elsif ($c =~ /^[\xE0-\xEF]/) {
1145
+ return "non-UTF8 (invalid)" unless $c =~ /^[\xE0-\xEF][\x80-\xBF]{2,2}$/;
1146
+ return "non-shortest-UTF8 (invalid)" if $c =~ /\xE0[\x80-\x9F]/;
1147
+ return "Arabic letter" if $c =~ /\xE0\xA2[\xA0-\xBF]/; # extended letters
1148
+ return "other Arabic" if $c =~ /\xE0\xA3/; # extended characters
1149
+ return "Devanagari punctuation" if $c =~ /\xE0\xA5[\xA4\xA5]/; # danda, double danda
1150
+ return "Devanagari digit" if $c =~ /\xE0\xA5[\xA6-\xAF]/;
1151
+ return "Devanagari letter" if $c =~ /\xE0[\xA4-\xA5]/;
1152
+ return "Bengali digit" if $c =~ /\xE0\xA7[\xA6-\xAF]/;
1153
+ return "Bengali currency" if $c =~ /\xE0\xA7[\xB2-\xB9]/;
1154
+ return "Bengali letter" if $c =~ /\xE0[\xA6-\xA7]/;
1155
+ return "Gurmukhi digit" if $c =~ /\xE0\xA9[\xA6-\xAF]/;
1156
+ return "Gurmukhi letter" if $c =~ /\xE0[\xA8-\xA9]/;
1157
+ return "Gujarati digit" if $c =~ /\xE0\xAB[\xA6-\xAF]/;
1158
+ return "Gujarati letter" if $c =~ /\xE0[\xAA-\xAB]/;
1159
+ return "Oriya digit" if $c =~ /\xE0\xAD[\xA6-\xAF]/;
1160
+ return "Oriya fraction" if $c =~ /\xE0\xAD[\xB2-\xB7]/;
1161
+ return "Oriya letter" if $c =~ /\xE0[\xAC-\xAD]/;
1162
+ return "Tamil digit" if $c =~ /\xE0\xAF[\xA6-\xAF]/;
1163
+ return "Tamil number" if $c =~ /\xE0\xAF[\xB0-\xB2]/; # number (10, 100, 1000)
1164
+ return "Tamil letter" if $c =~ /\xE0[\xAE-\xAF]/;
1165
+ return "Telegu digit" if $c =~ /\xE0\xB1[\xA6-\xAF]/;
1166
+ return "Telegu fraction" if $c =~ /\xE0\xB1[\xB8-\xBE]/;
1167
+ return "Telegu letter" if $c =~ /\xE0[\xB0-\xB1]/;
1168
+ return "Kannada digit" if $c =~ /\xE0\xB3[\xA6-\xAF]/;
1169
+ return "Kannada letter" if $c =~ /\xE0[\xB2-\xB3]/;
1170
+ return "Malayalam digit" if $c =~ /\xE0\xB5[\x98-\x9E\xA6-\xB8]/;
1171
+ return "Malayalam punctuation" if $c =~ /\xE0\xB5\xB9/; # date mark
1172
+ return "Malayalam letter" if $c =~ /\xE0[\xB4-\xB5]/;
1173
+ return "Sinhala digit" if $c =~ /\xE0\xB7[\xA6-\xAF]/;
1174
+ return "Sinhala punctuation" if $c =~ /\xE0\xB7\xB4/;
1175
+ return "Sinhala letter" if $c =~ /\xE0[\xB6-\xB7]/;
1176
+ return "Thai currency" if $c =~ /\xE0\xB8\xBF/;
1177
+ return "Thai digit" if $c =~ /\xE0\xB9[\x90-\x99]/;
1178
+ return "Thai character" if $c =~ /\xE0[\xB8-\xB9]/;
1179
+ return "Lao punctuation" if $c =~ /\xE0\xBA\xAF/; # Lao ellipsis
1180
+ return "Lao digit" if $c =~ /\xE0\xBB[\x90-\x99]/;
1181
+ return "Lao character" if $c =~ /\xE0[\xBA-\xBB]/;
1182
+ return "Tibetan punctuation" if $c =~ /\xE0\xBC[\x81-\x94]/;
1183
+ return "Tibetan sign" if $c =~ /\xE0\xBC[\x95-\x9F]/;
1184
+ return "Tibetan digit" if $c =~ /\xE0\xBC[\xA0-\xB3]/;
1185
+ return "Tibetan punctuation" if $c =~ /\xE0\xBC[\xB4-\xBD]/;
1186
+ return "Tibetan letter" if $c =~ /\xE0[\xBC-\xBF]/;
1187
+ return "Myanmar digit" if $c =~ /\xE1\x81[\x80-\x89]/;
1188
+ return "Myanmar digit" if $c =~ /\xE1\x82[\x90-\x99]/; # Myanmar Shan digits
1189
+ return "Myanmar punctuation" if $c =~ /\xE1\x81[\x8A-\x8B]/;
1190
+ return "Myanmar letter" if $c =~ /\xE1[\x80-\x81]/;
1191
+ return "Myanmar letter" if $c =~ /\xE1\x82[\x80-\x9F]/;
1192
+ return "Georgian punctuation" if $c =~ /\xE1\x83\xBB/;
1193
+ return "Georgian letter" if $c =~ /\xE1\x82[\xA0-\xBF]/;
1194
+ return "Georgian letter" if $c =~ /\xE1\x83/;
1195
+ return "Georgian letter" if $c =~ /\xE1\xB2[\x90-\xBF]/; # Georgian Mtavruli capital letters
1196
+ return "Georgian letter" if $c =~ /\xE2\xB4[\x80-\xAF]/; # Georgian small letters (Khutsuri)
1197
+ return "Korean Hangul letter" if $c =~ /\xE1[\x84-\x87]/;
1198
+ return "Ethiopic punctuation" if $c =~ /\xE1\x8D[\xA0-\xA8]/;
1199
+ return "Ethiopic digit" if $c =~ /\xE1\x8D[\xA9-\xB1]/;
1200
+ return "Ethiopic number" if $c =~ /\xE1\x8D[\xB2-\xBC]/;
1201
+ return "Ethiopic syllable" if $c =~ /\xE1[\x88-\x8D]/;
1202
+ return "Cherokee letter" if $c =~ /\xE1\x8E[\xA0-\xBF]/;
1203
+ return "Cherokee letter" if $c =~ /\xE1\x8F/;
1204
+ return "Canadian punctuation" if $c =~ /\xE1\x90\x80/; # Canadian Syllabics hyphen
1205
+ return "Canadian punctuation" if $c =~ /\xE1\x99\xAE/; # Canadian Syllabics full stop
1206
+ return "Canadian syllable" if $c =~ /\xE1[\x90-\x99]/;
1207
+ return "Canadian syllable" if $c =~ /\xE1\xA2[\xB0-\xBF]/;
1208
+ return "Canadian syllable" if $c =~ /\xE1\xA3/;
1209
+ return "Ogham whitespace" if $c =~ /\xE1\x9A\x80/;
1210
+ return "Ogham letter" if $c =~ /\xE1\x9A[\x81-\x9A]/;
1211
+ return "Ogham punctuation" if $c =~ /\xE1\x9A[\x9B-\x9C]/;
1212
+ return "Runic punctuation" if $c =~ /\xE1\x9B[\xAB-\xAD]/;
1213
+ return "Runic letter" if $c =~ /\xE1\x9A[\xA0-\xBF]/;
1214
+ return "Runic letter" if $c =~ /\xE1\x9B/;
1215
+ return "Khmer currency" if $c =~ /\xE1\x9F\x9B/;
1216
+ return "Khmer digit" if $c =~ /\xE1\x9F[\xA0-\xA9]/;
1217
+ return "Khmer letter" if $c =~ /\xE1[\x9E-\x9F]/;
1218
+ return "Mongolian punctuation" if $c =~ /\xE1\xA0[\x80-\x8A]/;
1219
+ return "Mongolian digit" if $c =~ /\xE1\xA0[\x90-\x99]/;
1220
+ return "Mongolian letter" if $c =~ /\xE1[\xA0-\xA1]/;
1221
+ return "Mongolian letter" if $c =~ /\xE1\xA2[\x80-\xAF]/;
1222
+ return "Buginese letter" if $c =~ /\xE1\xA8[\x80-\x9B]/;
1223
+ return "Buginese punctuation" if $c =~ /\xE1\xA8[\x9E-\x9F]/;
1224
+ return "Balinese letter" if $c =~ /\xE1\xAC/;
1225
+ return "Balinese letter" if $c =~ /\xE1\xAD[\x80-\x8F]/;
1226
+ return "Balinese digit" if $c =~ /\xE1\xAD[\x90-\x99]/;
1227
+ return "Balinese puncutation" if $c =~ /\xE1\xAD[\x9A-\xA0]/;
1228
+ return "Balinese symbol" if $c =~ /\xE1\xAD[\xA1-\xBF]/;
1229
+ return "Sundanese digit" if $c =~ /\xE1\xAE[\xB0-\xB9]/;
1230
+ return "Sundanese letter" if $c =~ /\xE1\xAE/;
1231
+ return "Cyrillic letter" if $c =~ /\xE1\xB2[\x80-\x8F]/;
1232
+ return "Sundanese punctuation" if $c =~ /\xE1\xB3[\x80-\x8F]/;
1233
+ return "IPA" if $c =~ /\xE1[\xB4-\xB6]/;
1234
+ return "non-ASCII Latin letter" if $c =~ /\xE1[\xB8-\xBB]/;
1235
+ return "Greek letter" if $c =~ /\xE1[\xBC-\xBF]/;
1236
+ return "non-ASCII whitespace" if $c =~ /\xE2\x80[\x80-\x8A\xAF]/;
1237
+ return "zero-width space" if $c =~ /\xE2\x80\x8B/;
1238
+ return "zero-width non-space" if $c =~ /\xE2\x80\x8C/;
1239
+ return "zero-width joiner" if $c =~ /\xE2\x80\x8D/;
1240
+ return "directional mark" if $c =~ /\xE2\x80[\x8E-\x8F\xAA-\xAE]/;
1241
+ return "non-ASCII punctuation" if $c =~ /\xE2\x80[\x90-\xBF]/;
1242
+ return "non-ASCII punctuation" if $c =~ /\xE2\x81[\x80-\x9E]/;
1243
+ return "superscript letter" if $c =~ /\xE2\x81[\xB1\xBF]/;
1244
+ return "superscript digit" if $c =~ /\xE2\x81[\xB0-\xB9]/;
1245
+ return "superscript punctuation" if $c =~ /\xE2\x81[\xBA-\xBE]/;
1246
+ return "subscript digit" if $c =~ /\xE2\x82[\x80-\x89]/;
1247
+ return "subscript punctuation" if $c =~ /\xE2\x82[\x8A-\x8E]/;
1248
+ return "non-ASCII currency" if $c =~ /\xE2\x82[\xA0-\xBF]/;
1249
+ return "letterlike symbol" if $c =~ /\xE2\x84/;
1250
+ return "letterlike symbol" if $c =~ /\xE2\x85[\x80-\x8F]/;
1251
+ return "fraction" if $c =~ /\xE2\x85[\x90-\x9E]/; # NEW
1252
+ return "Roman number" if $c =~ /\xE2\x85[\xA0-\xBF]/; # NEW
1253
+ return "arrow symbol" if $c =~ /\xE2\x86[\x90-\xBF]/;
1254
+ return "arrow symbol" if $c =~ /\xE2\x87/;
1255
+ return "mathematical operator" if $c =~ /\xE2[\x88-\x8B]/;
1256
+ return "technical symbol" if $c =~ /\xE2[\x8C-\x8F]/;
1257
+ return "enclosed alphanumeric" if $c =~ /\xE2\x91[\xA0-\xBF]/;
1258
+ return "enclosed alphanumeric" if $c =~ /\xE2[\x92-\x93]/;
1259
+ return "box drawing" if $c =~ /\xE2[\x94-\x95]/;
1260
+ return "geometric shape" if $c =~ /\xE2\x96[\xA0-\xBF]/;
1261
+ return "geometric shape" if $c =~ /\xE2\x97/;
1262
+ return "pictograph" if $c =~ /\xE2[\x98-\x9E]/;
1263
+ return "arrow symbol" if $c =~ /\xE2\xAC[\x80-\x91\xB0-\xBF]/;
1264
+ return "geometric shape" if $c =~ /\xE2\xAC[\x92-\xAF]/;
1265
+ return "arrow symbol" if $c =~ /\xE2\xAD[\x80-\x8F\x9A-\xBF]/;
1266
+ return "geometric shape" if $c =~ /\xE2\xAD[\x90-\x99]/;
1267
+ return "arrow symbol" if $c =~ /\xE2\xAE[\x80-\xB9]/;
1268
+ return "geometric shape" if $c =~ /\xE2\xAE[\xBA-\xBF]/;
1269
+ return "geometric shape" if $c =~ /\xE2\xAF[\x80-\x88\x8A-\x8F]/;
1270
+ return "symbol" if $c =~ /\xE2[\xAC-\xAF]/;
1271
+ return "Coptic fraction" if $c =~ /\xE2\xB3\xBD/;
1272
+ return "Coptic punctuation" if $c =~ /\xE2\xB3[\xB9-\xBF]/;
1273
+ return "Coptic letter" if $c =~ /\xE2[\xB2-\xB3]/;
1274
+ return "Georgian letter" if $c =~ /\xE2\xB4[\x80-\xAF]/;
1275
+ return "Tifinagh punctuation" if $c =~ /\xE2\xB5\xB0/;
1276
+ return "Tifinagh letter" if $c =~ /\xE2\xB4[\xB0-\xBF]/;
1277
+ return "Tifinagh letter" if $c =~ /\xE2\xB5/;
1278
+ return "Ethiopic syllable" if $c =~ /\xE2\xB6/;
1279
+ return "Ethiopic syllable" if $c =~ /\xE2\xB7[\x80-\x9F]/;
1280
+ return "non-ASCII punctuation" if $c =~ /\xE3\x80[\x80-\x91\x94-\x9F\xB0\xBB-\xBD]/;
1281
+ return "symbol" if $c =~ /\xE3\x80[\x91\x92\xA0\xB6\xB7]/;
1282
+ return "Japanese hiragana character" if $c =~ /\xE3\x81/;
1283
+ return "Japanese hiragana character" if $c =~ /\xE3\x82[\x80-\x9F]/;
1284
+ return "Japanese katakana character" if $c =~ /\xE3\x82[\xA0-\xBF]/;
1285
+ return "Japanese katakana character" if $c =~ /\xE3\x83/;
1286
+ return "Bopomofo letter" if $c =~ /\xE3\x84[\x80-\xAF]/;
1287
+ return "Korean Hangul letter" if $c =~ /\xE3\x84[\xB0-\xBF]/;
1288
+ return "Korean Hangul letter" if $c =~ /\xE3\x85/;
1289
+ return "Korean Hangul letter" if $c =~ /\xE3\x86[\x80-\x8F]/;
1290
+ return "Bopomofo letter" if $c =~ /\xE3\x86[\xA0-\xBF]/;
1291
+ return "CJK stroke" if $c =~ /\xE3\x87[\x80-\xAF]/;
1292
+ return "Japanese kana character" if $c =~ /\xE3\x87[\xB0-\xBF]/;
1293
+ return "CJK symbol" if $c =~ /\xE3[\x88-\x8B]/;
1294
+ return "CJK square Latin abbreviation" if $c =~ /\xE3\x8D[\xB1-\xBA]/;
1295
+ return "CJK square Latin abbreviation" if $c =~ /\xE3\x8E/;
1296
+ return "CJK square Latin abbreviation" if $c =~ /\xE3\x8F[\x80-\x9F\xBF]/;
1297
+ return "CJK character" if $c =~ /\xE4[\xB8-\xBF]/;
1298
+ return "CJK character" if $c =~ /[\xE5-\xE9]/;
1299
+ return "Yi syllable" if $c =~ /\xEA[\x80-\x92]/;
1300
+ return "Lisu letter" if $c =~ /\xEA\x93[\x90-\xBD]/;
1301
+ return "Lisu punctuation" if $c =~ /\xEA\x93[\xBE-\xBF]/;
1302
+ return "Cyrillic letter" if $c =~ /\xEA\x99/;
1303
+ return "Cyrillic letter" if $c =~ /\xEA\x9A[\x80-\x9F]/;
1304
+ return "modifier tone" if $c =~ /\xEA\x9C[\x80-\xA1]/;
1305
+ return "Javanese punctuation" if $c =~ /\xEA\xA7[\x81-\x8D\x9E-\x9F]/;
1306
+ return "Javanese digit" if $c =~ /\xEA\xA7[\x90-\x99]/;
1307
+ return "Javanese letter" if $c =~ /\xEA\xA6/;
1308
+ return "Javanese letter" if $c =~ /\xEA\xA7[\x80-\x9F]/;
1309
+ return "Ethiopic syllable" if $c =~ /\xEA\xAC[\x80-\xAF]/;
1310
+ return "Cherokee letter" if $c =~ /\xEA\xAD[\xB0-\xBF]/;
1311
+ return "Cherokee letter" if $c =~ /\xEA\xAE/;
1312
+ return "Meetai Mayek digit" if $c =~ /\xEA\xAF[\xB0-\xB9]/;
1313
+ return "Meetai Mayek letter" if $c =~ /\xEA\xAF/;
1314
+ return "Korean Hangul syllable" if $c =~ /\xEA[\xB0-\xBF]/;
1315
+ return "Korean Hangul syllable" if $c =~ /[\xEB-\xEC]/;
1316
+ return "Korean Hangul syllable" if $c =~ /\xED[\x80-\x9E]/;
1317
+ return "Klingon letter" if $c =~ /\xEF\xA3[\x90-\xA9]/;
1318
+ return "Klingon digit" if $c =~ /\xEF\xA3[\xB0-\xB9]/;
1319
+ return "Klingon punctuation" if $c =~ /\xEF\xA3[\xBD-\xBE]/;
1320
+ return "Klingon symbol" if $c =~ /\xEF\xA3\xBF/;
1321
+ return "private use character" if $c =~ /\xEE/;
1322
+ return "Latin typographic ligature" if $c =~ /\xEF\xAC[\x80-\x86]/;
1323
+ return "Hebrew presentation letter" if $c =~ /\xEF\xAC[\x9D-\xBF]/;
1324
+ return "Hebrew presentation letter" if $c =~ /\xEF\xAD[\x80-\x8F]/;
1325
+ return "Arabic presentation letter" if $c =~ /\xEF\xAD[\x90-\xBF]/;
1326
+ return "Arabic presentation letter" if $c =~ /\xEF[\xAE-\xB7]/;
1327
+ return "non-ASCII punctuation" if $c =~ /\xEF\xB8[\x90-\x99]/;
1328
+ return "non-ASCII punctuation" if $c =~ /\xEF\xB8[\xB0-\xBF]/;
1329
+ return "non-ASCII punctuation" if $c =~ /\xEF\xB9[\x80-\xAB]/;
1330
+ return "Arabic presentation letter" if $c =~ /\xEF\xB9[\xB0-\xBF]/;
1331
+ return "Arabic presentation letter" if $c =~ /\xEF\xBA/;
1332
+ return "Arabic presentation letter" if $c =~ /\xEF\xBB[\x80-\xBC]/;
1333
+ return "byte-order mark/zero-width no-break space" if $c eq "\xEF\xBB\xBF";
1334
+ return "fullwidth currency" if $c =~ /\xEF\xBC\x84/;
1335
+ return "fullwidth digit" if $c =~ /\xEF\xBC[\x90-\x99]/;
1336
+ return "fullwidth Latin letter" if $c =~ /\xEF\xBC[\xA1-\xBA]/;
1337
+ return "fullwidth Latin letter" if $c =~ /\xEF\xBD[\x81-\x9A]/;
1338
+ return "fullwidth punctuation" if $c =~ /\xEF\xBC/;
1339
+ return "fullwidth punctuation" if $c =~ /\xEF\xBD[\x9B-\xA4]/;
1340
+ return "halfwidth Japanese punctuation" if $c =~ /\xEF\xBD[\xA1-\xA4]/;
1341
+ return "halfwidth Japanese katakana character" if $c =~ /\xEF\xBD[\xA5-\xBF]/;
1342
+ return "halfwidth Japanese katakana character" if $c =~ /\xEF\xBE[\x80-\x9F]/;
1343
+ return "fullwidth currency" if $c =~ /\xEF\xBF[\xA0-\xA6]/;
1344
+ return "replacement character" if $c eq "\xEF\xBF\xBD";
1345
+ } elsif ($c =~ /[\xF0-\xF7]/) {
1346
+ return "non-UTF8 (invalid)" unless $c =~ /[\xF0-\xF7][\x80-\xBF]{3,3}$/;
1347
+ return "non-shortest-UTF8 (invalid)" if $c =~ /\xF0[\x80-\x8F]/;
1348
+ return "Linear B syllable" if $c =~ /\xF0\x90\x80/;
1349
+ return "Linear B syllable" if $c =~ /\xF0\x90\x81[\x80-\x8F]/;
1350
+ return "Linear B symbol" if $c =~ /\xF0\x90\x81[\x90-\x9F]/;
1351
+ return "Linear B ideogram" if $c =~ /\xF0\x90[\x82-\x83]/;
1352
+ return "Gothic letter" if $c =~ /\xF0\x90\x8C[\xB0-\xBF]/;
1353
+ return "Gothic letter" if $c =~ /\xF0\x90\x8D[\x80-\x8F]/;
1354
+ return "Phoenician letter" if $c =~ /\xF0\x90\xA4[\x80-\x95]/;
1355
+ return "Phoenician number" if $c =~ /\xF0\x90\xA4[\x96-\x9B]/;
1356
+ return "Phoenician punctuation" if $c =~ /\xF0\x90\xA4\x9F/; # word separator
1357
+ return "Old Hungarian number" if $c =~ /\xF0\x90\xB3[\xBA-\xBF]/;
1358
+ return "Old Hungarian letter" if $c =~ /\xF0\x90[\xB2-\xB3]/;
1359
+ return "Cuneiform digit" if $c =~ /\xF0\x92\x90/; # numberic sign
1360
+ return "Cuneiform digit" if $c =~ /\xF0\x92\x91[\x80-\xAF]/; # numberic sign
1361
+ return "Cuneiform punctuation" if $c =~ /\xF0\x92\x91[\xB0-\xBF]/;
1362
+ return "Cuneiform sign" if $c =~ /\xF0\x92[\x80-\x95]/;
1363
+ return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x81\xA8/;
1364
+ return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x82[\xAD-\xB6]/;
1365
+ return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x86[\x90\xBC-\xBF]/;
1366
+ return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x87[\x80-\x84]/;
1367
+ return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x8D[\xA2-\xAB]/;
1368
+ return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x8E[\x86-\x92]/;
1369
+ return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x8F[\xBA-\xBF]/;
1370
+ return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x90[\x80-\x83]/;
1371
+ return "Egyptian hieroglyph" if $c =~ /\xF0\x93[\x80-\x90]/;
1372
+ return "enclosed alphanumeric" if $c =~ /\xF0\x9F[\x84-\x87]/;
1373
+ return "Mahjong symbol" if $c =~ /\xF0\x9F\x80[\x80-\xAF]/;
1374
+ return "Domino symbol" if $c =~ /\xF0\x9F\x80[\xB0-\xBF]/;
1375
+ return "Domino symbol" if $c =~ /\xF0\x9F\x81/;
1376
+ return "Domino symbol" if $c =~ /\xF0\x9F\x82[\x80-\x9F]/;
1377
+ return "Playing card symbol" if $c =~ /\xF0\x9F\x82[\xA0-\xBF]/;
1378
+ return "Playing card symbol" if $c =~ /\xF0\x9F\x83/;
1379
+ return "CJK symbol" if $c =~ /\xF0\x9F[\x88-\x8B]/;
1380
+ return "pictograph" if $c =~ /\xF0\x9F[\x8C-\x9B]/;
1381
+ return "geometric shape" if $c =~ /\xF0\x9F[\x9E-\x9F]/;
1382
+ return "non-ASCII punctuation" if $c =~ /\xF0\x9F[\xA0-\xA3]/;
1383
+ return "pictograph" if $c =~ /\xF0\x9F[\xA4-\xAB]/;
1384
+ return "CJK character" if $c =~ /\xF0[\xA0-\xAF]/;
1385
+ return "tag" if $c =~ /\xF3\xA0[\x80-\x81]/;
1386
+ return "variation selector" if $c =~ /\xF3\xA0[\x84-\x87]/;
1387
+ return "private use character" if $c =~ /\xF3[\xB0-\xBF]/;
1388
+ return "private use character" if $c =~ /\xF4[\x80-\x8F]/;
1389
+ # ...
1390
+ } elsif ($c =~ /[\xF8-\xFB]/) {
1391
+ return "non-UTF8 (invalid)" unless $c =~ /[\xF8-\xFB][\x80-\xBF]{4,4}$/;
1392
+ } elsif ($c =~ /[\xFC-\xFD]/) {
1393
+ return "non-UTF8 (invalid)" unless $c =~ /[\xFC-\xFD][\x80-\xBF]{5,5}$/;
1394
+ } elsif ($c =~ /\xFE/) {
1395
+ return "non-UTF8 (invalid)" unless $c =~ /\xFE][\x80-\xBF]{6,6}$/;
1396
+ } else {
1397
+ return "non-UTF8 (invalid)";
1398
+ }
1399
+ return "other character";
1400
+ }
1401
+
1402
+ 1;
1403
+
1404
+
uroman/lib/NLP/stringDistance.pm ADDED
@@ -0,0 +1,724 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ################################################################
2
+ # #
3
+ # stringDistance #
4
+ # #
5
+ ################################################################
6
+
7
+ package NLP::stringDistance;
8
+
9
+ use List::Util qw(min max);
10
+ $utf8 = NLP::UTF8;
11
+ $util = NLP::utilities;
12
+ $romanizer = NLP::Romanizer;
13
+
14
+ %dummy_ht = ();
15
+
16
+ sub rule_string_expansion {
17
+ local($this, *ht, $s, $lang_code) = @_;
18
+
19
+ my @characters = $utf8->split_into_utf8_characters($s, "return only chars, return trailing whitespaces", *dummy_ht);
20
+ foreach $sub_len ((0 .. ($#characters-1))) {
21
+ my $sub = join("", @characters[0 .. $sub_len]);
22
+ foreach $super_len ((($sub_len + 1) .. $#characters)) {
23
+ my $super = join("", @characters[0 .. $super_len]);
24
+ # print STDERR " $sub -> $super\n" unless $ht{RULE_STRING_EXPANSION}->{$lang_code}->{$sub}->{$super};
25
+ $ht{RULE_STRING_EXPANSION}->{$lang_code}->{$sub}->{$super} = 1;
26
+ $ht{RULE_STRING_HAS_EXPANSION}->{$lang_code}->{$sub} = 1;
27
+ # print STDERR " RULE_STRING_HAS_EXPANSION $lang_code $sub\n";
28
+ }
29
+ }
30
+ }
31
+
32
+ sub load_string_distance_data {
33
+ local($this, $filename, *ht, $verbose) = @_;
34
+
35
+ $verbose = 0 unless defined($verbose);
36
+ open(IN,$filename) || die "Could not open $filename";
37
+ my $line_number = 0;
38
+ my $n_cost_rules = 0;
39
+ while (<IN>) {
40
+ $line_number++;
41
+ my $line = $_;
42
+ $line =~ s/^\xEF\xBB\xBF//;
43
+ $line =~ s/\s*$//;
44
+ next if $line =~ /^\s*(\#.*)?$/;
45
+ print STDERR "** Warning: line $line_number contains suspicious control character: $line\n" if $line =~ /[\x00-\x1F]/;
46
+ my $s1 = $util->slot_value_in_double_colon_del_list($line, "s1");
47
+ my $s2 = $util->slot_value_in_double_colon_del_list($line, "s2");
48
+ $s1 = $util->dequote_string($s1); # 'can\'t' => can't
49
+ $s2 = $util->dequote_string($s2);
50
+ my $cost = $util->slot_value_in_double_colon_del_list($line, "cost");
51
+ if (($s1 eq "") && ($s2 eq "")) {
52
+ print STDERR "Ignoring bad line $line_number in $filename, because both s1 and s2 are empty strings\n";
53
+ next;
54
+ }
55
+ unless ($cost =~ /^\d+(\.\d+)?$/) {
56
+ if ($cost eq "") {
57
+ print STDERR "Ignoring bad line $line_number in $filename, because of missing cost\n";
58
+ } else {
59
+ print STDERR "Ignoring bad line $line_number in $filename, because of ill-formed cost $cost\n";
60
+ }
61
+ next;
62
+ }
63
+ my $lang_code1_s = $util->slot_value_in_double_colon_del_list($line, "lc1");
64
+ my $lang_code2_s = $util->slot_value_in_double_colon_del_list($line, "lc2");
65
+ my @lang_codes_1 = ($lang_code1_s eq "") ? ("") : split(/,\s*/, $lang_code1_s);
66
+ my @lang_codes_2 = ($lang_code2_s eq "") ? ("") : split(/,\s*/, $lang_code2_s);
67
+ my $left_context1 = $util->slot_value_in_double_colon_del_list($line, "left1");
68
+ my $left_context2 = $util->slot_value_in_double_colon_del_list($line, "left2");
69
+ my $right_context1 = $util->slot_value_in_double_colon_del_list($line, "right1");
70
+ my $right_context2 = $util->slot_value_in_double_colon_del_list($line, "right2");
71
+ my $bad_left = $util->slot_value_in_double_colon_del_list($line, "left");
72
+ if ($bad_left) {
73
+ print STDERR "** Warning: slot '::left $bad_left' in line $line_number\n";
74
+ next;
75
+ }
76
+ my $bad_right = $util->slot_value_in_double_colon_del_list($line, "right");
77
+ if ($bad_right) {
78
+ print STDERR "** Warning: slot '::right $bad_right' in line $line_number\n";
79
+ next;
80
+ }
81
+ my $in_lang_codes1 = $util->slot_value_in_double_colon_del_list($line, "in-lc1");
82
+ my $in_lang_codes2 = $util->slot_value_in_double_colon_del_list($line, "in-lc2");
83
+ my $out_lang_codes1 = $util->slot_value_in_double_colon_del_list($line, "out-lc1");
84
+ my $out_lang_codes2 = $util->slot_value_in_double_colon_del_list($line, "out-lc2");
85
+ if ($left_context1) {
86
+ if ($left_context1 =~ /^\/.*\/$/) {
87
+ $left_context1 =~ s/^\///;
88
+ $left_context1 =~ s/\/$//;
89
+ } else {
90
+ print STDERR "Ignoring unrecognized non-regular-express ::left1 $left_context1 in $line_number of $filename\n";
91
+ $left_context1 = "";
92
+ }
93
+ }
94
+ if ($left_context2) {
95
+ if ($left_context2 =~ /^\/.*\/$/) {
96
+ $left_context2 =~ s/^\///;
97
+ $left_context2 =~ s/\/$//;
98
+ } else {
99
+ $left_context2 = "";
100
+ print STDERR "Ignoring unrecognized non-regular-express ::left2 $left_context2 in $line_number of $filename\n";
101
+ }
102
+ }
103
+ if ($right_context1) {
104
+ unless ($right_context1 =~ /^(\[[^\[\]]*\])+$/) {
105
+ $right_context1 = "";
106
+ print STDERR "Ignoring unrecognized right-context ::right1 $right_context1 in $line_number of $filename\n";
107
+ }
108
+ }
109
+ if ($right_context2) {
110
+ unless ($right_context2 =~ /^(\[[^\[\]]*\])+$/) {
111
+ $right_context2 = "";
112
+ print STDERR "Ignoring unrecognized right-context ::right2 $right_context2 in $line_number of $filename\n";
113
+ }
114
+ }
115
+ foreach $lang_code1 (@lang_codes_1) {
116
+ foreach $lang_code2 (@lang_codes_2) {
117
+ $n_cost_rules++;
118
+ my $cost_rule_id = $n_cost_rules;
119
+ $ht{COST}->{$lang_code1}->{$lang_code2}->{$s1}->{$s2}->{$cost_rule_id} = $cost;
120
+ $ht{RULE_STRING}->{$lang_code1}->{$s1} = 1;
121
+ $ht{RULE_STRING}->{$lang_code2}->{$s2} = 1;
122
+ $ht{LEFT1}->{$cost_rule_id} = $left_context1;
123
+ $ht{LEFT2}->{$cost_rule_id} = $left_context2;
124
+ $ht{RIGHT1}->{$cost_rule_id} = $right_context1;
125
+ $ht{RIGHT2}->{$cost_rule_id} = $right_context2;
126
+ $ht{INLC1}->{$cost_rule_id} = $in_lang_codes1;
127
+ $ht{INLC2}->{$cost_rule_id} = $in_lang_codes2;
128
+ $ht{OUTLC1}->{$cost_rule_id} = $out_lang_codes1;
129
+ $ht{OUTLC2}->{$cost_rule_id} = $out_lang_codes2;
130
+ unless (($s1 eq $s2)
131
+ && ($lang_code1 eq $lang_code2)
132
+ && ($left_context1 eq $left_context2)
133
+ && ($right_context1 eq $right_context2)
134
+ && ($in_lang_codes1 eq $in_lang_codes2)
135
+ && ($out_lang_codes1 eq $out_lang_codes2)) {
136
+ $n_cost_rules++;
137
+ $cost_rule_id = $n_cost_rules;
138
+ $ht{COST}->{$lang_code2}->{$lang_code1}->{$s2}->{$s1}->{$cost_rule_id} = $cost;
139
+ $ht{LEFT1}->{$cost_rule_id} = $left_context2;
140
+ $ht{LEFT2}->{$cost_rule_id} = $left_context1;
141
+ $ht{RIGHT1}->{$cost_rule_id} = $right_context2;
142
+ $ht{RIGHT2}->{$cost_rule_id} = $right_context1;
143
+ $ht{INLC1}->{$cost_rule_id} = $in_lang_codes2;
144
+ $ht{INLC2}->{$cost_rule_id} = $in_lang_codes1;
145
+ $ht{OUTLC1}->{$cost_rule_id} = $out_lang_codes2;
146
+ $ht{OUTLC2}->{$cost_rule_id} = $out_lang_codes1;
147
+ # print STDERR " Flip rule in line $line: $line\n";
148
+ }
149
+ $this->rule_string_expansion(*ht, $s1, $lang_code1);
150
+ $this->rule_string_expansion(*ht, $s2, $lang_code2);
151
+ }
152
+ }
153
+ }
154
+ close(IN);
155
+ print STDERR "Read in $n_cost_rules rules from $line_number lines in $filename\n" if $verbose;
156
+ }
157
+
158
+ sub romanized_string_to_simple_chart {
159
+ local($this, $s, *chart_ht) = @_;
160
+
161
+ my @characters = $utf8->split_into_utf8_characters($s, "return only chars, return trailing whitespaces", *dummy_ht);
162
+ $chart_ht{N_CHARS} = $#characters + 1;
163
+ $chart_ht{N_NODES} = 0;
164
+ foreach $i ((0 .. $#characters)) {
165
+ $romanizer->add_node($characters[$i], $i, ($i+1), *chart_ht, "", "");
166
+ }
167
+ }
168
+
169
+ sub linearize_chart_points {
170
+ local($this, *chart_ht, $chart_id, *sd_ht, $verbose) = @_;
171
+
172
+ $verbose = 0 unless defined($verbose);
173
+ print STDERR "Linearize $chart_id\n" if $verbose;
174
+ my $current_chart_pos = 0;
175
+ my $current_linear_chart_pos = 0;
176
+ $sd_ht{POS2LINPOS}->{$chart_id}->{$current_chart_pos} = $current_linear_chart_pos;
177
+ $sd_ht{LINPOS2POS}->{$chart_id}->{$current_linear_chart_pos} = $current_chart_pos;
178
+ print STDERR " LINPOS2POS.$chart_id LIN: $current_linear_chart_pos POS: $current_chart_pos\n" if $verbose;
179
+ my @end_chart_positions = keys %{$chart_ht{NODES_ENDING_AT}};
180
+ my $end_chart_pos = (@end_chart_positions) ? max(@end_chart_positions) : 0;
181
+ $sd_ht{MAXPOS}->{$chart_id} = $end_chart_pos;
182
+ print STDERR " Chart span: $current_chart_pos-$end_chart_pos\n" if $verbose;
183
+ while ($current_chart_pos < $end_chart_pos) {
184
+ my @node_ids = keys %{$chart_ht{NODES_STARTING_AT}->{$current_chart_pos}};
185
+ foreach $node_id (@node_ids) {
186
+ my $roman_s = $chart_ht{NODE_ROMAN}->{$node_id};
187
+ my @roman_chars = $utf8->split_into_utf8_characters($roman_s, "return only chars, return trailing whitespaces", *dummy_ht);
188
+ print STDERR " $current_chart_pos/$current_linear_chart_pos node: $node_id $roman_s (@roman_chars)\n" if $verbose;
189
+ if ($#roman_chars >= 1) {
190
+ foreach $i ((1 .. $#roman_chars)) {
191
+ $current_linear_chart_pos++;
192
+ $sd_ht{SPLITPOS2LINPOS}->{$chart_id}->{$current_chart_pos}->{$node_id}->{$i} = $current_linear_chart_pos;
193
+ $sd_ht{LINPOS2SPLITPOS}->{$chart_id}->{$current_linear_chart_pos}->{$current_chart_pos}->{$node_id}->{$i} = 1;
194
+ print STDERR " LINPOS2SPLITPOS.$chart_id LIN: $current_linear_chart_pos POS: $current_chart_pos NODE: $node_id I: $i\n" if $verbose;
195
+ }
196
+ }
197
+ }
198
+ $current_chart_pos++;
199
+ if ($util->member($current_chart_pos, @end_chart_positions)) {
200
+ $current_linear_chart_pos++;
201
+ $sd_ht{POS2LINPOS}->{$chart_id}->{$current_chart_pos} = $current_linear_chart_pos;
202
+ $sd_ht{LINPOS2POS}->{$chart_id}->{$current_linear_chart_pos} = $current_chart_pos;
203
+ print STDERR " LINPOS2POS.$chart_id LIN: $current_linear_chart_pos POS: $current_chart_pos\n" if $verbose;
204
+ }
205
+ }
206
+ $current_chart_pos = 0;
207
+ while ($current_chart_pos <= $end_chart_pos) {
208
+ my $current_linear_chart_pos = $sd_ht{POS2LINPOS}->{$chart_id}->{$current_chart_pos};
209
+ $current_linear_chart_pos = "?" unless defined($current_linear_chart_pos);
210
+ my @node_ids = keys %{$chart_ht{NODES_STARTING_AT}->{$current_chart_pos}};
211
+ # print STDERR " LINROM.$chart_id LIN: $current_linear_chart_pos POS: $current_chart_pos NODES: @node_ids\n" if $verbose;
212
+ foreach $node_id (@node_ids) {
213
+ my $end_pos = $chart_ht{NODE_END}->{$node_id};
214
+ my $end_linpos = $sd_ht{POS2LINPOS}->{$chart_id}->{$end_pos};
215
+ my $roman_s = $chart_ht{NODE_ROMAN}->{$node_id};
216
+ my @roman_chars = $utf8->split_into_utf8_characters($roman_s, "return only chars, return trailing whitespaces", *dummy_ht);
217
+ print STDERR " LINROM.$chart_id LIN: $current_linear_chart_pos POS: $current_chart_pos NODE: $node_id CHARS: @roman_chars\n" if $verbose;
218
+ if (@roman_chars) {
219
+ foreach $i ((0 .. $#roman_chars)) {
220
+ my $from_linear_chart_pos
221
+ = (($i == 0)
222
+ ? $sd_ht{POS2LINPOS}->{$chart_id}->{$current_chart_pos}
223
+ : $sd_ht{SPLITPOS2LINPOS}->{$chart_id}->{$current_chart_pos}->{$node_id}->{$i});
224
+ print STDERR " FROM.$chart_id I: $i POS: $current_chart_pos NODE: $node_id FROM: $from_linear_chart_pos\n" if $verbose;
225
+ my $to_linear_chart_pos
226
+ = (($i == $#roman_chars)
227
+ ? $end_linpos
228
+ : $sd_ht{SPLITPOS2LINPOS}->{$chart_id}->{$current_chart_pos}->{$node_id}->{($i+1)});
229
+ print STDERR " TO.$chart_id I: $i POS: $current_chart_pos NODE: $node_id FROM: $to_linear_chart_pos\n" if $verbose;
230
+ my $roman_char = $roman_chars[$i];
231
+ $sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$from_linear_chart_pos}->{$to_linear_chart_pos}->{$roman_char} = 1;
232
+ }
233
+ } else {
234
+ my $from_linear_chart_pos = $sd_ht{POS2LINPOS}->{$chart_id}->{$current_chart_pos};
235
+ my $to_linear_chart_pos = $sd_ht{POS2LINPOS}->{$chart_id}->{($current_chart_pos+1)};
236
+ # HHERE check this out
237
+ my $i = 1;
238
+ while (! (defined($to_linear_chart_pos))) {
239
+ $i++;
240
+ $to_linear_chart_pos = $sd_ht{POS2LINPOS}->{$chart_id}->{($current_chart_pos+$i)};
241
+ }
242
+ if (defined($from_linear_chart_pos) && defined($to_linear_chart_pos)) {
243
+ $sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$from_linear_chart_pos}->{$to_linear_chart_pos}->{""} = 1
244
+ } else {
245
+ print STDERR " UNDEF.$chart_id from: "
246
+ . ((defined($from_linear_chart_pos)) ? $from_linear_chart_pos : "?")
247
+ . " to: "
248
+ . ((defined($to_linear_chart_pos)) ? $to_linear_chart_pos : "?")
249
+ . "\n";
250
+ }
251
+ }
252
+ }
253
+ $current_chart_pos++;
254
+ }
255
+ $sd_ht{MAXLINPOS}->{$chart_id} = $sd_ht{POS2LINPOS}->{$chart_id}->{$end_chart_pos};
256
+ }
257
+
258
+ sub expand_lin_ij_roman {
259
+ local($this, *sd_ht, $chart_id, $lang_code, *ht) = @_;
260
+
261
+ foreach $start (sort { $a <=> $b } keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}}) {
262
+ foreach $end (sort { $a <=> $b } keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$start}}) {
263
+ foreach $roman (sort keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$start}->{$end}}) {
264
+ if ($ht{RULE_STRING_HAS_EXPANSION}->{$lang_code}->{$roman}
265
+ || $ht{RULE_STRING_HAS_EXPANSION}->{""}->{$roman}) {
266
+ $this->expand_lin_ij_roman_rec(*sd_ht, $chart_id, $start, $end, $roman, $lang_code, *ht);
267
+ }
268
+ }
269
+ }
270
+ }
271
+ }
272
+
273
+ sub expand_lin_ij_roman_rec {
274
+ local($this, *sd_ht, $chart_id, $start, $end, $roman, $lang_code, *ht) = @_;
275
+
276
+ # print STDERR " expand_lin_ij_roman_rec.$chart_id $start-$end $lang_code $roman\n";
277
+ return unless $ht{RULE_STRING_HAS_EXPANSION}->{$lang_code}->{$roman}
278
+ || $ht{RULE_STRING_HAS_EXPANSION}->{""}->{$roman};
279
+ foreach $new_end (keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$end}}) {
280
+ foreach $next_roman (sort keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$end}->{$new_end}}) {
281
+ my $exp_roman = join("", $roman, $next_roman);
282
+ if ($ht{RULE_STRING}->{$lang_code}->{$exp_roman}
283
+ || $ht{RULE_STRING}->{""}->{$exp_roman}) {
284
+ $sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$start}->{$new_end}->{$exp_roman} = 1;
285
+ # print STDERR " Expansion ($start-$new_end) $exp_roman\n";
286
+ }
287
+ if ($ht{RULE_STRING_HAS_EXPANSION}->{$lang_code}->{$exp_roman}
288
+ || $ht{RULE_STRING_HAS_EXPANSION}->{""}->{$exp_roman}) {
289
+ $this->expand_lin_ij_roman_rec(*sd_ht, $chart_id, $start, $new_end, $exp_roman, $lang_code, *ht);
290
+ }
291
+ }
292
+ }
293
+ }
294
+
295
+ sub trace_string_distance {
296
+ local($this, *sd_ht, $chart1_id, $chart2_id, $control, $line_number, $cost) = @_;
297
+
298
+ my $chart_comb_id = join("/", $chart1_id, $chart2_id);
299
+ return "mismatch" if $sd_ht{MISMATCH}->{$chart_comb_id};
300
+ my $chart1_end = $sd_ht{MAXLINPOS}->{$chart1_id};
301
+ my $chart2_end = $sd_ht{MAXLINPOS}->{$chart2_id};
302
+ my $verbose = ($control =~ /verbose/);
303
+ my $chunks_p = ($control =~ /chunks/);
304
+ my @traces = ();
305
+ my @s1_s = ();
306
+ my @s2_s = ();
307
+ my @e1_s = ();
308
+ my @e2_s = ();
309
+ my @r1_s = ();
310
+ my @r2_s = ();
311
+ my @ic_s = ();
312
+
313
+ # print STDERR "trace_string_distance $chart1_id $chart2_id $line_number\n";
314
+ while ($chart1_end || $chart2_end) {
315
+ my $incr_cost = $sd_ht{INCR_COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
316
+ my $prec_i = $sd_ht{PREC_I}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
317
+ my $prec_j = $sd_ht{PREC_J}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
318
+ if ($incr_cost || $verbose || $chunks_p) {
319
+ my $roman1 = $sd_ht{ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
320
+ my $roman2 = $sd_ht{ROMAN2}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
321
+ if ($verbose) {
322
+ push(@traces, "$prec_i-$chart1_end/$prec_j-$chart2_end:$roman1/$roman2:$incr_cost");
323
+ } else {
324
+ if (defined($roman1)) {
325
+ push(@traces, "$roman1/$roman2:$incr_cost");
326
+ } else {
327
+ $print_prec_i = (defined($prec_i)) ? $prec_i : "?";
328
+ $print_prec_j = (defined($prec_j)) ? $prec_j : "?";
329
+ print STDERR " $prec_i-$chart1_end, $prec_j-$chart2_end\n";
330
+ }
331
+ }
332
+ if ($chunks_p) {
333
+ push(@s1_s, $prec_i);
334
+ push(@s2_s, $prec_j);
335
+ push(@e1_s, $chart1_end);
336
+ push(@e2_s, $chart2_end);
337
+ push(@r1_s, $roman1);
338
+ push(@r2_s, $roman2);
339
+ push(@ic_s, $incr_cost);
340
+ }
341
+ }
342
+ $chart1_end = $prec_i;
343
+ $chart2_end = $prec_j;
344
+ }
345
+ if ($chunks_p) {
346
+ my $r1 = "";
347
+ my $r2 = "";
348
+ my $tc = 0;
349
+ my $in_chunk = 0;
350
+ foreach $i ((0 .. $#ic_s)) {
351
+ if ($ic_s[$i]) {
352
+ $r1 = $r1_s[$i] . $r1;
353
+ $r2 = $r2_s[$i] . $r2;
354
+ $tc += $ic_s[$i];
355
+ $in_chunk = 1;
356
+ } elsif ($in_chunk) {
357
+ $chunk = "$r1/$r2/$tc";
358
+ $chunk .= "*" if $cost > 5;
359
+ $sd_ht{N_COST_CHUNK}->{$chunk} = ($sd_ht{N_COST_CHUNK}->{$chunk} || 0) + 1;
360
+ $sd_ht{EX_COST_CHUNK}->{$chunk}->{$line_number} = 1;
361
+ $r1 = "";
362
+ $r2 = "";
363
+ $tc = 0;
364
+ $in_chunk = 0;
365
+ }
366
+ }
367
+ if ($in_chunk) {
368
+ $chunk = "$r1/$r2/$tc";
369
+ $chunk .= "*" if $cost > 5;
370
+ $sd_ht{N_COST_CHUNK}->{$chunk} = ($sd_ht{N_COST_CHUNK}->{$chunk} || 0) + 1;
371
+ $sd_ht{EX_COST_CHUNK}->{$chunk}->{$line_number} = 1;
372
+ }
373
+ } else {
374
+ return join(" ", reverse @traces);
375
+ }
376
+ }
377
+
378
+ sub right_context_match {
379
+ local($this, $right_context_rule, *sd_ht, $chart_id, $start_pos) = @_;
380
+
381
+ return 1 if $right_context_rule eq "";
382
+ if (($right_context_item, $right_context_rest) = ($right_context_rule =~ /^\[([^\[\]]*)\]*(.*)$/)) {
383
+ my $guarded_right_context_item = $right_context_item;
384
+ $guarded_right_context_item =~ s/\$/\\\$/g;
385
+ my @end_positions = keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$start_pos}};
386
+ return 1 if ($#end_positions == -1)
387
+ && (($right_context_item eq "")
388
+ || ($right_context_item =~ /\$/));
389
+ foreach $end_pos (@end_positions) {
390
+ my @romans = keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$start_pos}->{$end_pos}};
391
+ foreach $roman (@romans) {
392
+ if ($roman =~ /^[$guarded_right_context_item]/) {
393
+ return $this->right_context_match($right_context_rest, *sd_ht, $chart_id, $end_pos);
394
+ }
395
+ }
396
+ }
397
+ }
398
+ return 0;
399
+ }
400
+
401
+ sub string_distance {
402
+ local($this, *sd_ht, $chart1_id, $chart2_id, $lang_code1, $lang_code2, *ht, $control) = @_;
403
+
404
+ my $verbose = ($control =~ /verbose/i);
405
+ my $chart_comb_id = join("/", $chart1_id, $chart2_id);
406
+
407
+ my $chart1_end_pos = $sd_ht{MAXLINPOS}->{$chart1_id};
408
+ my $chart2_end_pos = $sd_ht{MAXLINPOS}->{$chart2_id};
409
+ print STDERR "string_distance.$chart_comb_id $chart1_end_pos/$chart2_end_pos\n" if $verbose;
410
+ $sd_ht{COST_IJ}->{$chart_comb_id}->{0}->{0} = 0;
411
+ $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{0}->{0} = "";
412
+ $sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{0}->{0} = "";
413
+ # HHERE
414
+ foreach $chart1_start ((0 .. $chart1_end_pos)) {
415
+ # print STDERR " C1 $chart1_start- ($chart1_start .. $chart1_end_pos)\n";
416
+ my $prev_further_expansion_possible = 0;
417
+ my @chart1_ends = sort { $a <=> $b } keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart1_id}->{$chart1_start}};
418
+ my $max_chart1_ends = (@chart1_ends) ? $chart1_ends[$#chart1_ends] : -1;
419
+ foreach $chart1_end (($chart1_start .. $chart1_end_pos)) {
420
+ my $further_expansion_possible = ($chart1_start == $chart1_end)
421
+ || defined($sd_ht{LINPOS2SPLITPOS}->{$chart1_id}->{$chart1_start})
422
+ || ($chart1_end < $max_chart1_ends);
423
+ my @romans1 = (($chart1_start == $chart1_end)
424
+ ? ("")
425
+ : (sort keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart1_id}->{$chart1_start}->{$chart1_end}}));
426
+ if ($#romans1 == -1) {
427
+ $further_expansion_possible = 1 if $prev_further_expansion_possible;
428
+ } else {
429
+ $prev_further_expansion_possible = 0;
430
+ }
431
+ # print STDERR " C1 $chart1_start-$chart1_end romans1: @romans1 {$further_expansion_possible} *l*\n";
432
+ foreach $roman1 (@romans1) {
433
+ # print STDERR " C1 $chart1_start-$chart1_end $roman1 {$further_expansion_possible} *?*\n";
434
+ next unless $ht{RULE_STRING}->{$lang_code1}->{$roman1}
435
+ || $ht{RULE_STRING}->{""}->{$roman1};
436
+ # print STDERR " C1 $chart1_start-$chart1_end $roman1 {$further_expansion_possible} ***\n";
437
+ foreach $lang_code1o (($lang_code1, "")) {
438
+ foreach $lang_code2o (($lang_code2, "")) {
439
+ my @chart2_starts = (sort { $a <=> $b } keys %{$sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_start}});
440
+ foreach $chart2_start (@chart2_starts) {
441
+ # print STDERR " C1 $chart1_start-$chart1_end $roman1 C2 $chart2_start- (@chart2_starts)\n";
442
+ foreach $chart2_end (($chart2_start .. $chart2_end_pos)) {
443
+ print STDERR " C1 $chart1_start-$chart1_end $roman1 C2 $chart2_start-$chart2_end\n";
444
+ my @romans2 = (($chart2_start == $chart2_end)
445
+ ? ("")
446
+ : (sort keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart2_id}->{$chart2_start}->{$chart2_end}}));
447
+ foreach $roman2 (@romans2) {
448
+ if ($roman1 eq $roman2) {
449
+ print STDERR " C1 $chart1_start-$chart1_end $roman1 C2 $chart2_start-$chart2_end $roman2 (IDENTITY)\n";
450
+ my $cost = 0;
451
+ my $preceding_cost = $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_start}->{$chart2_start};
452
+ my $combined_cost = $preceding_cost + $cost;
453
+ my $old_cost = $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
454
+ if ((! defined($old_cost)) || ($combined_cost < $old_cost)) {
455
+ $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $combined_cost;
456
+ push(@chart2_starts, $chart2_end) unless $util->member($chart2_end, @chart2_starts);
457
+ $sd_ht{PREC_I}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $chart1_start;
458
+ $sd_ht{PREC_J}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $chart2_start;
459
+ $sd_ht{ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $roman1;
460
+ $sd_ht{ROMAN2}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $roman2;
461
+ $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end}
462
+ = $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_start}->{$chart2_start} . $roman1;
463
+ $sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{$chart1_end}->{$chart2_end}
464
+ = $sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{$chart1_start}->{$chart2_start} . $roman2;
465
+ $comb_left_roman1 = $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
466
+ $sd_ht{INCR_COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $cost;
467
+ $sd_ht{COST_RULE}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = "IDENTITY";
468
+ print STDERR " New cost $chart1_end/$chart2_end: $combined_cost (+$cost from $chart1_start/$chart2_start $roman1/$roman2)\n" if $verbose;
469
+ }
470
+ } else {
471
+ next unless $ht{RULE_STRING}->{$lang_code2o}->{$roman2};
472
+ print STDERR " C1 $chart1_start-$chart1_end $roman1 C2 $chart2_start-$chart2_end $roman2\n";
473
+ next unless defined($ht{COST}->{$lang_code1o}->{$lang_code2o}->{$roman1}->{$roman2});
474
+ my @cost_rule_ids = keys %{$ht{COST}->{$lang_code1o}->{$lang_code2o}->{$roman1}->{$roman2}};
475
+ foreach $cost_rule_id (@cost_rule_ids) {
476
+ ## check whether any context requirements are satisfied
477
+ # left context rules are regular expressions
478
+ my $left_context_rule1 = $ht{LEFT1}->{$cost_rule_id};
479
+ if ($left_context_rule1) {
480
+ my $comb_left_roman1 = $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_start}->{$chart2_start};
481
+ if (defined($comb_left_roman1)) {
482
+ next unless $comb_left_roman1 =~ /$left_context_rule1/;
483
+ } else {
484
+ print STDERR " No comb_left_roman1 value for $chart_comb_id $chart1_start,$chart2_start\n";
485
+ }
486
+ }
487
+ my $left_context_rule2 = $ht{LEFT2}->{$cost_rule_id};
488
+ if ($left_context_rule2) {
489
+ my $comb_left_roman2 = $sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{$chart1_start}->{$chart2_start};
490
+ if (defined($comb_left_roman2)) {
491
+ next unless $comb_left_roman2 =~ /$left_context_rule2/;
492
+ } else {
493
+ print STDERR " No comb_left_roman2 value for $chart_comb_id $chart1_start,$chart2_start\n";
494
+ }
495
+ }
496
+ my $right_context_rule1 = $ht{RIGHT1}->{$cost_rule_id};
497
+ if ($right_context_rule1) {
498
+ my $match_p = $this->right_context_match($right_context_rule1, *sd_ht, $chart1_id, $chart1_end);
499
+ # print STDERR " Match?($right_context_rule1, 1, $chart1_end) = $match_p\n";
500
+ next unless $match_p;
501
+ }
502
+ my $right_context_rule2 = $ht{RIGHT2}->{$cost_rule_id};
503
+ if ($right_context_rule2) {
504
+ my $match_p = $this->right_context_match($right_context_rule2, *sd_ht, $chart2_id, $chart2_end);
505
+ # print STDERR " Match?($right_context_rule2, 2, $chart2_end) = $match_p\n";
506
+ next unless $match_p;
507
+ }
508
+ my $cost = $ht{COST}->{$lang_code1o}->{$lang_code2o}->{$roman1}->{$roman2}->{$cost_rule_id};
509
+ my $preceding_cost = $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_start}->{$chart2_start};
510
+ my $combined_cost = $preceding_cost + $cost;
511
+ my $old_cost = $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
512
+ if ((! defined($old_cost)) || ($combined_cost < $old_cost)) {
513
+ $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $combined_cost;
514
+ push(@chart2_starts, $chart2_end) unless $util->member($chart2_end, @chart2_starts);
515
+ $sd_ht{PREC_I}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $chart1_start;
516
+ $sd_ht{PREC_J}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $chart2_start;
517
+ $sd_ht{ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $roman1;
518
+ $sd_ht{ROMAN2}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $roman2;
519
+ $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end}
520
+ = $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_start}->{$chart2_start} . $roman1;
521
+ $sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{$chart1_end}->{$chart2_end}
522
+ = $sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{$chart1_start}->{$chart2_start} . $roman2;
523
+ $comb_left_roman1 = $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
524
+ # print STDERR " Comb-left-roman1($chart_comb_id,$chart1_end,$chart2_end) = $comb_left_roman1\n";
525
+ $sd_ht{INCR_COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $cost;
526
+ $sd_ht{COST_RULE}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $cost_rule_id;
527
+ print STDERR " New cost $chart1_end/$chart2_end: $combined_cost (+$cost from $chart1_start/$chart2_start $roman1/$roman2)\n" if $verbose;
528
+ }
529
+ }
530
+ }
531
+ }
532
+ }
533
+ }
534
+ }
535
+ }
536
+ $further_expansion_possible = 1
537
+ if $ht{RULE_STRING_HAS_EXPANSION}->{$lang_code1}->{$roman1}
538
+ || $ht{RULE_STRING_HAS_EXPANSION}->{""}->{$roman1};
539
+ # print STDERR " further_expansion_possible: $further_expansion_possible (lc: $lang_code1 r1: $roman1) ***\n";
540
+ }
541
+ # print STDERR " last C1 $chart1_start-$chart1_end (@romans1)\n" unless $further_expansion_possible;
542
+ last unless $further_expansion_possible;
543
+ $prev_further_expansion_possible = 1 if $further_expansion_possible;
544
+ }
545
+ }
546
+ my $total_cost = $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_end_pos}->{$chart2_end_pos};
547
+ unless (defined($total_cost)) {
548
+ $total_cost = 99.9999;
549
+ $sd_ht{MISMATCH}->{$chart_comb_id} = 1;
550
+ }
551
+ return $total_cost;
552
+ }
553
+
554
+ sub print_sd_ht {
555
+ local($this, *sd_ht, $chart1_id, $chart2_id, *OUT) = @_;
556
+
557
+ print OUT "string-distance chart:\n";
558
+ foreach $chart_id (($chart1_id, $chart2_id)) {
559
+ print OUT "SD chart $chart_id:\n";
560
+ foreach $from_linear_chart_pos (sort { $a <=> $b } keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}}) {
561
+ foreach $to_linear_chart_pos (sort { $a <=> $b } keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$from_linear_chart_pos}}) {
562
+ foreach $roman_char (sort keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$from_linear_chart_pos}->{$to_linear_chart_pos}}) {
563
+ print OUT " Lnode($from_linear_chart_pos-$to_linear_chart_pos): $roman_char\n";
564
+ }
565
+ }
566
+ }
567
+ }
568
+ }
569
+
570
+ sub print_chart_ht {
571
+ local($this, *chart_ht, *OUT) = @_;
572
+
573
+ print OUT "uroman chart:\n";
574
+ foreach $start (sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AT}}) {
575
+ foreach $end (sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}}) {
576
+ foreach $node_id (keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}}) {
577
+ $roman_s = $chart_ht{NODE_ROMAN}->{$node_id};
578
+ print OUT " Node $node_id ($start-$end): $roman_s\n";
579
+ }
580
+ }
581
+ }
582
+ }
583
+
584
+ sub normalize_string {
585
+ local($this, $s) = @_;
586
+
587
+ # $s =~ s/(\xE2\x80\x8C)//g; # delete zero width non-joiner
588
+ $s =~ s/(\xE2\x80[\x93-\x94])/-/g; # en-dash, em-dash
589
+ $s =~ s/([\x00-\x7F\xC0-\xFE][\x80-\xBF]*)\1+/$1$1/g; # shorten 3 or more occurrences of same character in a row to 2
590
+ $s =~ s/[ \t]+/ /g;
591
+
592
+ return $s;
593
+ }
594
+
595
+ my $string_distance_chart_id = 0;
596
+ sub string_distance_by_chart {
597
+ local($this, $s1, $s2, $lang_code1, $lang_code2, *ht, *pinyin_ht, $control) = @_;
598
+
599
+ $control = "" unless defined($control);
600
+ %sd_ht = ();
601
+
602
+ $s1 = $this->normalize_string($s1);
603
+ my $lc_s1 = $utf8->extended_lower_case($s1);
604
+ $string_distance_chart_id++;
605
+ my $chart1_id = $string_distance_chart_id;
606
+ *chart_ht = $romanizer->romanize($lc_s1, $lang_code1, "", *ht, *pinyin_ht, 0, "return chart", $chart1_id);
607
+ $this->linearize_chart_points(*chart_ht, $chart1_id, *sd_ht);
608
+ $this->expand_lin_ij_roman(*sd_ht, $chart1_id, $lang_code1, *ht);
609
+
610
+ $s2 = $this->normalize_string($s2);
611
+ my $lc_s2 = $utf8->extended_lower_case($s2);
612
+ $string_distance_chart_id++;
613
+ my $chart2_id = $string_distance_chart_id;
614
+ *chart_ht = $romanizer->romanize($lc_s2, $lang_code2, "", *ht, *pinyin_ht, 0, "return chart", $chart2_id);
615
+ $this->linearize_chart_points(*chart_ht, $chart2_id, *sd_ht);
616
+ $this->expand_lin_ij_roman(*sd_ht, $chart2_id, $lang_code2, *ht);
617
+
618
+ my $cost = $this->string_distance(*sd_ht, $chart1_id, $chart2_id, $lang_code1, $lang_code2, *ht, $control);
619
+ return $cost;
620
+ }
621
+
622
+ my $n_quick_romanized_string_distance = 0;
623
+ sub quick_romanized_string_distance_by_chart {
624
+ local($this, $s1, $s2, *ht, $control, $lang_code1, $lang_code2) = @_;
625
+
626
+ # my $verbose = ($s1 eq "apit") && ($s2 eq "apet");
627
+ # print STDERR "Start quick_romanized_string_distance_by_chart\n";
628
+ $s1 = lc $s1;
629
+ $s2 = lc $s2;
630
+ $control = "" unless defined($control);
631
+ $lang_code1 = "" unless defined($lang_code1);
632
+ $lang_code2 = "" unless defined($lang_code2);
633
+ my $cache_p = ($control =~ /cache/);
634
+ my $total_cost;
635
+ if ($cache_p) {
636
+ $total_cost = $ht{CACHED_QRSD}->{$s1}->{$s2};
637
+ if (defined($total_cost)) {
638
+ return $total_cost;
639
+ }
640
+ }
641
+ my @lang_codes1 = ($lang_code1 eq "") ? ("") : ($lang_code1, "");
642
+ my @lang_codes2 = ($lang_code2 eq "") ? ("") : ($lang_code2, "");
643
+ my $chart1_end_pos = length($s1);
644
+ my $chart2_end_pos = length($s2);
645
+ my %sd_ht = ();
646
+ $sd_ht{COST_IJ}->{0}->{0} = 0;
647
+ foreach $chart1_start ((0 .. $chart1_end_pos)) {
648
+ foreach $chart1_end (($chart1_start .. $chart1_end_pos)) {
649
+ my $substr1 = substr($s1, $chart1_start, ($chart1_end-$chart1_start));
650
+ foreach $lang_code1o (@lang_codes1) {
651
+ foreach $lang_code2o (@lang_codes2) {
652
+ # next unless defined($ht{COST}->{$lang_code1o}->{$lang_code2o}->{$substr1});
653
+ }
654
+ }
655
+ my @chart2_starts = (sort { $a <=> $b } keys %{$sd_ht{COST_IJ}->{$chart1_start}});
656
+ foreach $chart2_start (@chart2_starts) {
657
+ foreach $chart2_end (($chart2_start .. $chart2_end_pos)) {
658
+ my $substr2 = substr($s2, $chart2_start, ($chart2_end-$chart2_start));
659
+ foreach $lang_code1o (@lang_codes1) {
660
+ foreach $lang_code2o (@lang_codes2) {
661
+ if ($substr1 eq $substr2) {
662
+ my $cost = 0;
663
+ my $preceding_cost = $sd_ht{COST_IJ}->{$chart1_start}->{$chart2_start};
664
+ if (defined($preceding_cost)) {
665
+ my $combined_cost = $preceding_cost + $cost;
666
+ my $old_cost = $sd_ht{COST_IJ}->{$chart1_end}->{$chart2_end};
667
+ if ((! defined($old_cost)) || ($combined_cost < $old_cost)) {
668
+ $sd_ht{COST_IJ}->{$chart1_end}->{$chart2_end} = $combined_cost;
669
+ push(@chart2_starts, $chart2_end) unless $util->member($chart2_end, @chart2_starts);
670
+ }
671
+ }
672
+ } else {
673
+ next unless defined($ht{COST}->{$lang_code1o}->{$lang_code2o}->{$substr1}->{$substr2});
674
+ my @cost_rule_ids = keys %{$ht{COST}->{$lang_code1o}->{$lang_code2o}->{$substr1}->{$substr2}};
675
+ my $best_cost = 99.99;
676
+ foreach $cost_rule_id (@cost_rule_ids) {
677
+ my $cost = $ht{COST}->{$lang_code1o}->{$lang_code2o}->{$substr1}->{$substr2}->{$cost_rule_id};
678
+ my $left_context_rule1 = $ht{LEFT1}->{$cost_rule_id};
679
+ next if $left_context_rule1
680
+ && (! (substr($s1, 0, $chart1_start) =~ /$left_context_rule1/));
681
+ my $left_context_rule2 = $ht{LEFT2}->{$cost_rule_id};
682
+ next if $left_context_rule2
683
+ && (! (substr($s2, 0, $chart2_start) =~ /$left_context_rule2/));
684
+ my $right_context_rule1 = $ht{RIGHT1}->{$cost_rule_id};
685
+ my $right_context1 = substr($s1, $chart1_end);
686
+ next if $right_context_rule1
687
+ && (! (($right_context1 =~ /^$right_context_rule1/)
688
+ || (($right_context_rule1 =~ /^\[[^\[\]]*\$/)
689
+ && ($right_context1 eq ""))));
690
+ my $right_context_rule2 = $ht{RIGHT2}->{$cost_rule_id};
691
+ my $right_context2 = substr($s2, $chart2_end);
692
+ next if $right_context_rule2
693
+ && (! (($right_context2 =~ /^$right_context_rule2/)
694
+ || (($right_context_rule2 =~ /^\[[^\[\]]*\$/)
695
+ && ($right_context2 eq ""))));
696
+ $best_cost = $cost if $cost < $best_cost;
697
+ my $preceding_cost = $sd_ht{COST_IJ}->{$chart1_start}->{$chart2_start};
698
+ my $combined_cost = $preceding_cost + $cost;
699
+ my $old_cost = $sd_ht{COST_IJ}->{$chart1_end}->{$chart2_end};
700
+ if ((! defined($old_cost)) || ($combined_cost < $old_cost)) {
701
+ $sd_ht{COST_IJ}->{$chart1_end}->{$chart2_end} = $combined_cost;
702
+ push(@chart2_starts, $chart2_end) unless $util->member($chart2_end, @chart2_starts);
703
+ }
704
+ }
705
+ }
706
+ }
707
+ }
708
+ }
709
+ }
710
+ }
711
+ }
712
+ $total_cost = $sd_ht{COST_IJ}->{$chart1_end_pos}->{$chart2_end_pos};
713
+ $total_cost = 99.99 unless defined($total_cost);
714
+ $ht{CACHED_QRSD}->{$s1}->{$s2} = $total_cost if $cache_p;
715
+ $n_quick_romanized_string_distance++;
716
+ return $total_cost;
717
+ }
718
+
719
+ sub get_n_quick_romanized_string_distance {
720
+ return $n_quick_romanized_string_distance;
721
+ }
722
+
723
+ 1;
724
+
uroman/lib/NLP/utilities.pm ADDED
The diff for this file is too large to render. See raw diff
 
uroman/tarballs/uroman-v1.0.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:912655beef069e5abb43c8fc4c3c4428fd0af6f4a1697accc98277933d3e1ee5
3
+ size 440252
uroman/tarballs/uroman-v1.1.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df990f6096a10e093ac5f28c2b86d5ef9e9098ef7472855843f9a841bb3b963d
3
+ size 507234
uroman/tarballs/uroman-v1.2.4.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77d707f3c17d5c45869b80fe71caee6023d1d9949ccffb446626f374605a25e2
3
+ size 503690
uroman/tarballs/uroman-v1.2.5.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2e9044afff8b4483f43a99b1fb1279889336760d76245ee93f300e660a46660
3
+ size 575581
uroman/tarballs/uroman-v1.2.6.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02f6f73b067b972a8f7d408da2f9b22741629af67f55b2ea768d11710fbf40a4
3
+ size 567522
uroman/tarballs/uroman-v1.2.7.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbb51506ed3ea6dcb902c824e62bea39b3741f6526564ba05d6e0083d8d876e5
3
+ size 566800
uroman/tarballs/uroman-v1.2.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c69e56d9c5eea9416ae00ca4dd859a1ef5129c1867778b66ad2f811f0fd33c9
3
+ size 494625
uroman/test/multi-script.txt ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ::lcode deu Grüße aus Bordeaux
2
+ ::lcode tur İstanbul, Türkiye'de yer alan şehir ve ülkenin 81 ilinden biri.
3
+ ::lcode eng ⠠⠺⠑⠀⠓⠕⠇⠙⠀⠘⠮⠀⠞⠗⠥⠹⠎⠀⠞⠕⠀⠆⠀⠎⠑⠇⠋⠤⠑⠧⠊⠙⠢⠞⠂⠀⠞⠀⠁⠇⠇⠀⠍⠑⠝⠀⠜⠑⠀⠉⠗⠂⠞⠫⠀⠑⠟⠥⠁⠇⠂⠀⠞⠀⠮⠽⠀⠜⠑⠀⠑⠝⠙⠪⠫⠀⠃⠽⠀⠸⠮⠀⠠⠉⠗⠑⠁⠞⠕⠗⠀⠾⠀⠉⠻⠞⠁⠔⠀⠥⠝⠁⠇⠊⠑⠝⠁⠃⠇⠑⠀⠠⠐⠗⠎⠂⠀⠞⠀⠁⠍⠰⠛⠀⠘⠮⠀⠜⠑⠀⠠⠇⠊⠋⠑⠂⠀⠠⠇⠊⠃⠻⠞⠽⠀⠯⠀⠮⠀⠏⠥⠗⠎⠥⠊⠞⠀⠷⠀⠠⠓⠁⠏⠏⠊⠰⠎⠲
4
+ ::lcode ell Το Λος Άντζελες (στα ισπανικά Los Angeles = Οι Άγγελοι) ή στην Αμερικανική αργκό L.A., ελ έι) είναι η δεύτερη μεγαλύτερη πόλη των Ηνωμένων Πολιτειών από άποψη πληθυσμού, καθώς και ένα από τα σημαντικότερα οικονομικά, πολιτιστικά επιστημονικά και ψυχαγωγικά κέντρα του κόσμου.
5
+ ::lcode rus Герма́ния (нем. Deutschland), официальное название — Федерати́вная Респу́блика Герма́ния (нем. Bundesrepublik Deutschland), ФРГ (нем. BRD) — государство в Западной Европе. Площадь территории — 357 021 км². Численность населения по переписи 2011 года — более 80 миллионов человек. [2][6].
6
+ ::lcode ukr Володи́мир Олекса́ндрович Зеле́нський (нар. 25 січня 1978, Кривий Ріг) — український державний діяч, політик, шоумен, актор, комік, режисер, продюсер та сценарист, шостий Президент України з 20 травня 2019 року.
7
+ ::lcode srp Сва људска бића рађају се слободна и једнака у достојанству и правима. Она су обдарена разумом и свешћу и треба једни према другима да поступају у духу братства.
8
+ ::lcode ara كندا (بالإنجليزية: Canada) هي دولة في أمريكا الشمالية تتألف من 10 مقاطعات وثلاثة أقاليم. تقع في القسم الشمالي من القارة وتمتد من المحيط الأطلسي في الشرق إلى المحيط الهادئ في الغرب وتمتد شمالاً في المحيط المتجمد الشمالي. كندا هي البلد الثاني عالمياً من حيث المساحة الكلية. كما أن حدود كندا المشتركة مع الولايات المتحدة من الجنوب والشمال الغربي هي الأطول في العالم.
9
+ ::lcode fas کالیفرنیا (به انگلیسی: California) ایالتی در غرب آمریکا بر کرانهٔ اقیانوس آرام است. مرکز آن ساکرامنتو و شهرهای مهم آن لس‌آنجلس، سن دیگو، سن خوزه و سان‌فرانسیسکو هستند.همچنین این ایالت پر جمعیت ترین ایالت امریکا است.
10
+ ::lcode uig ئامېرىكا قوشما شتاتلىرى بولسا شىمالىي ئامېرىكاغا جايلاشقان بىر دۆلەت. ئۇنىڭ پايتەختى بولسا ۋاشىنگتون، ئەڭ چوڭ شەھىرى بولسا نيۇيورك شەھىرى. دۆلەت تىلى بولسا ئېنگلىزتىلى. ھازىرقى زۇڭتۇڭ باراك ئوباما. بۇ دۆلەت ئەسلىدە ئەنگىلىيەنىڭ مۇستەملىكىسى بولۇپ ۋاشىنگىتوننىڭ رەھپەرلىكىدە 1776 يىلى 7 ئاينىڭ 4 كۇنى مۇستەقىل بولغان، يەر مەيدانى 9 مىلىيون 826 مىڭ 630 كۋادىرات كلومېتىر، نوپۇسى 306 مىللىيون 142 مىڭ، بۇلارنىڭ ئاسساسلىق دىنى خرىستىئان دىنى.
11
+ ::lcode amh ኢትዮጵያ ከዓለም ሶስቱ ትልቅ የአብርሃም ሀይማኖቶች ጋር ታሪካዊ ግንኙነት አላት።
12
+ ::lcode hin कैलिफ़ोर्निया शब्द का पहला अर्थ था जो क्षेत्र जहाँ आज बाहा कैलिफ़ोर्निया प्रायद्वीप, नेवाडा, यूटा और एरिज़ोना, नया मेक्सिको, और वायोमिंग के कई विभाग स्थित हैं।
13
+ ::lcode mar लंडन (इंग्लिश: London ) हे इंग्लंडचे व युनायटेड किंग्डमचे राजधानीचे व सर्वात मोठे शहर तसेच युरोपियन संघामधील सर्वात मोठे महान���र क्षेत्र आहे.
14
+ ::lcode nep यसको उचाइ समुन्द्र सतहबाट ८,८४८ मीटर (२९,०२८ फीट) छ। यो नेपालको सोलुखुम्बु जिल्लाको खुम्जुङ्ग गा. वि. स. मा पर्छ ।
15
+ ::lcode tam தமிழ்நாடு (Tamil Nadu) இந்தியாவின் 29 மாநிலங்களில் ஒன்றாகும். தமிழ்நாடு, தமிழகம் என்றும் பரவலாக அழைக்கப்படுகிறது.
16
+ ::lcode mal ഇന്ത്യയുടെ തെക്കുപടിഞ്ഞാറെ അറ്റത്തുള്ള സംസ്ഥാനമാണ് കേരളം.
17
+ ::lcode ori ଓଡ଼ିଶା ଭାରତର ପୂର୍ବ ଉପକୂଳରେ ଥିବା ଏକ ପ୍ରଶାସନିକ ରାଜ୍ୟ । ଏହାର ଉତ୍ତର-ପୂର୍ବରେ ପଶ୍ଚିମବଙ୍ଗ, ଉତ୍ତରରେ ଝାଡ଼ଖଣ୍ଡ, ପଶ୍ଚିମ ଓ ଉତ୍ତର-ପଶ୍ଚିମରେ ଛତିଶଗଡ଼, ଦକ୍ଷିଣ ଓ ଦକ୍ଷିଣ-ପଶ୍ଚିମରେ ଆନ୍ଧ୍ରପ୍ରଦେଶ ଅବସ୍ଥିତ । ଏହା ଆୟତନ ହିସାବରେ ନବମ ଓ ଜନସଂଖ୍ୟା ହିସାବରେ ଏଗାରତମ ରାଜ୍ୟ । ଓଡ଼ିଆ ଭାଷା ରାଜ୍ୟର ସରକାରୀ ଭାଷା । ୨୦୦୧ ଜନଗଣନା ଅନୁସାରେ ରାଜ୍ୟର ପ୍ରାୟ ୩୩.୨ ନିୟୁତ ଲୋକ ଓଡ଼ିଆ ଭାଷା ବ୍ୟବହାର କରନ୍ତି ।
18
+ ::lcode zho 加拿大在一万四千年前即有原住民在此生活。
19
+ ::lcode heb כֹּל עוֹד בַּלֵּבָב פְּנִימָה נֶפֶשׁ יְהוּדִי הוֹמִיָּה וּלְפַאֲתֵי מִזְרָח, קָדִימָה, עַיִן לְצִיּוֹן צוֹפִיָּה, עוֹד לֹא אָבְדָה תִּקְוָתֵנוּ, הַתִּקְוָה בַּת שְׁנוֹת אַלְפַּיִם לִהְיוֹת עַם חָפְשִׁי בְּאַרְצֵנוּ, אֶרֶץ צִיּוֹן וִירוּשָׁלַיִם.
20
+ ::lcode yid דווקא איז אן העברעישער זשורנאל וואס באשרייבט די יידיש־שפראכיקע קולטור. עס איז דערשינען געווארן תמוז ה'תשס"ז (יולי 2006).
21
+ ::lcode hye Տալնոեի շրջան (ուկր.՝ Тальнівський район), շրջան Ուկրաինայի Չերկասիի մարզում։ Ստեղծվել է 1923 թվականին։ Վարչական կենտրոնը՝ Տալնոե։ Աշխարհագրությունը Շրջանի տարածքի մակերեսը կազմում է 917 կմ²։ Բնակչություն
22
+ ::lcode tai มีประเทศอิสระ 2 ประเทศ คือ ซานมารีโนและนครรัฐวาติกัน เป็นดินแดนที่ล้อมรอบไปด้วยพื้นที่ของอิตาลี ในขณะที่เมืองกัมปีโอเนดีตาเลีย เป็นดินแดนส่วนแยกของอิตาลีที่ถูกล้อมรอบด้วยพื้นที่ประเทศสวิตเซอร์แลนด์
23
+ 북쪽에는 인도네시아와 동티모르, 파푸아 뉴기니, 북동쪽에는 솔로몬 제도와 바누아투, 누벨칼레도니, 그리고 남동쪽에는 뉴질랜드가 있다.
24
+ ಬಾ ಇಲ್ಲಿ ಸಂಭವಿಸು ಇಂದೆನ್ನ ಹೃದಯದಲಿ ನಿತ್ಯವೂ ಅವತರಿಪ ಸತ್ಯಾವತಾರ ಮಣ್ಣಾಗಿ ಮರವಾಗಿ ಮಿಗವಾಗಿ ಕಗವಾಗೀ... ಮಣ್ಣಾಗಿ ಮರವಾಗಿ ಮಿಗವಾಗಿ ಕಗವಾಗಿ ಭವ ಭವದಿ ಭತಿಸಿಹೇ ಭವತಿ ದೂರ ನಿತ್ಯವೂ ಅವತರಿಪ ಸತ್ಯಾವತಾರ || ಬಾ ಇಲ್ಲಿ ||
25
+ ვეპხის ტყაოსანი შოთა რუსთაველი ღმერთსი შემვედრე, ნუთუ კვლა დამხსნას სოფლისა შრომასა, ცეცხლს, წყალსა და მიწასა, ჰაერთა თანა მრომასა; მომცნეს ფრთენი და აღვფრინდე, მივჰხვდე მას ჩემსა ნდომასა, დღისით და ღამით ვჰხედვიდე მზისა ელვათა კრთომაასა.
26
+ ᚛ᚐᚅᚋ ᚋᚖᚂᚓᚌᚖᚋᚏᚔᚇ ᚋᚐᚉᚔ ᚍᚓᚉᚒᚋᚓᚅ᚜
27
+ ᛁᚳ᛫ᛗᚨᚷ᛫ᚷᛚᚨᛋ᛫ᛖᚩᛏᚪᚾ᛫ᚩᚾᛞ᛫ᚻᛁᛏ᛫ᚾᛖ᛫ᚻᛖᚪᚱᛗᛁᚪᚧ᛫ᛗᛖ᛬
28
+ 𓊪𓏏𓍯𓃭𓐝𓇌𓋴
29
+ チェコスロバキア
30
+ ལྷ་ས་གྲ���ང་ཁྱེར
31
+ ᓵᓕ ᓴᕕᐊᕐᔪᒃ ᐃᒻᒥᓂᒃ ᓂᓪᓕᕈᑎᖃᓲᖑᕗᖅ ᑕᐃᑦᓱᒪᓂᑕᑦᓴᔭᐅᓂᕋᕐᓱᓂ. ᐃᒻᒥᓂᓪᓗᑕᐅᖅ ᓂᓪᓕᕈᑎᖃᓱᖑᒻᒥᓱᓂ ᐅᓪᓗᒥᓂᑕᑦᓴᔭᐅᓂᕋᕐᓱᓂ.
32
+ ⴰⵎⴰⴳⵔⴰⴷ 1 ⴰⵔ ⴷ ⵜⵜⵍⴰⵍⴰⵏ ⵎⵉⴷⴷⵏ ⴳⴰⵏ ⵉⵍⴻⵍⵍⵉⵜⵏ ⵎⴳⴰⴷⴷⴰⵏ ⵖ ⵡⴰⴷⴷⵓⵔ ⴷ ⵉⵣⵔⴼⴰⵏ, ⵢⵉⵍⵉ ⴰⴽⵯ ⴷⴰⵔⵙⵏ ⵓⵏⵍⵍⵉ ⴷ ⵓⴼⵔⴰⴽ, ⵉⵍⵍⴰ ⴼⵍⵍⴰ ⵙⵏ ⴰⴷ ⵜⵜⵎⵢⴰⵡⴰⵙⵏ ⵏⴳⵔⴰⵜⵙⵏ ⵙ ⵜⴰⴳⵎⴰⵜ.
uroman/test/multi-script.uroman-ref.txt ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ::lcode deu Gruesse aus Bordeaux
2
+ ::lcode tur Istanbul, Tuerkiye'de yer alan shehir ve uelkenin 81 ilinden biri.
3
+ ::lcode eng We hold ⠘e truos to ; self-evid⠢t, t all men aee cr,te equal, t ey aee endoee by ⠸e Creator u cita⠔ unalienable ⠠⠐rs, t amg ⠘e aee Life, Libity ⠯ e pursuit a Happis.
4
+ ::lcode ell To Los Andzeles (sta ispanika Los Angeles = Oi Angeloi) e sten Amerikanike arngo L.A., el ei) einai e deutere megalutere pole ton Enomenon Politeion apo apopse plethysmou, kathos kai ena apo ta semandikotera oikonomika, politistika epistemonika kai psychagogika kendra tou kosmou.
5
+ ::lcode rus Germaniya (nem. Deutschland), ofitsialnoe nazvanie — Federativnaya Respublika Germaniya (nem. Bundesrepublik Deutschland), FRG (nem. BRD) — gosudarstvo v Zapadnoi Evrope. Ploshchad territorii — 357 021 km². Chislennost naseleniya po perepisi 2011 goda — bolee 80 millionov chelovek. [2][6].
6
+ ::lcode ukr Volodimir Oleksandrovich Zelensky (nar. 25 sichnya 1978, Krivy Rig) — ukrayinsky derzhavny diyach, politik, shoumen, aktor, komik, rezhiser, prodyuser ta stsenarist, shosty Prezident Ukrayini z 20 travnya 2019 roku.
7
+ ::lcode srp Sva ljudska bitsha radjaju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i sveshtshu i treba jedni prema drugima da postupaju u dukhu bratstva.
8
+ ::lcode ara knda (balinjlyzya: Canada) hy dwla fy amryka alshmalya ttalf mn 10 mqat'at wthlatha aqalym. tq' fy alqsm alshmaly mn alqara wtmtd mn almhyt alatlsy fy alshrq ila almhyt alhadye fy alghrb wtmtd shmalan fy almhyt almtjmd alshmaly. knda hy albld althany 'almyan mn hyth almsaha alklya. kma an hdwd knda almshtrka m' alwlayat almthda mn aljnwb walshmal alghrby hy alatwl fy al'alm.
9
+ ::lcode fas kalifrnia (bh anglisi: California) ialti dr ghrb amrika br kranh' aqianws aram ast. mrkz an sakramntw w shhrhai mhm an lsanjls, sn digw, sn khwzh w sanfransiskw hstnd.hmtchnin in ialt pr jm'it trin ialt amrika ast.
10
+ ::lcode uig yeameraka qwshma shtatlara bwlsa shamalay yeamerakagha jaylashqan bar doelaet. yeunang paytaekhta bwlsa vashangtwn, yeaeng tchwng shaehara bwlsa nyuywrk shaehara. doelaet tala bwlsa yeenglaztala. hazarqa zungtung barak yewbama. bu doelaet yeaesladae yeaengalayaenang mustaemlakasa bwlup vashangatwnnang raehpaerlakadae 1776 yala 7 yeaynang 4 kuna mustaeqal bwlghan, yaer maeydana 9 malaywn 826 mang 630 kvadarat klwmetar, nwpusa 306 mallaywn 142 mang, bularnang yeassaslaq dana khrastayean dana.
11
+ ::lcode amh iteyopheyaa kaaalame sosetu teleqe yaaberehaame hayemaanotoche gaare taarikaawi genenyunate alaate.
12
+ ::lcode hin kailiphorniyaa shabda kaa pahalaa artha thaa jo kssetra jahaam aaj baahaa kailiphorniyaa praayadviip, nevaaddaa, yuuttaa aur erijonaa, nayaa meksiko, aur vaayomimga ke kaii vibhaag sthit haim.
13
+ ::lcode mar lamddan (imglish: London ) he imglamddace va yunaayattedd kimgddamace raajadhaaniice va sarvaat motthe shahar tasec yuropiyan samghaamadhiil sarvaat motthe mahaanagar kssetra aahe.
14
+ ::lcode nep yasako ucaai samundra satahabaatt 8,848 miittar (29,028 phiitt) cha. yo nepaalako solukhumbu jillaako khumjungga gaa. vi. sa. maa parcha .
15
+ ::lcode tam tamilnaadu (Tamil Nadu) intiyaavin 29 maanilangkalil onraakum. tamilnaadu, tamilakam enrum paravalaaka alaikkappadukiratu.
16
+ ::lcode mal intyayutte tekkupattinynyaarre arrrrattulllla samsthaanamaann keerallam.
17
+ ::lcode ori oddishaa bhaaratara puurba upakuullare thibaa eka prashaasanika raajya . ehaara uttara-puurbare pashcimabangga, uttarare jhaaddakhanndda, pashcima o uttara-pashcimare chatishagadda, dakssinna o dakssinna-pashcimare aandhrapradesha abasthita . ehaa aayatana hisaabare nabama o janasamkhyaa hisaabare egaaratama raajya . oddiaa bhaassaa raajyara sarakaarii bhaassaa . 2001 janagannanaa anusaare raajyara praaya 33.2 niyuta loka oddiaa bhaassaa byabahaara karanti .
18
+ ::lcode zho jianadazai14000nianqianjiyouyuanzhuminzaicishenghuo.
19
+ ::lcode heb kol 'od balevav penimah nefesh yehudi homiyah ulefa'ate mizerach, qadimah, 'ayin letsiyon tsofiyah, 'od lo avedah tiqvatenu, hatiqvah bat shenot 'alepayim liheyot 'am chafeshiy be'aretsenu, erets tsiyon virushalayim.
20
+ ::lcode yid dvvqa ayz an h'vr'ysh'r zshvrnal vvas vashryyvt dy yydysh-shfrakyq' qvltvr. 's ayz d'rshyn'n g'vvarn tmvz h'tshs"z (yvly 2006).
21
+ ::lcode hye Talnoei shrjan (ukr., Talnivsky raion), shrjan Ukrainayi Cherkasii marzum. Steghtsvel e 1923 tvakanin. Varchakan kentrone, Talnoe. Ashkharhagrutyune Shrjani taratski makerese kazmum e 917 km². Bnakchutyun
22
+ ::lcode tai miipratesisra 2 prates kuee saanmaariinolaeankrratwaatikan peondindaentiilomrobpaidwypueentiikongitaalii naiknatiimeueengkampiionediitaaleiiy peondindaenswnyaekkongitaaliitiituuklomrobdwypueentiipratesswitserlaend
23
+ bugjjogeneun indonesiawa dongtimoreu, papua nyugini, bugdongjjogeneun solromon jedowa banuatu, nubelkalredoni, geurigo namdongjjogeneun nyujilraendeuga issda.
24
+ baa illi sambhavisu imdenna hrdayadali nityavuu avataripa satyaavataara mannnnaagi maravaagi migavaagi kagavaagii... mannnnaagi maravaagi migavaagi kagavaagi bhava bhavadi bhatisihee bhavati duura nityavuu avataripa satyaavataara || baa illi ||
25
+ vepxis tqaosani shota rustaveli ghmertsi shemvedre, nutu kvla damxsnas sophlisa shromasa, tsetsxls, tsqalsa da mitsasa, haerta tana mromasa; momtsnes phrteni da aghvphrinde, mivhxvde mas chemsa ndomasa, dghisit da ghamit vhxedvide mzisa elvata krtomaasa.
26
+ anm moilegoimrid maki vekumen
27
+ ic mag glas eotan ond hit ne hearmiath me.
28
+ ptolmys
29
+ chekosurobakia
30
+ lha·sa·grong·khyer
31
+ saali safiaryok imminik nillirotiqasoongofoq taitsomanitatsayaonirarsoni. imminillotaoq nillirotiqasongommisoni ollominitatsayaonirarsoni.
32
+ amagrad 1 ar d ttlalan middn gan ilellitn mgaddan gh waddur d izrfan, yili ak darsn unlli d ufrak, illa flla sn ad ttmyawasn ngratsn s tagmat.
uroman/test/string-similarity-test-input.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ trap strap
2
+ colour color
3
+ labeling labelling
4
+ organisation organization
5
+ Philadelphia Filadelfia
6
+ Vladimir Volodymyr
7
+ Moskva Moskvoy
uroman/test/string-similarity-test-output-ref.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Lang-code-1: eng Lang-code-2: eng
2
+ trap strap 1
3
+ colour color 0.1
4
+ labeling labelling 0.02
5
+ organisation organization 0.1
6
+ Philadelphia Filadelfia 0.02
7
+ Vladimir Volodymyr 0.5
8
+ Moskva Moskvoy 0.5
uroman/text/amh.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ ኢትዮጵያ ከዓለም ሶስቱ ትልቅ የአብርሃም ሀይማኖቶች ጋር ታሪካዊ ግንኙነት አላት።
2
+ ክርስትናን በአራተኛው ምዕተ-ዓመት ተቀብላለች።
3
+ ከሕዝቡ አንድ ሶስተኛው እስላም ነው።
4
+ የመጀመሪያው የእስላም ሂጅራ ወደ ኢትዮጵያ ነው የተከናወነው።
5
+ ነጋሽ በአፍሪካ የመጀመሪያው የእስላም መቀመጫ ናት።
6
+ እስከ ፲፱፻፸ ዎቹ ድረስ ብዙ ቤተ-እስራኤሎች በኢትዮጵያ ይኖሩ ነበር።
7
+ የራስ ተፈሪ እንቅስቃሴ ኢትዮጵያን በትልቅ ክብር ነው የሚያያት።
uroman/text/ara.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ كندا (بالإنجليزية: Canada) هي دولة في أمريكا الشمالية تتألف من 10 مقاطعات وثلاثة أقاليم. تقع في القسم الشمالي من القارة وتمتد من المحيط الأطلسي في الشرق إلى المحيط الهادئ في الغرب وتمتد شمالاً في المحيط المتجمد الشمالي. كندا هي البلد الثاني عالمياً من حيث المساحة الكلية. كما أن حدود كندا المشتركة مع الولايات المتحدة من الجنوب والشمال الغربي هي الأطول في العالم.
2
+ أراضي كندا مأهولة منذ آلاف السنين من قبل مجموعات مختلفة من السكان الأصليين. مع حلول أواخر القرن الخامس عشر بدأت الحملات البريطانية والفرنسية استكشاف المنطقة ومن ثم استوطنتها على طول ساحل المحيط الأطلسي. تنازلت فرنسا عن ما يقرب من جميع مستعمراتها في أمريكا الشمالية في عام 1763 بعد حرب السنوات السبع. في عام 1867، مع اتحاد ثلاثة مستعمرات بريطانية في أمريكا الشمالية عبر كونفدرالية تشكلت كندا باعتبارها كيانًا فدراليًا ذا سيادة يضم أربع مقاطعات. بدأ ذلك عملية اتسعت فيها مساحة كندا وتوسع حكمها الذاتي عن المملكة المتحدة. تجلت هذه الاستقلالية من خلال تشريع وستمنستر عام 1931 وبلغت ذروتها في صورة قانون كندا عام 1982 والذي قطع الاعتماد القانوني لكندا على البرلمان البريطاني.
3
+ كندا دولة فيدرالية يحكمها نظام ديمقراطي تمثيلي وملكية دستورية حيث الملكة إليزابيث الثانية قائدة للدولة. الأمة الكندية أمة ثنائية اللغة حيث الإنكليزية والفرنسية لغتان رسميتان على المستوى الاتحادي. تعد كندا واحدة من أكثر دول العالم تطوراً، حيث تمتلك اقتصاداً متنوعاً وتعتمد على مواردها الطبيعية الوفيرة، وعلى التجارة وبخاصة مع الولايات المتحدة اللتان تربطهما علاقة طويلة ومعقدة. كندا عضو في مجموعة الدول الصناعية السبع ومجموعة الثماني ومجموعة العشرين وحلف شمال الأطلسي ومنظمة التعاون والتنمية الاقتصادية ومنظمة التجارة العالمية ودول الكومنولث والفرنكوفونية ومنظمة الدول الأمريكية والإبيك والأمم المتحدة. تمتلك كندا واحداً من أعلى مستويات المعيشة في العالم حيث مؤشر التنمية البشرية يضعها في المرتبة الثامنة عالمياً.
uroman/text/ben.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ বার্লিন (জার্মান: Berlin বেয়ালিন্‌') জার্মানির রাজধানী, এবং ইউরোপ মহাদেশের একটি ঐতিহাসিক শহর। বার্লিন শহরে ৩৪ লক্ষেরও বেশি লোক বাস করেন। শহরটি একাধারে একটি শহর এবং জার্মানির একটি রাজ্য। বার্লিনের আয়তন ৩৪৩ বর্গমাইল; এটির আয়তন প্যারিস শহরের প্রায় ৯ গুণ।
2
+ বার্লিন একটি বহুসাংস্কৃতিক শহর। বিশ্বের ১৮৪টি দেশ থেকে আগত প্রায় ৪ লক্ষ ৩০ হাজার অভিবাসী বার্লিনে বাস করে। এদের মধ্যে তুরস্ক থেকে আগত অভিবাসীরা সংখ্যা সবচেয়ে বেশি; বার্লিনে প্রায় ১ লক্ষ ১৯ হাজার তুর্কি অভিবাসী বাস করে। তুরস্কের বাইরে বার্লিনেই ইউরোপে তুর্কিদের সবচেয়ে বড় সম্প্রদায় অবস্থিত।
3
+ ১৯৪৯ সাল থেকে ১৯৯০ পর্যন্ত বার্লিন পূর্ব বার্লিন ও পশ্চিম বার্লিন---এই দুই ভাগে বিভক্ত ছিল। ১৯৬১ সালে পূর্ব জার্মান সরকার সেখানকার নাগরিকদের পশ্চিম বার্লিনে পালিয়ে যাওয়া ঠেকাতে দুই বার্লিনের মাঝে একটি দেয়াল তুলে দেয়। দেয়ালটি ১৯৬১ সাল থেকে ১৯৮৯ সাল পর্যন্ত টিকে ছিল। ঐ সময় ৫ হাজারেরও বেশি ব্যক্তি দেয়ালটি টপকানোর চেষ্টা করে; এদের মধ্যে ৩২০০ জনকে গ্রেফতার করা হয় এবং ১৯১ জন নিহত হয়।
4
+ ১৯৮৯ সালে দেয়ালটি ভেঙে ফেলার পর বার্লিনের ব্রান্ডেনবুর্গ ফটক পূর্ব ও পশ্চিম বার্লিনের পুনঃএকত্রীকরণের প্রতীক হিসেবে দাঁড়িয়ে আছে।
5
+ বার্লিনের স্থানীয় ফুটবল দলের নাম হের্টা বে এস ৎসে বের্লিন। তারা ঘরোয়া ম্যাচগুলি বার্লিনের "অলিম্পিয়াষ্টাডিয়ন" নামের স্টেডিয়ামে খেলে থাকে। এই স্টেডিয়ামেই ১৯৩৬ সালের গ্রীষ্মকালীন অলিম্পিক্‌স অনুষ্ঠিত হয়।
6
+ বার্লিনে কুকুর পোষা খুবই ব্যয়বহুল একটি কাজ। কুকুরের মালিককে প্রতি বছর দেড়শ ইউরো কর দিতে হয়।
7
+ বার্লিনের কাউফ্‌হাউস ডেস ভেস্টেন্‌স (Kaufhaus des Westens, সংক্ষেপে KaDeWe, কাডেভে) ইউরোপের বৃহত্তম ডিপার্টমেন্ট স্টোর। এর আট তলাবিশিষ্ট ভবনে প্রায় ৪ লক্ষ জিনিস বেচা কেনা হয়।
8
+ মার্কিন যুক্তরাষ্ট্রের লস অ্যাঞ্জেলেস বার্লিনের ভগ্নী শহর।
uroman/text/bod.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ཁྲིན་ཀོན་ཆུས
2
+ ལྷ་ས་གྲོང་ཁྱེར
3
+ [[ཁྲིན་ཀོན་ཆུས་‎ཞེས་པ་ནི་རྒྱ་ནག་གཞུང་གིས་བཙན་འཛུལ་བྱས་རྗེས་བཏགས་པའི་མིང་ཞིག་ཡིན་པ་དང། དེ་ནི་ད་ལྟའི་ཆར་ལྷ་ས་གྲོང་ཁྱེར་གྱི་ཁོངས་གཏོགས་རྫོང་ཁག་བདུན་པོ་ཕུད་པའི་གྲོང་ཁྱེར་ནང་ཁུལ་གྱི་ས་ཁུལ་ཁག་བསྡུས་པའི་གནས་དེར་ཁྲེང་ཀོན་ཆུས་ཞེས་པའི་ཁོངས་སུ་གཏོགས་པར་བཤད་ཡོད་ཅིང། ནུབ་ཏུ་སྟོད་ལུང་ས་འབྲེལ་འབྲས་སྤུངས་དན་བག་ཡན་དང་ཤར་དུ་གཤོངས་ཀ་གླིང་ཡན་ཙམ་དུ་ཡིན་ཚོད་འདུག]]
uroman/text/egy.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ 𓈎𓃭𓇋𓍯𓊪𓄿𓆓𓂋𓄿𓏏𓆇
2
+ 𓊪𓏏𓍯𓃭𓐝𓇌𓋴
3
+ 𓆿𓍧𓎇𓏻
4
+ 𓇌𓊪𓏲𓌙𓈉
5
+
uroman/text/ell.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Το Λος Άντζελες (στα ισπανικά Los Angeles = Οι Άγγελοι) ή στην Αμερικανική αργκό L.A., ελ έι) είναι η δεύτερη μεγαλύτερη πόλη των Ηνωμένων Πολιτειών από άποψη πληθυσμού, καθώς και ένα από τα σημαντικότερα οικονομικά, πολιτιστικά επιστημονικά και ψυχαγωγικά κέντρα του κόσμου.
2
+ Βρίσκεται στη δυτική ακτή των Η.Π.Α., στην πολιτεία της Καλιφόρνιας.
3
+ Έχει 3,85 εκατομμύρια κατοίκους σύμφωνα με εκτίμηση του 2006 και έκταση 1.214,9 τετραγωνικών χιλιομέτρων.
4
+ Η αχανής μητροπολιτική περιοχή του Λος Άντζελες εκτιμάται ότι αριθμεί περίπου 13 εκατομμύρια κατοίκους, οι οποίοι αποκαλούνται Angelenos.
5
+ Η πόλη αποτελεί ένα από τα πιο κοσμοπολίτικα μέρη στον κόσμο, καθώς κατοικούν άνθρωποι προερχόμενοι από κάθε γωνιά της γης, που προσελκύονται από το ευχάριστο κλίμα, τον έντονο και γεμάτο ενέργεια τρόπο ζωής αλλά και την υπόσχεση του αμερικανικού ονείρου.
6
+
7
+ Γερούν Ντάισελμπλουμ
8
+ Γιώργος Κωνσταντινίδης
uroman/text/fas.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ کالیفرنیا (به انگلیسی: California) ایالتی در غرب آمریکا بر کرانهٔ اقیانوس آرام است. مرکز آن ساکرامنتو و شهرهای مهم آن لس‌آنجلس، سن دیگو، سن خوزه و سان‌فرانسیسکو هستند.همچنین این ایالت پر جمعیت ترین ایالت امریکا است.
2
+ نام این ایالت از زبان اسپانیولی و به خصوص از رمانی به نام Las sergas de Esplandián گرفته شده، و متعلق به شخصیتی از این داستان است به نام ملکه Califia که احتمالاً از واژهٔ عربی «خلیفه» گرفته شده.[۲]
3
+ کالیفرنیا پرجمعیت‌ترین ایالت ایالات متحده آمریکاست و نیز بزرگ‌ترین جمعیت ایرانی تبار خارج از خاور میانه را در خود جای داده است[۳] به طوری که چندین تن از اعضای شورای شهر بورلی هیلز ایرانی‌الاصل هستند.
4
+ کالیفرنیا نهمین اقتصاد جهان است. در سال ۲۰۱۲، این ایالت تولید ناخالص داخلی برابر با ۱٬۹۵۸٬۹۰۴تریلیون دلار داشت، که نزدیک به تولید ناخالص داخلی کشور ایتالیا (۲٬۰۱۳٬۳۷۵ میلیون دلار) بود.[۴]
5
+ کالیفرنیا صنعت فناوری اطلاعات و رایانه‌ای بسیار پیشرفته‌ای دارد به طوری که شرکتهای اوراکل، سیسکو سیستمز، اینتل، گوگل، یاهو، شرکت ای‌ام‌دی، سان مایکروسیستمز، و نیز شرکت رایانه‌ای اپل و شِوران نیز در این ایالت مرکزیت دارند.
6
+ علاوه بر این، دو عدد از آزمایشگاه‌های فدرال بزرگ آمریکا در این ایالت قرار دارند از جمله آزمایشگاه ملی لارنس لیورمور و آزمایشگاه ملی لارنس برکلی.