shasenem Akmyradov commited on
Commit
9bff372
0 Parent(s):

Duplicate from Akmyradov/TurkmenTTSweSTT

Browse files

Co-authored-by: Yslam <Akmyradov@users.noreply.huggingface.co>

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +34 -0
  2. README.md +14 -0
  3. app.py +94 -0
  4. asr.py +41 -0
  5. data/asr/all_langs.tsv +1 -0
  6. data/lid/all_langs.tsv +4017 -0
  7. data/tts/all_langs.tsv +1 -0
  8. requirements.txt +11 -0
  9. tts.py +173 -0
  10. uroman/.gitignore +35 -0
  11. uroman/LICENSE.txt +11 -0
  12. uroman/README.md +165 -0
  13. uroman/README.txt +141 -0
  14. uroman/bin/de-accent.pl +201 -0
  15. uroman/bin/string-distance.pl +99 -0
  16. uroman/bin/uroman-quick.pl +58 -0
  17. uroman/bin/uroman-tsv.sh +28 -0
  18. uroman/bin/uroman.pl +138 -0
  19. uroman/data/Chinese_to_Pinyin.txt +0 -0
  20. uroman/data/Scripts.txt +135 -0
  21. uroman/data/UnicodeData.txt +0 -0
  22. uroman/data/UnicodeDataOverwrite.txt +442 -0
  23. uroman/data/romanization-table-arabic-block.txt +179 -0
  24. uroman/data/romanization-table.txt +2019 -0
  25. uroman/data/romanization-table.v1.2.1.txt +814 -0
  26. uroman/data/string-distance-cost-rules.txt +896 -0
  27. uroman/lib/JSON.pm +2317 -0
  28. uroman/lib/JSON/backportPP.pm +2806 -0
  29. uroman/lib/JSON/backportPP/Boolean.pm +27 -0
  30. uroman/lib/JSON/backportPP/Compat5005.pm +131 -0
  31. uroman/lib/JSON/backportPP/Compat5006.pm +173 -0
  32. uroman/lib/NLP/Chinese.pm +239 -0
  33. uroman/lib/NLP/English.pm +0 -0
  34. uroman/lib/NLP/Romanizer.pm +2020 -0
  35. uroman/lib/NLP/UTF8.pm +1404 -0
  36. uroman/lib/NLP/stringDistance.pm +724 -0
  37. uroman/lib/NLP/utilities.pm +0 -0
  38. uroman/tarballs/uroman-v1.0.tar.gz +3 -0
  39. uroman/tarballs/uroman-v1.1.tar.gz +3 -0
  40. uroman/tarballs/uroman-v1.2.4.tar.gz +3 -0
  41. uroman/tarballs/uroman-v1.2.5.tar.gz +3 -0
  42. uroman/tarballs/uroman-v1.2.6.tar.gz +3 -0
  43. uroman/tarballs/uroman-v1.2.7.tar.gz +3 -0
  44. uroman/tarballs/uroman-v1.2.tar.gz +3 -0
  45. uroman/test/multi-script.txt +32 -0
  46. uroman/test/multi-script.uroman-ref.txt +32 -0
  47. uroman/test/string-similarity-test-input.txt +7 -0
  48. uroman/test/string-similarity-test-output-ref.txt +8 -0
  49. uroman/text/amh.txt +7 -0
  50. uroman/text/ara.txt +3 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: MMS
3
+ emoji: ⚡
4
+ colorFrom: pink
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 3.32.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: cc-by-nc-4.0
11
+ duplicated_from: Akmyradov/TurkmenTTSweSTT
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import librosa
3
+ from asr import transcribe
4
+ from tts import synthesize, TTS_EXAMPLES
5
+
6
+ ALL_LANGUAGES = {}
7
+
8
+ for task in ["tts", "asr", "lid"]:
9
+ ALL_LANGUAGES.setdefault(task, {})
10
+ with open(f"data/{task}/all_langs.tsv") as f:
11
+ for line in f:
12
+ iso, name = line.split(" ", 1)
13
+ ALL_LANGUAGES[task][iso] = name
14
+
15
+
16
+ def identify(microphone, file_upload):
17
+ LID_SAMPLING_RATE = 16_000
18
+
19
+ warn_output = ""
20
+ if (microphone is not None) and (file_upload is not None):
21
+ warn_output = (
22
+ "WARNING: You've uploaded an audio file and used the microphone. "
23
+ "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
24
+ )
25
+
26
+ elif (microphone is None) and (file_upload is None):
27
+ return "ERROR: You have to either use the microphone or upload an audio file"
28
+
29
+ audio_fp = microphone if microphone is not None else file_upload
30
+ inputs = librosa.load(audio_fp, sr=LID_SAMPLING_RATE, mono=True)[0]
31
+
32
+ raw_output = {"eng": 0.9, "hin": 0.04, "heb": 0.03, "ara": 0.02, "fra": 0.01}
33
+ return {(k + ": " + ALL_LANGUAGES["lid"][k]): v for k, v in raw_output.items()}
34
+
35
+
36
+ demo = gr.Blocks()
37
+
38
+ mms_transcribe = gr.Interface(
39
+ fn=transcribe,
40
+ inputs=[
41
+ gr.Audio(source="microphone", type="filepath"),
42
+ gr.Audio(source="upload", type="filepath"),
43
+ gr.Dropdown(
44
+ [f"{k}: {v}" for k, v in ALL_LANGUAGES["asr"].items()],
45
+ label="Language",
46
+ value="tuk-script_latin: Turkmen",
47
+ ),
48
+ ],
49
+ outputs="text",
50
+ title="Speech-to-text",
51
+ description=("Transcribe audio!"),
52
+ allow_flagging="never",
53
+ )
54
+
55
+ mms_synthesize = gr.Interface(
56
+ fn=synthesize,
57
+ inputs=[
58
+ gr.Text(label="Input text"),
59
+ gr.Dropdown(
60
+ [f"{k}: {v}" for k, v in ALL_LANGUAGES["tts"].items()],
61
+ label="Language",
62
+ value="tuk-script_latin: Turkmen",
63
+ ),
64
+ gr.Slider(minimum=0.1, maximum=4.0, value=1.0, step=0.1, label="Speed"),
65
+ ],
66
+ outputs=[
67
+ gr.Audio(label="Generated Audio", type="numpy"),
68
+ gr.Text(label="Filtered text after removing OOVs"),
69
+ ],
70
+ examples=TTS_EXAMPLES,
71
+ title="Text-to-speech",
72
+ description=("Generate audio!"),
73
+ allow_flagging="never",
74
+ )
75
+
76
+ mms_identify = gr.Interface(
77
+ fn=identify,
78
+ inputs=[
79
+ gr.Audio(source="microphone", type="filepath"),
80
+ gr.Audio(source="upload", type="filepath"),
81
+ ],
82
+ outputs=gr.Label(num_top_classes=10),
83
+ title="Language Identification",
84
+ description=("Identity the language of audio!"),
85
+ allow_flagging="never",
86
+ )
87
+
88
+ with demo:
89
+ gr.TabbedInterface(
90
+ [mms_synthesize, mms_transcribe, mms_identify],
91
+ ["Text-to-speech", "Speech-to-text", "Language Identification"],
92
+ )
93
+
94
+ demo.launch()
asr.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ from transformers import Wav2Vec2ForCTC, AutoProcessor
3
+ import torch
4
+
5
+ ASR_SAMPLING_RATE = 16_000
6
+
7
+
8
+ MODEL_ID = "facebook/mms-1b-all"
9
+
10
+ processor = AutoProcessor.from_pretrained(MODEL_ID)
11
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
12
+
13
+
14
+ def transcribe(microphone, file_upload, lang):
15
+
16
+ warn_output = ""
17
+ if (microphone is not None) and (file_upload is not None):
18
+ warn_output = (
19
+ "WARNING: You've uploaded an audio file and used the microphone. "
20
+ "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
21
+ )
22
+ elif (microphone is None) and (file_upload is None):
23
+ return "ERROR: You have to either use the microphone or upload an audio file"
24
+
25
+ audio_fp = microphone if microphone is not None else file_upload
26
+ audio_samples = librosa.load(audio_fp, sr=ASR_SAMPLING_RATE, mono=True)[0]
27
+
28
+ lang_code = lang.split(":")[0]
29
+ processor.tokenizer.set_target_lang(lang_code)
30
+ model.load_adapter(lang_code)
31
+
32
+ inputs = processor(
33
+ audio_samples, sampling_rate=ASR_SAMPLING_RATE, return_tensors="pt"
34
+ )
35
+
36
+ with torch.no_grad():
37
+ outputs = model(**inputs).logits
38
+
39
+ ids = torch.argmax(outputs, dim=-1)[0]
40
+ transcription = processor.decode(ids)
41
+ return warn_output + transcription
data/asr/all_langs.tsv ADDED
@@ -0,0 +1 @@
 
 
1
+ tuk-script_latin Turkmen
data/lid/all_langs.tsv ADDED
@@ -0,0 +1,4017 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ara Arabic
2
+ eng English
3
+ cmn Chinese, Mandarin
4
+ spa Spanish
5
+ fra French
6
+ mlg Malagasy
7
+ ful Fulah
8
+ swe Swedish
9
+ por Portuguese
10
+ zlm Malay
11
+ sun Sunda
12
+ tuk Turkmen
13
+ vie Vietnamese
14
+ kor Korean
15
+ hin Hindi
16
+ ben Bengali
17
+ som Somali
18
+ asm Assamese
19
+ swh Swahili
20
+ urd Urdu
21
+ hau Hausa
22
+ ind Indonesian
23
+ tat Tatar
24
+ bod Tibetan, Central
25
+ tel Telugu
26
+ mon Mongolian
27
+ aze Azerbaijani
28
+ rus Russian
29
+ tgl Tagalog
30
+ tur Turkish
31
+ mar Marathi
32
+ amh Amharic
33
+ ron Romanian
34
+ yor Yoruba
35
+ tha Thai
36
+ slv Slovene
37
+ heb Hebrew
38
+ mkd Macedonian
39
+ bel Belarusian
40
+ nya Chichewa
41
+ mal Malayalam
42
+ bul Bulgarian
43
+ hun Hungarian
44
+ hat Haitian Creole
45
+ fas Persian
46
+ hrv Croatian
47
+ cat Catalan
48
+ tam Tamil
49
+ orm Oromo
50
+ kmr Kurdish, Northern
51
+ nld Dutch
52
+ cak Kaqchikel
53
+ afr Afrikaans
54
+ pol Polish
55
+ jav Javanese
56
+ lin Lingala
57
+ cym Welsh
58
+ kik Gikuyu
59
+ nob Norwegian Bokmål
60
+ grn Guarani
61
+ snd Sindhi
62
+ kaz Kazakh
63
+ isl Icelandic
64
+ uzb Uzbek
65
+ bos Bosnian
66
+ mya Burmese
67
+ lat Latin
68
+ deu German, Standard
69
+ npi Nepali
70
+ che Chechen
71
+ yue Chinese, Yue
72
+ kat Georgian
73
+ kan Kannada
74
+ lit Lithuanian
75
+ mam Mam
76
+ sqi Albanian
77
+ hye Armenian
78
+ jpn Japanese
79
+ ell Greek
80
+ crh Crimean Tatar
81
+ lav Latvian
82
+ khm Khmer
83
+ bak Bashkort
84
+ poh Poqomchi’
85
+ quc K’iche’
86
+ pan Punjabi, Eastern
87
+ ixl Ixil
88
+ xog Soga
89
+ ces Czech
90
+ tgk Tajik
91
+ cfm Chin, Falam
92
+ fao Faroese
93
+ guj Gujarati
94
+ aka Akan
95
+ ukr Ukrainian
96
+ glg Galician
97
+ ltz Luxembourgish
98
+ sxn Sangir
99
+ sna Shona
100
+ lao Lao
101
+ mlt Maltese
102
+ sin Sinhala
103
+ lug Ganda
104
+ aiw Aari
105
+ kia Kim
106
+ ayo Ayoreo
107
+ dtp Kadazan Dusun
108
+ cmo Mnong, Central
109
+ nhx Nahuatl, Isthmus-Mecayapan
110
+ gag Gagauz
111
+ tzj Tz’utujil
112
+ tuv Turkana
113
+ acr Achi
114
+ mri Maori
115
+ eus Basque
116
+ pus Pushto
117
+ quy Quechua, Ayacucho
118
+ srp Serbian
119
+ ita Italian
120
+ nno Norwegian Nynorsk
121
+ xsm Kasem
122
+ luo Dholuo
123
+ ory Odia
124
+ gur Farefare
125
+ cac Chuj
126
+ quh Quechua, South Bolivian
127
+ ewe Éwé
128
+ kbp Kabiyè
129
+ saq Samburu
130
+ slk Slovak
131
+ xon Konkomba
132
+ fin Finnish
133
+ mos Mòoré
134
+ bwq Bobo Madaré, Southern
135
+ yao Yao
136
+ hne Chhattisgarhi
137
+ rif Tarifit
138
+ new Newar
139
+ hus Huastec
140
+ dyu Jula
141
+ bre Breton
142
+ guh Guahibo
143
+ bis Bislama
144
+ yid Yiddish
145
+ txa Tombonuo
146
+ mnk Mandinka
147
+ uig Uyghur
148
+ bqc Boko
149
+ dan Danish
150
+ ngl Lomwe
151
+ pse Malay, Central
152
+ bam Bamanankan
153
+ mtg Una
154
+ pmf Pamona
155
+ onb Lingao
156
+ ntm Nateni
157
+ tso Tsonga
158
+ bno Bantoanon
159
+ teo Ateso
160
+ uhn Damal
161
+ ycl Lolopo
162
+ bus Bokobaru
163
+ ttq Tamajaq, Tawallammat
164
+ mcr Menya
165
+ seh Sena
166
+ kru Kurux
167
+ lok Loko
168
+ est Estonian
169
+ tpi Tok Pisin
170
+ zne Zande
171
+ bxk Bukusu
172
+ mzi Mazatec, Ixcatlán
173
+ amf Hamer-Banna
174
+ rel Rendille
175
+ sck Sadri
176
+ lcp Lawa, Western
177
+ gbo Grebo, Northern
178
+ adx Tibetan, Amdo
179
+ tcc Datooga
180
+ cnh Chin, Hakha
181
+ pwg Gapapaiwa
182
+ wlx Wali
183
+ rjs Rajbanshi
184
+ thl Tharu, Dangaura
185
+ xal Kalmyk-Oirat
186
+ dos Dogosé
187
+ lis Lisu
188
+ txu Kayapó
189
+ sxb Suba
190
+ gng Ngangam
191
+ ifa Ifugao, Amganad
192
+ beh Biali
193
+ poe Popoloca, San Juan Atzingo
194
+ dga Dagaare, Southern
195
+ dsh Daasanach
196
+ vmw Makhuwa
197
+ mup Malvi
198
+ lnd Lundayeh
199
+ kbo Keliko
200
+ cwa Kabwa
201
+ rol Romblomanon
202
+ khg Tibetan, Khams
203
+ nko Nkonya
204
+ dgi Dagara, Northern
205
+ kml Kalinga, Tanudan
206
+ nxq Naxi
207
+ acn Achang
208
+ pxm Mixe, Quetzaltepec
209
+ wal Wolaytta
210
+ ctg Chittagonian
211
+ dnw Dani, Western
212
+ pui Puinave
213
+ lew Kaili, Ledo
214
+ bfa Bari
215
+ mqj Mamasa
216
+ rmc Romani, Carpathian
217
+ mhy Ma’anyan
218
+ xsr Sherpa
219
+ gri Ghari
220
+ bfy Bagheli
221
+ kqp Kimré
222
+ frd Fordata
223
+ ayr Aymara, Central
224
+ mip Mixtec, Apasco-Apoala
225
+ nym Nyamwezi
226
+ tzh Tzeltal
227
+ kcg Tyap
228
+ tex Tennet
229
+ lbw Tolaki
230
+ sda Toraja-Sa’dan
231
+ kdt Kuay
232
+ bfo Birifor, Malba
233
+ qxl Quichua, Salasaca Highland
234
+ ttc Tektiteko
235
+ bfz Pahari, Mahasu
236
+ mhx Lhao Vo
237
+ sbp Sangu
238
+ mco Mixe, Coatlán
239
+ mbu Mbula-Bwazza
240
+ mxt Mixtec, Jamiltepec
241
+ nzi Nzema
242
+ suz Sunwar
243
+ hlt Chin, Matu
244
+ tzo Tzotzil
245
+ any Anyin
246
+ gna Kaansa
247
+ sid Sidamo
248
+ alp Alune
249
+ maj Mazatec, Jalapa de Díaz
250
+ zim Mesme
251
+ knj Akateko
252
+ zar Zapotec, Rincón
253
+ mxb Mixtec, Tezoatlán
254
+ bdu Oroko
255
+ bbc Batak Toba
256
+ ddn Dendi
257
+ obo Manobo, Obo
258
+ krs Gbaya
259
+ zaq Zapotec, Aloápam
260
+ ife Ifè
261
+ soy Miyobe
262
+ trs Triqui, Chicahuaxtla
263
+ mbj Nadëb
264
+ tuo Tucano
265
+ atb Zaiwa
266
+ vif Vili
267
+ mim Mixtec, Alacatlatzala
268
+ grc Greek, Ancient
269
+ cek Chin, Eastern Khumi
270
+ kfx Pahari, Kullu
271
+ naw Nawuri
272
+ tgj Tagin
273
+ xed Hdi
274
+ hnn Hanunoo
275
+ had Hatam
276
+ kij Kilivila
277
+ nlc Nalca
278
+ kek Q’eqchi’
279
+ rej Rejang
280
+ fon Fon
281
+ amk Ambai
282
+ kyb Kalinga, Butbut
283
+ dnj Dan
284
+ oku Oku
285
+ gil Kiribati
286
+ mag Magahi
287
+ lln Lele
288
+ pil Yom
289
+ pls Popoloca, San Marcos Tlacoyalco
290
+ box Buamu
291
+ kwf Kwara’ae
292
+ mgd Moru
293
+ xtm Mixtec, Magdalena Peñasco
294
+ ctd Chin, Tedim
295
+ akb Batak Angkola
296
+ nlg Gela
297
+ bmq Bomu
298
+ bmv Bum
299
+ mgo Meta’
300
+ cla Ron
301
+ rug Roviana
302
+ enx Enxet
303
+ mpm Mixtec, Yosondúa
304
+ gof Gofa
305
+ bom Berom
306
+ mbc Macushi
307
+ btx Batak Karo
308
+ did Didinga
309
+ mej Meyah
310
+ bgq Bagri
311
+ maa Mazatec, San Jerónimo Tecóatl
312
+ nmz Nawdm
313
+ mfk Mofu, North
314
+ aeu Akeu
315
+ mqn Moronene
316
+ tob Toba
317
+ hlb Halbi
318
+ nin Ninzo
319
+ kqe Kalagan
320
+ lex Luang
321
+ mkl Mokole
322
+ icr Islander English Creole
323
+ lns Lamnso’
324
+ tlj Talinga-Bwisi
325
+ bzh Buang, Mapos
326
+ bdh Baka
327
+ kle Kulung
328
+ pib Yine
329
+ vut Vute
330
+ btd Batak Dairi
331
+ xmm Malay, Manado
332
+ yka Yakan
333
+ btt Bete-Bendi
334
+ hoc Ho
335
+ yba Yala
336
+ mib Mixtec, Atatlahuca
337
+ kpq Korupun-Sela
338
+ xsb Sambal
339
+ muy Muyang
340
+ zyp Chin, Zyphe
341
+ bbo Konabéré
342
+ krc Karachay-Balkar
343
+ eka Ekajuk
344
+ mcp Makaa
345
+ bqj Bandial
346
+ mcq Ese
347
+ ybb Yemba
348
+ hyw Armenian, Western
349
+ tmc Tumak
350
+ mih Mixtec, Chayuco
351
+ blt Tai Dam
352
+ zpz Zapotec, Texmelucan
353
+ tng Tobanga
354
+ not Nomatsigenga
355
+ pny Pinyin
356
+ nuj Nyole
357
+ bhz Bada
358
+ kvn Kuna, Border
359
+ lje Rampi
360
+ sne Bidayuh, Bau
361
+ ndy Lutos
362
+ ksb Shambala
363
+ nhy Nahuatl, Northern Oaxaca
364
+ kwd Kwaio
365
+ moz Mukulu
366
+ cmr Mro-Khimi
367
+ xuo Kuo
368
+ zpu Zapotec, Yalálag
369
+ avn Avatime
370
+ pap Papiamentu
371
+ pss Kaulong
372
+ akp Siwu
373
+ ted Krumen, Tepo
374
+ rro Waima
375
+ muv Muthuvan
376
+ gau Gadaba, Mudhili
377
+ ake Akawaio
378
+ guq Aché
379
+ lsi Lacid
380
+ cul Kulina
381
+ tna Tacana
382
+ cle Chinantec, Lealao
383
+ iri Rigwe
384
+ flr Fuliiru
385
+ bkd Binukid
386
+ bmr Muinane
387
+ twb Tawbuid
388
+ ikk Ika
389
+ tbl Tboli
390
+ mnw Mon
391
+ asa Asu
392
+ abi Abidji
393
+ yaz Lokaa
394
+ bgw Bhatri
395
+ miy Mixtec, Ayutla
396
+ gai Mbore
397
+ smo Samoan
398
+ cnl Chinantec, Lalana
399
+ far Fataleka
400
+ poi Popoluca, Highland
401
+ tgo Sudest
402
+ gud Dida, Yocoboué
403
+ kak Kalanguya
404
+ gub Guajajára
405
+ yre Yaouré
406
+ cso Chinantec, Sochiapam
407
+ gwr Gwere
408
+ ati Attié
409
+ urt Urat
410
+ mil Mixtec, Peñoles
411
+ ndv Ndut
412
+ rnl Ranglong
413
+ sch Sakachep
414
+ zpc Zapotec, Choapan
415
+ tom Tombulu
416
+ tnt Tontemboan
417
+ atg Ivbie North-Okpela-Arhe
418
+ kdl Tsikimba
419
+ mto Mixe, Totontepec
420
+ bov Tuwuli
421
+ myy Macuna
422
+ ava Avar
423
+ ami Amis
424
+ luc Aringa
425
+ plw Palawano, Brooke’s Point
426
+ cab Garifuna
427
+ sey Paicoca
428
+ zpg Zapotec, Guevea de Humboldt
429
+ xnj Chingoni
430
+ kdc Kutu
431
+ zpt Zapotec, San Vicente Coatlán
432
+ prk Wa, Parauk
433
+ qxr Quichua, Cañar Highland
434
+ nga Ngbaka
435
+ ubl Bikol, Buhi’non
436
+ crs Seychelles French Creole
437
+ cwe Kwere
438
+ pps Popoloca, San Luís Temalacayuca
439
+ bjw Bakwé
440
+ aia Arosi
441
+ taq Tamasheq
442
+ idd Ede Idaca
443
+ ceb Cebuano
444
+ blh Kuwaa
445
+ kfw Naga, Kharam
446
+ gqr Gor
447
+ suc Subanon, Western
448
+ cok Cora, Santa Teresa
449
+ kzf Kaili, Da’a
450
+ myv Erzya
451
+ mge Mango
452
+ tly Talysh
453
+ udm Udmurt
454
+ tmf Toba-Maskoy
455
+ cbi Chachi
456
+ kqr Kimaragang
457
+ yas Nugunu
458
+ nsu Nahuatl, Sierra Negra
459
+ pez Penan, Eastern
460
+ moa Mwan
461
+ dgk Dagba
462
+ tao Yami
463
+ lon Lomwe, Malawi
464
+ kog Kogi
465
+ tlb Tobelo
466
+ azg Amuzgo, San Pedro Amuzgos
467
+ xtd Mixtec, Diuxi-Tilantongo
468
+ bqp Bisã
469
+ kpv Komi-Zyrian
470
+ hwc Hawaii Pidgin
471
+ cpu Ashéninka, Pichis
472
+ yat Yambeta
473
+ kje Kisar
474
+ met Mato
475
+ zmz Mbandja
476
+ ury Orya
477
+ cpb Ashéninka, Ucayali-Yurúa
478
+ bep Behoa
479
+ yea Ravula
480
+ zga Kinga
481
+ asg Cishingini
482
+ kaq Capanahua
483
+ jun Juang
484
+ knb Kalinga, Lubuagan
485
+ kyf Kouya
486
+ rap Rapa Nui
487
+ ess Yupik, Saint Lawrence Island
488
+ stn Owa
489
+ byr Yipma
490
+ sjm Mapun
491
+ mjv Mannan
492
+ rub Gungu
493
+ kjh Khakas
494
+ kmd Kalinga, Majukayang
495
+ dbq Daba
496
+ wap Wapishana
497
+ blx Ayta, Mag-Indi
498
+ kne Kankanaey
499
+ arl Arabela
500
+ abp Ayta, Abellen
501
+ tuf Tunebo, Central
502
+ cgc Kagayanen
503
+ ksr Borong
504
+ ojb Ojibwa, Northwestern
505
+ cbr Kakataibo-Kashibo
506
+ chv Chuvash
507
+ ktj Krumen, Plapo
508
+ omw Tairora, South
509
+ cjo Ashéninka, Pajonal
510
+ mhr Mari, Meadow
511
+ atq Aralle-Tabulahan
512
+ rkt Rangpuri
513
+ ium Iu Mien
514
+ crt Chorote, Iyojwa’ja
515
+ nog Nogai
516
+ snn Siona
517
+ tte Bwanabwana
518
+ tvw Sedoa
519
+ pjt Pitjantjatjara
520
+ nlk Yali, Ninia
521
+ tih Murut, Timugon
522
+ ppk Uma
523
+ lid Nyindrou
524
+ cui Cuiba
525
+ cot Caquinte
526
+ tav Tatuyo
527
+ log Logo
528
+ prt Prai
529
+ boj Anjam
530
+ huu Witoto, Murui
531
+ mqf Momuna
532
+ med Melpa
533
+ snp Siane
534
+ dah Gwahatike
535
+ tnr Ménik
536
+ tbk Tagbanwa, Calamian
537
+ mtj Moskona
538
+ men Mende
539
+ ubu Umbu-Ungu
540
+ agu Awakateko
541
+ kmu Kanite
542
+ trn Trinitario
543
+ zaj Zaramo
544
+ dnt Dani, Mid Grand Valley
545
+ qvh Quechua, Huamalíes-Dos de Mayo Huánuco
546
+ mcd Sharanahua
547
+ urb Kaapor
548
+ wsg Gondi, Adilabad
549
+ war Waray-Waray
550
+ ame Yanesha’
551
+ cof Tsafiki
552
+ bbb Barai
553
+ hap Hupla
554
+ law Lauje
555
+ crq Chorote, Iyo’wujwa
556
+ bor Borôro
557
+ kri Krio
558
+ nhe Nahuatl, Eastern Huasteca
559
+ bjr Binumarien
560
+ xte Ketengban
561
+ eip Lik
562
+ dav Dawida
563
+ mpd Machinere
564
+ mai Maithili
565
+ sil Sisaala, Tumulung
566
+ pis Pijin
567
+ crk Cree, Plains
568
+ kyz Kayabí
569
+ ngu Nahuatl, Guerrero
570
+ guo Guayabero
571
+ mnx Sougb
572
+ nij Ngaju
573
+ qva Quechua, Ambo-Pasco
574
+ lif Limbu
575
+ bvz Bauzi
576
+ awa Awadhi
577
+ kir Kyrgyz
578
+ kin Kinyarwanda
579
+ iba Iban
580
+ niy Ngiti
581
+ nas Naasioi
582
+ knk Kuranko
583
+ gog Gogo
584
+ gvc Wanano
585
+ mdm Mayogo
586
+ pkb Kipfokomo
587
+ sho Shanga
588
+ gbm Garhwali
589
+ dig Chidigo
590
+ bsq Bassa
591
+ tye Kyanga
592
+ gux Gourmanchéma
593
+ yal Yalunka
594
+ zyb Zhuang, Yongbei
595
+ run Rundi
596
+ bky Bokyi
597
+ yan Mayangna
598
+ tbt Tembo
599
+ set Sentani
600
+ oci Occitan
601
+ nyy Nyakyusa-Ngonde
602
+ shn Shan
603
+ bcc Balochi, Southern
604
+ kno Kono
605
+ yaa Yaminahua
606
+ bwu Buli
607
+ bgr Chin, Bawm
608
+ mfz Mabaan
609
+ keo Kakwa
610
+ led Lendu
611
+ kue Kuman
612
+ grt Garo
613
+ sus Susu
614
+ mdy Male
615
+ sah Yakut
616
+ dug Chiduruma
617
+ pkr Kurumba, Attapady
618
+ tir Tigrigna
619
+ suk Sukuma
620
+ san Sanskrit
621
+ kdj Ng’akarimojong
622
+ nyf Kigiryama
623
+ bem Bemba
624
+ hak Chinese, Hakka
625
+ dag Dagbani
626
+ nan Chinese, Min Nan
627
+ kdh Tem
628
+ gum Misak
629
+ hnj Hmong Njua
630
+ aha Ahanta
631
+ lsm Saamya-Gwe
632
+ nyn Nyankore
633
+ lam Lamba
634
+ tgw Sénoufo, Tagwana
635
+ kde Makonde
636
+ lhu Lahu
637
+ wme Wambule
638
+ guc Wayuu
639
+ mur Murle
640
+ kam Kamba
641
+ bru Bru, Eastern
642
+ nsk Naskapi
643
+ guk Gumuz
644
+ cas Tsimané
645
+ nnw Nuni, Southern
646
+ jow Jowulu
647
+ bvc Baelelea
648
+ gjn Gonja
649
+ cko Anufo
650
+ rim Nyaturu
651
+ mfi Wandala
652
+ thf Thangmi
653
+ trq Triqui, San Martín Itunyoso
654
+ bmu Somba-Siawari
655
+ ade Adele
656
+ rmy Romani, Vlax
657
+ nim Nilamba
658
+ mbb Manobo, Western Bukidnon
659
+ mxv Mixtec, Metlatónoc
660
+ ses Songhay, Koyraboro Senni
661
+ dyo Jola-Fonyi
662
+ taj Tamang, Eastern
663
+ mnb Muna
664
+ sbd Samo, Southern
665
+ hui Huli
666
+ esi Inupiatun, North Alaskan
667
+ wba Warao
668
+ kqn Kaonde
669
+ spy Sabaot
670
+ raw Rawang
671
+ kbr Kafa
672
+ tem Themne
673
+ bst Basketo
674
+ oss Ossetic
675
+ omi Omi
676
+ qul Quechua, North Bolivian
677
+ car Carib
678
+ kff Koya
679
+ ptu Bambam
680
+ mev Maan
681
+ mgh Makhuwa-Meetto
682
+ cly Chatino, Eastern Highland
683
+ mpx Misima-Panaeati
684
+ kus Kusaal
685
+ mwq Chin, Müün
686
+ khq Songhay, Koyra Chiini
687
+ nia Nias
688
+ urk Urak Lawoi’
689
+ spp Sénoufo, Supyire
690
+ dzo Dzongkha
691
+ sgb Ayta, Mag-antsi
692
+ kma Konni
693
+ iou Tuma-Irumu
694
+ lef Lelemi
695
+ nst Naga, Tangshang
696
+ udg Muduga
697
+ vag Vagla
698
+ kum Kumyk
699
+ maw Mampruli
700
+ quz Quechua, Cusco
701
+ kaa Karakalpak
702
+ mpg Marba
703
+ yva Yawa
704
+ bgc Haryanvi
705
+ bim Bimoba
706
+ fij Fijian
707
+ bud Ntcham
708
+ ceg Chamacoco
709
+ tpm Tampulma
710
+ mrj Mari, Hill
711
+ nus Nuer
712
+ sba Ngambay
713
+ lom Loma
714
+ bib Bisa
715
+ twu Termanu
716
+ acd Gikyode
717
+ mak Makasar
718
+ cni Asháninka
719
+ pbb Nasa
720
+ qvm Quechua, Margos-Yarowilca-Lauricocha
721
+ zab Zapotec, Western Tlacolula Valley
722
+ csk Jola-Kasa
723
+ gxx Wè Southern
724
+ bgt Bughotu
725
+ yuz Yuracare
726
+ emp Emberá, Northern
727
+ mzj Manya
728
+ mfq Moba
729
+ guw Gun
730
+ kac Jingpho
731
+ ilo Ilocano
732
+ qvo Quichua, Napo
733
+ las Lama
734
+ ctu Chol
735
+ cdj Churahi
736
+ yam Yamba
737
+ dip Dinka, Northeastern
738
+ kfy Kumaoni
739
+ sig Paasaal
740
+ srx Sirmauri
741
+ mie Mixtec, Ocotepec
742
+ tca Ticuna
743
+ cap Chipaya
744
+ nav Navajo
745
+ mca Maka
746
+ pce Palaung, Ruching
747
+ upv Uripiv-Wala-Rano-Atchin
748
+ bgd Bareli, Rathwi
749
+ blz Balantak
750
+ dik Dinka, Southwestern
751
+ gbi Galela
752
+ dgo Dogri
753
+ nnb Nande
754
+ cax Chiquitano
755
+ myb Mbay
756
+ txq Tii
757
+ dhi Dhimal
758
+ mad Madura
759
+ shk Shilluk
760
+ ktb Kambaata
761
+ quw Quichua, Tena Lowland
762
+ rav Sampang
763
+ sag Sango
764
+ nyu Nyungwe
765
+ ljp Lampung Api
766
+ mzm Mumuye
767
+ stb Subanen, Northern
768
+ pab Parecís
769
+ mzw Deg
770
+ mhi Ma’di
771
+ gor Gorontalo
772
+ agd Agarabi
773
+ gnd Zulgo-Gemzek
774
+ xnr Kangri
775
+ mor Moro
776
+ kyu Kayah, Western
777
+ ese Ese Ejja
778
+ myk Sénoufo, Mamara
779
+ zaw Zapotec, Mitla
780
+ cme Cerma
781
+ aaz Amarasi
782
+ cnt Chinantec, Tepetotutla
783
+ zpo Zapotec, Amatlán
784
+ anv Denya
785
+ ach Acholi
786
+ mwv Mentawai
787
+ kfb Kolami, Northwestern
788
+ otn Otomi, Tenango
789
+ kbq Kamano
790
+ kss Kisi, Southern
791
+ jiv Shuar
792
+ dop Lukpa
793
+ nhw Nahuatl, Western Huasteca
794
+ ahk Akha
795
+ tbz Ditammari
796
+ zas Zapotec, Santo Domingo Albarradas
797
+ huv Huave, San Mateo del Mar
798
+ xtn Mixtec, Northern Tlaxiaco
799
+ bex Jur Modo
800
+ crn Cora, El Nayar
801
+ cuk Kuna, San Blas
802
+ gbk Gaddi
803
+ toi Tonga
804
+ key Kupia
805
+ ifb Ifugao, Batad
806
+ ztq Zapotec, Quioquitani-Quierí
807
+ nag Nagamese
808
+ toc Totonac, Coyutla
809
+ ken Kenyang
810
+ agr Awajún
811
+ bfd Bafut
812
+ kyq Kenga
813
+ ker Kera
814
+ ntr Delo
815
+ usp Uspanteko
816
+ alz Alur
817
+ mas Maasai
818
+ lme Pévé
819
+ nhu Noone
820
+ dwr Dawro
821
+ ksp Kabba
822
+ ncu Chumburung
823
+ min Minangkabau
824
+ wol Wolof
825
+ hif Hindi, Fiji
826
+ tll Tetela
827
+ bba Baatonum
828
+ cco Chinantec, Comaltepec
829
+ tbc Takia
830
+ lia Limba, West-Central
831
+ mgq Malila
832
+ mnf Mundani
833
+ hil Hiligaynon
834
+ kyc Kyaka
835
+ ozm Koonzime
836
+ gyr Guarayu
837
+ pcm Pidgin, Nigerian
838
+ sml Sama, Central
839
+ npl Nahuatl, Southeastern Puebla
840
+ tby Tabaru
841
+ lem Nomaande
842
+ udu Uduk
843
+ xsu Sanumá
844
+ soq Kanasi
845
+ tik Tikar
846
+ ibg Ibanag
847
+ zpl Zapotec, Lachixío
848
+ sbl Sambal, Botolan
849
+ itv Itawit
850
+ noa Woun Meu
851
+ ace Aceh
852
+ ign Ignaciano
853
+ shp Shipibo-Conibo
854
+ jbu Jukun Takum
855
+ kub Kutep
856
+ knf Mankanya
857
+ mvp Duri
858
+ jac Jakalteko
859
+ wwa Waama
860
+ biv Birifor, Southern
861
+ kkj Kako
862
+ ter Terêna
863
+ pbi Parkwa
864
+ csy Chin, Siyin
865
+ xrb Karaboro, Eastern
866
+ mxq Mixe, Juquila
867
+ mfh Matal
868
+ bht Bhattiyali
869
+ fal Fali, South
870
+ adj Adioukrou
871
+ mcu Mambila, Cameroon
872
+ otq Otomi, Querétaro
873
+ bpr Blaan, Koronadal
874
+ miq Mískito
875
+ tee Tepehua, Huehuetla
876
+ mrw Maranao
877
+ nfr Nafaanra
878
+ izr Izere
879
+ bzi Bisu
880
+ sas Sasak
881
+ cou Wamey
882
+ cbt Shawi
883
+ lwo Luwo
884
+ ban Bali
885
+ kab Amazigh
886
+ cbs Kashinawa
887
+ prf Paranan
888
+ nhi Nahuatl, Zacatlán-Ahuacatlán-Tepetzintla
889
+ dyi Sénoufo, Djimini
890
+ cnw Chin, Ngawn
891
+ zaa Zapotec, Sierra de Juárez
892
+ mfe Morisyen
893
+ mio Mixtec, Pinotepa Nacional
894
+ kjb Q’anjob’al
895
+ myx Masaaba
896
+ con Cofán
897
+ bkv Bekwarra
898
+ sur Mwaghavul
899
+ eza Ezaa
900
+ qxn Quechua, Northern Conchucos Ancash
901
+ lgg Lugbara
902
+ cya Chatino, Nopala
903
+ zao Zapotec, Ozolotepec
904
+ kez Kukele
905
+ sja Epena
906
+ bdq Bahnar
907
+ acf Lesser Antillean French Creole
908
+ ruf Luguru
909
+ cce Chopi
910
+ old Mochi
911
+ acu Achuar-Shiwiar
912
+ jmc Machame
913
+ xpe Kpelle, Liberia
914
+ alj Alangan
915
+ awb Awa
916
+ srn Sranan Tongo
917
+ zad Zapotec, Cajonos
918
+ lob Lobi
919
+ tsz Purepecha
920
+ ote Otomi, Mezquital
921
+ bcl Bikol, Central
922
+ mbt Manobo, Matigsalug
923
+ yua Maya, Yucatec
924
+ sgw Sebat Bet Gurage
925
+ tue Tuyuca
926
+ kao Xaasongaxango
927
+ mjl Mandeali
928
+ maz Mazahua, Central
929
+ miz Mixtec, Coatzospan
930
+ qvw Quechua, Huaylla Wanca
931
+ cpa Chinantec, Palantla
932
+ kxc Konso
933
+ bss Akoose
934
+ laj Lango
935
+ nyo Nyoro
936
+ ndp Kebu
937
+ hag Hanga
938
+ lip Sekpele
939
+ agn Agutaynen
940
+ mfy Mayo
941
+ nod Thai, Northern
942
+ zos Zoque, Francisco León
943
+ gde Gude
944
+ qub Quechua, Huallaga
945
+ tri Trió
946
+ way Wayana
947
+ umb Umbundu
948
+ gwi Gwich’in
949
+ qwh Quechua, Huaylas Ancash
950
+ bsc Oniyan
951
+ qvn Quechua, North Junín
952
+ ncj Nahuatl, Northern Puebla
953
+ tnk Kwamera
954
+ mit Mixtec, Southern Puebla
955
+ irk Iraqw
956
+ djk Aukan
957
+ vun Vunjo
958
+ rai Ramoaaina
959
+ mda Mada
960
+ gym Ngäbere
961
+ wob Wè Northern
962
+ pam Kapampangan
963
+ mop Maya, Mopán
964
+ tpp Tepehua, Pisaflores
965
+ mzk Mambila, Nigeria
966
+ hig Kamwe
967
+ tap Taabwa
968
+ hto Witoto, Minika
969
+ pww Karen, Pwo Northern
970
+ kxm Khmer, Northern
971
+ pbc Patamona
972
+ ifu Ifugao, Mayoyao
973
+ heh Hehe
974
+ bnp Bola
975
+ nwb Nyabwa
976
+ pko Pökoot
977
+ jam Jamaican English Creole
978
+ gej Gen
979
+ sld Sissala
980
+ iqw Ikwo
981
+ pae Pagibete
982
+ tac Tarahumara, Western
983
+ zai Zapotec, Isthmus
984
+ alt Altai, Southern
985
+ snw Selee
986
+ ann Obolo
987
+ lee Lyélé
988
+ bao Waimaha
989
+ klv Maskelynes
990
+ izz Izii
991
+ pag Pangasinan
992
+ thk Kitharaka
993
+ hay Haya
994
+ mog Mongondow
995
+ krj Kinaray-a
996
+ klu Klao
997
+ apb Sa’a
998
+ gmv Gamo
999
+ ycn Yucuna
1000
+ kqy Koorete
1001
+ msy Aruamu
1002
+ qvs Quechua, San Martín
1003
+ ood Tohono O’odham
1004
+ cbc Carapana
1005
+ stp Tepehuan, Southeastern
1006
+ bts Batak Simalungun
1007
+ enb Markweeta
1008
+ bcw Bana
1009
+ muh Mündü
1010
+ adh Jopadhola
1011
+ gkn Gokana
1012
+ tgp Tangoa
1013
+ ziw Zigula
1014
+ kpz Kupsapiiny
1015
+ poy Pogolo
1016
+ daa Dangaléat
1017
+ tnn Tanna, North
1018
+ shi Tachelhit
1019
+ guu Yanomamö
1020
+ kdi Kumam
1021
+ ata Pele-Ata
1022
+ bav Vengo
1023
+ neb Toura
1024
+ mif Mofu-Gudur
1025
+ mbh Mangseng
1026
+ srm Saramaccan
1027
+ vid Vidunda
1028
+ vmy Mazatec, Ayautla
1029
+ nnq Ngindo
1030
+ dts Dogon, Toro So
1031
+ ilb Ila
1032
+ ngp Ngulu
1033
+ tpt Tepehua, Tlachichilco
1034
+ kki Kagulu
1035
+ gvl Gulay
1036
+ chz Chinantec, Ozumacín
1037
+ ndj Ndamba
1038
+ toh Tonga
1039
+ zae Zapotec, Yareni
1040
+ caa Ch’orti’
1041
+ pau Palauan
1042
+ zpi Zapotec, Santa María Quiegolani
1043
+ cjp Cabécar
1044
+ bng Benga
1045
+ bjv Bedjond
1046
+ cuc Chinantec, Usila
1047
+ krl Karelian
1048
+ wmw Mwani
1049
+ nch Nahuatl, Central Huasteca
1050
+ nse Nsenga
1051
+ ndz Ndogo
1052
+ meq Merey
1053
+ mah Marshallese
1054
+ gso Gbaya, Southwest
1055
+ kwi Awa-Cuaiquer
1056
+ qve Quechua, Eastern Apurímac
1057
+ mza Mixtec, Santa María Zacatepec
1058
+ rng Ronga
1059
+ azz Nahuatl, Highland Puebla
1060
+ hns Hindustani, Sarnami
1061
+ npy Napu
1062
+ bps Blaan, Sarangani
1063
+ mqb Mbuko
1064
+ ura Urarina
1065
+ zty Zapotec, Yatee
1066
+ inb Inga
1067
+ cwt Kuwaataay
1068
+ yli Yali, Angguruk
1069
+ pad Paumarí
1070
+ mox Molima
1071
+ zpm Zapotec, Mixtepec
1072
+ tos Totonac, Highland
1073
+ bzj Belize English Creole
1074
+ apr Arop-Lokep
1075
+ ifk Ifugao, Tuwali
1076
+ nca Iyo
1077
+ boa Bora
1078
+ rmo Romani, Sinte
1079
+ jic Tol
1080
+ ded Dedua
1081
+ waw Waiwai
1082
+ saj Sahu
1083
+ lnl Banda, South Central
1084
+ pir Piratapuyo
1085
+ quf Quechua, Lambayeque
1086
+ sri Siriano
1087
+ kdn Kunda
1088
+ cbv Cacua
1089
+ lac Lacandon
1090
+ mpp Migabac
1091
+ gam Kandawo
1092
+ qvc Quechua, Cajamarca
1093
+ qvz Quichua, Northern Pastaza
1094
+ qxh Quechua, Panao
1095
+ lai Lambya
1096
+ hub Wampís
1097
+ jvn Javanese, Suriname
1098
+ coe Koreguaje
1099
+ ify Kallahan, Keley-i
1100
+ nab Nambikuára, Southern
1101
+ mir Mixe, Isthmus
1102
+ apf Agta, Pahanan
1103
+ des Desano
1104
+ lww Lewo
1105
+ cbu Kandozi-Chapra
1106
+ tfr Teribe
1107
+ beq Beembe
1108
+ nbw Ngbandi, Southern
1109
+ loq Lobala
1110
+ tbg Tairora, North
1111
+ avu Avokaya
1112
+ mcb Matsigenka
1113
+ bto Bikol, Rinconada
1114
+ mnh Mono
1115
+ lgl Wala
1116
+ yad Yagua
1117
+ qxo Quechua, Southern Conchucos
1118
+ hno Hindko, Northern
1119
+ bxg Bangala
1120
+ pao Paiute, Northern
1121
+ ibo Igbo
1122
+ jnj Yemsa
1123
+ sgj Surgujia
1124
+ ldi Laari
1125
+ sab Buglere
1126
+ bci Baoulé
1127
+ bxh Buhutu
1128
+ haw Hawaiian
1129
+ tnc Tanimuca-Letuama
1130
+ mfx Melo
1131
+ tyv Tuvan
1132
+ neq Mixe, North Central
1133
+ wbi Vwanji
1134
+ bcq Bench
1135
+ ksw Karen, S’gaw
1136
+ guz Ekegusii
1137
+ mkw Kituba
1138
+ ore Maijuna
1139
+ ige Igede
1140
+ bjz Baruga
1141
+ zca Zapotec, Coatecas Altas
1142
+ mer Kimîîru
1143
+ aui Anuki
1144
+ arn Mapudungun
1145
+ zul Zulu
1146
+ kxf Kawyaw
1147
+ alw Alaba-K’abeena
1148
+ xho Xhosa
1149
+ loz Lozi
1150
+ mww Hmong Daw
1151
+ mey Hassaniyya
1152
+ ijc Izon
1153
+ mwt Moken
1154
+ bza Bandi
1155
+ lun Lunda
1156
+ kby Kanuri, Manga
1157
+ pov Guinea-Bissau Creole
1158
+ bdg Bonggi
1159
+ ipi Ipili
1160
+ sfw Esahie
1161
+ knc Kanuri, Yerwa
1162
+ syl Sylheti
1163
+ bho Bhojpuri
1164
+ tum Tumbuka
1165
+ tdy Tadyawan
1166
+ nso Sotho, Northern
1167
+ lbj Ladakhi
1168
+ ckb Kurdish, Central
1169
+ ndc Ndau
1170
+ bwr Bura-Pabir
1171
+ pci Duruwa
1172
+ dje Zarma
1173
+ bax Bamun
1174
+ top Totonac, Papantla
1175
+ gkp Kpelle, Guinea
1176
+ lub Luba-Katanga
1177
+ qug Quichua, Chimborazo Highland
1178
+ lus Mizo
1179
+ csh Chin, Asho
1180
+ gvr Gurung
1181
+ tew Tewa
1182
+ cag Nivaclé
1183
+ bev Bété, Daloa
1184
+ ggu Gban
1185
+ vai Vai
1186
+ tiv Tiv
1187
+ dgr Tlicho
1188
+ epo Esperanto
1189
+ srr Serer-Sine
1190
+ elm Eleme
1191
+ maf Mafa
1192
+ abk Abkhaz
1193
+ ijn Kalabari
1194
+ lua Luba-Kasai
1195
+ kck Kalanga
1196
+ ngb Ngbandi, Northern
1197
+ zpq Zapotec, Zoogocho
1198
+ etu Ejagham
1199
+ gvs Gumawana
1200
+ bft Balti
1201
+ tzm Tamazight, Central Atlas
1202
+ ida Luidakho-Luisukha-Lutirichi
1203
+ enl Enlhet
1204
+ ada Dangme
1205
+ nzb Njebi
1206
+ xdy Malayic Dayak
1207
+ aca Achagua
1208
+ ktu Kituba
1209
+ ebu Kiembu
1210
+ pdt Plautdietsch
1211
+ gaa Ga
1212
+ swk Sena, Malawi
1213
+ awn Awngi
1214
+ okr Kirike
1215
+ kvj Psikye
1216
+ xkl Kenyah, Mainstream
1217
+ knp Kwanja
1218
+ krw Krahn, Western
1219
+ mzl Mixe, Mazatlán
1220
+ ndi Samba Leko
1221
+ mug Musgu
1222
+ soe Songomeno
1223
+ sea Semai
1224
+ kfc Konda-Dora
1225
+ lol Mongo-Nkundu
1226
+ tsc Tswa
1227
+ idu Idoma
1228
+ mni Meitei
1229
+ trc Triqui, Copala
1230
+ mgr Mambwe-Lungu
1231
+ mcn Masana
1232
+ lrc Luri, Northern
1233
+ kfi Kurumba, Kannada
1234
+ bzw Basa
1235
+ mzz Maiadomu
1236
+ mrt Marghi Central
1237
+ rml Romani, Baltic
1238
+ rhg Rohingya
1239
+ urh Urhobo
1240
+ lag Langi
1241
+ its Isekiri
1242
+ ego Eggon
1243
+ gle Irish
1244
+ ubr Ubir
1245
+ hdy Hadiyya
1246
+ jen Dza
1247
+ sru Suruí
1248
+ ngc Ngombe
1249
+ lmp Limbum
1250
+ isn Isanzu
1251
+ kqs Kissi, Northern
1252
+ kpm Koho
1253
+ nup Nupe-Nupe-Tako
1254
+ mwm Sar
1255
+ kng Koongo
1256
+ nnc Nancere
1257
+ bkm Kom
1258
+ tui Tupuri
1259
+ ogo Khana
1260
+ lic Hlai
1261
+ mkn Malay, Kupang
1262
+ wed Wedau
1263
+ ald Alladian
1264
+ ksf Bafia
1265
+ dur Dii
1266
+ jaa Jamamadí
1267
+ kmb Kimbundu
1268
+ mua Mundang
1269
+ cje Chru
1270
+ igb Ebira
1271
+ gya Gbaya, Northwest
1272
+ skr Saraiki
1273
+ dow Doyayo
1274
+ dww Dawawa
1275
+ iso Isoko
1276
+ giz Giziga
1277
+ bum Bulu
1278
+ zza Zaza
1279
+ mfa Malay, Pattani
1280
+ snf Noon
1281
+ mgw Matumbi
1282
+ bin Edo
1283
+ vmk Makhuwa-Shirima
1284
+ dua Duala
1285
+ kea Kabuverdianu
1286
+ sef Sénoufo, Cebaara
1287
+ kaj Jju
1288
+ kqf Kakabai
1289
+ ayz Mai Brat
1290
+ ksz Kodaku
1291
+ ncl Nahuatl, Michoacán
1292
+ bzd Bribri
1293
+ ssn Waata
1294
+ mro Mru
1295
+ bhi Bhilali
1296
+ wes Pidgin, Cameroon
1297
+ adi Adi
1298
+ efi Efik
1299
+ ena Apal
1300
+ nde Ndebele
1301
+ ast Asturian
1302
+ mhw Mbukushu
1303
+ bbj Ghomálá’
1304
+ geb Kire
1305
+ igl Igala
1306
+ aoi Anindilyakwa
1307
+ rao Rao
1308
+ nnh Ngiemboon
1309
+ byv Medumba
1310
+ sat Santhali
1311
+ dzg Dazaga
1312
+ gnn Gumatj
1313
+ bhb Bhili
1314
+ swp Suau
1315
+ sgc Kipsigis
1316
+ wim Wik-Mungkan
1317
+ viv Iduna
1318
+ ady Adyghe
1319
+ krr Krung
1320
+ fan Fang
1321
+ coh Chichonyi-Chidzihana-Chikauma
1322
+ nbq Nggem
1323
+ gvo Gavião do Jiparaná
1324
+ glk Gilaki
1325
+ acz Acheron
1326
+ mwf Murrinh-Patha
1327
+ wbp Warlpiri
1328
+ tod Toma
1329
+ unr Mundari
1330
+ khe Korowai
1331
+ ntj Ngaanyatjarra
1332
+ wnc Wantoat
1333
+ suj Shubi
1334
+ emk Maninkakan, Eastern
1335
+ kel Kela
1336
+ dks Dinka, Southeastern
1337
+ zav Zapotec, Yatzachi
1338
+ jra Jarai
1339
+ dhg Dhangu-Djangu
1340
+ wlo Wolio
1341
+ bmk Ghayavi
1342
+ lgr Lengo
1343
+ njz Nyishi
1344
+ lue Luvale
1345
+ mhu Digaro-Mishmi
1346
+ tsn Setswana
1347
+ beo Bedamuni
1348
+ lgm Lega-Mwenga
1349
+ haq Ha
1350
+ trp Kok Borok
1351
+ tdh Thulung
1352
+ tuy Tugen
1353
+ lzz Laz
1354
+ gvj Guajá
1355
+ gom Konkani, Goan
1356
+ kjl Kham, Western Parbate
1357
+ tke Takwane
1358
+ mpj Martu Wangka
1359
+ ven Venda
1360
+ xer Xerénte
1361
+ nyd Olunyole
1362
+ byd Benyadu’
1363
+ snc Sinaugoro
1364
+ sdr Sadri, Oraon
1365
+ toq Toposa
1366
+ wod Wolani
1367
+ nhr Naro
1368
+ tvs Taveta
1369
+ ble Balanta-Kentohe
1370
+ bcp Bali
1371
+ rag Lulogooli
1372
+ jmx Mixtec, Western Juxtlahuaca
1373
+ bvd Baeggu
1374
+ bvu Malay, Bukit
1375
+ dbj Ida’an
1376
+ her Herero
1377
+ mwc Are
1378
+ sou Thai, Southern
1379
+ ktz Juǀ’hoansi
1380
+ rmn Romani, Balkan
1381
+ qxu Quechua, Arequipa-La Unión
1382
+ nmn !Xóõ
1383
+ haj Hajong
1384
+ bee Byangsi
1385
+ wbf Wara
1386
+ sot Sotho, Southern
1387
+ fmu Muria, Far Western
1388
+ swb Comorian, Maore
1389
+ dde Doondo
1390
+ mve Marwari
1391
+ mlk Kiwilwana
1392
+ mjt Sauria Paharia
1393
+ bjg Bidyogo
1394
+ jmd Yamdena
1395
+ mwn Nyamwanga
1396
+ yml Iamalele
1397
+ kha Khasi
1398
+ mzp Movima
1399
+ tvk Ambrym, Southeast
1400
+ tkr Tsakhur
1401
+ dim Dime
1402
+ mix Mixtec, Mixtepec
1403
+ tbo Tawala
1404
+ lma Limba, East
1405
+ pln Palenquero
1406
+ koe Suri, Kacipo-Bale
1407
+ glv Manx
1408
+ kjg Khmu
1409
+ wof Wolof, Gambian
1410
+ kjc Konjo, Coastal
1411
+ xuu Khwedam
1412
+ brv Bru, Western
1413
+ aoz Uab Meto
1414
+ evn Evenki
1415
+ tsb Tsamai
1416
+ djr Djambarrpuyngu
1417
+ mch Maquiritari
1418
+ kgk Kaiwá
1419
+ klr Khaling
1420
+ gno Gondi, Northern
1421
+ nuy Nunggubuyu
1422
+ srq Sirionó
1423
+ sep Sénoufo, Sìcìté
1424
+ oki Okiek
1425
+ trd Turi
1426
+ msc Maninka, Sankaran
1427
+ twm Monpa, Tawang
1428
+ rki Rakhine
1429
+ mfv Mandjak
1430
+ mhs Buru
1431
+ mjx Mahali
1432
+ ggw Gogodala
1433
+ nfa Dhao
1434
+ mym Me’en
1435
+ hvn Hawu
1436
+ nuz Nahuatl, Tlamacazapa
1437
+ are Arrarnta, Western
1438
+ lbm Lodhi
1439
+ hni Hani
1440
+ chf Chontal, Tabasco
1441
+ mtd Mualang
1442
+ div Maldivian
1443
+ the Tharu, Central
1444
+ rgs Roglai, Southern
1445
+ nys Nyungar
1446
+ tpe Tippera
1447
+ eyo Keiyo
1448
+ ghr Ghera
1449
+ kls Kalasha
1450
+ lrm Olumarama
1451
+ pmy Malay, Papuan
1452
+ lbx Lawangan
1453
+ akh Angal Heneng
1454
+ kpc Curripaco
1455
+ sco Scots
1456
+ lwg Oluwanga
1457
+ kay Kamayurá
1458
+ zac Zapotec, Ocotlán
1459
+ ccp Chakma
1460
+ pof Poke
1461
+ seg Segeju
1462
+ nos Nisu, Eastern
1463
+ abt Ambulas
1464
+ llc Lele
1465
+ sbe Saliba
1466
+ khz Keapara
1467
+ yup Yukpa
1468
+ khw Khowar
1469
+ bjn Banjar
1470
+ kyg Keyagana
1471
+ tab Tabasaran
1472
+ wci Gbe, Waci
1473
+ llg Lole
1474
+ lig Ligbi
1475
+ tcz Chin, Thado
1476
+ tog Tonga
1477
+ bqi Bakhtiâri
1478
+ psa Awyu, Asue
1479
+ knx Kendayan
1480
+ wat Kaninuwa
1481
+ xem Kembayan
1482
+ suv Puroik
1483
+ hix Hixkaryána
1484
+ bmf Bom-Kim
1485
+ bkx Baikeno
1486
+ imo Imbongu
1487
+ cjs Shor
1488
+ cto Embera Catío
1489
+ nyk Nyaneka
1490
+ tet Tetun
1491
+ slu Selaru
1492
+ xmc Makhuwa-Marrevone
1493
+ knu Kono
1494
+ rgu Rikou
1495
+ bgz Banggai
1496
+ zam Zapotec, Miahuatlán
1497
+ xdn
1498
+ iru Irula
1499
+ mbp Malayo
1500
+ ymm Maay
1501
+ kuj Kuria
1502
+ bfg Kayan, Busang
1503
+ thq Tharu, Mid-Eastern
1504
+ otd Ot Danum
1505
+ tnv Tangchangya
1506
+ esg Gondi, Aheri
1507
+ ajg Aja
1508
+ dwy Dhuwaya
1509
+ yrl Nhengatu
1510
+ kud ’Auhelawa
1511
+ mau Mazatec, Huautla
1512
+ loe Saluan
1513
+ kiw Kiwai, Northeast
1514
+ zin Zinza
1515
+ bbr Girawa
1516
+ srb Sora
1517
+ gup Gunwinggu
1518
+ pht Phu Thai
1519
+ ztg Zapotec, Xanaguía
1520
+ tpa Taupota
1521
+ blr Blang
1522
+ awi Aekyom
1523
+ pgg Pangwali
1524
+ snk Soninke
1525
+ nni Nuaulu, North
1526
+ hts Hadza
1527
+ scg Sanggau
1528
+ xdo Kwandu
1529
+ adq Adangbe
1530
+ cnk Chin, Khumi
1531
+ nza Mbembe, Tigon
1532
+ agg Angor
1533
+ ina Interlingua (International Auxiliary Language Association)
1534
+ maq Mazatec, Chiquihuitlán
1535
+ blo Anii
1536
+ ctp Chatino, Western Highland
1537
+ lbf Tinani
1538
+ xta Mixtec, Alcozauca
1539
+ tix Tiwa, Southern
1540
+ mee Mengen
1541
+ dnn Dzùùngoo
1542
+ kap Bezhta
1543
+ ssy Saho
1544
+ yon Yongkom
1545
+ tlr Talise
1546
+ duc Duna
1547
+ tro Naga, Tarao
1548
+ tth Ta’oih, Upper
1549
+ kpo Ikposo
1550
+ nuf Nusu
1551
+ pbo Papel
1552
+ lla Lala-Roba
1553
+ mki Dhatki
1554
+ ckt Chukchi
1555
+ pri Paicî
1556
+ pnb Punjabi, Western
1557
+ rah Rabha
1558
+ fli Fali Muchella
1559
+ eto Eton
1560
+ beu Blagar
1561
+ xsq Makhuwa-Saka
1562
+ bhw Biak
1563
+ atd Manobo, Ata
1564
+ zpv Zapotec, Chichicapan
1565
+ sza Semelai
1566
+ bob Aweer
1567
+ afz Obokuitai
1568
+ mui Musi
1569
+ tkt Tharu, Kathariya
1570
+ phr Pahari-Potwari
1571
+ bha Bharia
1572
+ tdt Tetun Dili
1573
+ ton Tongan
1574
+ nwi Tanna, Southwest
1575
+ olu Kuvale
1576
+ mxx Mahou
1577
+ uki Kui
1578
+ mgp Magar, Eastern
1579
+ zgb Zhuang, Guibei
1580
+ bxr Buriat, Russia
1581
+ tsj Tshangla
1582
+ gwn Gwandara
1583
+ bon Bine
1584
+ enq Enga
1585
+ qxp Quechua, Puno
1586
+ bji Burji
1587
+ onr One, Northern
1588
+ xky Uma’ Lasan
1589
+ awu Awyu, Central
1590
+ kvo Dobel
1591
+ xav Xavánte
1592
+ yiu Awu
1593
+ sdq Semandang
1594
+ pdu Kayan
1595
+ vaa Vaagri Booli
1596
+ shr Shi
1597
+ kvw Wersing
1598
+ mvv Murut, Tahol
1599
+ blb Bilua
1600
+ ckh Chak
1601
+ kei Kei
1602
+ jml Jumli
1603
+ knl Keninjal
1604
+ tpr Tuparí
1605
+ pwo Karen, Pwo Western
1606
+ dgc Agta, Casiguran Dumagat
1607
+ bug Bugis
1608
+ age Angal
1609
+ kmw Komo
1610
+ sei Seri
1611
+ cbn Nyahkur
1612
+ ria Riang
1613
+ asy Asmat, Yaosakor
1614
+ nes Kinnauri, Bhoti
1615
+ mrr Maria
1616
+ oyb Oy
1617
+ vah Varhadi-Nagpuri
1618
+ gnk ǁGana
1619
+ gah Alekano
1620
+ ghe Ghale, Southern
1621
+ aoj Mufian
1622
+ kps Tehit
1623
+ tpx Me’phaa, Acatepec
1624
+ jab Hyam
1625
+ vaj Northwestern !Kung
1626
+ sie Simaa
1627
+ pcf Paliyan
1628
+ itl Itelmen
1629
+ gld Nanai
1630
+ hmd Miao, Large Flowery
1631
+ skx Seko Padang
1632
+ yoy Yoy
1633
+ dhw Danuwar
1634
+ sbu Stod Bhoti
1635
+ bun Sherbro
1636
+ khb Lü
1637
+ leu Kara
1638
+ kas Kashmiri
1639
+ hii Hinduri
1640
+ djo Jangkang
1641
+ krn Sapo
1642
+ bap Bantawa
1643
+ iii Nuosu
1644
+ row Dela-Oenale
1645
+ brx Boro
1646
+ lir Liberian English
1647
+ apz Safeyoka
1648
+ ssw Swati
1649
+ kib Koalib
1650
+ bmb Bembe
1651
+ cao Chácobo
1652
+ nbe Naga, Konyak
1653
+ jna Jangshung
1654
+ kca Khanty
1655
+ zyn Zhuang, Yongnan
1656
+ kpy Koryak
1657
+ peg Pengo
1658
+ tnl Lenakel
1659
+ nti Natioro
1660
+ gaj Gadsup
1661
+ lep Lepcha
1662
+ mxn Moi
1663
+ dry Darai
1664
+ kmc Dong, Southern
1665
+ kup Kunimaipa
1666
+ tqo Toaripi
1667
+ kqb Kovai
1668
+ ksd Kuanua
1669
+ hea Miao, Northern Qiandong
1670
+ pcc Bouyei
1671
+ dre Dolpo
1672
+ mxj Miju-Mishmi
1673
+ lyn Luyana
1674
+ kxv Kuvi
1675
+ cns Asmat, Central
1676
+ aix Aighon
1677
+ rwr Marwari
1678
+ anu Anuak
1679
+ aso Dano
1680
+ ino Inoke-Yate
1681
+ ncm Nambo
1682
+ kfq Korku
1683
+ dhn Dangi
1684
+ nii Nii
1685
+ bzf Boikin
1686
+ srl Isirawa
1687
+ bpe Bauni
1688
+ ong Olo
1689
+ mho Mashi
1690
+ sdo Bidayuh Serian
1691
+ kfv Kurmukar
1692
+ cch Atsam
1693
+ agx Aghul
1694
+ ewo Ewondo
1695
+ dta Daur
1696
+ mlu To’abaita
1697
+ zik Zimakani
1698
+ yom Yombe
1699
+ lae Pattani
1700
+ wbr Wagdi
1701
+ dar Dargwa
1702
+ mrm Mwerlap
1703
+ hmt Hamtai
1704
+ vay Wayu
1705
+ dib Dinka, South Central
1706
+ cdm Chepang
1707
+ ola Walungge
1708
+ yiz Azhe
1709
+ lri Olumarachi
1710
+ xmz Mori Bawah
1711
+ tpj Ñandeva
1712
+ kgp Kaingang
1713
+ bcf Bamu
1714
+ wib Toussian, Southern
1715
+ mji Kim Mun
1716
+ fwe Fwe
1717
+ apw Apache, Western
1718
+ xri Krikati-Timbira
1719
+ thr Tharu, Rana
1720
+ afe Utugwang-Irungene-Afrike
1721
+ gea Geruma
1722
+ gwj ǀGwi
1723
+ kai Karekare
1724
+ sgp Singpho
1725
+ ahl Igo
1726
+ pav Pakaásnovos
1727
+ zzj Zhuang, Zuojiang
1728
+ sip Sikkimese
1729
+ ybi Yamphu
1730
+ cli Chakali
1731
+ xtl Mixtec, Tijaltepec
1732
+ cro Crow
1733
+ pmi Pumi, Northern
1734
+ nmi Nyam
1735
+ kcl Kala
1736
+ ish Esan
1737
+ rab Chamling
1738
+ kvf Kabalai
1739
+ kwv Kaba Naa, Sara
1740
+ bwi Baniwa
1741
+ mrd Magar, Western
1742
+ kfk Kinnauri
1743
+ cfa Dikaka
1744
+ pex Petats
1745
+ aly Alyawarr
1746
+ lot Otuho
1747
+ twe Teiwa
1748
+ ygr Yagaria
1749
+ afu Awutu
1750
+ gol Gola
1751
+ dhd Dhundari
1752
+ bku Buhid
1753
+ ppt Pa
1754
+ ulu Uma’ Lung
1755
+ syw Syuba
1756
+ ekg Ekari
1757
+ boq Bogaya
1758
+ tsx Mubami
1759
+ stt Stieng, Budeh
1760
+ kwl Kofyar
1761
+ bzy Abanglekuo
1762
+ mjc Mixtec, San Juan Colorado
1763
+ tnp Whitesands
1764
+ njb Naga, Nocte
1765
+ mle Manambu
1766
+ ram Canela
1767
+ bas Basaa
1768
+ kjp Karen, Pwo Eastern
1769
+ shj Shatt
1770
+ hut Humla
1771
+ pud Punan Aput
1772
+ att Atta, Pamplona
1773
+ wbm Wa, Vo
1774
+ xuj Kurumba, Jennu
1775
+ bhj Bahing
1776
+ dhm Dhimba
1777
+ les Lese
1778
+ amn Amanab
1779
+ ass Ipulo
1780
+ kge Komering
1781
+ bwx Bunu, Bu-Nao
1782
+ onp Sartang
1783
+ nmo Naga, Moyon
1784
+ gju Gujari
1785
+ haz Hazaragi
1786
+ snx Sam
1787
+ bfb Bareli, Pauri
1788
+ kyo Klon
1789
+ tdf Talieng
1790
+ mgm Mambae
1791
+ swv Shekhawati
1792
+ blk Pa’o
1793
+ kqm Khisa
1794
+ ikx Ik
1795
+ yig Nasu, Wusa
1796
+ twh Tai Dón
1797
+ tjg Tunjung
1798
+ kpb Kurumba, Mullu
1799
+ kzs Sugut Dusun
1800
+ szb Ngalum
1801
+ ysn Sani
1802
+ bzz Evant
1803
+ nbu Naga, Rongmei
1804
+ cgk Chocangacakha
1805
+ kbd Kabardian
1806
+ cua Cua
1807
+ ntp Tepehuan, Northern
1808
+ zpj Zapotec, Quiavicuzas
1809
+ aii Assyrian Neo-Aramaic
1810
+ kpr Korafe-Yegha
1811
+ tpu Tampuan
1812
+ mfc Mba
1813
+ xra Krahô
1814
+ aai Miniafia Oyan
1815
+ shg Shua
1816
+ brg Baure
1817
+ tsg Tausug
1818
+ giw Duoluo
1819
+ myl Moma
1820
+ mks Mixtec, Silacayoapan
1821
+ say Saya
1822
+ goj Gowlan
1823
+ ywq Yi, Wuding-Luquan
1824
+ tsr Akei
1825
+ niq Nandi
1826
+ mtr Mewari
1827
+ lml Hano
1828
+ wtm Mewati
1829
+ mde Maba
1830
+ cik Kinnauri, Chitkuli
1831
+ dwz Dewas Rai
1832
+ uar Tairuma
1833
+ ian Iatmul
1834
+ lar Larteh
1835
+ ttr Tera
1836
+ dby Dibiyaso
1837
+ pah Tenharim
1838
+ wlv Bermejo Wichí
1839
+ mpr Vangunu
1840
+ uth ut-Hun
1841
+ krv Kavet
1842
+ mrg Mising
1843
+ grv Grebo, Central
1844
+ bpx Bareli, Palya
1845
+ dob Dobu
1846
+ knv Tabo
1847
+ scp Hyolmo
1848
+ shy Tachawit
1849
+ lbe Lak
1850
+ sya Siang
1851
+ loy Lhowa
1852
+ cux Cuicatec, Tepeuxila
1853
+ ybh Yakkha
1854
+ sso Essono
1855
+ ztp Zapotec, Loxicha
1856
+ jul Jirel
1857
+ kgq Kamoro
1858
+ dao Chin, Daai
1859
+ wad Wamesa
1860
+ mnz Moni
1861
+ kbc Kadiwéu
1862
+ agw Kahua
1863
+ wmt Walmajarri
1864
+ bco Kaluli
1865
+ pkh Pangkhua
1866
+ meu Motu
1867
+ gjk Koli, Kachi
1868
+ uss us-Saare
1869
+ raa Dungmali
1870
+ nkb Naga, Khoibu
1871
+ aau Abau
1872
+ bde Bade
1873
+ mzr Marubo
1874
+ sax Sa
1875
+ txo Toto
1876
+ mte Mono
1877
+ sdp Sherdukpen
1878
+ hmo Motu, Hiri
1879
+ gdb Gadaba, Pottangi Ollar
1880
+ tic Tira
1881
+ mdk Mangbutu
1882
+ baa Babatana
1883
+ sjp Surjapuri
1884
+ kun Kunama
1885
+ kbl Kanembu
1886
+ mql Mbelime
1887
+ qud Quichua, Calderón Highland
1888
+ lpo Lipo
1889
+ arr Karo
1890
+ kty Kango
1891
+ klw Tado
1892
+ mke Mawchi
1893
+ nfu Mfumte
1894
+ soi Sonha
1895
+ tar Tarahumara, Central
1896
+ xub Kurumba, Betta
1897
+ klz Kabola
1898
+ lra Bakati’, Rara
1899
+ mxu Mada
1900
+ kwx Khirwar
1901
+ mdr Mandar
1902
+ hoe Horom
1903
+ lsr Aruop
1904
+ mbz Mixtec, Amoltepec
1905
+ lbq Wampar
1906
+ mdd Mbum
1907
+ plj Polci
1908
+ all Allar
1909
+ kjo Kinnauri, Pahari
1910
+ xmt Matbat
1911
+ kft Kanjari
1912
+ mcf Matses
1913
+ tbf Mandara
1914
+ sif Siamou
1915
+ tio Teop
1916
+ tcy Tulu
1917
+ lnu Longuda
1918
+ ica Ede Ica
1919
+ bpp Kaure
1920
+ juk Wapan
1921
+ shb Ninam
1922
+ grj Grebo, Southern
1923
+ bec Iceve-Maci
1924
+ mvg Mixtec, Yucuañe
1925
+ cnb Chin, Uppu
1926
+ skj Seke
1927
+ noe Nimadi
1928
+ tba Aikanã
1929
+ sly Selayar
1930
+ dot Dass
1931
+ sfm Miao, Small Flowery
1932
+ yss Yessan-Mayo
1933
+ blw Balangao
1934
+ slr Salar
1935
+ soa Thai Song
1936
+ bla Blackfoot
1937
+ tan Tangale
1938
+ bns Bundeli
1939
+ xtc Katcha-Kadugli-Miri
1940
+ nmf Naga, Tangkhul
1941
+ grd Guruntum-Mbaaru
1942
+ amr Amarakaeri
1943
+ puu Punu
1944
+ mlm Mulam
1945
+ lec Leco
1946
+ bcs Hohumono
1947
+ byn Bilen
1948
+ ott Otomi, Temoaya
1949
+ arv Arbore
1950
+ xkk Kachok
1951
+ mjg Tu
1952
+ pnq Pana
1953
+ asc Asmat, Casuarina Coast
1954
+ aks Akaselem
1955
+ mmg Ambrym, North
1956
+ tld Talaud
1957
+ bkq Bakairí
1958
+ ort Oriya, Adivasi
1959
+ kxz Kerewo
1960
+ kwj Kwanga
1961
+ cub Cubeo
1962
+ eja Jola-Felupe
1963
+ wbl Wakhi
1964
+ uri Urim
1965
+ zua Zeem
1966
+ kjd Kiwai, Southern
1967
+ ruk Kuce
1968
+ lbk Bontok, Central
1969
+ bfw Bondo
1970
+ jao Yanyuwa
1971
+ hca Andaman Hindi Creole
1972
+ ssx Samberigi
1973
+ ldl Kaan
1974
+ byx Qaqet
1975
+ nku Kulango, Bouna
1976
+ gec Grebo, Gboloo
1977
+ zlj Zhuang, Liujiang
1978
+ bge Bauria
1979
+ btu Batu
1980
+ nlx Nahali
1981
+ hmr Hmar
1982
+ tcu Tarahumara, Southeastern
1983
+ lax Tiwa
1984
+ lhm Lhomi
1985
+ kdp Nikyob-Nindem
1986
+ tes Tengger
1987
+ mdb Morigi
1988
+ msi Malay, Sabah
1989
+ rog Roglai, Northern
1990
+ jda Jad
1991
+ zpa Zapotec, Lachiguiri
1992
+ poc Poqomam
1993
+ mgu Magi
1994
+ nnu Dwang
1995
+ kui Kuikúro-Kalapálo
1996
+ llp Efate, North
1997
+ kxj Kulfa
1998
+ mjz Majhi
1999
+ jms Mashi
2000
+ nto Ntomba
2001
+ hsn Chinese, Xiang
2002
+ bhu Bhunjia
2003
+ nfd Ahwai
2004
+ ksg Kusaghe
2005
+ kzr Karang
2006
+ lyg Lyngngam
2007
+ prp Parsi
2008
+ lle Lele
2009
+ kex Kukna
2010
+ brh Brahui
2011
+ bkk Brokskat
2012
+ wuu Chinese, Wu
2013
+ gry Grebo, Barclayville
2014
+ bgp Balochi, Eastern
2015
+ pai Pye
2016
+ cta Chatino, Tataltepec
2017
+ cog Chong
2018
+ oro Orokolo
2019
+ pug Phuie
2020
+ swi Sui
2021
+ inj Inga, Jungle
2022
+ wmo Wom
2023
+ kcv Kete
2024
+ cna Changthang
2025
+ xkf Khengkha
2026
+ jer Jere
2027
+ bca Bai, Central
2028
+ kua Oshiwambo
2029
+ roh Romansh
2030
+ mxe Mele-Fila
2031
+ jmn Naga, Makuri
2032
+ dus Dumi
2033
+ ssk Sunam
2034
+ bqg Bago-Kusuntu
2035
+ pwr Powari
2036
+ jbj Arandai
2037
+ yet Yetfa
2038
+ lhi Lahu Shi
2039
+ aar Afar
2040
+ ksu Khamyang
2041
+ mxy Mixtec, Southeastern Nochixtlán
2042
+ tcn Tichurong
2043
+ lmx Laimbue
2044
+ xua Kurumba, Alu
2045
+ khr Kharia
2046
+ zyj Zhuang, Youjiang
2047
+ mng Mnong, Eastern
2048
+ roo Rotokas
2049
+ anr Andh
2050
+ mdv Mixtec, Santa Lucía Monteverde
2051
+ msm Manobo, Agusan
2052
+ nbl Ndebele
2053
+ cin Cinta Larga
2054
+ sjl Miji
2055
+ saw Sawi
2056
+ xkz Kurtokha
2057
+ npb Nupbikha
2058
+ cnc Côông
2059
+ muk Mugom
2060
+ foi Foi
2061
+ sqq Sou
2062
+ tdd Tai Nüa
2063
+ kil Kariya
2064
+ bma Lame
2065
+ dad Marik
2066
+ bix Bijori
2067
+ nao Naaba
2068
+ pwb Panawa
2069
+ bhx Bhalay
2070
+ aro Araona
2071
+ qwa Quechua, Corongo Ancash
2072
+ gga Gao
2073
+ zau Zangskari
2074
+ brt Bitare
2075
+ tyz Tày
2076
+ keu Akebu
2077
+ anm Anal
2078
+ lro Laro
2079
+ ssb Sama, Southern
2080
+ der Deori
2081
+ kad Adara
2082
+ esk Inupiatun, Northwest Alaska
2083
+ clo Chontal, Lowland Oaxaca
2084
+ bli Bolia
2085
+ tuz Turka
2086
+ bra Braj Bhasha
2087
+ nnm Namia
2088
+ sui Suki
2089
+ tgs Nume
2090
+ gbe Niksek
2091
+ xwe Gbe, Xwela
2092
+ kfp Korwa
2093
+ apt Apatani
2094
+ dzl Dzalakha
2095
+ mpq Matís
2096
+ hal Halang
2097
+ bio Nai
2098
+ jib Jibu
2099
+ kph Kplang
2100
+ hia Lamang
2101
+ yij Yindjibarndi
2102
+ chq Chinantec, Quiotepec
2103
+ xbi Kombio
2104
+ mpc Mangarrayi
2105
+ ebo Teke-Eboo
2106
+ tcs Torres Strait Creole
2107
+ kvi Kwang
2108
+ zyg Zhuang, Yang
2109
+ bww Bwa
2110
+ kpl Kpala
2111
+ hoy Holiya
2112
+ nhp Nahuatl, Isthmus-Pajapan
2113
+ abo Abon
2114
+ dai Day
2115
+ zom Zo
2116
+ lea Lega-Shabunda
2117
+ kej Kadar
2118
+ aup Makayam
2119
+ tcx Toda
2120
+ kmi Kami
2121
+ jio Jiamao
2122
+ bhd Bhadrawahi
2123
+ cav Cavineña
2124
+ bda Bayot
2125
+ ppq Pefiyahe
2126
+ bbk Babanki
2127
+ apu Apurinã
2128
+ ahr Ahirani
2129
+ wsi Wusi
2130
+ tdj Tajio
2131
+ myu Mundurukú
2132
+ kzq Kaike
2133
+ bfu Gahri
2134
+ sgh Shughni
2135
+ kfg Kudiya
2136
+ bcn Bali
2137
+ ygw Yagwoia
2138
+ ttv Titan
2139
+ iyo Mesaka
2140
+ pcn Abishi
2141
+ lkt Lakota
2142
+ aim Aimol
2143
+ tcf Me’phaa, Malinaltepec
2144
+ fod Foodo
2145
+ phk Phake
2146
+ scu Shumcho
2147
+ lch Luchazi
2148
+ nbm Ngbaka Ma’bo
2149
+ bei Bakati’
2150
+ jid Bu
2151
+ sce Dongxiang
2152
+ noi Noiri
2153
+ hmj Ge
2154
+ tyr Tai Daeng
2155
+ rop Kriol
2156
+ tsv Tsogo
2157
+ nbr Numana
2158
+ kvx Koli, Parkari
2159
+ ums Pendau
2160
+ dka Dakpakha
2161
+ alu ’Are’are
2162
+ pid Piaroa
2163
+ mab Mixtec, Yutanduchi
2164
+ gaq Gata’
2165
+ kgy Kyerung
2166
+ abs Malay, Ambonese
2167
+ alk Alak
2168
+ gdn Umanakaina
2169
+ ths Thakali
2170
+ khn Khandesi
2171
+ gaw Nobonob
2172
+ aac Ari
2173
+ tvd Tsuvadi
2174
+ bkr Bakumpai
2175
+ xkb Nago, Northern
2176
+ aot Atong
2177
+ lmn Lambadi
2178
+ kgr Abun
2179
+ moc Mocoví
2180
+ mbk Malol
2181
+ sss So
2182
+ dbv Dungu
2183
+ ngt Kriang
2184
+ tja Tajuasohn
2185
+ kif Kham, Eastern Parbate
2186
+ okv Orokaiva
2187
+ qvi Quichua, Imbabura Highland
2188
+ esu Yupik, Central
2189
+ bby Befang
2190
+ koi Komi-Permyak
2191
+ cvg Chug
2192
+ gdr Wipi
2193
+ kxp Koli, Wadiyari
2194
+ mme Mae
2195
+ pmj Pumi, Southern
2196
+ suy Suyá
2197
+ vas Vasavi
2198
+ suo Bouni
2199
+ nbc Naga, Chang
2200
+ bvr Burarra
2201
+ tts Thai, Northeastern
2202
+ diu Gciriku
2203
+ ndx Nduga
2204
+ bkl Berik
2205
+ lhp Lhokpu
2206
+ alf Elege
2207
+ wog Wogamusin
2208
+ bxa Bauro
2209
+ xwl Gbe, Western Xwla
2210
+ jae Yabem
2211
+ xbr Kambera
2212
+ bwd Bwaidoka
2213
+ nar Iguta
2214
+ dcc Deccan
2215
+ bjx Kalinga, Vanaw
2216
+ yes Nyankpa
2217
+ kul Kulere
2218
+ ssi Sansi
2219
+ hre Hre
2220
+ mtt Mota
2221
+ ysp Lolopo, Southern
2222
+ auc Waorani
2223
+ thy Tha
2224
+ dza Tunzuii
2225
+ tkb Buksa
2226
+ lkr Päri
2227
+ skn Subanon, Kolibugan
2228
+ tgd Ciwogai
2229
+ myp Pirahã
2230
+ eve Even
2231
+ bgg Bugun
2232
+ ril Riang Lang
2233
+ dbm Duguri
2234
+ bew Betawi
2235
+ aps Orop
2236
+ aon Weri
2237
+ dub Dubli
2238
+ hld Halang Doan
2239
+ jwi Jwira-Pepesa
2240
+ ayg Ginyanga
2241
+ wno Wano
2242
+ bfr Bazigar
2243
+ kpk Kpan
2244
+ bcg Baga Pokur
2245
+ avt Au
2246
+ nke Duke
2247
+ stk Aramba
2248
+ mkz Makasae
2249
+ hms Miao, Southern Qiandong
2250
+ duh Dungra Bhil
2251
+ scl Shina
2252
+ bfm Mmen
2253
+ ctl Chinantec, Tlacoatzintepec
2254
+ kra Kumal
2255
+ hmg Miao, Southwestern Guiyang
2256
+ zay Zayse
2257
+ faa Fasu
2258
+ lpn Naga, Long Phuri
2259
+ bqv Koro Wachi
2260
+ mpt Mian
2261
+ zak Zanaki
2262
+ pne Penan, Western
2263
+ apn Apinayé
2264
+ sbx Seberuang
2265
+ anp Angika
2266
+ bdv Bodo Parja
2267
+ juy Juray
2268
+ dso Desiya
2269
+ ndd Nde-Nsele-Nta
2270
+ ich Etkywan
2271
+ bkc Baka
2272
+ lez Lezgi
2273
+ lsh Lish
2274
+ mig Mixtec, San Miguel el Grande
2275
+ bdi Burun
2276
+ buu Budu
2277
+ ktn Karitiâna
2278
+ lbo Laven
2279
+ spn Sanapaná
2280
+ kgj Kham, Gamal
2281
+ kky Guugu Yimidhirr
2282
+ bjj Kanauji
2283
+ hve Huave, San Dionisio del Mar
2284
+ ghs Guhu-Samane
2285
+ vav Varli
2286
+ pih Pitcairn-Norfolk
2287
+ pcg Paniya
2288
+ ldj Lemoro
2289
+ brr Birao
2290
+ emn Eman
2291
+ lhl Lohar, Lahul
2292
+ pnc Pannei
2293
+ mnl Tiale
2294
+ ncq Katang, Northern
2295
+ xac Kachari
2296
+ xsn Sanga
2297
+ muz Mursi
2298
+ gwd Ale
2299
+ saf Safaliba
2300
+ dir Dirim
2301
+ dmg Kinabatangan, Upper
2302
+ isu Isu
2303
+ tpq Kinnauri, Chhoyul
2304
+ yuf Havasupai-Walapai-Yavapai
2305
+ oub Glio-Oubi
2306
+ ngn Ngwo
2307
+ fai Faiwol
2308
+ moi Mboi
2309
+ muo Mubako
2310
+ cih Chinali
2311
+ wew Wejewa
2312
+ luj Luna
2313
+ lkh Lakha
2314
+ wti Berta
2315
+ mse Musey
2316
+ bwo Borna
2317
+ nxr Ninggerum
2318
+ gru Kistane
2319
+ wiu Witu
2320
+ ndr Ndoola
2321
+ kmo Kwoma
2322
+ ksm Kumba
2323
+ ggb Gbii
2324
+ tqu Touo
2325
+ gia Kija
2326
+ aol Alor
2327
+ ute Ute-Southern Paiute
2328
+ xtj Mixtec, San Juan Teita
2329
+ khj Kuturmi
2330
+ bvh Bure
2331
+ kwc Likwala
2332
+ doz Dorze
2333
+ kga Koyaga
2334
+ cqd Miao, Chuanqiandian Cluster
2335
+ cjv Chuave
2336
+ hmb Songhay, Humburi Senni
2337
+ nac Narak
2338
+ iws Iwam, Sepik
2339
+ kxw Konai
2340
+ kmy Koma
2341
+ tww Tuwari
2342
+ arg Aragonese
2343
+ tig Tigré
2344
+ irx Kamberau
2345
+ ktv Katu, Eastern
2346
+ cdh Chambeali
2347
+ tis Itneg, Masadiit
2348
+ yeu Yerukula
2349
+ nzy Nzakambay
2350
+ drg Rungus
2351
+ wau Waurá
2352
+ mln Malango
2353
+ rmb Rembarrnga
2354
+ ldb Duya
2355
+ mjs Miship
2356
+ baw Bambili-Bambui
2357
+ dmo Kemedzung
2358
+ qxs Qiang, Southern
2359
+ kjq Keres, Western
2360
+ kwa Dâw
2361
+ azo Awing
2362
+ cjk Chokwe
2363
+ jeh Jeh
2364
+ drs Gedeo
2365
+ arh Arhuaco
2366
+ zdj Comorian, Ngazidja
2367
+ yaq Yaqui
2368
+ gyz Gyaazi
2369
+ fir Firan
2370
+ hbn Heiban
2371
+ ayb Gbe, Ayizo
2372
+ yde Yangum Dey
2373
+ gby Gbari
2374
+ byc Ubaghara
2375
+ bac Badui
2376
+ nhb Beng
2377
+ nms Letemboi
2378
+ pll Palaung, Shwe
2379
+ bwe Karen, Bwe
2380
+ ibb Ibibio
2381
+ agl Fembe
2382
+ nnp Naga, Wancho
2383
+ wmd Mamaindê
2384
+ kmt Kemtuik
2385
+ wja Waja
2386
+ bol Bole
2387
+ bhf Odiai
2388
+ xty Mixtec, Yoloxóchitl
2389
+ sgz Sursurunga
2390
+ apj Apache, Jicarilla
2391
+ drd Darmiya
2392
+ mqu Mandari
2393
+ brd Baram
2394
+ oym Wayampi
2395
+ uis Uisai
2396
+ eot Beti
2397
+ zpk Zapotec, Tlacolulita
2398
+ lbn Rmeet
2399
+ nqg Nago, Southern
2400
+ sme Saami, North
2401
+ zaz Zari
2402
+ sen Sénoufo, Nanerigé
2403
+ pca Popoloca, Santa Inés Ahuatempan
2404
+ biz Baloi
2405
+ brb Brao
2406
+ ppo Folopa
2407
+ amb Ambo
2408
+ krx Karon
2409
+ kwn Kwangali
2410
+ yiq Miqie
2411
+ gmb Gula’alaa
2412
+ res Reshe
2413
+ plc Palawano, Central
2414
+ bab Bainouk-Gunyuño
2415
+ kvb Kubu
2416
+ ymk Makwe
2417
+ nxk Naga, Kokak
2418
+ nut Nung
2419
+ dio Dibo
2420
+ tva Vaghua
2421
+ aez Aeka
2422
+ aoe Angal Enen
2423
+ bqh Baima
2424
+ otx Otomi, Texcatepec
2425
+ gdf Guduf-Gava
2426
+ mfl Putai
2427
+ adl Adi, Galo
2428
+ yay Agwagwune
2429
+ gas Garasia, Adiwasi
2430
+ aio Aiton
2431
+ tkx Tangko
2432
+ brf Bira
2433
+ usi Usoi
2434
+ vmz Mazatec, Mazatlán
2435
+ hru Hruso
2436
+ nja Nzanyi
2437
+ mfn Mbembe, Cross River
2438
+ ekr Yace
2439
+ nud Gala
2440
+ otr Otoro
2441
+ fie Fyer
2442
+ kwe Kwerba
2443
+ mgb Mararit
2444
+ yno Yong
2445
+ bef Benabena
2446
+ nux Mehek
2447
+ sto Stoney
2448
+ fqs Momu
2449
+ sbn Sindhi Bhil
2450
+ liq Libido
2451
+ jbm Bijim
2452
+ bfh Blafe
2453
+ isi Nkem-Nkum
2454
+ vig Viemo
2455
+ heg Helong
2456
+ kvl Kayaw
2457
+ thz Tamajeq, Tayart
2458
+ rin Nungu
2459
+ nco Sibe
2460
+ siw Motuna
2461
+ vmc Mixtec, Juxtlahuaca
2462
+ lev Pantar, Western
2463
+ mvn Minaveha
2464
+ tpl Me’phaa, Tlacoapa
2465
+ uiv Iyive
2466
+ pua Purepecha, Western Highland
2467
+ rnd Ruund
2468
+ cjm Cham, Eastern
2469
+ sym Samo, Maya
2470
+ bbt Mburku
2471
+ gvn Kuku-Yalanji
2472
+ kbx Ap Ma
2473
+ nsm Naga, Sumi
2474
+ bys Burak
2475
+ tlf Telefol
2476
+ mzq Mori Atas
2477
+ pck Chin, Paite
2478
+ hoo Holoholo
2479
+ wrm Warumungu
2480
+ tek Teke, Ibali
2481
+ zkr Zakhring
2482
+ ywl Lalu, Western
2483
+ mjw Karbi
2484
+ kmn Awtuw
2485
+ khs Kasua
2486
+ bnj Bangon
2487
+ mfd Mendankwe-Nkwen
2488
+ pqa Pa’a
2489
+ swo Shanenawa
2490
+ los Loniu
2491
+ nma Naga, Maram
2492
+ dgz Daga
2493
+ stj Samo, Matya
2494
+ ayu Ayu
2495
+ mxs Mixtec, Huitepec
2496
+ bpn Dzao Min
2497
+ tlx Khehek
2498
+ nbn Kuri
2499
+ ynq Yendang
2500
+ grh Tugbiri-Niragu
2501
+ juo Jiba
2502
+ amu Amuzgo, Guerrero
2503
+ myw Muyuw
2504
+ ybj Hasha
2505
+ hio Tshuwau
2506
+ kix Naga, Khiamniungan
2507
+ pma Paama
2508
+ bej Bedawiyet
2509
+ dni Dani, Lower Grand Valley
2510
+ naq Khoekhoe
2511
+ mrq Marquesan, North
2512
+ mrn Cheke Holo
2513
+ dgh Dghwede
2514
+ bau Mbat
2515
+ ite Itene
2516
+ crw Chrau
2517
+ ndb Kenswei Nsei
2518
+ nuk Nuu-chah-nulth
2519
+ dnd Daonda
2520
+ nlu Nchumbulu
2521
+ sge Segai
2522
+ can Chambri
2523
+ sre Bakati’, Sara
2524
+ kfr Kacchi
2525
+ hul Vula’a
2526
+ kid Koshin
2527
+ cyo Cuyonon
2528
+ ykm Kap
2529
+ ktm Kurti
2530
+ bsf Bauchi
2531
+ pio Piapoco
2532
+ kkc Odoodee
2533
+ thm Aheu
2534
+ xkn Kayan, Kayan River
2535
+ gfk Patpatar
2536
+ gel ut-Ma’in
2537
+ bsh Kati
2538
+ pmq Pame, Northern
2539
+ bfj Bafanji
2540
+ xwg Kwegu
2541
+ sng Sanga
2542
+ szp Suabo
2543
+ fvr Fur
2544
+ zwa Zay
2545
+ svs Savosavo
2546
+ chw Chuwabu
2547
+ nlv Nahuatl, Orizaba
2548
+ bsp Baga Sitemu
2549
+ bdl Bajau, Indonesian
2550
+ khy Kele
2551
+ ito Itonama
2552
+ naj Nalu
2553
+ bdd Bunama
2554
+ emb Embaloh
2555
+ zps Zapotec, Coatlán
2556
+ kee Keres, Eastern
2557
+ ukw Ukwuani-Aboh-Ndoni
2558
+ ldm Landoma
2559
+ duw Dusun Witu
2560
+ mxp Mixe, Tlahuitoltepec
2561
+ zln Zhuang, Lianshan
2562
+ zns Mangas
2563
+ blf Buol
2564
+ ksn Kasiguranin
2565
+ prm Kibiri
2566
+ lmd Lumun
2567
+ lop Lopa
2568
+ yev Yeri
2569
+ kwk Kwakwala
2570
+ tcp Chin, Tawr
2571
+ int Intha
2572
+ clj Chin, Laitu
2573
+ jit Jita
2574
+ mgc Morokodo
2575
+ ags Esimbi
2576
+ tvu Tunen
2577
+ ghk Karen, Geko
2578
+ hue Huave, San Francisco del Mar
2579
+ kkn Kon Keu
2580
+ pbm Mazatec, Puebla and Northeastern
2581
+ snl Sangil
2582
+ jkp Karen, Paku
2583
+ yrk Nenets
2584
+ ciw Chippewa
2585
+ mlf Mal
2586
+ pym Pyam
2587
+ vrs Varisi
2588
+ nnd Ambae, West
2589
+ akg Anakalangu
2590
+ udi Udi
2591
+ kys Kayan, Baram
2592
+ lky Lokoya
2593
+ bui Bongili
2594
+ zkd Kadu
2595
+ ihp Iha
2596
+ cdr Kamuku
2597
+ anj Anor
2598
+ ndm Ndam
2599
+ lga Lungga
2600
+ hmw Miao, Western Mashan
2601
+ zkn Kanan
2602
+ bpz Bilba
2603
+ taw Tay
2604
+ mez Menominee
2605
+ wuv Wuvulu-Aua
2606
+ mkk Byep
2607
+ aki Aiome
2608
+ gue Gurindji
2609
+ bse Wushi
2610
+ dsq Tadaksahak
2611
+ spt Spiti Bhoti
2612
+ hoj Haroti
2613
+ aom Ömie
2614
+ mdt Mbere
2615
+ nbb Ndoe
2616
+ ape Bukiyip
2617
+ eky Kayah, Eastern
2618
+ itd Tidung, Southern
2619
+ mcc Bitur
2620
+ kzi Kelabit
2621
+ bhq Tukang Besi South
2622
+ dia Dia
2623
+ asb Assiniboine
2624
+ wyy Fijian, Western
2625
+ nna Nyangumarta
2626
+ twx Tewe
2627
+ mlq Maninkakan, Western
2628
+ uta Itang
2629
+ hmz Sinicized Miao
2630
+ aof Bragat
2631
+ rue Rusyn
2632
+ pbs Pame, Central
2633
+ kio Kiowa
2634
+ tdn Tondano
2635
+ snm Ma’di, Southern
2636
+ cod Kukama-Kukamiria
2637
+ cde Chenchu
2638
+ ppl Nahuat
2639
+ tdg Tamang, Western
2640
+ jmb Zumbun
2641
+ eit Eitiep
2642
+ wni Comorian, Ndzwani
2643
+ tlp Totonac, Filomena Mata-Coahuitlán
2644
+ ilk Bogkalot
2645
+ nri Naga, Chokri
2646
+ kyv Kewat
2647
+ scs Slavey, North
2648
+ kji Zabana
2649
+ tku Totonac, Upper Necaxa
2650
+ byp Bumaji
2651
+ xkt Kantosi
2652
+ kcc Lubila
2653
+ yuq Yuqui
2654
+ cho Choctaw
2655
+ hot Malei
2656
+ kku Tumi
2657
+ bmi Bagirmi
2658
+ wlc Comorian, Mwali
2659
+ auu Auye
2660
+ wle Wolane
2661
+ mmm Maii
2662
+ cdo Chinese, Min Dong
2663
+ nez Nez Perce
2664
+ ukp Bukpe
2665
+ kwo Kwomtari
2666
+ zpx Zapotec, San Baltazar Loxicha
2667
+ mlv Mwotlap
2668
+ ppm Papuma
2669
+ bqr Burusu
2670
+ wut Wutung
2671
+ tji Tujia, Northern
2672
+ bbq Bamali
2673
+ ttk Totoro
2674
+ ets Etsako
2675
+ yin Riang Lai
2676
+ gim Gimi
2677
+ kow Kugama
2678
+ ksa Shuwa-Zamani
2679
+ git Gitxsan
2680
+ erk Efate, South
2681
+ vmx Mixtec, Tamazola
2682
+ duv Duvle
2683
+ bgf Bangandu
2684
+ wms Wambon
2685
+ isd Isnag
2686
+ pmx Naga, Poumai
2687
+ doy Dompo
2688
+ nak Nakanai
2689
+ bze Bozo, Jenaama
2690
+ gis Giziga, North
2691
+ miu Mixtec, Cacaloxtepec
2692
+ bzu Burmeso
2693
+ ckx Caka
2694
+ duu Drung
2695
+ jmr Kamara
2696
+ lur Laura
2697
+ wlw Walak
2698
+ rar Cook Islands Maori
2699
+ osi Osing
2700
+ mmd Maonan
2701
+ kmm Kom
2702
+ kvr Kerinci
2703
+ ncr Nchane
2704
+ for Fore
2705
+ bgn Balochi, Western
2706
+ gnm Ginuman
2707
+ alx Amol
2708
+ xks Kumbewaha
2709
+ lkn Lakon
2710
+ mbl Maxakalí
2711
+ bri Mokpwe
2712
+ mov Mohave
2713
+ pot Potawatomi
2714
+ pnu Bunu, Jiongnai
2715
+ djm Dogon, Jamsay
2716
+ ula Fungwa
2717
+ nnj Nyangatom
2718
+ ybl Yukuben
2719
+ aab Arum
2720
+ has Haisla
2721
+ alh Alawa
2722
+ mea Menka
2723
+ pum Puma
2724
+ spo Spokane
2725
+ cyb Cayubaba
2726
+ nbh Ngamo
2727
+ ont Ontenu
2728
+ ahp Aizi, Aproumu
2729
+ bpy Bishnupuriya
2730
+ utr Etulo
2731
+ auk Heiyoho
2732
+ bdb Basap
2733
+ klo Kapya
2734
+ nrf Guernésiais
2735
+ tmn Taman
2736
+ mvo Marovo
2737
+ kla Klamath-Modoc
2738
+ jnl Rawat
2739
+ jad Jahanka
2740
+ hrm Miao, Horned
2741
+ hoa Hoava
2742
+ mus Muskogee
2743
+ dna Dani, Upper Grand Valley
2744
+ btg Bété, Gagnoa
2745
+ ngs Gvoko
2746
+ lmu Lamenu
2747
+ add Lidzonka
2748
+ pha Pa-Hng
2749
+ kvq Karen, Geba
2750
+ pch Pardhan
2751
+ bgs Tagabawa
2752
+ nir Nimboran
2753
+ bcy Bacama
2754
+ var Huarijío
2755
+ sjo Xibe
2756
+ jle Ngile
2757
+ cuv Cuvok
2758
+ smf Auwe
2759
+ cnq Chung
2760
+ bhh Bukharic
2761
+ dox Mositacha
2762
+ ior Inor
2763
+ oma Omaha-Ponca
2764
+ abz Abui
2765
+ kza Karaboro, Western
2766
+ rbb Palaung, Rumai
2767
+ bfq Badaga
2768
+ kht Khamti
2769
+ sps Saposa
2770
+ syk Sukur
2771
+ slp Lamaholot
2772
+ jax Malay, Jambi
2773
+ byo Biyo
2774
+ qvj Quichua, Loja Highland
2775
+ bnx Bangubangu
2776
+ ngw Ngwaba
2777
+ krf Koro
2778
+ loa Loloda
2779
+ cox Nanti
2780
+ wwo Dorig
2781
+ akc Mpur
2782
+ kal Greenlandic
2783
+ siu Sinagen
2784
+ aqm Atohwaim
2785
+ rmt Domari
2786
+ nhn Nahuatl, Central
2787
+ jum Jumjum
2788
+ nix Hema
2789
+ ncg Nisga’a
2790
+ ccl Cutchi-Swahili
2791
+ kvu Yinbaw
2792
+ tnb Tunebo, Western
2793
+ cpx Chinese, Pu-Xian
2794
+ kgo Krongo
2795
+ nxd Ngando
2796
+ coj Cochimi
2797
+ grx Muno
2798
+ bfs Bai, Southern
2799
+ cov Cao Miao
2800
+ cbj Ede Cabe
2801
+ loh Narim
2802
+ iry Iraya
2803
+ cky Cakfem-Mushere
2804
+ bsn Barasana-Eduria
2805
+ xkv Kgalagadi
2806
+ itz Itza’
2807
+ tgc Tigak
2808
+ boh Boma
2809
+ mck Mbunda
2810
+ ccg Samba Daka
2811
+ piy Piya-Kwonci
2812
+ how Honi
2813
+ pwm Molbog
2814
+ tds Doutai
2815
+ ldg Lenyima
2816
+ csa Chinantec, Chiltepec
2817
+ cbk Chavacano
2818
+ ibl Ibaloi
2819
+ kql Kyenele
2820
+ smq Samo
2821
+ uya Doko-Uyanga
2822
+ tkd Tukudede
2823
+ cry Kyoli
2824
+ clk Idu-Mishmi
2825
+ cut Cuicatec, Teutila
2826
+ apm Apache, Mescalero-Chiricahua
2827
+ bya Batak
2828
+ nyi Ama
2829
+ nih Nyiha, Tanzania
2830
+ hbb Nya Huba
2831
+ huc ǂ’Amkhoe
2832
+ cdi Chodri
2833
+ rhp Yahang
2834
+ bcj Bardi
2835
+ pei Chichimeco-Jonaz
2836
+ gdl Dirasha
2837
+ emg Mewahang, Eastern
2838
+ mmz Mabaale
2839
+ afo Ajiri
2840
+ bhs Buwal
2841
+ lht Lo-Toga
2842
+ ktp Kaduo
2843
+ xns Kanashi
2844
+ sjb Sajau Basap
2845
+ pow Popoloca, San Felipe Otlaltepec
2846
+ rad Rade
2847
+ gut Maléku Jaíka
2848
+ vam Dumo
2849
+ kis Kis
2850
+ bet Bété, Guiberoua
2851
+ lva Makuva
2852
+ zoc Zoque, Copainalá
2853
+ goa Guro
2854
+ bkg Buraka
2855
+ yae Pumé
2856
+ won Wongo
2857
+ gpa Gupa-Abawa
2858
+ sde Vori
2859
+ mls Masalit
2860
+ jiu Jinuo, Youle
2861
+ bmd Baga Manduri
2862
+ czt Chin, Zotung
2863
+ tvn Tavoyan
2864
+ zng Mang
2865
+ ijj Ede Ije
2866
+ dms Dampelas
2867
+ mlw Moloko
2868
+ wow Wawonii
2869
+ png Pangu
2870
+ ikw Ikwere
2871
+ dtb Kadazan, Labuk-Kinabatangan
2872
+ bey Beli
2873
+ ntu Natügu
2874
+ sua Sulka
2875
+ kcx Kachama-Ganjule
2876
+ ekl Kol
2877
+ mhp Malay, Balinese
2878
+ slz Ma’ya
2879
+ skt Sakata
2880
+ dez Dengese
2881
+ ogc Ogbah
2882
+ byz Waran
2883
+ yui Wajiara
2884
+ kdx Kam
2885
+ erh Eruwa
2886
+ atp Atta, Pudtol
2887
+ qws Quechua, Sihuas Ancash
2888
+ ale Aleut
2889
+ lcm Tungag
2890
+ pbp Badyara
2891
+ anc Ngas
2892
+ khl Lusi
2893
+ mkc Siliput
2894
+ knm Kanamarí
2895
+ yah Yazgulyam
2896
+ hml Miao, Luopohe
2897
+ mfb Bangka
2898
+ mxl Gbe, Maxi
2899
+ lgt Pahi
2900
+ das Daho-Doo
2901
+ njo Naga, Ao
2902
+ iar Purari
2903
+ nou Ewage-Notu
2904
+ moh Mohawk
2905
+ tvl Tuvaluan
2906
+ yuy Yugur, East
2907
+ kvt Lahta
2908
+ sku Sakao
2909
+ hra Hrangkhol
2910
+ nka Nkoya
2911
+ crx Carrier
2912
+ tif Tifal
2913
+ pia Pima Bajo
2914
+ ppi Paipai
2915
+ nbp Nnam
2916
+ btm Batak Mandailing
2917
+ jya Jiarong
2918
+ mxd Modang
2919
+ psn Panasuan
2920
+ puc Punan Merap
2921
+ tty Sikaritai
2922
+ mzb Tumzabt
2923
+ zmb Zimba
2924
+ kdu Kadaru
2925
+ nnz Nda’nda’
2926
+ nmb V’ënen Taut
2927
+ tcd Tafi
2928
+ weh Weh
2929
+ jni Janji
2930
+ txn Tarangan, West
2931
+ pem Phende
2932
+ xod Kokoda
2933
+ byj Bina
2934
+ bpw Bo
2935
+ bbf Baibai
2936
+ sol Solos
2937
+ mmc Mazahua, Michoacán
2938
+ pta Pai Tavytera
2939
+ khc Tukang Besi North
2940
+ nau Nauruan
2941
+ llu Lau
2942
+ pnz Pana
2943
+ kuy Kuuku-Ya’u
2944
+ wbq Waddar
2945
+ wud Wudu
2946
+ mbi Manobo, Ilianen
2947
+ ikt Inuinnaqtun
2948
+ bhp Bima
2949
+ mdj Mangbetu
2950
+ swj Sira
2951
+ xom Komo
2952
+ rir Ribun
2953
+ sbr Murut, Sembakung
2954
+ tfn Tanaina
2955
+ pwa Pawaia
2956
+ msw Mansoanka
2957
+ zpn Zapotec, Santa Inés Yatzechi
2958
+ rkm Marka
2959
+ aun One, Molmo
2960
+ mxa Mixtec, Northwest Oaxaca
2961
+ abr Abron
2962
+ bxs Busam
2963
+ bly Notre
2964
+ gro Groma
2965
+ mvz Mesqan
2966
+ yum Quechan
2967
+ nxg Ngad’a
2968
+ akw Akwa
2969
+ kmp Gimme
2970
+ kfh Kurichiya
2971
+ ged Gade
2972
+ yuj Karkar-Yuri
2973
+ hwo Hwana
2974
+ pkt Maleng
2975
+ agc Agatu
2976
+ mgi Migili
2977
+ akt Akolet
2978
+ bkw Bekwel
2979
+ dun Dusun Deyah
2980
+ mrh Chin, Mara
2981
+ dgd Dagaari Dioula
2982
+ kci Kamantan
2983
+ yak Yakama
2984
+ kch Vono
2985
+ bxq Beele
2986
+ chx Chantyal
2987
+ gra Garasia, Rajput
2988
+ kih Kilmeri
2989
+ ono Onondaga
2990
+ adn Adang
2991
+ aug Aguna
2992
+ bqt Bamukumbit
2993
+ mum Maiwala
2994
+ atu Reel
2995
+ hop Hopi
2996
+ bhy Bhele
2997
+ zms Mbesa
2998
+ prx Purig
2999
+ bjp Fanamaket
3000
+ odu Odual
3001
+ azd Nahuatl, Eastern Durango
3002
+ bje Biao-Jiao Mien
3003
+ mct Mengisa
3004
+ njm Naga, Angami
3005
+ liu Logorik
3006
+ pwn Paiwan
3007
+ mav Sateré-Mawé
3008
+ gnu Gnau
3009
+ jub Wannu
3010
+ sez Chin, Senthang
3011
+ mgg Mpumpong
3012
+ ost Osatu
3013
+ vkl Kulisusu
3014
+ kbj Kari
3015
+ bag Tuki
3016
+ bjt Balanta-Ganja
3017
+ mkf Miya
3018
+ ngi Ngizim
3019
+ mds Maria
3020
+ gvf Golin
3021
+ thd Kuuk Thayorre
3022
+ rau Raute
3023
+ sse Sama, Balangingih
3024
+ nhz Nahuatl, Santa María la Alta
3025
+ cvn Chinantec, Valle Nacional
3026
+ nba Nyemba
3027
+ hnd Hindko, Southern
3028
+ nbi Naga, Mao
3029
+ bil Bille
3030
+ xmh Kugu-Muminh
3031
+ bip Bila
3032
+ zhi Zhire
3033
+ aal Afade
3034
+ mfg Mogofin
3035
+ wan Wan
3036
+ kkf Monpa, Kalaktang
3037
+ nyq Nayini
3038
+ ors Orang Seletar
3039
+ bbp Banda, West Central
3040
+ yle Yélî Dnye
3041
+ taz Tocho
3042
+ dri C’Lela
3043
+ nbv Ngamambo
3044
+ mqg Malay, Kota Bangun Kutai
3045
+ mdu Mboko
3046
+ aty Aneityum
3047
+ mbq Maisin
3048
+ hav Havu
3049
+ ner Yahadian
3050
+ glw Glavda
3051
+ nyb Nyangbo
3052
+ clt Chin, Lautu
3053
+ jiy Jinuo, Buyuan
3054
+ qxa Quechua, Chiquián
3055
+ win Ho-Chunk
3056
+ chr Cherokee
3057
+ vkn Koro Nulu
3058
+ quv Sakapulteko
3059
+ wrs Waris
3060
+ nit Kolami, Southeastern
3061
+ ver Verre
3062
+ nmk Namakura
3063
+ czh Chinese, Huizhou
3064
+ wrp Waropen
3065
+ mmx Madak
3066
+ yis Yis
3067
+ kce Kaivi
3068
+ ddg Fataluku
3069
+ sle Sholaga
3070
+ ega Ega
3071
+ jnd Jandavra
3072
+ kxx Likuba
3073
+ kna Dera
3074
+ yer Tarok
3075
+ amm Sawiyanu
3076
+ onn Onobasulu
3077
+ tce Tutchone, Southern
3078
+ buh Bunu, Younuo
3079
+ sst Sinasina
3080
+ wsk Waskia
3081
+ dln Darlong
3082
+ teq Temein
3083
+ org Oring
3084
+ cfg Karimjo
3085
+ nce Yale
3086
+ lgq Logba
3087
+ yif Ache
3088
+ kfo Koro
3089
+ jog Jogi
3090
+ nkx Nkoroo
3091
+ asr Asuri
3092
+ ktc Kholok
3093
+ gbz Dari, Zoroastrian
3094
+ kvy Yintale
3095
+ kvv Kola
3096
+ oia Oirata
3097
+ rdb Rudbari
3098
+ ymb Yambes
3099
+ sad Sandawe
3100
+ ntk Ikoma-Nata-Isenye
3101
+ dru Rukai
3102
+ bjh Bahinemo
3103
+ ywa Kalou
3104
+ nmc Ngam
3105
+ nat Cahungwarya
3106
+ ato Atong
3107
+ liw Col
3108
+ qux Quechua, Yauyos
3109
+ shw Shwai
3110
+ cfd Cara
3111
+ pip Pero
3112
+ zts Zapotec, Tilquiapan
3113
+ mcs Mambai
3114
+ sgi Nizaa
3115
+ mhl Mauwake
3116
+ ndu Dugun
3117
+ bqa Tchumbuli
3118
+ bqo Balo
3119
+ buz Bukwen
3120
+ fak Fang
3121
+ tii Tiene
3122
+ gvp Gavião, Pará
3123
+ kmh Kalam
3124
+ xkc Kho’ini
3125
+ max Malay, North Moluccan
3126
+ phl Palula
3127
+ gbg Gbanziri
3128
+ zag Zaghawa
3129
+ trf Trinidadian English Creole
3130
+ weo Wemale
3131
+ geg Gengle
3132
+ kxb Krobu
3133
+ pru Puragi
3134
+ kie Kibet
3135
+ mpn Mindiri
3136
+ mhz Mor
3137
+ gbh Gbe, Defi
3138
+ gbr Gbagyi
3139
+ tmy Tami
3140
+ rey Reyesano
3141
+ kpj Karajá
3142
+ nap Napoletano-Calabrese
3143
+ lgu Longgu
3144
+ bye Pouye
3145
+ tml Citak, Tamnim
3146
+ kpw Kobon
3147
+ kfa Kodava
3148
+ iyx Yaka
3149
+ twy Tawoyan
3150
+ sed Sedang
3151
+ bdm Buduma
3152
+ plg Pilagá
3153
+ buo Terei
3154
+ aww Awun
3155
+ yyu Yau
3156
+ cld Chaldean Neo-Aramaic
3157
+ xmg Mengaka
3158
+ pku Paku
3159
+ xkg Kagoro
3160
+ caq Nicobarese, Car
3161
+ kmq Gwama
3162
+ lel Lele
3163
+ gqa Ga’anda
3164
+ tfi Gbe, Tofin
3165
+ mml Man Met
3166
+ nxa Nauete
3167
+ tdk Rom
3168
+ kbv Dla
3169
+ bgv Warkay-Bipim
3170
+ bbw Supapya
3171
+ kvm Kendem
3172
+ aku Akum
3173
+ ert Eritai
3174
+ jdg Jadgali
3175
+ gow Gorowa
3176
+ doo Dongo
3177
+ jeb Jebero
3178
+ stf Seta
3179
+ nid Ngandi
3180
+ mqx Mamuju
3181
+ mta Manobo, Cotabato
3182
+ she Sheko
3183
+ mfm Marghi South
3184
+ jei Yei
3185
+ deg Degema
3186
+ gcf Guadeloupean French Creole
3187
+ bxb Belanda Bor
3188
+ mut Muria, Western
3189
+ diw Dinka, Northwestern
3190
+ nqy Naga, Akyaung Ari
3191
+ sop Songe
3192
+ kny Kanyok
3193
+ lse Lusengo
3194
+ ahg Qimant
3195
+ opa Okpamheri
3196
+ hah Hahon
3197
+ daq Maria, Dandami
3198
+ hac Gurani
3199
+ klg Tagakaulo
3200
+ kqi Koita
3201
+ slx Salampasu
3202
+ ots Otomí, Estado de México
3203
+ tru Turoyo
3204
+ sxw Gbe, Saxwe
3205
+ dij Dai
3206
+ aog Angoram
3207
+ kcr Katla
3208
+ agf Arguni
3209
+ alq Algonquin
3210
+ raf Mewahang, Western
3211
+ mij Mungbam
3212
+ gdu Gudu
3213
+ wgi Wahgi
3214
+ bbu Kulung
3215
+ ndo Ndonga
3216
+ mma Mama
3217
+ tal Tal
3218
+ odk Oadki
3219
+ etr Edolo
3220
+ umu Munsee
3221
+ kjs Kewapi, East
3222
+ bvm Bamunka
3223
+ jqr Jaqaru
3224
+ kfm Khunsari
3225
+ tbp Diebroud
3226
+ ems Yupik, Pacific Gulf
3227
+ kcq Kamo
3228
+ ruy Mala
3229
+ nng Naga, Maring
3230
+ jns Jaunsari
3231
+ sbk Safwa
3232
+ wji Warji
3233
+ sbz Sara Kaba
3234
+ bhl Bimin
3235
+ auy Awiyaana
3236
+ txt Citak
3237
+ nof Nomane
3238
+ cll Chala
3239
+ pak Parakanã
3240
+ tli Tlingit
3241
+ kqo Krahn, Eastern
3242
+ kbz Duhwa
3243
+ mbx Mari
3244
+ xrw Karawa
3245
+ crj Cree, Southern East
3246
+ jaq Yaqay
3247
+ pbn Kpasham
3248
+ dbi Doka
3249
+ kod Kodi
3250
+ bjk Barok
3251
+ syb Subanen, Central
3252
+ nyh Nyikina
3253
+ kfd Koraga, Korra
3254
+ mtk Mbo’
3255
+ mbd Manobo, Dibabawon
3256
+ jgk Gwak
3257
+ mmp Siawi
3258
+ uba Ubang
3259
+ kxh Karo
3260
+ tov Taromi, Upper
3261
+ buk Bugawac
3262
+ abn Abua
3263
+ kbh Camsá
3264
+ slc Sáliba
3265
+ knt Katukína, Panoan
3266
+ rwa Rawo
3267
+ kyk Kamayo
3268
+ kli Kalumpang
3269
+ klq Rumu
3270
+ iqu Iquitu
3271
+ jku Labir
3272
+ bga Gwamhi-Wuri
3273
+ amo Amo
3274
+ gou Gavar
3275
+ kdz Kwaja
3276
+ nzm Naga, Zeme
3277
+ mgk Mawes
3278
+ sjr Siar-Lak
3279
+ aqg Arigidi
3280
+ ghl Ghulfan
3281
+ oso Ososo
3282
+ rei Reli
3283
+ tiw Tiwi
3284
+ kdq Koch
3285
+ zbu Bu
3286
+ wem Gbe, Weme
3287
+ gig Goaria
3288
+ tsw Tsishingini
3289
+ gmz Mgbolizhia
3290
+ mfo Mbe
3291
+ anw Anaang
3292
+ mtu Mixtec, Tututepec
3293
+ ahb Axamb
3294
+ bub Bua
3295
+ jru Japreria
3296
+ ryu Okinawan, Central
3297
+ nuo Nguôn
3298
+ kdm Gyong
3299
+ due Agta, Umiray Dumaget
3300
+ boo Bozo, Tiemacèwè
3301
+ vmm Mixtec, Mitlatongo
3302
+ ydg Yadgha
3303
+ adz Adzera
3304
+ yaf Yaka
3305
+ mep Miriwoong
3306
+ kip Kham, Sheshi
3307
+ bvw Boga
3308
+ mqh Mixtec, Tlazoyaltepec
3309
+ bmj Bote
3310
+ dih Kumiai
3311
+ cib Gbe, Ci
3312
+ ggg Gurgula
3313
+ ldq Lufu
3314
+ scv Sheni
3315
+ siy Sivandi
3316
+ ktf Kwami
3317
+ gew Gera
3318
+ lan Laru
3319
+ kks Giiwo
3320
+ fun Iatê
3321
+ dtm Dogon, Tomo Kan
3322
+ thp Thompson
3323
+ gye Gyem
3324
+ zaf Zapotec, Ayoquesco
3325
+ kcs Koenoem
3326
+ yap Yapese
3327
+ bnv Beneraf
3328
+ src Sardinian, Logudorese
3329
+ brq Breri
3330
+ frc French, Cajun
3331
+ elk Elkei
3332
+ aad Amal
3333
+ kqj Koromira
3334
+ ael Ambele
3335
+ mku Maninka, Konyanka
3336
+ otm Otomi, Eastern Highland
3337
+ ldp Tso
3338
+ dbd Dadiya
3339
+ ttm Tutchone, Northern
3340
+ nen Nengone
3341
+ bit Berinomo
3342
+ wca Yanomámi
3343
+ jig Jingulu
3344
+ wss Wasa
3345
+ huh Huilliche
3346
+ xti Mixtec, Sinicahua
3347
+ nhv Nahuatl, Temascaltepec
3348
+ smy Semnani
3349
+ tak Tala
3350
+ hch Huichol
3351
+ kqa Mum
3352
+ spm Akukem
3353
+ kfz Koromfé
3354
+ ank Goemai
3355
+ ruz Ruma
3356
+ koh Koyo
3357
+ pdo Padoe
3358
+ kvd Kui
3359
+ fut Futuna-Aniwa
3360
+ wom Wom
3361
+ sor Soumraye
3362
+ gdx Godwari
3363
+ ttb Gaa
3364
+ iti Itneg, Inlaud
3365
+ tsp Toussian, Northern
3366
+ jkr Koro
3367
+ sct Katang, Southern
3368
+ laa Subanen, Southern
3369
+ auq Anus
3370
+ agy Alta, Southern
3371
+ tuq Tedaga
3372
+ acv Achumawi
3373
+ mbv Mbulungish
3374
+ orh Oroqen
3375
+ def Dezfuli
3376
+ gop Yeretuar
3377
+ nyg Nyindu
3378
+ liz Libinza
3379
+ tay Atayal
3380
+ dil Dilling
3381
+ mtf Murik
3382
+ jup Hupdë
3383
+ uuu U
3384
+ ncf Notsi
3385
+ hum Hungana
3386
+ vum Vumbu
3387
+ mfj Mefele
3388
+ afi Chini
3389
+ meh Mixtec, Southwestern Tlaxiaco
3390
+ tma Tama
3391
+ mkg Mak
3392
+ aik Akye
3393
+ ung Ngarinyin
3394
+ itt Itneg, Maeng
3395
+ akl Aklanon
3396
+ sti Stieng, Bulo
3397
+ gid Gidar
3398
+ ckl Kibaku
3399
+ spu Sapuan
3400
+ enn Engenni
3401
+ ebr Tchaman
3402
+ mcw Mawa
3403
+ ybe Yugur, West
3404
+ kni Kanufi
3405
+ kjr Kurudu
3406
+ bwm Biwat
3407
+ vra Vera’a
3408
+ duq Dusun Malang
3409
+ bpu Bongu
3410
+ mrz Marind
3411
+ sdh Kurdish, Southern
3412
+ cdn Chaudangsi
3413
+ vmp Mazatec, Soyaltepec
3414
+ zsm Malay, Standard
3415
+ szg Sengele
3416
+ yun Bena
3417
+ kcd Kanum, Ngkâlmpw
3418
+ ala Alago
3419
+ ywn Yawanawa
3420
+ nfl Äiwoo
3421
+ pbl Mak
3422
+ pyu Puyuma
3423
+ zrg Mirgan
3424
+ aif Agi
3425
+ kmj Kumarbhag Paharia
3426
+ njj Njen
3427
+ ahs Ashe
3428
+ kwu Kwakum
3429
+ mxh Mvuba
3430
+ chp Dene
3431
+ iko Olulumo-Ikom
3432
+ krh Kurama
3433
+ bux Boghom
3434
+ udl Wuzlam
3435
+ one Oneida
3436
+ akq Ak
3437
+ fla Kalispel-Pend d’Oreille
3438
+ zpr Zapotec, Santiago Xanica
3439
+ tvt Naga, Tutsa
3440
+ awe Awetí
3441
+ bqx Baangi
3442
+ yns Iyansi
3443
+ dya Dyan
3444
+ hkk Hunjara-Kaina Ke
3445
+ clc Chilcotin
3446
+ kpa Kutto
3447
+ ldk Leelau
3448
+ dak Dakota
3449
+ vls West Flemish
3450
+ xnz Mattokki
3451
+ ccj Kasanga
3452
+ kzc Kulango, Bondoukou
3453
+ dkx Mazagway-Hidi
3454
+ leq Lembena
3455
+ saz Saurashtra
3456
+ mqz Pano
3457
+ akr Araki
3458
+ fap Paloor
3459
+ mef Megam
3460
+ rat Razajerdi
3461
+ kmk Kalinga, Limos
3462
+ ike Inuktitut, Eastern Canadian
3463
+ see Seneca
3464
+ nlo Ngul
3465
+ klk Kono
3466
+ rcf Réunion French Creole
3467
+ bof Bolon
3468
+ rwk Rwa
3469
+ smt Simte
3470
+ jma Dima
3471
+ mmn Minamanwa
3472
+ mhk Mungaka
3473
+ whg Yuwei
3474
+ zro Záparo
3475
+ sob Sobei
3476
+ mtp Weenhayek
3477
+ zuy Zumaya
3478
+ ocu Matlatzinca, Atzingo
3479
+ xtt Mixtec, Tacahua
3480
+ mek Mekeo
3481
+ ctt Chetti, Wayanad
3482
+ bni Bangi
3483
+ ogb Ogbia
3484
+ orx Oro
3485
+ kot Lagwan
3486
+ itr Yawuno Teneyo
3487
+ kic Kickapoo
3488
+ skd Miwok, Southern Sierra
3489
+ nhg Nahuatl, Tetelcingo
3490
+ bvi Belanda Viri
3491
+ tny Tongwe
3492
+ rui Rufiji
3493
+ dor Dori’o
3494
+ lmk Lamkang
3495
+ ncb Nicobarese, Central
3496
+ msg Moraid
3497
+ snq Sangu
3498
+ eme Tekó
3499
+ amc Amahuaca
3500
+ msn Vurës
3501
+ hdn Haida, Northern
3502
+ com Comanche
3503
+ sgd Surigaonon
3504
+ cdf Chiru
3505
+ ttj Tooro
3506
+ skv Skou
3507
+ twp Ere
3508
+ gek Ywom
3509
+ cob Chicomuceltec
3510
+ fll Fali, North
3511
+ mne Naba
3512
+ coc Cocopa
3513
+ mph Maung
3514
+ gaf Gende
3515
+ agh Ngelima
3516
+ epi Epie
3517
+ aaw Solong
3518
+ sok Sokoro
3519
+ piu Pintupi-Luritja
3520
+ dyg Agta, Villa Viciosa
3521
+ mla Malo
3522
+ dof Domu
3523
+ klx Koluwawa
3524
+ gab Gabri
3525
+ scn Sicilian
3526
+ mat Matlatzinca, San Francisco
3527
+ bja Budza
3528
+ kcj Kobiana
3529
+ kwb Kwa
3530
+ tsu Tsou
3531
+ kev Kanikkaran
3532
+ ksj Uare
3533
+ zrs Mairasi
3534
+ bcv Shoo-Minda-Nye
3535
+ sug Suganga
3536
+ pcl Pardhi
3537
+ yim Naga, Yimchungru
3538
+ kqk Gbe, Kotafon
3539
+ bzx Bozo, Kelengaxo
3540
+ esh Eshtehardi
3541
+ fay Fars, Southwestern
3542
+ dee Dewoin
3543
+ eze Uzekwe
3544
+ bwt Bafaw-Balong
3545
+ nph Naga, Phom
3546
+ pmm Pol
3547
+ pdc German, Pennsylvania
3548
+ srz Shahmirzadi
3549
+ tug Tunia
3550
+ hux Witoto, Nipode
3551
+ soo Songo
3552
+ bcz Bainouk-Gunyaamolo
3553
+ bva Barein
3554
+ sky Sikaiana
3555
+ blc Bella Coola
3556
+ skq Sininkere
3557
+ yix Axi
3558
+ arx Aruá
3559
+ msl Molof
3560
+ aqt Angaité
3561
+ gcr Guianese French Creole
3562
+ mtb Anyin Morofo
3563
+ lrl Lari
3564
+ tiy Teduray
3565
+ iwm Iwam
3566
+ bhg Binandere
3567
+ pbv Pnar
3568
+ gmm Gbaya-Mbodomo
3569
+ apy Apalaí
3570
+ iow Iowa-Oto
3571
+ cku Koasati
3572
+ sry Sera
3573
+ zcd Zapotec, Las Delicias
3574
+ toj Tojolabal
3575
+ idi Idi
3576
+ kqw Kandas
3577
+ irr Ir
3578
+ bif Biafada
3579
+ akf Akpa
3580
+ arw Arawak
3581
+ lor Téén
3582
+ was Washo
3583
+ nrg Narango
3584
+ knz Kalamsé
3585
+ anf Animere
3586
+ goz Gozarkhani
3587
+ vmh Maraghei
3588
+ arp Arapaho
3589
+ glr Glaro-Twabo
3590
+ big Biangai
3591
+ tou Tho
3592
+ lie Likila
3593
+ hol Holu
3594
+ dbn Duriankere
3595
+ asu Asurini, Tocantins
3596
+ xvi Kamviri
3597
+ aaf Aranadan
3598
+ mii Mixtec, Chigmecatitlán
3599
+ xkj Kajali
3600
+ bez Bena
3601
+ trv Seediq
3602
+ bqs Bosmun
3603
+ yax Yauma
3604
+ ykg Yukaghir, Northern
3605
+ hgm Hai|ǁom
3606
+ sgr Sangisari
3607
+ vaf Vafsi
3608
+ anl Chin, Anu-Khongso
3609
+ mdh Maguindanaon
3610
+ bbv Karnai
3611
+ wbb Wabo
3612
+ shc Sonde
3613
+ nsa Naga, Sangtam
3614
+ rtm Rotuman
3615
+ kvg Kuni-Boazi
3616
+ cgg Chiga
3617
+ mdn Mbati
3618
+ job Joba
3619
+ bxl Jalkunan
3620
+ jrt Jakattoe
3621
+ ilp Iranun
3622
+ njh Naga, Lotha
3623
+ sek Sekani
3624
+ avi Avikam
3625
+ nmh Naga, Monsang
3626
+ cos Corsican
3627
+ ctz Chatino, Zacatepec
3628
+ wbj Alagwa
3629
+ sbg Seget
3630
+ tyy Tiyaa
3631
+ bea Beaver
3632
+ chd Chontal, Highland Oaxaca
3633
+ ado Abu
3634
+ mnv Rennell-Bellona
3635
+ dbb Deno
3636
+ mti Maiwa
3637
+ ekp Ekpeye
3638
+ plr Sénoufo, Palaka
3639
+ nev Nyaheun
3640
+ cra Chara
3641
+ tla Tepehuan, Southwestern
3642
+ xmf Mingrelian
3643
+ nyw Nyaw
3644
+ dis Dimasa
3645
+ zpy Zapotec, Mazaltepec
3646
+ dgx Doghoro
3647
+ ifm Teke-Wuumu
3648
+ ngz Ngungwel
3649
+ yra Yerakai
3650
+ sau Saleman
3651
+ psw Port Sandwich
3652
+ kbm Iwal
3653
+ mye Myene
3654
+ tiq Tiéfo
3655
+ kkh Khün
3656
+ kjt Karen, Phrae Pwo
3657
+ gox Gobu
3658
+ kzm Kais
3659
+ pac Pacoh
3660
+ gua Shiki
3661
+ too Totonac, Xicotepec de Juárez
3662
+ nre Naga, Southern Rengma
3663
+ pqm Malecite-Passamaquoddy
3664
+ gul Sea Island English Creole
3665
+ cte Chinantec, Tepinapa
3666
+ buf Bushoong
3667
+ bws Bomboma
3668
+ tlq Tai Loi
3669
+ asi Buruwai
3670
+ bpv Marind, Bian
3671
+ atk Ati
3672
+ gar Galeya
3673
+ plv Palawano, Southwest
3674
+ sev Sénoufo, Nyarafolo
3675
+ vem Vemgo-Mabas
3676
+ hla Halia
3677
+ mna Mbula
3678
+ pcb Pear
3679
+ lih Lihir
3680
+ ksv Kusu
3681
+ iby Ibani
3682
+ yrb Yareba
3683
+ nge Ngemba
3684
+ ney Neyo
3685
+ keb Kélé
3686
+ nuq Nukumanu
3687
+ okh Koresh-e Rostam
3688
+ ity Itneg, Moyadan
3689
+ van Walman
3690
+ ijs Ijo, Southeast
3691
+ shs Shuswap
3692
+ mkb Mal Paharia
3693
+ kit Agob
3694
+ nyj Nyanga
3695
+ tti Tobati
3696
+ agb Legbo
3697
+ twr Tarahumara, Southwestern
3698
+ cae Laalaa
3699
+ biu Biate
3700
+ grs Gresi
3701
+ brp Barapasi
3702
+ tdv Atoro
3703
+ crv Chaura
3704
+ njs Nisa
3705
+ oke Okpe
3706
+ tdl Kusur-Myet
3707
+ mlx Na’ahai
3708
+ zte Zapotec, Elotepec
3709
+ ivb Ibatan
3710
+ chy Cheyenne
3711
+ mbf Malay, Baba
3712
+ nal Nalik
3713
+ lwl Lawa, Eastern
3714
+ buw Bubi
3715
+ qus Quichua, Santiago del Estero
3716
+ lik Lika
3717
+ lna Langbashe
3718
+ dem Dem
3719
+ ldo Loo
3720
+ pbg Paraujano
3721
+ mic Mi’kmaq
3722
+ wdj Wadjiginy
3723
+ tol Tolowa
3724
+ sns Nahavaq
3725
+ luz Luri, Southern
3726
+ tgy Togoyo
3727
+ sha Shall-Zwall
3728
+ mtl Tehl
3729
+ scw Sya
3730
+ hna Mina
3731
+ moe Innu
3732
+ mae Bo-Rukul
3733
+ avd Alviri-Vidari
3734
+ bsy Bisaya, Sabah
3735
+ kfe Kota
3736
+ dsn Dusner
3737
+ kst Winyé
3738
+ bid Bidiyo
3739
+ erg Sie
3740
+ tls Tambotalo
3741
+ nkw Nkutu
3742
+ zia Zia
3743
+ bdw Baham
3744
+ une Uneme
3745
+ ykk Yakaikeke
3746
+ plu Palikúr
3747
+ pfe Pere
3748
+ blq Paluai
3749
+ sao Sause
3750
+ tsa Tsaangi
3751
+ uni Uni
3752
+ irn Irántxe
3753
+ pos Popoluca, Sayula
3754
+ mot Barí
3755
+ lki Laki
3756
+ gbn Mo’da
3757
+ chk Chuukese
3758
+ kmz Khorasani Turkish
3759
+ orz Ormu
3760
+ bfe Betaf
3761
+ nlj Nyali
3762
+ bnn Bunun
3763
+ aba Abé
3764
+ abu Abure
3765
+ iai Iaai
3766
+ knn Konkani
3767
+ biy Birhor
3768
+ yog Yogad
3769
+ gnb Gangte
3770
+ bou Bondei
3771
+ zmq Mituku
3772
+ tto Ta’oih, Lower
3773
+ abm Abanyom
3774
+ dhv Drehu
3775
+ brl Birwa
3776
+ shh Shoshoni
3777
+ zbc Berawan, Central
3778
+ oyd Oyda
3779
+ pek Penchal
3780
+ tbj Tiang
3781
+ ema Emai-Iuleha-Ora
3782
+ bgi Bagobo-Klata
3783
+ tkq Tee
3784
+ nmm Nyeshangte
3785
+ kkk Kokota
3786
+ djn Djauan
3787
+ tow Jemez
3788
+ nwm Nyamusa-Molo
3789
+ tef Teressa
3790
+ daw Davawenyo
3791
+ kpx Koiali, Mountain
3792
+ mtq Muong
3793
+ mwe Mwera
3794
+ stv Silt’e
3795
+ lum Luimbi
3796
+ phq Phana’
3797
+ tdc Embera Tadó
3798
+ pcj Parenga
3799
+ vnk Lovono
3800
+ kdd Yankunytjatjara
3801
+ aul Aulua
3802
+ mnp Chinese, Min Bei
3803
+ tdo Teme
3804
+ mwg Aiklep
3805
+ dma Duma
3806
+ coz Chocholtec
3807
+ owi Owiniga
3808
+ rji Raji
3809
+ aey Amele
3810
+ dge Degenang
3811
+ nil Nila
3812
+ ler Lenkau
3813
+ agt Agta, Central Cagayan
3814
+ kof Kubi
3815
+ okx Okpe
3816
+ ogg Ogbogolo
3817
+ xes Koromu
3818
+ hur Halkomelem
3819
+ bgx Balkan Gagauz Turkish
3820
+ anx Andra-Hus
3821
+ rwo Rawa
3822
+ caz Canichana
3823
+ kuh Kushi
3824
+ bks Sorsoganon, Northern
3825
+ ztx Zapotec, Zaachila
3826
+ axk Yaka
3827
+ umm Umon
3828
+ mmy Migaama
3829
+ aee Pashai, Northeast
3830
+ lil Lillooet
3831
+ lvk Lavukaleve
3832
+ ibd Iwaidja
3833
+ azt Atta, Faire
3834
+ usa Usarufa
3835
+ saa Saba
3836
+ bar Bavarian
3837
+ mzn Mazandarani
3838
+ unx Munda
3839
+ puo Puoc
3840
+ lek Leipon
3841
+ pkg Pak-Tong
3842
+ niu Niue
3843
+ oni Onin
3844
+ jaf Jara
3845
+ dwa Diri
3846
+ lmg Lamogai
3847
+ tau Tanana, Upper
3848
+ zoh Zoque, Chimalapa
3849
+ cbg Chimila
3850
+ gla Scottish Gaelic
3851
+ yur Yurok
3852
+ peb Pomo, Eastern
3853
+ kbb Kaxuiâna
3854
+ ivv Ivatan
3855
+ oka Okanagan
3856
+ ral Ralte
3857
+ nun Anong
3858
+ soz Temi
3859
+ ndh Ndali
3860
+ kdy Keijar
3861
+ bjo Banda, Mid-Southern
3862
+ env Enwan
3863
+ nds Saxon, Low
3864
+ kyy Asa’a
3865
+ moy Shekkacho
3866
+ mnm Mapena
3867
+ sbh Sori-Harengan
3868
+ bek Bebeli
3869
+ pdn Fedan
3870
+ mxm Meramera
3871
+ moj Monzombo
3872
+ tul Tula
3873
+ oks Oko-Eni-Osayen
3874
+ bjc Bariji
3875
+ hvv Huave, Santa María del Mar
3876
+ dme Dugwor
3877
+ plk Shina, Kohistani
3878
+ lal Lalia
3879
+ sir Siri
3880
+ yhd Arabic, Judeo-Iraqi
3881
+ zmp Mpuono
3882
+ ofu Efutop
3883
+ iki Iko
3884
+ sjg Assangori
3885
+ gae Guarequena
3886
+ hei Heiltsuk
3887
+ dmr Damar, East
3888
+ lti Leti
3889
+ ipo Ipiko
3890
+ dva Duau
3891
+ yaw Yawalapití
3892
+ dgg Doga
3893
+ mdw Mbosi
3894
+ mzv Mandja
3895
+ tkp Tikopia
3896
+ snv Sa’ban
3897
+ bte Gamo-Ningi
3898
+ nqt Nteng
3899
+ etx Iten
3900
+ gwa Mbato
3901
+ aji Ajië
3902
+ gni Gooniyandi
3903
+ blm Beli
3904
+ hid Hidatsa
3905
+ tof Gizrra
3906
+ kos Kosraean
3907
+ cja Cham, Western
3908
+ yki Yoke
3909
+ haa Han
3910
+ gad Gaddang
3911
+ mbs Manobo, Sarangani
3912
+ kkd Kinuku
3913
+ kol Kol
3914
+ noz Nayi
3915
+ kms Kamasau
3916
+ kfu Katkari
3917
+ mps Dadibi
3918
+ sbc Kele
3919
+ pon Pohnpeian
3920
+ sos Seeku
3921
+ kwt Kwesten
3922
+ diz Ding
3923
+ buj Basa-Gurmana
3924
+ onj Onjob
3925
+ cbo Izora
3926
+ pic Pinji
3927
+ zpw Zapotec, Zaniza
3928
+ sro Sardinian, Campidanese
3929
+ psi Pashai, Southeast
3930
+ kyh Karok
3931
+ msk Mansaka
3932
+ kxn Melanau, Kanowit-Tanjong
3933
+ end Ende
3934
+ mgf Maklew
3935
+ wgb Wagawaga
3936
+ mhc Mocho
3937
+ niw Nimo
3938
+ gyd Kayardild
3939
+ crc Lonwolwol
3940
+ bwf Boselewa
3941
+ huf Humene
3942
+ lad Ladino
3943
+ dei Demisa
3944
+ kgb Kawe
3945
+ prc Parachi
3946
+ ttw Long Wat
3947
+ ilu Ili’uun
3948
+ mnu Mer
3949
+ mbo Mbo
3950
+ glo Galambu
3951
+ sys Sinyar
3952
+ sgy Sanglechi
3953
+ poo Pomo, Central
3954
+ tsi Tsimshian
3955
+ svb Ulau-Suain
3956
+ wsa Warembori
3957
+ kkz Kaska
3958
+ gsw German, Swiss
3959
+ skb Saek
3960
+ ano Andoque
3961
+ zun Zuni
3962
+ tnm Tabla
3963
+ sbb Simbo
3964
+ wkd Mo
3965
+ sby Soli
3966
+ xok Xokleng
3967
+ chj Chinantec, Ojitlán
3968
+ jge Judeo-Georgian
3969
+ ugo Ugong
3970
+ lmi Lombi
3971
+ nkh Naga, Khezha
3972
+ huz Hunzib
3973
+ tft Ternate
3974
+ mrp Morouas
3975
+ mrf Elseng
3976
+ yot Yotti
3977
+ gbv Gbanu
3978
+ ayt Ayta, Magbukun
3979
+ hgw Haigwai
3980
+ swr Saweru
3981
+ lcc Legenyem
3982
+ zpe Zapotec, Petapa
3983
+ zpd Zapotec, Southeastern Ixtlán
3984
+ kep Kaikadi
3985
+ vmj Mixtec, Ixtayutla
3986
+ clu Caluyanun
3987
+ cma Maa
3988
+ qun Quinault
3989
+ kcf Ukaan
3990
+ fry Frisian
3991
+ har Harari
3992
+ bta Bata
3993
+ wro Worrorra
3994
+ mwp Kala Lagaw Ya
3995
+ sny Saniyo-Hiyewe
3996
+ nzk Nzakara
3997
+ knw Kung-Ekoka
3998
+ wbk Waigali
3999
+ smn Saami, Inari
4000
+ shq Sala
4001
+ zat Zapotec, Tabaa
4002
+ ngj Ngie
4003
+ psh Pashai, Southwest
4004
+ amt Amto
4005
+ xgu Unggumi
4006
+ qui Quileute
4007
+ gww Kwini
4008
+ agi Agariya
4009
+ caf Carrier, Southern
4010
+ pay Pech
4011
+ cbd Carijona
4012
+ mwa Mwatebu
4013
+ gcn Gaina
4014
+ suq Suri, Tirmaga-Chai
4015
+ djc Daju, Dar Daju
4016
+ aaa Ghotuo
4017
+ etn Eton
data/tts/all_langs.tsv ADDED
@@ -0,0 +1 @@
 
 
1
+ tuk-script_latin Turkmen
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ librosa
2
+ git+https://github.com/huggingface/transformers.git
3
+ torch
4
+ Cython==0.29.21
5
+ phonemizer==2.2.1
6
+ scipy
7
+ numpy
8
+ torchvision
9
+ matplotlib
10
+ Unidecode==1.1.1
11
+ monotonic-align
tts.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import re
8
+ import tempfile
9
+ import torch
10
+ import sys
11
+ import gradio as gr
12
+
13
+ from huggingface_hub import hf_hub_download
14
+
15
+ # Setup TTS env
16
+ if "vits" not in sys.path:
17
+ sys.path.append("vits")
18
+
19
+ from vits import commons, utils
20
+ from vits.models import SynthesizerTrn
21
+
22
+
23
+ class TextMapper(object):
24
+ def __init__(self, vocab_file):
25
+ self.symbols = [
26
+ x.replace("\n", "") for x in open(vocab_file, encoding="utf-8").readlines()
27
+ ]
28
+ self.SPACE_ID = self.symbols.index(" ")
29
+ self._symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
30
+ self._id_to_symbol = {i: s for i, s in enumerate(self.symbols)}
31
+
32
+ def text_to_sequence(self, text, cleaner_names):
33
+ """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
34
+ Args:
35
+ text: string to convert to a sequence
36
+ cleaner_names: names of the cleaner functions to run the text through
37
+ Returns:
38
+ List of integers corresponding to the symbols in the text
39
+ """
40
+ sequence = []
41
+ clean_text = text.strip()
42
+ for symbol in clean_text:
43
+ symbol_id = self._symbol_to_id[symbol]
44
+ sequence += [symbol_id]
45
+ return sequence
46
+
47
+ def uromanize(self, text, uroman_pl):
48
+ iso = "xxx"
49
+ with tempfile.NamedTemporaryFile() as tf, tempfile.NamedTemporaryFile() as tf2:
50
+ with open(tf.name, "w") as f:
51
+ f.write("\n".join([text]))
52
+ cmd = f"perl " + uroman_pl
53
+ cmd += f" -l {iso} "
54
+ cmd += f" < {tf.name} > {tf2.name}"
55
+ os.system(cmd)
56
+ outtexts = []
57
+ with open(tf2.name) as f:
58
+ for line in f:
59
+ line = re.sub(r"\s+", " ", line).strip()
60
+ outtexts.append(line)
61
+ outtext = outtexts[0]
62
+ return outtext
63
+
64
+ def get_text(self, text, hps):
65
+ text_norm = self.text_to_sequence(text, hps.data.text_cleaners)
66
+ if hps.data.add_blank:
67
+ text_norm = commons.intersperse(text_norm, 0)
68
+ text_norm = torch.LongTensor(text_norm)
69
+ return text_norm
70
+
71
+ def filter_oov(self, text, lang=None):
72
+ text = self.preprocess_char(text, lang=lang)
73
+ val_chars = self._symbol_to_id
74
+ txt_filt = "".join(list(filter(lambda x: x in val_chars, text)))
75
+ return txt_filt
76
+
77
+ def preprocess_char(self, text, lang=None):
78
+ """
79
+ Special treatement of characters in certain languages
80
+ """
81
+ if lang == "ron":
82
+ text = text.replace("ț", "ţ")
83
+ print(f"{lang} (ț -> ţ): {text}")
84
+ return text
85
+
86
+
87
+ def synthesize(text, lang, speed):
88
+
89
+ if speed is None:
90
+ speed = 1.0
91
+
92
+ lang_code = lang.split(":")[0].strip()
93
+
94
+ vocab_file = hf_hub_download(
95
+ repo_id="facebook/mms-tts",
96
+ filename="vocab.txt",
97
+ subfolder=f"models/{lang_code}",
98
+ )
99
+ config_file = hf_hub_download(
100
+ repo_id="facebook/mms-tts",
101
+ filename="config.json",
102
+ subfolder=f"models/{lang_code}",
103
+ )
104
+ g_pth = hf_hub_download(
105
+ repo_id="facebook/mms-tts",
106
+ filename="G_100000.pth",
107
+ subfolder=f"models/{lang_code}",
108
+ )
109
+
110
+ if torch.cuda.is_available():
111
+ device = torch.device("cuda")
112
+ elif (
113
+ hasattr(torch.backends, "mps")
114
+ and torch.backends.mps.is_available()
115
+ and torch.backends.mps.is_built()
116
+ ):
117
+ device = torch.device("mps")
118
+ else:
119
+ device = torch.device("cpu")
120
+
121
+ print(f"Run inference with {device}")
122
+
123
+ assert os.path.isfile(config_file), f"{config_file} doesn't exist"
124
+ hps = utils.get_hparams_from_file(config_file)
125
+ text_mapper = TextMapper(vocab_file)
126
+ net_g = SynthesizerTrn(
127
+ len(text_mapper.symbols),
128
+ hps.data.filter_length // 2 + 1,
129
+ hps.train.segment_size // hps.data.hop_length,
130
+ **hps.model,
131
+ )
132
+ net_g.to(device)
133
+ _ = net_g.eval()
134
+
135
+ _ = utils.load_checkpoint(g_pth, net_g, None)
136
+
137
+ is_uroman = hps.data.training_files.split(".")[-1] == "uroman"
138
+
139
+ if is_uroman:
140
+ uroman_dir = "uroman"
141
+ assert os.path.exists(uroman_dir)
142
+ uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
143
+ text = text_mapper.uromanize(text, uroman_pl)
144
+
145
+ text = text.lower()
146
+ text = text_mapper.filter_oov(text, lang=lang)
147
+ stn_tst = text_mapper.get_text(text, hps)
148
+ with torch.no_grad():
149
+ x_tst = stn_tst.unsqueeze(0).to(device)
150
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
151
+ hyp = (
152
+ net_g.infer(
153
+ x_tst,
154
+ x_tst_lengths,
155
+ noise_scale=0.667,
156
+ noise_scale_w=0.8,
157
+ length_scale=1.0 / speed,
158
+ )[0][0, 0]
159
+ .cpu()
160
+ .float()
161
+ .numpy()
162
+ )
163
+
164
+ return gr.Audio.update(value=(hps.data.sampling_rate, hyp)), text
165
+
166
+
167
+ TTS_EXAMPLES = [
168
+ ["Salam. Men indi ýuwaş ýuwaşdan size düşünip başladym", "tuk-script_latin: Turkmen"],
169
+ ["Türkmençe bir bilýäňmow sen?", "tuk-script_latin: Turkmen"],
170
+ ["Iň gowy adamlar, yzyny özüň bilýäň.", "tuk-script_latin: Turkmen"],
171
+ ["Siz bilen tanyşanyma örän şat.", "tuk-script_latin: Turkmen"],
172
+ ["Esasy zat jan saglyk.", "tuk-script_latin: Turkmen"],
173
+ ]
uroman/.gitignore ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !Build/
2
+ .last_cover_stats
3
+ /META.yml
4
+ /META.json
5
+ /MYMETA.*
6
+ *.o
7
+ *.pm.tdy
8
+ *.bs
9
+
10
+ # Devel::Cover
11
+ cover_db/
12
+
13
+ # Devel::NYTProf
14
+ nytprof.out
15
+
16
+ # Dizt::Zilla
17
+ /.build/
18
+
19
+ # Module::Build
20
+ _build/
21
+ Build
22
+ Build.bat
23
+
24
+ # Module::Install
25
+ inc/
26
+
27
+ # ExtUtils::MakeMaker
28
+ /blib/
29
+ /_eumm/
30
+ /*.gz
31
+ /Makefile
32
+ /Makefile.old
33
+ /MANIFEST.bak
34
+ /pm_to_blib
35
+ /*.zip
uroman/LICENSE.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (C) 2015-2020 Ulf Hermjakob, USC Information Sciences Institute
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ Any publication of projects using uroman shall acknowledge its use: "This project uses the universal romanizer software 'uroman' written by Ulf Hermjakob, USC Information Sciences Institute (2015-2020)".
8
+ Bibliography: Ulf Hermjakob, Jonathan May, and Kevin Knight. 2018. Out-of-the-box universal romanization tool uroman. In Proceedings of the 56th Annual Meeting of Association for Computational Linguistics, Demo Track.
9
+
10
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
11
+
uroman/README.md ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # uroman
2
+
3
+ *uroman* is a *universal romanizer*. It converts text in any script to the Latin alphabet.
4
+
5
+ Version: 1.2.8
6
+ Release date: April 23, 2021
7
+ Author: Ulf Hermjakob, USC Information Sciences Institute
8
+
9
+
10
+ ### Usage
11
+ ```bash
12
+ $ uroman.pl [-l <lang-code>] [--chart] [--no-cache] < STDIN
13
+ where the optional <lang-code> is a 3-letter languages code, e.g. ara, bel, bul, deu, ell, eng, fas,
14
+ grc, ell, eng, heb, kaz, kir, lav, lit, mkd, mkd2, oss, pnt, pus, rus, srp, srp2, tur, uig, ukr, yid.
15
+ --chart specifies chart output (in JSON format) to represent alternative romanizations.
16
+ --no-cache disables caching.
17
+ ```
18
+ ### Examples
19
+ ```bash
20
+ $ bin/uroman.pl < text/zho.txt
21
+ $ bin/uroman.pl -l tur < text/tur.txt
22
+ $ bin/uroman.pl -l heb --chart < text/heb.txt
23
+ $ bin/uroman.pl < test/multi-script.txt > test/multi-script.uroman.txt
24
+ ```
25
+
26
+ Identifying the input as Arabic, Belarusian, Bulgarian, English, Farsi, German,
27
+ Ancient Greek, Modern Greek, Pontic Greek, Hebrew, Kazakh, Kyrgyz, Latvian,
28
+ Lithuanian, North Macedonian, Russian, Serbian, Turkish, Ukrainian, Uyghur or
29
+ Yiddish will improve romanization for those languages as some letters in those
30
+ languages have different sound values from other languages using the same script
31
+ (French, Russian, Hebrew respectively).
32
+ No effect for other languages in this version.
33
+
34
+ ### Bibliography
35
+ Ulf Hermjakob, Jonathan May, and Kevin Knight. 2018. Out-of-the-box universal romanization tool uroman. In Proceedings of the 56th Annual Meeting of Association for Computational Linguistics, Demo Track. ACL-2018 Best Demo Paper Award. [Paper in ACL Anthology](https://www.aclweb.org/anthology/P18-4003) | [Poster](https://www.isi.edu/~ulf/papers/poster-uroman-acl2018.pdf) | [BibTex](https://www.aclweb.org/anthology/P18-4003.bib)
36
+
37
+ ### Change History
38
+ Changes in version 1.2.8
39
+ * Updated to Unicode 13.0 (2021), which supports several new scripts (10% larger UnicodeData.txt).
40
+ * Improved support for Georgian.
41
+ * Preserve various symbols (as opposed to mapping to the symbols' names).
42
+ * Various small improvements.
43
+
44
+ Changes in version 1.2.7
45
+ * Improved support for Pashto.
46
+
47
+ Changes in version 1.2.6
48
+ * Improved support for Ukrainian, Russian and Ogham (ancient Irish script).
49
+ * Added support for English Braille.
50
+ * Added alternative Romanization for North Macedonian and Serbian (mkd2/srp2)
51
+ reflecting a casual style that many native speakers of those languages use
52
+ when writing text in Latin script, e.g. non-accented single letters (e.g. "s")
53
+ rather than phonetically motivated combinations of letters (e.g. "sh").
54
+ * When a line starts with "::lcode xyz ", the new uroman version will switch to
55
+ that language for that line. This is used for the new reference test file.
56
+ * Various small improvements.
57
+
58
+ Changes in version 1.2.5
59
+ * Improved support for Armenian and eight languages using Cyrillic scripts.
60
+ -- For Serbian and Macedonian, which are often written in both Cyrillic
61
+ and Latin scripts, uroman will map both official versions to the same
62
+ romanized text, e.g. both "Ниш" and "Niš" will be mapped to "Nish" (which
63
+ properly reflects the pronunciation of the city's name).
64
+ For both Serbian and Macedonian, casual writers often use a simplified
65
+ Latin form without diacritics, e.g. "s" to represent not only Cyrillic "с"
66
+ and Latin "s", but also "ш" or "š", even if this conflates "s" and "sh" and
67
+ other such pairs. The casual romanization can be simulated by using
68
+ alternative uroman language codes "srp2" and "mkd2", which romanize
69
+ both "Ниш" and "Niš" to "Nis" to reflect the casual Latin spelling.
70
+ * Various small improvements.
71
+
72
+ Changes in version 1.2.4
73
+ * Bug-fix that generated two emtpy lines for each empty line in cache mode.
74
+
75
+ Changes in version 1.2
76
+ * Run-time improvement based on (1) token-based caching and (2) shortcut
77
+ romanization (identity) of ASCII strings for default 1-best (non-chart)
78
+ output. Speed-up by a factor of 10 for Bengali and Uyghur on medium and
79
+ large size texts.
80
+ * Incremental improvements for Farsi, Amharic, Russian, Hebrew and related
81
+ languages.
82
+ * Richer lattice structure (more alternatives) for "Romanization" of English
83
+ to support better matching to romanizations of other languages.
84
+ Changes output only when --chart option is specified. No change in output for
85
+ default 1-best output, which for ASCII characters is always the input string.
86
+
87
+ Changes in version 1.1 (major upgrade)
88
+ * Offers chart output (in JSON format) to represent alternative romanizations.
89
+ -- Location of first character is defined to be "line: 1, start:0, end:0".
90
+ * Incremental improvements of Hebrew and Greek romanization; Chinese numbers.
91
+ * Improved web-interface at http://www.isi.edu/~ulf/uroman.html
92
+ -- Shows corresponding original and romanization text in red
93
+ when hovering over a text segment.
94
+ -- Shows alternative romanizations when hovering over romanized text
95
+ marked by dotted underline.
96
+ -- Added right-to-left script detection and improved display for right-to-left
97
+ script text (as determined line by line).
98
+ -- On-page support for some scripts that are often not pre-installed on users'
99
+ computers (Burmese, Egyptian, Klingon).
100
+
101
+ Changes in version 1.0 (major upgrade)
102
+ * Upgraded principal internal data structure from string to lattice.
103
+ * Improvements mostly in vowelization of South and Southeast Asian languages.
104
+ * Vocalic 'r' more consistently treated as vowel (no additional vowel added).
105
+ * Repetition signs (Japanese/Chinese/Thai/Khmer/Lao) are mapped to superscript 2.
106
+ * Japanese Katakana middle dots now mapped to ASCII space.
107
+ * Tibetan intersyllabic mark now mapped to middle dot (U+00B7).
108
+ * Some corrections regarding analysis of Chinese numbers.
109
+ * Many more foreign diacritics and punctuation marks dropped or mapped to ASCII.
110
+ * Zero-width characters dropped, except line/sentence-initial byte order marks.
111
+ * Spaces normalized to ASCII space.
112
+ * Fixed bug that in some cases mapped signs (such as dagger or bullet) to their verbal descriptions.
113
+ * Tested against previous version of uroman with a new uroman visual diff tool.
114
+ * Almost an order of magnitude faster.
115
+
116
+ Changes in version 0.7 (minor upgrade)
117
+ * Added script uroman-quick.pl for Arabic script languages, incl. Uyghur.
118
+ Much faster, pre-caching mapping of Arabic to Latin characters, simple greedy processing.
119
+ Will not convert material from non-Arabic blocks such as any (somewhat unusual) Cyrillic
120
+ or Chinese characters in Uyghur texts.
121
+
122
+ Changes in version 0.6 (minor upgrade)
123
+ * Added support for two letter characters used in Uzbek:
124
+ (1) character "ʻ" ("modifier letter turned comma", which modifies preceding "g" and "u" letters)
125
+ (2) character "ʼ" ("modifier letter apostrophe", which Uzbek uses to mark a glottal stop).
126
+ Both are now mapped to "'" (plain ASCII apostrophe).
127
+ * Added support for Uyghur vowel characters such as "ې" (Arabic e) and "ۆ" (Arabic oe)
128
+ even when they are not preceded by "ئ" (yeh with hamza above).
129
+ * Added support for Arabic semicolon "؛", Arabic ligature forms for phrases such as "ﷺ"
130
+ ("sallallahou alayhe wasallam" = "prayer of God be upon him and his family and peace")
131
+ * Added robustness for Arabic letter presentation forms (initial/medial/final/isolated).
132
+ However, it is strongly recommended to normalize any presentation form Arabic letters
133
+ to their non-presentation form before calling uroman.
134
+ * Added force flush directive ($|=1;).
135
+
136
+ Changes in version 0.5 (minor upgrade)
137
+ * Improvements for Uyghur (make sure to use language option: -l uig)
138
+
139
+ Changes in version 0.4 (minor upgrade)
140
+ * Improvements for Thai (special cases for vowel/consonant reordering, e.g. for "sara o"; dropped some aspiration 'h's)
141
+ * Minor change for Arabic (added "alef+fathatan" = "an")
142
+
143
+ New features in version 0.3
144
+ * Covers Mandarin (Chinese)
145
+ * Improved romanization for numerous languages
146
+ * Preserves capitalization (e.g. from Latin, Cyrillic, Greek scripts)
147
+ * Maps from native digits to Western numbers
148
+ * Faster for South Asian languages
149
+
150
+ ### Other features
151
+ * Web interface: http://www.isi.edu/~ulf/uroman.html
152
+ * Vowelization is provided when locally computable, e.g. for many South Asian languages and Tibetan.
153
+
154
+ ### Limitations
155
+ * The current version of uroman has a few limitations, some of which we plan to address in future versions.
156
+ For Japanese, *uroman* currently romanizes hiragana and katakana as expected, but kanji are interpreted as Chinese characters and romanized as such.
157
+ For Egyptian hieroglyphs, only single-sound phonetic characters and numbers are currently romanized.
158
+ For Linear B, only phonetic syllabic characters are romanized.
159
+ For some other extinct scripts such as cuneiform, no romanization is provided.
160
+ * A romanizer is not a full transliterator. For example, this version of
161
+ uroman does not vowelize text that lacks explicit vowelization such as
162
+ normal text in Arabic and Hebrew (without diacritics/points).
163
+
164
+ ### Acknowledgments
165
+ This research is based upon work supported in part by the Office of the Director of National Intelligence (ODNI), Intelligence Advanced Research Projects Activity (IARPA), via contract # FA8650-17-C-9116, and by research sponsored by Air Force Research Laboratory (AFRL) under agreement number FA8750-19-1-1000. The views and conclusions contained herein are those of the authors and should not be interpreted as necessarily representing the official policies, either expressed or implied, of ODNI, IARPA, Air Force Laboratory, DARPA, or the U.S. Government. The U.S. Government is authorized to reproduce and distribute reprints for governmental purposes notwithstanding any copyright annotation therein.
uroman/README.txt ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ uroman version 1.2.8
2
+ Release date: April 23, 2021
3
+ Author: Ulf Hermjakob, USC Information Sciences Institute
4
+
5
+ uroman is a universal romanizer. It converts text in any script to the Latin alphabet.
6
+
7
+ Usage: uroman.pl [-l <lang-code>] [--chart] [--no-cache] < STDIN
8
+ where the optional <lang-code> is a 3-letter languages code, e.g. ara, bel, bul, deu, ell, eng, fas,
9
+ grc, ell, eng, heb, kaz, kir, lav, lit, mkd, mkd2, oss, pnt, pus, rus, srp, srp2, tur, uig, ukr, yid.
10
+ --chart specifies chart output (in JSON format) to represent alternative romanizations.
11
+ --no-cache disables caching.
12
+ Examples: bin/uroman.pl < text/zho.txt
13
+ bin/uroman.pl -l tur < text/tur.txt
14
+ bin/uroman.pl -l heb --chart < text/heb.txt
15
+ bin/uroman.pl < test/multi-script.txt > test/multi-script.uroman.txt
16
+
17
+ Identifying the input as Arabic, Belarusian, Bulgarian, English, Farsi, German,
18
+ Ancient Greek, Modern Greek, Pontic Greek, Hebrew, Kazakh, Kyrgyz, Latvian,
19
+ Lithuanian, North Macedonian, Russian, Serbian, Turkish, Ukrainian, Uyghur or Yiddish
20
+ will improve romanization for those languages as some letters in those languages
21
+ have different sound values from other languages using the same script.
22
+ No effect for other languages in this version.
23
+
24
+ Bibliography: Ulf Hermjakob, Jonathan May, and Kevin Knight. 2018. Out-of-the-box universal romanization tool uroman. In Proceedings of the 56th Annual Meeting of Association for Computational Linguistics, Demo Track. [Best Demo Paper Award]
25
+
26
+ Changes in version 1.2.8
27
+ * Improved support for Georgian.
28
+ * Updated UnicodeData.txt to version 13 (2021) with several new scripts (10% larger).
29
+ * Preserve various symbols (as opposed to mapping to the symbols' names).
30
+ * Various small improvements.
31
+ Changes in version 1.2.7
32
+ * Improved support for Pashto.
33
+ Changes in version 1.2.6
34
+ * Improved support for Ukrainian, Russian and Ogham (ancient Irish script).
35
+ * Added support for English Braille.
36
+ * Added alternative Romanization for North Macedonian and Serbian (mkd2/srp2)
37
+ reflecting a casual style that many native speakers of those languages use
38
+ when writing text in Latin script, e.g. non-accented single letters (e.g. "s")
39
+ rather than phonetically motivated combinations of letters (e.g. "sh").
40
+ * When a line starts with "::lcode xyz ", the new uroman version will switch to
41
+ that language for that line. This is used for the new reference test file.
42
+ * Various small improvements.
43
+ Changes in version 1.2.5
44
+ * Improved support for Armenian and eight languages using Cyrillic scripts.
45
+ -- For Serbian and Macedonian, which are often written in both Cyrillic
46
+ and Latin scripts, uroman will map both official versions to the same
47
+ romanized text, e.g. both "Ниш" and "Niš" will be mapped to "Nish" (which
48
+ properly reflects the pronunciation of the city's name).
49
+ For both Serbian and Macedonian, casual writers often use a simplified
50
+ Latin form without diacritics, e.g. "s" to represent not only Cyrillic "с"
51
+ and Latin "s", but also "ш" or "š", even if this conflates "s" and "sh" and
52
+ other such pairs. The casual romanization can be simulated by using
53
+ alternative uroman language codes "srp2" and "mkd2", which romanize
54
+ both "Ниш" and "Niš" to "Nis" to reflect the casual Latin spelling.
55
+ * Various small improvements.
56
+ Changes in version 1.2.4
57
+ * Added support for Tifinagh (a script used for Berber languages).
58
+ * Bug-fix that generated two emtpy lines for each empty line in cache mode.
59
+ Changes in version 1.2.3
60
+ * Exclude emojis, dingbats, many other pictographs from being romanized (e.g. to "face")
61
+ Changes in version 1.2
62
+ * Run-time improvement based on (1) token-based caching and (2) shortcut
63
+ romanization (identity) of ASCII strings for default 1-best (non-chart)
64
+ output. Speed-up by a factor of 10 for Bengali and Uyghur on medium and
65
+ large size texts.
66
+ * Incremental improvements for Farsi, Amharic, Russian, Hebrew and related
67
+ languages.
68
+ * Richer lattice structure (more alternatives) for "Romanization" of English
69
+ to support better matching to romanizations of other languages.
70
+ Changes output only when --chart option is specified. No change in output for
71
+ default 1-best output, which for ASCII characters is always the input string.
72
+ Changes in version 1.1 (major upgrade)
73
+ * Offers chart output (in JSON format) to represent alternative romanizations.
74
+ -- Location of first character is defined to be "line: 1, start:0, end:0".
75
+ * Incremental improvements of Hebrew and Greek romanization; Chinese numbers.
76
+ * Improved web-interface at http://www.isi.edu/~ulf/uroman.html
77
+ -- Shows corresponding original and romanization text in red
78
+ when hovering over a text segment.
79
+ -- Shows alternative romanizations when hovering over romanized text
80
+ marked by dotted underline.
81
+ -- Added right-to-left script detection and improved display for right-to-left
82
+ script text (as determined line by line).
83
+ -- On-page support for some scripts that are often not pre-installed on users'
84
+ computers (Burmese, Egyptian, Klingon).
85
+ Changes in version 1.0 (major upgrade)
86
+ * Upgraded principal internal data structure from string to lattice.
87
+ * Improvements mostly in vowelization of South and Southeast Asian languages.
88
+ * Vocalic 'r' more consistently treated as vowel (no additional vowel added).
89
+ * Repetition signs (Japanese/Chinese/Thai/Khmer/Lao) are mapped to superscript 2.
90
+ * Japanese Katakana middle dots now mapped to ASCII space.
91
+ * Tibetan intersyllabic mark now mapped to middle dot (U+00B7).
92
+ * Some corrections regarding analysis of Chinese numbers.
93
+ * Many more foreign diacritics and punctuation marks dropped or mapped to ASCII.
94
+ * Zero-width characters dropped, except line/sentence-initial byte order marks.
95
+ * Spaces normalized to ASCII space.
96
+ * Fixed bug that in some cases mapped signs (such as dagger or bullet) to their verbal descriptions.
97
+ * Tested against previous version of uroman with a new uroman visual diff tool.
98
+ * Almost an order of magnitude faster.
99
+ Changes in version 0.7 (minor upgrade)
100
+ * Added script uroman-quick.pl for Arabic script languages, incl. Uyghur.
101
+ Much faster, pre-caching mapping of Arabic to Latin characters, simple greedy processing.
102
+ Will not convert material from non-Arabic blocks such as any (somewhat unusual) Cyrillic
103
+ or Chinese characters in Uyghur texts.
104
+ Changes in version 0.6 (minor upgrade)
105
+ * Added support for two letter characters used in Uzbek:
106
+ (1) character "ʻ" ("modifier letter turned comma", which modifies preceding "g" and "u" letters)
107
+ (2) character "ʼ" ("modifier letter apostrophe", which Uzbek uses to mark a glottal stop).
108
+ Both are now mapped to "'" (plain ASCII apostrophe).
109
+ * Added support for Uyghur vowel characters such as "ې" (Arabic e) and "ۆ" (Arabic oe)
110
+ even when they are not preceded by "ئ" (yeh with hamza above).
111
+ * Added support for Arabic semicolon "؛", Arabic ligature forms for phrases such as "ﷺ"
112
+ ("sallallahou alayhe wasallam" = "prayer of God be upon him and his family and peace")
113
+ * Added robustness for Arabic letter presentation forms (initial/medial/final/isolated).
114
+ However, it is strongly recommended to normalize any presentation form Arabic letters
115
+ to their non-presentation form before calling uroman.
116
+ * Added force flush directive ($|=1;).
117
+ Changes in version 0.5 (minor upgrade)
118
+ * Improvements for Uyghur (make sure to use language option: -l uig)
119
+ Changes in version 0.4 (minor upgrade)
120
+ * Improvements for Thai (special cases for vowel/consonant reordering, e.g. for "sara o"; dropped some aspiration 'h's)
121
+ * Minor change for Arabic (added "alef+fathatan" = "an")
122
+ New features in version 0.3
123
+ * Covers Mandarin (Chinese)
124
+ * Improved romanization for numerous languages
125
+ * Preserves capitalization (e.g. from Latin, Cyrillic, Greek scripts)
126
+ * Maps from native digits to Western numbers
127
+ * Faster for South Asian languages
128
+
129
+ Other features
130
+ * Web interface: http://www.isi.edu/~ulf/uroman.html
131
+ * Vowelization is provided when locally computable, e.g. for many South Asian
132
+ languages and Tibetan.
133
+
134
+ Limitations
135
+ * This version of uroman assumes all CJK ideographs to be Mandarin (Chinese).
136
+ This means that Japanese kanji are incorrectly romanized; however, Japanese
137
+ hiragana and katakana are properly romanized.
138
+ * A romanizer is not a full transliterator. For example, this version of
139
+ uroman does not vowelize text that lacks explicit vowelization such as
140
+ normal text in Arabic and Hebrew (without diacritics/points).
141
+
uroman/bin/de-accent.pl ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/perl -w
2
+
3
+ sub print_version {
4
+ print STDERR "$0 version 1.1\n";
5
+ print STDERR " Author: Ulf Hermjakob\n";
6
+ print STDERR " Last changed: March 14, 2011\n";
7
+ }
8
+
9
+ sub print_usage {
10
+ print STDERR "$0 [options] < with_accents.txt > without_accents.txt\n";
11
+ print STDERR " -h or -help\n";
12
+ print STDERR " -v or -version\n";
13
+ }
14
+
15
+ sub de_accent_string {
16
+ local($s) = @_;
17
+
18
+ # $s =~ tr/A-Z/a-z/;
19
+ unless (0) {
20
+ # Latin-1
21
+ if ($s =~ /\xC3[\x80-\xBF]/) {
22
+ $s =~ s/(À|Á|Â|Ã|Ä|Å)/A/g;
23
+ $s =~ s/Æ/Ae/g;
24
+ $s =~ s/Ç/C/g;
25
+ $s =~ s/Ð/D/g;
26
+ $s =~ s/(È|É|Ê|Ë)/E/g;
27
+ $s =~ s/(Ì|Í|Î|Ï)/I/g;
28
+ $s =~ s/Ñ/N/g;
29
+ $s =~ s/(Ò|Ó|Ô|Õ|Ö|Ø)/O/g;
30
+ $s =~ s/(Ù|Ú|Û|Ü)/U/g;
31
+ $s =~ s/Þ/Th/g;
32
+ $s =~ s/Ý/Y/g;
33
+ $s =~ s/(à|á|â|ã|ä|å)/a/g;
34
+ $s =~ s/æ/ae/g;
35
+ $s =~ s/ç/c/g;
36
+ $s =~ s/(è|é|ê|ë)/e/g;
37
+ $s =~ s/(ì|í|î|ï)/i/g;
38
+ $s =~ s/ð/d/g;
39
+ $s =~ s/ñ/n/g;
40
+ $s =~ s/(ò|ó|ô|õ|ö)/o/g;
41
+ $s =~ s/ß/ss/g;
42
+ $s =~ s/þ/th/g;
43
+ $s =~ s/(ù|ú|û|ü)/u/g;
44
+ $s =~ s/(ý|ÿ)/y/g;
45
+ }
46
+ # Latin Extended-A
47
+ if ($s =~ /[\xC4-\xC5][\x80-\xBF]/) {
48
+ $s =~ s/(Ā|Ă|Ą)/A/g;
49
+ $s =~ s/(ā|ă|ą)/a/g;
50
+ $s =~ s/(Ć|Ĉ|Ċ|Č)/C/g;
51
+ $s =~ s/(ć|ĉ|ċ|č)/c/g;
52
+ $s =~ s/(Ď|Đ)/D/g;
53
+ $s =~ s/(ď|đ)/d/g;
54
+ $s =~ s/(Ē|Ĕ|Ė|Ę|Ě)/E/g;
55
+ $s =~ s/(ē|ĕ|ė|ę|ě)/e/g;
56
+ $s =~ s/(Ĝ|Ğ|Ġ|Ģ)/G/g;
57
+ $s =~ s/(ĝ|ğ|ġ|ģ)/g/g;
58
+ $s =~ s/(Ĥ|Ħ)/H/g;
59
+ $s =~ s/(ĥ|ħ)/h/g;
60
+ $s =~ s/(Ĩ|Ī|Ĭ|Į|İ)/I/g;
61
+ $s =~ s/(ĩ|ī|ĭ|į|ı)/i/g;
62
+ $s =~ s/IJ/Ij/g;
63
+ $s =~ s/ij/ij/g;
64
+ $s =~ s/Ĵ/J/g;
65
+ $s =~ s/ĵ/j/g;
66
+ $s =~ s/Ķ/K/g;
67
+ $s =~ s/(ķ|ĸ)/k/g;
68
+ $s =~ s/(Ĺ|Ļ|Ľ|Ŀ|Ł)/L/g;
69
+ $s =~ s/(ļ|ľ|ŀ|ł)/l/g;
70
+ $s =~ s/(Ń|Ņ|Ň|Ŋ)/N/g;
71
+ $s =~ s/(ń|ņ|ň|ʼn|ŋ)/n/g;
72
+ $s =~ s/(Ō|Ŏ|Ő)/O/g;
73
+ $s =~ s/(ō|ŏ|ő)/o/g;
74
+ $s =~ s/Œ/Oe/g;
75
+ $s =~ s/œ/oe/g;
76
+ $s =~ s/(Ŕ|Ŗ|Ř)/R/g;
77
+ $s =~ s/(ŕ|ŗ|ř)/r/g;
78
+ $s =~ s/(Ś|Ŝ|Ş|Š)/S/g;
79
+ $s =~ s/(ś|ŝ|ş|š|ſ)/s/g;
80
+ $s =~ s/(Ţ|Ť|Ŧ)/T/g;
81
+ $s =~ s/(ţ|ť|ŧ)/t/g;
82
+ $s =~ s/(Ũ|Ū|Ŭ|Ů|Ű|Ų)/U/g;
83
+ $s =~ s/(ũ|ū|ŭ|ů|ű|ų)/u/g;
84
+ $s =~ s/Ŵ/W/g;
85
+ $s =~ s/ŵ/w/g;
86
+ $s =~ s/(Ŷ|Ÿ)/Y/g;
87
+ $s =~ s/ŷ/y/g;
88
+ $s =~ s/(Ź|Ż|Ž)/Z/g;
89
+ $s =~ s/(ź|ż|ž)/z/g;
90
+ }
91
+ # Latin Extended Additional
92
+ if ($s =~ /\xE1[\xB8-\xBF][\x80-\xBF]/) {
93
+ $s =~ s/(ḁ|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẚ)/a/g;
94
+ $s =~ s/(ḃ|ḅ|ḇ)/b/g;
95
+ $s =~ s/(ḉ)/c/g;
96
+ $s =~ s/(ḋ|ḍ|ḏ|ḑ|ḓ)/d/g;
97
+ $s =~ s/(ḕ|ḗ|ḙ|ḛ|ḝ|ẹ|ẻ|ẽ|ế|ề|ể|ễ|ệ)/e/g;
98
+ $s =~ s/(ḟ)/f/g;
99
+ $s =~ s/(ḡ)/g/g;
100
+ $s =~ s/(ḣ|ḥ|ḧ|ḩ|ḫ)/h/g;
101
+ $s =~ s/(ḭ|ḯ|ỉ|ị)/i/g;
102
+ $s =~ s/(ḱ|ḳ|ḵ)/k/g;
103
+ $s =~ s/(ḷ|ḹ|ḻ|ḽ)/l/g;
104
+ $s =~ s/(ḿ|ṁ|ṃ)/m/g;
105
+ $s =~ s/(ṅ|ṇ|ṉ|ṋ)/m/g;
106
+ $s =~ s/(ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ṍ|ṏ|ṑ|ṓ)/o/g;
107
+ $s =~ s/(ṕ|ṗ)/p/g;
108
+ $s =~ s/(ṙ|ṛ|ṝ|ṟ)/r/g;
109
+ $s =~ s/(ṡ|ṣ|ṥ|ṧ|ṩ|ẛ)/s/g;
110
+ $s =~ s/(ṫ|ṭ|ṯ|ṱ)/t/g;
111
+ $s =~ s/(ṳ|ṵ|ṷ|ṹ|ṻ|ụ|ủ|ứ|ừ|ử|ữ|ự)/u/g;
112
+ $s =~ s/(ṽ|ṿ)/v/g;
113
+ $s =~ s/(ẁ|ẃ|ẅ|ẇ|ẉ|ẘ)/w/g;
114
+ $s =~ s/(ẋ|ẍ)/x/g;
115
+ $s =~ s/(ẏ|ỳ|ỵ|ỷ|ỹ|ẙ)/y/g;
116
+ $s =~ s/(ẑ|ẓ|ẕ)/z/g;
117
+ $s =~ s/(Ḁ|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ)/A/g;
118
+ $s =~ s/(Ḃ|Ḅ|Ḇ)/B/g;
119
+ $s =~ s/(Ḉ)/C/g;
120
+ $s =~ s/(Ḋ|Ḍ|Ḏ|Ḑ|Ḓ)/D/g;
121
+ $s =~ s/(Ḕ|Ḗ|Ḙ|Ḛ|Ḝ|Ẹ|Ẻ|Ẽ|Ế|Ề|Ể|Ễ|Ệ)/E/g;
122
+ $s =~ s/(Ḟ)/F/g;
123
+ $s =~ s/(Ḡ)/G/g;
124
+ $s =~ s/(Ḣ|Ḥ|Ḧ|Ḩ|Ḫ)/H/g;
125
+ $s =~ s/(Ḭ|Ḯ|Ỉ|Ị)/I/g;
126
+ $s =~ s/(Ḱ|Ḳ|Ḵ)/K/g;
127
+ $s =~ s/(Ḷ|Ḹ|Ḻ|Ḽ)/L/g;
128
+ $s =~ s/(Ḿ|Ṁ|Ṃ)/M/g;
129
+ $s =~ s/(Ṅ|Ṇ|Ṉ|Ṋ)/N/g;
130
+ $s =~ s/(Ṍ|Ṏ|Ṑ|Ṓ|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ)/O/g;
131
+ $s =~ s/(Ṕ|Ṗ)/P/g;
132
+ $s =~ s/(Ṙ|Ṛ|Ṝ|Ṟ)/R/g;
133
+ $s =~ s/(Ṡ|Ṣ|Ṥ|Ṧ|Ṩ)/S/g;
134
+ $s =~ s/(Ṫ|Ṭ|Ṯ|Ṱ)/T/g;
135
+ $s =~ s/(Ṳ|Ṵ|Ṷ|Ṹ|Ṻ|Ụ|Ủ|Ứ|Ừ|Ử|Ữ|Ự)/U/g;
136
+ $s =~ s/(Ṽ|Ṿ)/V/g;
137
+ $s =~ s/(Ẁ|Ẃ|Ẅ|Ẇ|Ẉ)/W/g;
138
+ $s =~ s/(Ẍ)/X/g;
139
+ $s =~ s/(Ẏ|Ỳ|Ỵ|Ỷ|Ỹ)/Y/g;
140
+ $s =~ s/(Ẑ|Ẓ|Ẕ)/Z/g;
141
+ }
142
+ # Greek letters
143
+ if ($s =~ /\xCE[\x86-\xAB]/) {
144
+ $s =~ s/ά/α/g;
145
+ $s =~ s/έ/ε/g;
146
+ $s =~ s/ί/ι/g;
147
+ $s =~ s/ϊ/ι/g;
148
+ $s =~ s/ΐ/ι/g;
149
+ $s =~ s/ό/ο/g;
150
+ $s =~ s/ύ/υ/g;
151
+ $s =~ s/ϋ/υ/g;
152
+ $s =~ s/ΰ/υ/g;
153
+ $s =~ s/ώ/ω/g;
154
+ $s =~ s/Ά/Α/g;
155
+ $s =~ s/Έ/Ε/g;
156
+ $s =~ s/Ή/Η/g;
157
+ $s =~ s/Ί/Ι/g;
158
+ $s =~ s/Ϊ/Ι/g;
159
+ $s =~ s/Ύ/Υ/g;
160
+ $s =~ s/Ϋ/Υ/g;
161
+ $s =~ s/Ώ/Ω/g;
162
+ }
163
+ # Cyrillic letters
164
+ if ($s =~ /\xD0[\x80-\xAF]/) {
165
+ $s =~ s/Ѐ/Е/g;
166
+ $s =~ s/Ё/Е/g;
167
+ $s =~ s/Ѓ/Г/g;
168
+ $s =~ s/Ќ/К/g;
169
+ $s =~ s/Ѝ/И/g;
170
+ $s =~ s/Й/И/g;
171
+ $s =~ s/ѐ/е/g;
172
+ $s =~ s/ё/е/g;
173
+ $s =~ s/ѓ/г/g;
174
+ $s =~ s/ќ/к/g;
175
+ $s =~ s/ѝ/и/g;
176
+ $s =~ s/й/и/g;
177
+ }
178
+ }
179
+ return $s;
180
+ }
181
+
182
+ while (@ARGV) {
183
+ $arg = shift @ARGV;
184
+ if ($arg =~ /^-*(h|help)$/i) {
185
+ &print_usage;
186
+ exit 1;
187
+ } elsif ($arg =~ /^-*(v|version)$/i) {
188
+ &print_version;
189
+ exit 1;
190
+ } else {
191
+ print STDERR "Ignoring unrecognized argument $arg\n";
192
+ }
193
+ }
194
+
195
+ $line_number = 0;
196
+ while (<>) {
197
+ $line_number++;
198
+ print &de_accent_string($_);
199
+ }
200
+ exit 0;
201
+
uroman/bin/string-distance.pl ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/perl -w
2
+
3
+ # Author: Ulf Hermjakob
4
+ # Release date: October 13, 2019
5
+
6
+ # Usage: string-distance.pl {-lc1 <language-code>} {-lc2 <language-code>} < STDIN > STDOUT
7
+ # Example: string-distance.pl -lc1 rus -lc2 ukr < STDIN > STDOUT
8
+ # Example: string-distance.pl < ../test/string-similarity-test-input.txt
9
+ # Input format: two strings per line (tab-separated, in Latin script)
10
+ # Strings in non-Latin scripts should first be romanized. (Recommended script: uroman.pl)
11
+ # Output format: repetition of the two input strings, plus the string distance between them (tab-separated).
12
+ # Additional output meta info lines at the top are marked with an initial #.
13
+ #
14
+ # The script uses data from a string-distance-cost-rules file that lists costs,
15
+ # where the default cost is "1" with lower costs for differences in vowels,
16
+ # duplicate consonants, "f" vs. "ph" etc.
17
+ # Language cost rules can be language-specific and context-sensitive.
18
+
19
+ $|=1;
20
+
21
+ use FindBin;
22
+ use Cwd "abs_path";
23
+ use File::Basename qw(dirname);
24
+ use File::Spec;
25
+
26
+ my $bin_dir = abs_path(dirname($0));
27
+ my $root_dir = File::Spec->catfile($bin_dir, File::Spec->updir());
28
+ my $data_dir = File::Spec->catfile($root_dir, "data");
29
+ my $lib_dir = File::Spec->catfile($root_dir, "lib");
30
+
31
+ use lib "$FindBin::Bin/../lib";
32
+ use List::Util qw(min max);
33
+ use NLP::utilities;
34
+ use NLP::stringDistance;
35
+ $util = NLP::utilities;
36
+ $sd = NLP::stringDistance;
37
+ $verbose = 0;
38
+ $separator = "\t";
39
+
40
+ $cost_rule_filename = File::Spec->catfile($data_dir, "string-distance-cost-rules.txt");
41
+
42
+ $lang_code1 = "eng";
43
+ $lang_code2 = "eng";
44
+ %ht = ();
45
+
46
+ while (@ARGV) {
47
+ $arg = shift @ARGV;
48
+ if ($arg =~ /^-+lc1$/) {
49
+ $lang_code_candidate = shift @ARGV;
50
+ $lang_code1 = $lang_code_candidate if $lang_code_candidate =~ /^[a-z]{3,3}$/;
51
+ } elsif ($arg =~ /^-+lc2$/) {
52
+ $lang_code_candidate = shift @ARGV;
53
+ $lang_code2 = $lang_code_candidate if $lang_code_candidate =~ /^[a-z]{3,3}$/;
54
+ } elsif ($arg =~ /^-+(v|verbose)$/) {
55
+ $verbose = shift @ARGV;
56
+ } else {
57
+ print STDERR "Ignoring unrecognized arg $arg\n";
58
+ }
59
+ }
60
+
61
+ $sd->load_string_distance_data($cost_rule_filename, *ht, $verbose);
62
+ print STDERR "Loaded resources.\n" if $verbose;
63
+
64
+ my $chart_id = 0;
65
+ my $line_number = 0;
66
+ print "# Lang-code-1: $lang_code1 Lang-code-2: $lang_code2\n";
67
+ while (<>) {
68
+ $line_number++;
69
+ if ($verbose) {
70
+ if ($line_number =~ /000$/) {
71
+ if ($line_number =~ /0000$/) {
72
+ print STDERR $line_number;
73
+ } else {
74
+ print STDERR ".";
75
+ }
76
+ }
77
+ }
78
+ my $line = $_;
79
+ $line =~ s/^\xEF\xBB\xBF//;
80
+ next if $line =~ /^\s*(\#.*)?$/;
81
+ my $s1;
82
+ my $s2;
83
+ if (($s1, $s2) = ($line =~ /^("(?:\\"|[^"])*"|\S+)$separator("(?:\\"|[^"])*"|\S+)\s*$/)) {
84
+ $s1 = $util->dequote_string($s1);
85
+ $s2 = $util->dequote_string($s2);
86
+ } elsif ($line =~ /^\s*(#.*)$/) {
87
+ } else {
88
+ print STDERR "Could not process line $line_number: $line" if $verbose;
89
+ print "\n";
90
+ next;
91
+ }
92
+
93
+ $cost = $sd->quick_romanized_string_distance_by_chart($s1, $s2, *ht, "", $lang_code1, $lang_code2);
94
+ print "$s1\t$s2\t$cost\n";
95
+ }
96
+ print STDERR "\n" if $verbose;
97
+
98
+ exit 0;
99
+
uroman/bin/uroman-quick.pl ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/perl -w
2
+
3
+ # uroman Nov. 12, 2015 - July 25, 2016
4
+ # version v0.7
5
+ # Author: Ulf Hermjakob
6
+
7
+ # Usage: uroman-quick.pl {-l [tur|uig|ukr|yid]} < STDIN
8
+ # currently only for Arabic script languages, incl. Uyghur
9
+
10
+ $|=1;
11
+
12
+ use FindBin;
13
+ use Cwd "abs_path";
14
+ use File::Basename qw(dirname);
15
+ use File::Spec;
16
+
17
+ my $bin_dir = abs_path(dirname($0));
18
+ my $root_dir = File::Spec->catfile($bin_dir, File::Spec->updir());
19
+ my $data_dir = File::Spec->catfile($root_dir, "data");
20
+ my $lib_dir = File::Spec->catfile($root_dir, "lib");
21
+
22
+ use lib "$FindBin::Bin/../lib";
23
+ use NLP::Romanizer;
24
+ use NLP::UTF8;
25
+ $romanizer = NLP::Romanizer;
26
+ %ht = ();
27
+ $lang_code = "";
28
+
29
+ while (@ARGV) {
30
+ $arg = shift @ARGV;
31
+ if ($arg =~ /^-+(l|lc|lang-code)$/) {
32
+ $lang_code = lc (shift @ARGV || "")
33
+ } else {
34
+ print STDERR "Ignoring unrecognized arg $arg\n";
35
+ }
36
+ }
37
+
38
+ $romanization_table_arabic_block_filename = File::Spec->catfile($data_dir, "romanization-table-arabic-block.txt");
39
+ $romanization_table_filename = File::Spec->catfile($data_dir, "romanization-table.txt");
40
+
41
+ $romanizer->load_romanization_table(*ht, $romanization_table_arabic_block_filename);
42
+ $romanizer->load_romanization_table(*ht, $romanization_table_filename);
43
+
44
+ $line_number = 0;
45
+ while (<>) {
46
+ $line_number++;
47
+ my $line = $_;
48
+ print $romanizer->quick_romanize($line, $lang_code, *ht) . "\n";
49
+ if ($line_number =~ /0000$/) {
50
+ print STDERR $line_number;
51
+ } elsif ($line_number =~ /000$/) {
52
+ print STDERR ".";
53
+ }
54
+ }
55
+ print STDERR "\n";
56
+
57
+ exit 0;
58
+
uroman/bin/uroman-tsv.sh ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Created by Thamme Gowda on June 17, 2019
3
+
4
+ DIR=$(dirname "${BASH_SOURCE[0]}") # get the directory name
5
+ # DIR=$(realpath "${DIR}") # resolve its full path if need be
6
+
7
+ if [[ $# -lt 1 || $# -gt 2 ]]; then
8
+ >&2 echo "ERROR: invalid args"
9
+ >&2 echo "Usage: <input.tsv> [<output.tsv>]"
10
+ exit 2
11
+ fi
12
+
13
+ INP=$1
14
+ OUT=$2
15
+
16
+ CMD=$DIR/uroman.pl
17
+
18
+ function romanize(){
19
+ paste <(cut -f1 $INP) <(cut -f2 $INP | $CMD)
20
+ }
21
+
22
+ if [[ -n $OUT ]]; then
23
+ romanize > $OUT
24
+ else
25
+ romanize
26
+ fi
27
+
28
+
uroman/bin/uroman.pl ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/perl -w
2
+
3
+ # uroman Nov. 12, 2015 - Apr. 23, 2021
4
+ $version = "v1.2.8";
5
+ # Author: Ulf Hermjakob
6
+
7
+ # Usage: uroman.pl {-l [ara|bel|bul|deu|ell|eng|fas|grc|heb|kaz|kir|lav|lit|mkd|mkd2|oss|pnt|rus|srp|srp2|tur|uig|ukr|yid]} {--chart|--offset-mapping} {--no-cache} {--workset} < STDIN
8
+ # Example: cat workset.txt | uroman.pl --offset-mapping --workset
9
+
10
+ $|=1;
11
+
12
+ use FindBin;
13
+ use Cwd "abs_path";
14
+ use File::Basename qw(dirname);
15
+ use File::Spec;
16
+
17
+ my $bin_dir = abs_path(dirname($0));
18
+ my $root_dir = File::Spec->catfile($bin_dir, File::Spec->updir());
19
+ my $data_dir = File::Spec->catfile($root_dir, "data");
20
+ my $lib_dir = File::Spec->catfile($root_dir, "lib");
21
+
22
+ use lib "$FindBin::Bin/../lib";
23
+ use NLP::Chinese;
24
+ use NLP::Romanizer;
25
+ use NLP::UTF8;
26
+ use NLP::utilities;
27
+ use JSON;
28
+ $chinesePM = NLP::Chinese;
29
+ $romanizer = NLP::Romanizer;
30
+ $util = NLP::utilities;
31
+ %ht = ();
32
+ %pinyin_ht = ();
33
+ $lang_code = "";
34
+ $return_chart_p = 0;
35
+ $return_offset_mappings_p = 0;
36
+ $workset_p = 0;
37
+ $cache_rom_tokens_p = 1;
38
+
39
+ $script_data_filename = File::Spec->catfile($data_dir, "Scripts.txt");
40
+ $unicode_data_overwrite_filename = File::Spec->catfile($data_dir, "UnicodeDataOverwrite.txt");
41
+ $unicode_data_filename = File::Spec->catfile($data_dir, "UnicodeData.txt");
42
+ $romanization_table_filename = File::Spec->catfile($data_dir, "romanization-table.txt");
43
+ $chinese_tonal_pinyin_filename = File::Spec->catfile($data_dir, "Chinese_to_Pinyin.txt");
44
+
45
+ while (@ARGV) {
46
+ $arg = shift @ARGV;
47
+ if ($arg =~ /^-+(l|lc|lang-code)$/) {
48
+ $lang_code = lc (shift @ARGV || "")
49
+ } elsif ($arg =~ /^-+chart$/i) {
50
+ $return_chart_p = 1;
51
+ } elsif ($arg =~ /^-+workset$/i) {
52
+ $workset_p = 1;
53
+ } elsif ($arg =~ /^-+offset[-_]*map/i) {
54
+ $return_offset_mappings_p = 1;
55
+ } elsif ($arg =~ /^-+unicode[-_]?data/i) {
56
+ $filename = shift @ARGV;
57
+ if (-r $filename) {
58
+ $unicode_data_filename = $filename;
59
+ } else {
60
+ print STDERR "Ignoring invalid UnicodeData filename $filename\n";
61
+ }
62
+ } elsif ($arg =~ /^-+(no-tok-cach|no-cach)/i) {
63
+ $cache_rom_tokens_p = 0;
64
+ } else {
65
+ print STDERR "Ignoring unrecognized arg $arg\n";
66
+ }
67
+ }
68
+
69
+ $romanizer->load_script_data(*ht, $script_data_filename);
70
+ $romanizer->load_unicode_data(*ht, $unicode_data_filename);
71
+ $romanizer->load_unicode_overwrite_romanization(*ht, $unicode_data_overwrite_filename);
72
+ $romanizer->load_romanization_table(*ht, $romanization_table_filename);
73
+ $chinese_to_pinyin_not_yet_loaded_p = 1;
74
+ $current_date = $util->datetime("dateTtime");
75
+ $lang_code_clause = ($lang_code) ? " \"lang-code\":\"$lang_code\",\n" : "";
76
+
77
+ print "{\n \"romanizer\":\"uroman $version (Ulf Hermjakob, USC/ISI)\",\n \"date\":\"$current_date\",\n$lang_code_clause \"romanization\": [\n" if $return_chart_p;
78
+ my $line_number = 0;
79
+ my $chart_result = "";
80
+ while (<>) {
81
+ $line_number++;
82
+ my $line = $_;
83
+ my $snt_id = "";
84
+ if ($workset_p) {
85
+ next if $line =~ /^#/;
86
+ if (($i_value, $s_value) = ($line =~ /^(\S+\.\d+)\s(.*)$/)) {
87
+ $snt_id = $i_value;
88
+ $line = "$s_value\n";
89
+ } else {
90
+ next;
91
+ }
92
+ }
93
+ if ($chinese_to_pinyin_not_yet_loaded_p && $chinesePM->string_contains_utf8_cjk_unified_ideograph_p($line)) {
94
+ $chinesePM->read_chinese_tonal_pinyin_files(*pinyin_ht, $chinese_tonal_pinyin_filename);
95
+ $chinese_to_pinyin_not_yet_loaded_p = 0;
96
+ }
97
+ if ($return_chart_p) {
98
+ print $chart_result;
99
+ *chart_ht = $romanizer->romanize($line, $lang_code, "", *ht, *pinyin_ht, 0, "return chart", $line_number);
100
+ $chart_result = $romanizer->chart_to_json_romanization_elements(0, $chart_ht{N_CHARS}, *chart_ht, $line_number);
101
+ } elsif ($return_offset_mappings_p) {
102
+ ($best_romanization, $offset_mappings) = $romanizer->romanize($line, $lang_code, "", *ht, *pinyin_ht, 0, "return offset mappings", $line_number, 0);
103
+ print "::snt-id $snt_id\n" if $workset_p;
104
+ print "::orig $line";
105
+ print "::rom $best_romanization\n";
106
+ print "::align $offset_mappings\n\n";
107
+ } elsif ($cache_rom_tokens_p) {
108
+ print $romanizer->romanize_by_token_with_caching($line, $lang_code, "", *ht, *pinyin_ht, 0, "", $line_number) . "\n";
109
+ } else {
110
+ print $romanizer->romanize($line, $lang_code, "", *ht, *pinyin_ht, 0, "", $line_number) . "\n";
111
+ }
112
+ }
113
+ $chart_result =~ s/,(\s*)$/$1/;
114
+ print $chart_result;
115
+ print " ]\n}\n" if $return_chart_p;
116
+
117
+ $dev_test_p = 0;
118
+ if ($dev_test_p) {
119
+ $n_suspicious_code_points = 0;
120
+ $n_instances = 0;
121
+ foreach $char_name (sort { hex($ht{UTF_NAME_TO_UNICODE}->{$a}) <=> hex($ht{UTF_NAME_TO_UNICODE}->{$b}) }
122
+ keys %{$ht{SUSPICIOUS_ROMANIZATION}}) {
123
+ $unicode_value = $ht{UTF_NAME_TO_UNICODE}->{$char_name};
124
+ $utf8_string = $ht{UTF_NAME_TO_CODE}->{$char_name};
125
+ foreach $romanization (sort keys %{$ht{SUSPICIOUS_ROMANIZATION}->{$char_name}}) {
126
+ $count = $ht{SUSPICIOUS_ROMANIZATION}->{$char_name}->{$romanization};
127
+ $s = ($count == 1) ? "" : "s";
128
+ print STDERR "*** Suspiciously lengthy romanization:\n" unless $n_suspicious_code_points;
129
+ print STDERR "::s $utf8_string ::t $romanization ::comment $char_name (U+$unicode_value)\n";
130
+ $n_suspicious_code_points++;
131
+ $n_instances += $count;
132
+ }
133
+ }
134
+ print STDERR " *** Total of $n_suspicious_code_points suspicious code points ($n_instances instance$s)\n" if $n_suspicious_code_points;
135
+ }
136
+
137
+ exit 0;
138
+
uroman/data/Chinese_to_Pinyin.txt ADDED
The diff for this file is too large to render. See raw diff
 
uroman/data/Scripts.txt ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ::script-name Aegean
2
+ ::script-name Ahom
3
+ ::script-name Anatolian Hieroglyph
4
+ ::script-name Arabic ::direction right-to-left
5
+ ::script-name Armenian
6
+ ::script-name Avestan
7
+ ::script-name Balinese
8
+ ::script-name Bamum
9
+ ::script-name Bassa Vah
10
+ ::script-name Batak
11
+ ::script-name Bengali ::abugida-default-vowel a
12
+ ::script-name Bhaiksuki
13
+ ::script-name Bopomofo ::language Chinese
14
+ ::script-name Brahmi ::abugida-default-vowel a
15
+ ::script-name Braille
16
+ ::script-name Buginese
17
+ ::script-name Buhid
18
+ ::script-name Canadian Syllabics
19
+ ::script-name Carian
20
+ ::script-name Caucasian Albanian
21
+ ::script-name Chakma
22
+ ::script-name Cham
23
+ ::script-name Cherokee
24
+ ::script-name Coptic
25
+ ::script-name Cuneiform
26
+ ::script-name Cypriot
27
+ ::script-name Cyrillic
28
+ ::script-name CJK ::alt-script-name Chinese, Kanji ::language Chinese, Japanese, Korean, Mandarin
29
+ ::script-name Deseret
30
+ ::script-name Devanagari ::abugida-default-vowel a
31
+ ::script-name Duployan
32
+ ::script-name Egyptian Hieroglyph
33
+ ::script-name Elbasan
34
+ ::script-name Ethiopic
35
+ ::script-name Georgian
36
+ ::script-name Glagolitic
37
+ ::script-name Gothic
38
+ ::script-name Grantha
39
+ ::script-name Greek
40
+ ::script-name Gujarati ::abugida-default-vowel a
41
+ ::script-name Gurmukhi ::abugida-default-vowel a
42
+ ::script-name Hangul ::language Korean
43
+ ::script-name Hanunoo
44
+ ::script-name Hatran
45
+ ::script-name Hebrew ::direction right-to-left
46
+ ::script-name Hiragana ::language Japanese
47
+ ::script-name Imperial Aramaic
48
+ ::script-name Inscriptional Pahlavi
49
+ ::script-name Inscriptional Parthian
50
+ ::script-name Javanese
51
+ ::script-name Kaithi
52
+ ::script-name Kannada ::abugida-default-vowel a
53
+ ::script-name Katakana ::language Japanese
54
+ ::script-name Kayah Li
55
+ ::script-name Kharoshthi
56
+ ::script-name Khmer ::abugida-default-vowel a, o
57
+ ::script-name Khojki
58
+ ::script-name Khudawadi
59
+ ::script-name Klingon
60
+ ::script-name Lao
61
+ ::script-name Lepcha
62
+ ::script-name Latin
63
+ ::script-name Limbu
64
+ ::script-name Linear A
65
+ ::script-name Linear B
66
+ ::script-name Lycian
67
+ ::script-name Lydian
68
+ ::script-name Mahajani
69
+ ::script-name Malayalam ::abugida-default-vowel a
70
+ ::script-name Mandaic
71
+ ::script-name Manichaean
72
+ ::script-name Marchen
73
+ ::script-name Meetei Mayek
74
+ ::script-name Meroitic Cursive
75
+ ::script-name Meroitic Hieroglyphic
76
+ ::script-name Miao
77
+ ::script-name Modi ::abugida-default-vowel a
78
+ ::script-name Mongolian
79
+ ::script-name Mro
80
+ ::script-name Multani
81
+ ::script-name Myanmar ::alt-script-name Burmese ::abugida-default-vowel a
82
+ ::script-name Nabataean
83
+ ::script-name New Tai Lue
84
+ ::script-name Newa
85
+ ::script-name Nko ::direction right-to-left
86
+ ::script-name Ogham
87
+ ::script-name Ol Chiki
88
+ ::script-name Old Hungarian
89
+ ::script-name Old Italic
90
+ ::script-name Old Permic
91
+ ::script-name Old Persian
92
+ ::script-name Old North Arabian
93
+ ::script-name Old South Arabian
94
+ ::script-name Old Turkic
95
+ ::script-name Oriya ::alt-script-name Odia ::abugida-default-vowel a
96
+ ::script-name Osage
97
+ ::script-name Osmanya
98
+ ::script-name Pahawh Hmong
99
+ ::script-name Palmyrene
100
+ ::script-name Pau Cin Hau
101
+ ::script-name Phags-pa
102
+ ::script-name Phaistos Disc
103
+ ::script-name Phoenician
104
+ ::script-name Psalter Pahlavi
105
+ ::script-name Rejang
106
+ ::script-name Runic
107
+ ::script-name Samaritan
108
+ ::script-name Saurashtra
109
+ ::script-name Sharada
110
+ ::script-name Shavian
111
+ ::script-name Siddham
112
+ ::script-name Sinhala ::abugida-default-vowel a
113
+ ::script-name Sora Sompeng
114
+ ::script-name Sundanese ::abugida-default-vowel a
115
+ ::script-name Syloti Nagri
116
+ ::script-name Syriac
117
+ ::script-name Tagalog
118
+ ::script-name Tagbanwa
119
+ ::script-name Tai Le
120
+ ::script-name Tai Tham
121
+ ::script-name Tai Viet
122
+ ::script-name Takri
123
+ ::script-name Tamil ::abugida-default-vowel a
124
+ ::script-name Tangut
125
+ ::script-name Telugu ::abugida-default-vowel a
126
+ ::script-name Thaana ::direction right-to-left
127
+ ::script-name Thai
128
+ ::script-name Tibetan ::abugida-default-vowel a
129
+ ::script-name Tifinagh
130
+ ::script-name Tirhuta
131
+ ::script-name Ugaritic
132
+ ::script-name Vai
133
+ ::script-name Vedic
134
+ ::script-name Warang Citi
135
+ ::script-name Yi
uroman/data/UnicodeData.txt ADDED
The diff for this file is too large to render. See raw diff
 
uroman/data/UnicodeDataOverwrite.txt ADDED
@@ -0,0 +1,442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## UnicodeDataOverwrite.txt
2
+ ::u 00A0 ::r " " ::comment no-break space
3
+ ::u 01BF ::r w ::comment ƿ Latin Character Wynn (Old English)
4
+ ::u 0294 ::r ' ::comment gottal stop
5
+ ::u 0295 ::r ' ::comment ʕ voiced pharyngeal fricative
6
+ ::u 0305 ::r "" ::comment ̅ Combining overline
7
+ ::u 0306 ::r "" ::comment ̆ Combining breve
8
+ ::u 0307 ::r "" ::comment ̇ Combining dot above
9
+ ::u 030A ::r "" ::comment ̊ Combining ring above
10
+ ::u 030C ::r "" ::comment ̌ Combining caron
11
+ ::u 0311 ::r "" ::comment ̑ Combining inverted breve
12
+ ::u 031D ::r "" ::comment ̝ Combining down up below
13
+ ::u 031E ::r "" ::comment ̞ Combining down tack below
14
+ ::u 031F ::r "" ::comment ̟ Combining plus sign below
15
+ ::u 0323 ::r "" ::comment ̣ Combining dot below
16
+ ::u 0325 ::r "" ::comment ̥ Combining ring below
17
+ ::u 0329 ::r "" ::comment ̩ Combining vertical line below
18
+ ::u 032A ::r "" ::comment ̪ Combining bridge below
19
+ ::u 032F ::r "" ::comment ̯ Combining inverted breve below
20
+ ::u 0342 ::r "" ::comment ͂ Combining Greek perispomeni (circumflex accent)
21
+ ::u 0343 ::r "" ::comment ̓ Combining Greek koronis
22
+ ::u 0361 ::r "" ::comment Combining double inverted breve
23
+ ::u 0384 ::r "" ::comment ΄ Greek tonos
24
+ ::u 0482 ::r 1000· ::comment ҂ Cyrillic thousands sign
25
+ ::u 0483 ::r "" ::comment ҃ Combining Cyrillic Titlo ::annotation titlo
26
+ ::u 0484 ::r "" ::comment ҄ Combining Cyrillic Palatalization ::annotation palatalization
27
+ ::u 055B ::r "" ::comment ՛ Armenian emphasis mark
28
+ ::u 055F ::r "" ::comment ՟ Armenian abbreviation mark ::annotation abbreviation
29
+
30
+ ::u 0901 ::r +m ::comment Devanagari sign candrabindu
31
+ ::u 0902 ::r +m ::comment Devanagari sign anusvara
32
+ ::u 0903 ::r +h ::comment Devanagari sign visarga
33
+ ::u 093D ::r ' ::comment Devanagari sign avagraha
34
+ ::u 0950 ::r om ::comment ॐ Devanagari om symbol
35
+ ::u 0951 ::r "" ::comment ॑ Devanagari stress sign "udatta"
36
+ ::u 0952 ::r "" ::comment ॒ Devanagari stress sign "anudatta"
37
+ ::u 0981 ::r +n ::comment Bengali sign candrabindu ("chôndrôbindu")
38
+ ::u 0982 ::r +ng ::comment Bengali sign anusvara ("ônushar")
39
+ ::u 0983 ::r +h ::comment Bengali sign visarga ("bishôrgô")
40
+ ::u 099A ::r ch ::comment instead of Bengali C(A)
41
+ ::u 099B ::r chh ::comment instead of Bengali CC(A)
42
+ ::u 0A02 ::r +m ::comment Gurmukhi sign bindi
43
+ ::u 0A70 ::r +m ::comment Gurmukhi tippi
44
+ # ::u 0A72 ::r "" ::comment Gurmukhi addak
45
+ ::u 0A72 ::r "" ::comment Gurmukhi iri
46
+ ::u 0A73 ::r "" ::comment Gurmukhi ura
47
+ ::u 0B01 ::r +m ::comment Oriya sign candrabindu
48
+ ::u 0B03 ::r +h ::comment Oriya sign visarga
49
+ ::u 0B5F ::r ya ::comment ୟ Oriya letter yya
50
+ ::u 0B82 ::r +m ::comment Tamil sign anusvara (not to be used?)
51
+ ::u 0B83 ::r +h ::comment Tamil sign visarga ("āytam")
52
+ ::u 0B9F ::r t ::comment instead of Tamil TT(A)
53
+ ::u 0BA3 ::r n ::comment instead of Tamil NN(A)
54
+ ::u 0BA9 ::r n ::comment instead of Tamil NNN(A)
55
+ ::u 0BB1 ::r r ::comment instead of Tamil RR(A)
56
+ ::u 0BB3 ::r l ::comment instead of Tamil LL(A)
57
+ ::u 0BB4 ::r l ::comment instead of Tamil LLL(A)
58
+ ::u 0C03 ::r +h ::comment ః Telugu sign visarga
59
+ ::u 0C83 ::r +h ::comment Kannada sign visarga
60
+ ::u 0D02 ::r +m ::comment Malayalam sign anusvara
61
+ ::u 0D03 ::r +h ::comment Malayalam sign visarga
62
+ ::u 0D82 ::r +n ::comment Sinhala sign anusvaraya
63
+ ::u 0DA4 ::r ny ::comment Sinhala ඤ
64
+ ::u 0DA5 ::r gn ::comment Sinhala ඥ
65
+ ::u 0DCA ::r "" ::comment Sinhala sign al-lakuna (virama = no vowel)
66
+ ::u 0DCF ::r aa ::comment Sinhala ා
67
+ ::u 0DD0 ::r ae ::comment Sinhala ැ
68
+ ::u 0DD1 ::r ae ::comment Sinhala ෑ
69
+ ::u 0DD2 ::r i ::comment Sinhala ි
70
+ ::u 0DD3 ::r ii ::comment Sinhala ී
71
+ ::u 0DD4 ::r u ::comment Sinhala ු
72
+ ::u 0DD6 ::r uu ::comment Sinhala ූ
73
+ ::u 0DD8 ::r r ::comment Sinhala ෘ
74
+ ::u 0DD9 ::r e ::comment Sinhala ෙ
75
+ ::u 0DDA ::r ee ::comment Sinhala ේ
76
+ ::u 0DDB ::r ai ::comment Sinhala ෛ
77
+ ::u 0DDC ::r o ::comment Sinhala ො
78
+ ::u 0DDD ::r oo ::comment Sinhala ෝ
79
+ ::u 0DDE ::r au ::comment Sinhala ෞ
80
+ ::u 0DDF ::r aa ::comment Sinhala ා
81
+ ::u 0DF2 ::r rr ::comment Sinhala ෲ
82
+
83
+ ::u 0E02 ::r k ::comment Thai character KHO KHAI
84
+ ::u 0E03 ::r k ::comment Thai character KHO KHUAT
85
+ ::u 0E04 ::r k ::comment Thai character KHO KHWAI
86
+ ::u 0E05 ::r k ::comment Thai character KHO KHON
87
+ ::u 0E06 ::r k ::comment Thai character KHO RAKHANG
88
+ ::u 0E10 ::r t ::comment Thai character THO THAN
89
+ ::u 0E11 ::r t ::comment Thai character THO NANGMONTHO
90
+ ::u 0E12 ::r t ::comment Thai character THO PHUTHAO
91
+ ::u 0E16 ::r t ::comment Thai character THO THUNG
92
+ ::u 0E17 ::r t ::comment Thai character THO THAHAN
93
+ ::u 0E18 ::r t ::comment Thai character THO THONG
94
+ ::u 0E1C ::r p ::comment Thai character PHO PHUNG
95
+ ::u 0E1E ::r p ::comment Thai character PHO PHAN
96
+ ::u 0E20 ::r p ::comment Thai character PHO SAMPHAO
97
+ ::u 0E2D ::r o ::comment Thai character O ANG
98
+ ::u 0E2F ::r ... ::comment ฯ Thai character PAIYANNOI (ellipsis, abbreviation)
99
+ ::u 0E31 ::r a ::comment Thai character MAI HAN-AKAT
100
+ ::u 0E3A ::r "" ::comment Thai character PHINTHU (Pali virama)
101
+ ::u 0E40 ::r e ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA E
102
+ ::u 0E41 ::r ae ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA AE
103
+ ::u 0E42 ::r o ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA O
104
+ ::u 0E43 ::r ai ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA AI MAIMUAN
105
+ ::u 0E44 ::r ai ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA AI MAIMALAI
106
+ ::u 0E45 ::r "" ::comment Thai character LAKKHANGYAO vowel lengthener
107
+ ::u 0E47 ::r o ::comment Thai character MAITAIKHU vowel shortener
108
+ ::u 0E48 ::r "" ::tone-mark non-standard ::comment Thai tone mark MAI EK
109
+ ::u 0E49 ::r "" ::tone-mark standard ::comment Thai tone mark MAI THO
110
+ ::u 0E4A ::r "" ::tone-mark high ::comment Thai tone mark MAI TRI
111
+ ::u 0E4B ::r "" ::tone-mark rising ::comment Thai tone mark MAI CHATTAWA
112
+ ::u 0E4C ::r "" ::comment Thai character THANTHAKHAT cancellation mark (cf. virama)
113
+ ::u 0E4D ::r +m ::comment ํ Thai character NIKHAHIT final nasal (cf. anusvara)
114
+ ::u 0ECC ::r "" ::comment ໌ Lao cancellation mark ::annotation cancellation
115
+ ::u 0F0B ::r · ::comment ་ Tibetan mark intersyllabic tsheg
116
+ ::u 0F0C ::r "" ::comment ༌ Tibetan mark delimiter tsheg bstar
117
+ ::u 0F84 ::r "" ::comment ྄ Tibetan halanta
118
+ ::u 1036 ::r +n ::comment Myanmar sign anusvara ("auk myit")
119
+ ::u 1037 ::r "" ::tone-mark creaky ::comment Myanmar sign dot below
120
+ ::u 1038 ::r "" ::tone-mark high ::comment Myanmar sign visarga
121
+
122
+ ::u 16A0 ::r f ::comment ᚠ RUNIC LETTER FEHU FEOH FE F
123
+ ::u 16A1 ::r v ::comment ᚡ RUNIC LETTER V
124
+ ::u 16A2 ::r u ::comment ᚢ RUNIC LETTER URUZ UR U
125
+ ::u 16A3 ::r y ::comment ᚣ RUNIC LETTER YR
126
+ ::u 16A4 ::r y ::comment ᚤ RUNIC LETTER Y
127
+ ::u 16A5 ::r w ::comment ᚥ RUNIC LETTER W
128
+ ::u 16A6 ::r th ::comment ᚦ RUNIC LETTER THURISAZ THURS THORN
129
+ ::u 16A7 ::r th ::comment ᚧ RUNIC LETTER ETH
130
+ ::u 16A8 ::r a ::comment ᚨ RUNIC LETTER ANSUZ A
131
+ ::u 16A9 ::r o ::comment ᚩ RUNIC LETTER OS O
132
+ ::u 16AA ::r a ::comment ᚪ RUNIC LETTER AC A
133
+ ::u 16AB ::r ae ::comment ᚫ RUNIC LETTER AESC
134
+ ::u 16AC ::r o ::comment ᚬ RUNIC LETTER LONG-BRANCH-OSS O
135
+ ::u 16AD ::r o ::comment ᚭ RUNIC LETTER SHORT-TWIG-OSS O
136
+ ::u 16AE ::r o ::comment ᚮ RUNIC LETTER O
137
+ ::u 16AF ::r oe ::comment ᚯ RUNIC LETTER OE
138
+ ::u 16B0 ::r on ::comment ᚰ RUNIC LETTER ON
139
+ ::u 16B1 ::r r ::comment ᚱ RUNIC LETTER RAIDO RAD REID R
140
+ ::u 16B2 ::r k ::comment ᚲ RUNIC LETTER KAUNA
141
+ ::u 16B3 ::r c ::comment ᚳ RUNIC LETTER CEN
142
+ ::u 16B4 ::r k ::comment ᚴ RUNIC LETTER KAUN K
143
+ ::u 16B5 ::r g ::comment ᚵ RUNIC LETTER G
144
+ ::u 16B6 ::r ng ::comment ᚶ RUNIC LETTER ENG
145
+ ::u 16B7 ::r g ::comment ᚷ RUNIC LETTER GEBO GYFU G
146
+ ::u 16B8 ::r g ::comment ᚸ RUNIC LETTER GAR
147
+ ::u 16B9 ::r w ::comment ᚹ RUNIC LETTER WUNJO WYNN W
148
+ ::u 16BA ::r h ::comment ᚺ RUNIC LETTER HAGLAZ H
149
+ ::u 16BB ::r h ::comment ᚻ RUNIC LETTER HAEGL H
150
+ ::u 16BC ::r h ::comment ᚼ RUNIC LETTER LONG-BRANCH-HAGALL H
151
+ ::u 16BD ::r h ::comment ᚽ RUNIC LETTER SHORT-TWIG-HAGALL H
152
+ ::u 16BE ::r n ::comment ᚾ RUNIC LETTER NAUDIZ NYD NAUD N
153
+ ::u 16BF ::r n ::comment ᚿ RUNIC LETTER SHORT-TWIG-NAUD N
154
+ ::u 16C0 ::r n ::comment ᛀ RUNIC LETTER DOTTED-N
155
+ ::u 16C1 ::r i ::comment ᛁ RUNIC LETTER ISAZ IS ISS I
156
+ ::u 16C2 ::r e ::comment ᛂ RUNIC LETTER E
157
+ ::u 16C3 ::r j ::comment ᛃ RUNIC LETTER JERAN J
158
+ ::u 16C4 ::r j ::comment ᛄ RUNIC LETTER GER
159
+ ::u 16C5 ::r ae ::comment ᛅ RUNIC LETTER LONG-BRANCH-AR AE
160
+ ::u 16C6 ::r a ::comment ᛆ RUNIC LETTER SHORT-TWIG-AR A
161
+ ::u 16C7 ::r i ::comment ᛇ RUNIC LETTER IWAZ EOH
162
+ ::u 16C8 ::r p ::comment ᛈ RUNIC LETTER PERTHO PEORTH P
163
+ ::u 16C9 ::r z ::comment ᛉ RUNIC LETTER ALGIZ EOLHX
164
+ ::u 16CA ::r s ::comment ᛊ RUNIC LETTER SOWILO S
165
+ ::u 16CB ::r s ::comment ᛋ RUNIC LETTER SIGEL LONG-BRANCH-SOL S
166
+ ::u 16CC ::r s ::comment ᛌ RUNIC LETTER SHORT-TWIG-SOL S
167
+ ::u 16CD ::r c ::comment ᛍ RUNIC LETTER C
168
+ ::u 16CE ::r z ::comment ᛎ RUNIC LETTER Z
169
+ ::u 16CF ::r t ::comment ᛏ RUNIC LETTER TIWAZ TIR TYR T
170
+ ::u 16D0 ::r t ::comment ᛐ RUNIC LETTER SHORT-TWIG-TYR T
171
+ ::u 16D1 ::r d ::comment ᛑ RUNIC LETTER D
172
+ ::u 16D2 ::r b ::comment ᛒ RUNIC LETTER BERKANAN BEORC BJARKAN B
173
+ ::u 16D3 ::r b ::comment ᛓ RUNIC LETTER SHORT-TWIG-BJARKAN B
174
+ ::u 16D4 ::r p ::comment ᛔ RUNIC LETTER DOTTED-P
175
+ ::u 16D5 ::r p ::comment ᛕ RUNIC LETTER OPEN-P
176
+ ::u 16D6 ::r e ::comment ᛖ RUNIC LETTER EHWAZ EH E
177
+ ::u 16D7 ::r m ::comment ᛗ RUNIC LETTER MANNAZ MAN M
178
+ ::u 16D8 ::r m ::comment ᛘ RUNIC LETTER LONG-BRANCH-MADR M
179
+ ::u 16D9 ::r m ::comment ᛙ RUNIC LETTER SHORT-TWIG-MADR M
180
+ ::u 16DA ::r l ::comment ᛚ RUNIC LETTER LAUKAZ LAGU LOGR L
181
+ ::u 16DB ::r l ::comment ᛛ RUNIC LETTER DOTTED-L
182
+ ::u 16DC ::r ng ::comment ᛜ RUNIC LETTER INGWAZ
183
+ ::u 16DD ::r ng ::comment ᛝ RUNIC LETTER ING
184
+ ::u 16DE ::r d ::comment ᛞ RUNIC LETTER DAGAZ DAEG D
185
+ ::u 16DF ::r o ::comment ᛟ RUNIC LETTER OTHALAN ETHEL O
186
+ ::u 16E0 ::r ea ::comment ᛠ RUNIC LETTER EAR
187
+ ::u 16E1 ::r io ::comment ᛡ RUNIC LETTER IOR
188
+ ::u 16E2 ::r q ::comment ᛢ RUNIC LETTER CWEORTH
189
+ ::u 16E3 ::r k ::comment ᛣ RUNIC LETTER CALC
190
+ ::u 16E4 ::r k ::comment ᛤ RUNIC LETTER CEALC
191
+ ::u 16E5 ::r st ::comment ᛥ RUNIC LETTER STAN
192
+ ::u 16E6 ::r r ::comment ᛦ RUNIC LETTER LONG-BRANCH-YR
193
+ ::u 16E7 ::r r ::comment ᛧ RUNIC LETTER SHORT-TWIG-YR
194
+ ::u 16E8 ::r r ::comment ᛨ RUNIC LETTER ICELANDIC-YR
195
+ ::u 16E9 ::r q ::comment ᛩ RUNIC LETTER Q
196
+ ::u 16EA ::r x ::comment ᛪ RUNIC LETTER X
197
+
198
+ ::u 17B9 ::r oe ::comment Khmer vowel sign y (short)
199
+ ::u 17BA ::r oe ::comment Khmer vowel sign yy (long)
200
+ ::u 17C6 ::r +m ::comment Khmer sign nikahit (cf. anusvara)
201
+ ::u 17C7 ::r +h ::comment Khmer sign reahmuk (cf. visarga)
202
+ ::u 17C8 ::r ' ::comment Khmer sign yuukaleapintu (short vowel and glottal stop)
203
+ ::u 17C9 ::r "" ::comment Khmer sign muusikatoan: changes the second register to the first
204
+ ::u 17CA ::r "" ::comment Khmer sign triisap: changes the first register to the second
205
+ ::u 17CB ::r "" ::comment Khmer sign bantoc (vowel shortener)
206
+ ::u 17D2 ::r "" ::comment Khmer sign coeng (foot/subscript, cf. virama = no vowel)
207
+ ::u 17D5 ::r . ::comment Khmer sign bariyoosan; period ending entire text or chapter
208
+
209
+ ::u 180E ::r ' ::comment ᠎ Mongolian vowel separator
210
+
211
+ ::u 1B80 ::r +ng ::comment ᮀ Sundanese sign panyecek
212
+ ::u 1B81 ::r +r ::comment ᮁ Sundanese sign panglayar
213
+ ::u 1B82 ::r +h ::comment ᮂ Sundanese sign pangwisad
214
+ ::u 1BA1 ::r ya ::comment ᮡ Sundanese consonant sign pamingkal
215
+ ::u 1BA2 ::r ra ::comment ᮢ Sundanese consonant sign panyakr
216
+ ::u 1BA3 ::r la ::comment ᮣ Sundanese consonant sign panyiku
217
+ ::u 1BA4 ::r i ::comment ᮤ Sundanese consonant sign panghulu
218
+ ::u 1BA5 ::r u ::comment ᮥ Sundanese consonant sign panyuku
219
+ ::u 1BA6 ::r e ::comment ᮦ Sundanese vowel sign panaelaeng
220
+ ::u 1BA7 ::r o ::comment ᮧ Sundanese vowel sign panolong
221
+ ::u 1BA8 ::r e ::comment ᮨ Sundanese vowel sign pamepet
222
+ ::u 1BA9 ::r eu ::comment ᮩ Sundanese vowel sign paneuleung
223
+ ::u 1BAA ::r "" ::comment ᮪ Sundanese sign pamaaeh or patén (no vowel/virama)
224
+
225
+ ::u 1FBD ::r "" ::comment ᾽ Greek koronis
226
+ ::u 1FFE ::r "" ::comment Greek dasia (rough breathing)
227
+
228
+ ::u 2002 ::r " " ::comment en space
229
+ ::u 2003 ::r " " ::comment em space
230
+ ::u 2004 ::r " " ::comment three-per-em space
231
+ ::u 2005 ::r " " ::comment four-per-em space
232
+ ::u 2006 ::r " " ::comment six-per-em space
233
+ ::u 2007 ::r " " ::comment figure space
234
+ ::u 2008 ::r " " ::comment punctuation space
235
+ ::u 2009 ::r " " ::comment thin space
236
+ ::u 200A ::r " " ::comment hair space
237
+ ::u 202F ::r " " ::comment narrow no-break space
238
+
239
+ ::u 2D30 ::r a ::comment TIFINAGH LETTER YA ⴰ
240
+ ::u 2D31 ::r b ::comment TIFINAGH LETTER YAB ⴱ
241
+ ::u 2D32 ::r bh ::comment TIFINAGH LETTER YABH ⴲ
242
+ ::u 2D33 ::r g ::comment TIFINAGH LETTER YAG ⴳ
243
+ ::u 2D34 ::r ghh ::comment TIFINAGH LETTER YAGHH ⴴ
244
+ ::u 2D35 ::r j ::comment TIFINAGH LETTER BERBER ACADEMY YAJ ⴵ
245
+ ::u 2D36 ::r j ::comment TIFINAGH LETTER YAJ ⴶ
246
+ ::u 2D37 ::r d ::comment TIFINAGH LETTER YAD ⴷ
247
+ ::u 2D38 ::r dh ::comment TIFINAGH LETTER YADH ⴸ
248
+ ::u 2D39 ::r dd ::comment TIFINAGH LETTER YADD ⴹ
249
+ ::u 2D3A ::r ddh ::comment TIFINAGH LETTER YADDH ⴺ
250
+ ::u 2D3B ::r e ::comment TIFINAGH LETTER YEY ⴻ
251
+ ::u 2D3C ::r f ::comment TIFINAGH LETTER YAF ⴼ
252
+ ::u 2D3D ::r k ::comment TIFINAGH LETTER YAK ⴽ
253
+ ::u 2D3E ::r k ::comment TIFINAGH LETTER TUAREG YAK ⴾ
254
+ ::u 2D3F ::r khh ::comment TIFINAGH LETTER YAKHH ⴿ
255
+ ::u 2D40 ::r h ::comment TIFINAGH LETTER YAH ⵀ
256
+ ::u 2D41 ::r h ::comment TIFINAGH LETTER BERBER ACADEMY YAH ⵁ
257
+ ::u 2D42 ::r h ::comment TIFINAGH LETTER TUAREG YAH ⵂ
258
+ ::u 2D43 ::r hh ::comment TIFINAGH LETTER YAHH ⵃ
259
+ ::u 2D44 ::r ' ::comment TIFINAGH LETTER YAA ⵄ
260
+ ::u 2D45 ::r kh ::comment TIFINAGH LETTER YAKH ⵅ
261
+ ::u 2D46 ::r kh ::comment TIFINAGH LETTER TUAREG YAKH ⵆ
262
+ ::u 2D47 ::r q ::comment TIFINAGH LETTER YAQ ⵇ
263
+ ::u 2D48 ::r q ::comment TIFINAGH LETTER TUAREG YAQ ⵈ
264
+ ::u 2D49 ::r i ::comment TIFINAGH LETTER YI ⵉ
265
+ ::u 2D4A ::r zh ::comment TIFINAGH LETTER YAZH ⵊ
266
+ ::u 2D4B ::r zh ::comment TIFINAGH LETTER AHAGGAR YAZH ⵋ
267
+ ::u 2D4C ::r zh ::comment TIFINAGH LETTER TUAREG YAZH ⵌ
268
+ ::u 2D4D ::r l ::comment TIFINAGH LETTER YAL ⵍ
269
+ ::u 2D4E ::r m ::comment TIFINAGH LETTER YAM ⵎ
270
+ ::u 2D4F ::r n ::comment TIFINAGH LETTER YAN ⵏ
271
+ ::u 2D50 ::r gn ::comment TIFINAGH LETTER TUAREG YAGN ⵐ
272
+ ::u 2D51 ::r ng ::comment TIFINAGH LETTER TUAREG YANG ⵑ
273
+ ::u 2D52 ::r p ::comment TIFINAGH LETTER YAP ⵒ
274
+ ::u 2D53 ::r u ::comment TIFINAGH LETTER YU ⵓ
275
+ ::u 2D54 ::r r ::comment TIFINAGH LETTER YAR ⵔ
276
+ ::u 2D55 ::r rr ::comment TIFINAGH LETTER YARR ⵕ
277
+ ::u 2D56 ::r gh ::comment TIFINAGH LETTER YAGH ⵖ
278
+ ::u 2D57 ::r gh ::comment TIFINAGH LETTER TUAREG YAGH ⵗ
279
+ ::u 2D58 ::r gh ::comment TIFINAGH LETTER AYER YAGH ⵘ
280
+ ::u 2D59 ::r s ::comment TIFINAGH LETTER YAS ⵙ
281
+ ::u 2D5A ::r ss ::comment TIFINAGH LETTER YASS ⵚ
282
+ ::u 2D5B ::r sh ::comment TIFINAGH LETTER YASH ⵛ
283
+ ::u 2D5C ::r t ::comment TIFINAGH LETTER YAT ⵜ
284
+ ::u 2D5D ::r th ::comment TIFINAGH LETTER YATH ⵝ
285
+ ::u 2D5E ::r ch ::comment TIFINAGH LETTER YACH ⵞ
286
+ ::u 2D5F ::r tt ::comment TIFINAGH LETTER YATT ⵟ
287
+ ::u 2D60 ::r v ::comment TIFINAGH LETTER YAV ⵠ
288
+ ::u 2D61 ::r w ::comment TIFINAGH LETTER YAW ⵡ
289
+ ::u 2D62 ::r y ::comment TIFINAGH LETTER YAY ⵢ
290
+ ::u 2D63 ::r z ::comment TIFINAGH LETTER YAZ ⵣ
291
+ ::u 2D64 ::r z ::comment TIFINAGH LETTER TAWELLEMET YAZ ⵤ
292
+ ::u 2D65 ::r zz ::comment TIFINAGH LETTER YAZZ ⵥ
293
+ ::u 2D66 ::r ye ::comment TIFINAGH LETTER YE ⵦ
294
+ ::u 2D67 ::r yo ::comment TIFINAGH LETTER YO ⵧ
295
+ ::u 2D6F ::r "" ::comment TIFINAGH MODIFIER LETTER LABIALIZATION MARK ⵯ
296
+ ::u 2D70 ::r "" ::comment TIFINAGH SEPARATOR MARK ⵰
297
+ ::u 2D7F ::r "" ::comment TIFINAGH CONSONANT JOINER ⵿
298
+
299
+ ::u 3063 ::r tsu ::comment Hiragana letter small tsu
300
+ ::u 30C3 ::r tsu ::comment Katakana letter small tsu
301
+
302
+ ::u ABE3 ::r o ::comment ꯣ Meetei Mayek vowel sign onap
303
+ ::u ABE7 ::r ou ::comment ꯧ Meetei Mayek vowel sign sounap
304
+
305
+ ::u F008 ::r "" ::comment Yoruba diacritic in private use area
306
+ ::u F00F ::r "" ::comment Yoruba diacritic in private use area
307
+ ::u F023 ::r "" ::comment Yoruba diacritic in private use area
308
+ ::u F025 ::r "" ::comment Yoruba diacritic in private use area
309
+
310
+ ::u F8D0 ::r a ::name KLINGON LETTER A
311
+ ::u F8D1 ::r b ::name KLINGON LETTER B
312
+ ::u F8D2 ::r ch ::name KLINGON LETTER CH
313
+ ::u F8D3 ::r D ::name KLINGON LETTER D
314
+ ::u F8D4 ::r e ::name KLINGON LETTER E
315
+ ::u F8D5 ::r gh ::name KLINGON LETTER GH
316
+ ::u F8D6 ::r H ::name KLINGON LETTER H
317
+ ::u F8D7 ::r I ::name KLINGON LETTER I
318
+ ::u F8D8 ::r j ::name KLINGON LETTER J
319
+ ::u F8D9 ::r l ::name KLINGON LETTER L
320
+ ::u F8DA ::r m ::name KLINGON LETTER M
321
+ ::u F8DB ::r n ::name KLINGON LETTER N
322
+ ::u F8DC ::r ng ::name KLINGON LETTER NG
323
+ ::u F8DD ::r o ::name KLINGON LETTER O
324
+ ::u F8DE ::r p ::name KLINGON LETTER P
325
+ ::u F8DF ::r q ::name KLINGON LETTER Q
326
+ ::u F8E0 ::r Q ::name KLINGON LETTER Q
327
+ ::u F8E1 ::r r ::name KLINGON LETTER R
328
+ ::u F8E2 ::r S ::name KLINGON LETTER S
329
+ ::u F8E3 ::r t ::name KLINGON LETTER T
330
+ ::u F8E4 ::r tlh ::name KLINGON LETTER TLH
331
+ ::u F8E5 ::r u ::name KLINGON LETTER U
332
+ ::u F8E6 ::r v ::name KLINGON LETTER V
333
+ ::u F8E7 ::r w ::name KLINGON LETTER W
334
+ ::u F8E8 ::r y ::name KLINGON LETTER Y
335
+ ::u F8E9 ::r ' ::name KLINGON LETTER GLOTTAL STOP
336
+ ::u F8F0 ::num 0 ::name KLINGON DIGIT ZERO
337
+ ::u F8F1 ::num 1 ::name KLINGON DIGIT ONE
338
+ ::u F8F2 ::num 2 ::name KLINGON DIGIT TWO
339
+ ::u F8F3 ::num 3 ::name KLINGON DIGIT THREE
340
+ ::u F8F4 ::num 4 ::name KLINGON DIGIT FOUR
341
+ ::u F8F5 ::num 5 ::name KLINGON DIGIT FIVE
342
+ ::u F8F6 ::num 6 ::name KLINGON DIGIT SIX
343
+ ::u F8F7 ::num 7 ::name KLINGON DIGIT SEVEN
344
+ ::u F8F8 ::num 8 ::name KLINGON DIGIT EIGHT
345
+ ::u F8F9 ::num 9 ::name KLINGON DIGIT NINE
346
+ ::u F8FD ::r , ::name KLINGON COMMA
347
+ ::u F8FE ::r . ::name KLINGON FULL STOP
348
+ ::u F8FF ::name KLINGON MUMMIFICATION GLYPH
349
+
350
+ ::u 1163D ::r +m ::comment Modi sign anusvara
351
+ ::u 1163E ::r +h ::comment Modi sign visarga
352
+
353
+ ::u 13068 ::num 1000000 ::comment Egyptian Hieroglyph
354
+ ::u 1308B ::r r ::comment Egyptian Hieroglyph ::pic mouth
355
+ ::u 1309D ::r ' ::comment Egyptian Hieroglyph (ayn) ::pic forearm
356
+ ::u 130A7 ::r d ::comment Egyptian Hieroglyph ::pic hand
357
+ ::u 130AD ::num 10000 ::comment Egyptian Hieroglyph
358
+ ::u 130AE ::num 20000 ::comment Egyptian Hieroglyph
359
+ ::u 130AF ::num 30000 ::comment Egyptian Hieroglyph
360
+ ::u 130B0 ::num 40000 ::comment Egyptian Hieroglyph
361
+ ::u 130B1 ::num 50000 ::comment Egyptian Hieroglyph
362
+ ::u 130B2 ::num 60000 ::comment Egyptian Hieroglyph
363
+ ::u 130B3 ::num 70000 ::comment Egyptian Hieroglyph
364
+ ::u 130B4 ::num 80000 ::comment Egyptian Hieroglyph
365
+ ::u 130B5 ::num 90000 ::comment Egyptian Hieroglyph
366
+ ::u 130B6 ::num 50000 ::comment Egyptian Hieroglyph
367
+ ::u 130C0 ::r b ::comment Egyptian Hieroglyph ::pic foot
368
+ ::u 130ED ::r l ::comment Egyptian Hieroglyph [also rw] ::pic lion recumbent
369
+ ::u 13121 ::r h ::comment Egyptian Hieroglyph (f-underscore) ::pic aninal's belly and udder
370
+ ::u 1313F ::r a ::comment Egyptian Hieroglyph (alef) ::pic vulture
371
+ ::u 13153 ::r m ::comment Egyptian Hieroglyph ::pic owl
372
+ ::u 13171 ::r w ::comment Egyptian Hieroglyph ::pic quail chick
373
+ ::u 13187 ::r ::comment Egyptian Hieroglyph (determinative/son) H8 ::pic egg
374
+ ::u 13190 ::num 100000 ::comment Egyptian Hieroglyph
375
+ ::u 13191 ::r f ::comment Egyptian Hieroglyph ::pic horned viper
376
+ ::u 13193 ::r d ::comment Egyptian Hieroglyph (J) ::pic cobra
377
+ ::u 131BC ::num 1000 ::comment Egyptian Hieroglyph
378
+ ::u 131BD ::num 2000 ::comment Egyptian Hieroglyph
379
+ ::u 131BE ::num 3000 ::comment Egyptian Hieroglyph
380
+ ::u 131BF ::num 4000 ::comment Egyptian Hieroglyph
381
+ ::u 131C0 ::num 5000 ::comment Egyptian Hieroglyph
382
+ ::u 131C1 ::num 6000 ::comment Egyptian Hieroglyph
383
+ ::u 131C2 ::num 7000 ::comment Egyptian Hieroglyph
384
+ ::u 131C3 ::num 8000 ::comment Egyptian Hieroglyph
385
+ ::u 131C4 ::num 9000 ::comment Egyptian Hieroglyph
386
+ ::u 131CB ::r i ::comment Egyptian Hieroglyph (yod) ::pic single reed
387
+ ::u 131CC ::r y ::comment Egyptian Hieroglyph ::pic double reed
388
+ ::u 1320E ::r q ::comment Egyptian Hieroglyph (qaf) ::pic sandy slope
389
+ ::u 13209 ::comment Egyptian Hieroglyph ::pic desert hills
390
+ ::u 13216 ::r n ::comment Egyptian Hieroglyph ::pic ripple of water
391
+ ::u 13219 ::r sh ::comment Egyptian Hieroglyph (š) ::pic basin
392
+ ::u 13254 ::r h ::comment Egyptian Hieroglyph ::pic reed shelter
393
+ ::u 13283 ::r z ::comment Egyptian Hieroglyph [also S?] ::pic door bolt
394
+ ::u 132AA ::r p ::comment Egyptian Hieroglyph ::pic stool
395
+ ::u 132D4 ::r n ::comment Egyptian Hieroglyph ::pic red crown
396
+ ::u 132F4 ::r s ::comment Egyptian Hieroglyph [also Z?] ::pic folded cloth
397
+ ::u 13319 ::comment Egyptian Hieroglyph ::pic throw stick
398
+ ::u 13362 ::num 100 ::comment Egyptian Hieroglyph
399
+ ::u 13363 ::num 200 ::comment Egyptian Hieroglyph
400
+ ::u 13364 ::num 300 ::comment Egyptian Hieroglyph
401
+ ::u 13365 ::num 400 ::comment Egyptian Hieroglyph
402
+ ::u 13366 ::num 500 ::comment Egyptian Hieroglyph
403
+ ::u 13367 ::num 600 ::comment Egyptian Hieroglyph
404
+ ::u 13368 ::num 700 ::comment Egyptian Hieroglyph
405
+ ::u 13369 ::num 800 ::comment Egyptian Hieroglyph
406
+ ::u 1336A ::num 900 ::comment Egyptian Hieroglyph
407
+ ::u 1336B ::num 500 ::comment Egyptian Hieroglyph
408
+ ::u 1336F ::r o ::comment Egyptian Hieroglyph ::pic lasso
409
+ ::u 1337F ::r t ::comment Egyptian Hieroglyph (ṯ) ::pic hobble
410
+ ::u 13386 ::num 10 ::comment Egyptian Hieroglyph
411
+ ::u 13387 ::num 20 ::comment Egyptian Hieroglyph
412
+ ::u 13388 ::num 30 ::comment Egyptian Hieroglyph
413
+ ::u 13389 ::num 40 ::comment Egyptian Hieroglyph
414
+ ::u 1338A ::num 50 ::comment Egyptian Hieroglyph
415
+ ::u 1338B ::num 60 ::comment Egyptian Hieroglyph
416
+ ::u 1338C ::num 70 ::comment Egyptian Hieroglyph
417
+ ::u 1338D ::num 80 ::comment Egyptian Hieroglyph
418
+ ::u 1338E ::num 90 ::comment Egyptian Hieroglyph
419
+ ::u 1338F ::num 20 ::comment Egyptian Hieroglyph
420
+ ::u 13390 ::num 30 ::comment Egyptian Hieroglyph
421
+ ::u 13391 ::num 40 ::comment Egyptian Hieroglyph
422
+ ::u 13392 ::num 50 ::comment Egyptian Hieroglyph
423
+ ::u 1339B ::r h ::comment Egyptian Hieroglyph ::pic twisted flax
424
+ ::u 133A1 ::r k ::comment Egyptian Hieroglyph ::pic basket with handle
425
+ ::u 133A2 ::r k ::comment Egyptian Hieroglyph ::pic basket with handle, variant
426
+ ::u 133A4 ::r g ::comment Egyptian Hieroglyph ::pic bag
427
+ ::u 133BC ::r g ::comment Egyptian Hieroglyph ::pic stand
428
+ ::u 133CF ::r t ::comment Egyptian Hieroglyph ::pic loaf
429
+ ::u 133ED ::r y ::comment Egyptian Hieroglyph ::pic two strokes
430
+ ::u 133F2 ::r w ::comment Egyptian Hieroglyph ::pic quail chick, hieratic variant
431
+ ::u 133FA ::num 1 ::comment Egyptian Hieroglyph
432
+ ::u 133FB ::num 2 ::comment Egyptian Hieroglyph
433
+ ::u 133FC ::num 3 ::comment Egyptian Hieroglyph
434
+ ::u 133FD ::num 4 ::comment Egyptian Hieroglyph
435
+ ::u 133FE ::num 5 ::comment Egyptian Hieroglyph
436
+ ::u 133FF ::num 6 ::comment Egyptian Hieroglyph
437
+ ::u 13400 ::num 7 ::comment Egyptian Hieroglyph
438
+ ::u 13401 ::num 8 ::comment Egyptian Hieroglyph
439
+ ::u 13402 ::num 9 ::comment Egyptian Hieroglyph
440
+ ::u 13403 ::num 5 ::comment Egyptian Hieroglyph
441
+ ::u 1340D ::r kh ::comment Egyptian Hieroglyph (ḫ, khah) ::pic placenta?
442
+ ::u 1341D ::r m ::comment Egyptian Hieroglyph (also jm)
uroman/data/romanization-table-arabic-block.txt ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ::s ، ::t , ::comment ARABIC COMMA
2
+ ::s ؛ ::t ; ::comment ARABIC SEMICOLON
3
+ ::s ؟ ::t ? ::comment ARABIC QUESTION MARK
4
+ ::s ء ::t ' ::comment ARABIC LETTER HAMZA
5
+ ::s آ ::t a ::comment ARABIC LETTER ALEF WITH MADDA ABOVE
6
+ ::s أ ::t a ::comment ARABIC LETTER ALEF WITH HAMZA ABOVE
7
+ ::s ؤ ::t w ::comment ARABIC LETTER WAW WITH HAMZA ABOVE
8
+ ::s إ ::t i ::comment ARABIC LETTER ALEF WITH HAMZA BELOW
9
+ ::s ئ ::t ye ::comment ARABIC LETTER YEH WITH HAMZA ABOVE
10
+ ::s ا ::t a ::comment ARABIC LETTER ALEF
11
+ ::s ب ::t b ::comment ARABIC LETTER BEH
12
+ ::s ة ::t a ::comment ARABIC LETTER TEH MARBUTA
13
+ ::s ت ::t t ::comment ARABIC LETTER TEH
14
+ ::s ث ::t th ::comment ARABIC LETTER THEH
15
+ ::s ج ::t j ::comment ARABIC LETTER JEEM
16
+ ::s ح ::t h ::comment ARABIC LETTER HAH
17
+ ::s خ ::t kh ::comment ARABIC LETTER KHAH
18
+ ::s د ::t d ::comment ARABIC LETTER DAL
19
+ ::s ذ ::t th ::comment ARABIC LETTER THAL
20
+ ::s ر ::t r ::comment ARABIC LETTER REH
21
+ ::s ز ::t z ::comment ARABIC LETTER ZAIN
22
+ ::s س ::t s ::comment ARABIC LETTER SEEN
23
+ ::s ش ::t sh ::comment ARABIC LETTER SHEEN
24
+ ::s ص ::t s ::comment ARABIC LETTER SAD
25
+ ::s ض ::t d ::comment ARABIC LETTER DAD
26
+ ::s ط ::t t ::comment ARABIC LETTER TAH
27
+ ::s ظ ::t z ::comment ARABIC LETTER ZAH
28
+ ::s ع ::t ' ::comment ARABIC LETTER AIN
29
+ ::s غ ::t gh ::comment ARABIC LETTER GHAIN
30
+ ::s ـ ::t - ::comment ARABIC TATWEEL
31
+ ::s ف ::t f ::comment ARABIC LETTER FEH
32
+ ::s ق ::t q ::comment ARABIC LETTER QAF
33
+ ::s ك ::t k ::comment ARABIC LETTER KAF
34
+ ::s ل ::t l ::comment ARABIC LETTER LAM
35
+ ::s م ::t m ::comment ARABIC LETTER MEEM
36
+ ::s ن ::t n ::comment ARABIC LETTER NOON
37
+ ::s ه ::t h ::comment ARABIC LETTER HEH
38
+ ::s و ::t w ::comment ARABIC LETTER WAW
39
+ ::s ى ::t a ::comment ARABIC LETTER ALEF MAKSURA
40
+ ::s ي ::t y ::comment ARABIC LETTER YEH
41
+ ::s َ ::t a ::comment ARABIC FATHA
42
+ ::s ُ ::t u ::comment ARABIC DAMMA
43
+ ::s ِ ::t i ::comment ARABIC KASRA
44
+ ::s ْ ::t ::comment ARABIC SUKUN
45
+ ::s ٔ ::t ' ::comment ARABIC HAMZA ABOVE
46
+ ::s ٕ ::t ' ::comment ARABIC HAMZA BELOW
47
+ ::s ٠ ::t 0 ::comment ARABIC-INDIC DIGIT ZERO
48
+ ::s ١ ::t 1 ::comment ARABIC-INDIC DIGIT ONE
49
+ ::s ٢ ::t 2 ::comment ARABIC-INDIC DIGIT TWO
50
+ ::s ٣ ::t 3 ::comment ARABIC-INDIC DIGIT THREE
51
+ ::s ٤ ::t 4 ::comment ARABIC-INDIC DIGIT FOUR
52
+ ::s ٥ ::t 5 ::comment ARABIC-INDIC DIGIT FIVE
53
+ ::s ٦ ::t 6 ::comment ARABIC-INDIC DIGIT SIX
54
+ ::s ٧ ::t 7 ::comment ARABIC-INDIC DIGIT SEVEN
55
+ ::s ٨ ::t 8 ::comment ARABIC-INDIC DIGIT EIGHT
56
+ ::s ٩ ::t 9 ::comment ARABIC-INDIC DIGIT NINE
57
+ ::s ٪ ::t % ::comment ARABIC PERCENT SIGN
58
+ ::s ٫ ::t , ::comment ARABIC DECIMAL SEPARATOR
59
+ ::s ٬ ::t , ::comment ARABIC THOUSANDS SEPARATOR
60
+ ::s ٮ ::t b ::comment ARABIC LETTER DOTLESS BEH
61
+ ::s ٯ ::t q ::comment ARABIC LETTER DOTLESS QAF
62
+ ::s ٰ ::t a ::comment ARABIC LETTER SUPERSCRIPT ALEF
63
+ ::s ٱ ::t a ::comment ARABIC LETTER ALEF WASLA
64
+ ::s ٲ ::t a ::comment ARABIC LETTER ALEF WITH WAVY HAMZA ABOVE
65
+ ::s ٳ ::t a ::comment ARABIC LETTER ALEF WITH WAVY HAMZA BELOW
66
+ ::s ٷ ::t u ::comment ARABIC LETTER U WITH HAMZA ABOVE
67
+ ::s ٹ ::t tt ::comment ARABIC LETTER TTEH
68
+ ::s ٺ ::t tt ::comment ARABIC LETTER TTEHEH
69
+ ::s ٻ ::t b ::comment ARABIC LETTER BEEH
70
+ ::s ټ ::t t ::comment ARABIC LETTER TEH WITH RING
71
+ ::s ٽ ::t t ::comment ARABIC LETTER TEH WITH THREE DOTS ABOVE DOWNWARDS
72
+ ::s پ ::t p ::comment ARABIC LETTER PEH
73
+ ::s ٿ ::t t ::comment ARABIC LETTER TEHEH
74
+ ::s ڀ ::t b ::comment ARABIC LETTER BEHEH
75
+ ::s ځ ::t h ::comment ARABIC LETTER HAH WITH HAMZA ABOVE
76
+ ::s ڂ ::t h ::comment ARABIC LETTER HAH WITH TWO DOTS VERTICAL ABOVE
77
+ ::s ڃ ::t ny ::comment ARABIC LETTER NYEH
78
+ ::s ڄ ::t dy ::comment ARABIC LETTER DYEH
79
+ ::s څ ::t h ::comment ARABIC LETTER HAH WITH THREE DOTS ABOVE
80
+ ::s چ ::t tch ::comment ARABIC LETTER TCHEH
81
+ ::s ڇ ::t tch ::comment ARABIC LETTER TCHEHEH
82
+ ::s ڈ ::t dd ::comment ARABIC LETTER DDAL
83
+ ::s ډ ::t d ::comment ARABIC LETTER DAL WITH RING
84
+ ::s ڊ ::t d ::comment ARABIC LETTER DAL WITH DOT BELOW
85
+ ::s ڋ ::t d ::comment ARABIC LETTER DAL WITH DOT BELOW AND SMALL TAH
86
+ ::s ڌ ::t d ::comment ARABIC LETTER DAHAL
87
+ ::s ڍ ::t dd ::comment ARABIC LETTER DDAHAL
88
+ ::s ڎ ::t d ::comment ARABIC LETTER DUL
89
+ ::s ڏ ::t d ::comment ARABIC LETTER DAL WITH THREE DOTS ABOVE DOWNWARDS
90
+ ::s ڐ ::t d ::comment ARABIC LETTER DAL WITH FOUR DOTS ABOVE
91
+ ::s ڑ ::t rr ::comment ARABIC LETTER RREH
92
+ ::s ڒ ::t r ::comment ARABIC LETTER REH WITH SMALL V
93
+ ::s ړ ::t r ::comment ARABIC LETTER REH WITH RING
94
+ ::s ڔ ::t r ::comment ARABIC LETTER REH WITH DOT BELOW
95
+ ::s ڕ ::t r ::comment ARABIC LETTER REH WITH SMALL V BELOW
96
+ ::s ږ ::t r ::comment ARABIC LETTER REH WITH DOT BELOW AND DOT ABOVE
97
+ ::s ڗ ::t r ::comment ARABIC LETTER REH WITH TWO DOTS ABOVE
98
+ ::s ژ ::t j ::comment ARABIC LETTER JEH
99
+ ::s ڙ ::t r ::comment ARABIC LETTER REH WITH FOUR DOTS ABOVE
100
+ ::s ښ ::t s ::comment ARABIC LETTER SEEN WITH DOT BELOW AND DOT ABOVE
101
+ ::s ڛ ::t s ::comment ARABIC LETTER SEEN WITH THREE DOTS BELOW
102
+ ::s ڜ ::t s ::comment ARABIC LETTER SEEN WITH THREE DOTS BELOW AND THREE DOTS ABOVE
103
+ ::s ڝ ::t s ::comment ARABIC LETTER SAD WITH TWO DOTS BELOW
104
+ ::s ڞ ::t s ::comment ARABIC LETTER SAD WITH THREE DOTS ABOVE
105
+ ::s ڟ ::t t ::comment ARABIC LETTER TAH WITH THREE DOTS ABOVE
106
+ ::s ڠ ::t n ::comment ARABIC LETTER AIN WITH THREE DOTS ABOVE
107
+ ::s ڡ ::t f ::comment ARABIC LETTER DOTLESS FEH
108
+ ::s ڢ ::t f ::comment ARABIC LETTER FEH WITH DOT MOVED BELOW
109
+ ::s ڣ ::t f ::comment ARABIC LETTER FEH WITH DOT BELOW
110
+ ::s ڤ ::t v ::comment ARABIC LETTER VEH
111
+ ::s ڥ ::t f ::comment ARABIC LETTER FEH WITH THREE DOTS BELOW
112
+ ::s ڦ ::t p ::comment ARABIC LETTER PEHEH
113
+ ::s ڧ ::t q ::comment ARABIC LETTER QAF WITH DOT ABOVE
114
+ ::s ڨ ::t q ::comment ARABIC LETTER QAF WITH THREE DOTS ABOVE
115
+ ::s ک ::t k ::comment ARABIC LETTER KEHEH
116
+ ::s ڪ ::t k ::comment ARABIC LETTER SWASH KAF
117
+ ::s ګ ::t k ::comment ARABIC LETTER KAF WITH RING
118
+ ::s ڬ ::t k ::comment ARABIC LETTER KAF WITH DOT ABOVE
119
+ ::s ڭ ::t ng ::comment ARABIC LETTER NG
120
+ ::s ڮ ::t k ::comment ARABIC LETTER KAF WITH THREE DOTS BELOW
121
+ ::s گ ::t g ::comment ARABIC LETTER GAF
122
+ ::s ڰ ::t g ::comment ARABIC LETTER GAF WITH RING
123
+ ::s ڱ ::t ng ::comment ARABIC LETTER NGOEH
124
+ ::s ڲ ::t g ::comment ARABIC LETTER GAF WITH TWO DOTS BELOW
125
+ ::s ڳ ::t g ::comment ARABIC LETTER GUEH
126
+ ::s ڴ ::t g ::comment ARABIC LETTER GAF WITH THREE DOTS ABOVE
127
+ ::s ڵ ::t l ::comment ARABIC LETTER LAM WITH SMALL V
128
+ ::s ڶ ::t l ::comment ARABIC LETTER LAM WITH DOT ABOVE
129
+ ::s ڷ ::t l ::comment ARABIC LETTER LAM WITH THREE DOTS ABOVE
130
+ ::s ڸ ::t l ::comment ARABIC LETTER LAM WITH THREE DOTS BELOW
131
+ ::s ڹ ::t n ::comment ARABIC LETTER NOON WITH DOT BELOW
132
+ ::s ں ::t n ::comment ARABIC LETTER NOON GHUNNA
133
+ ::s ڻ ::t rn ::comment ARABIC LETTER RNOON
134
+ ::s ڼ ::t n ::comment ARABIC LETTER NOON WITH RING
135
+ ::s ڽ ::t n ::comment ARABIC LETTER NOON WITH THREE DOTS ABOVE
136
+ ::s ھ ::t h ::comment ARABIC LETTER HEH DOACHASHMEE
137
+ ::s ڿ ::t tch ::comment ARABIC LETTER TCHEH WITH DOT ABOVE
138
+ ::s ۀ ::t h ::comment ARABIC LETTER HEH WITH YEH ABOVE
139
+ ::s ہ ::t h ::comment ARABIC LETTER HEH GOAL
140
+ ::s ۂ ::t h ::comment ARABIC LETTER HEH GOAL WITH HAMZA ABOVE
141
+ ::s ۃ ::t a ::comment ARABIC LETTER TEH MARBUTA GOAL
142
+ ::s ۄ ::t w ::comment ARABIC LETTER WAW WITH RING
143
+ ::s ۅ ::t oe ::comment ARABIC LETTER KIRGHIZ OE
144
+ ::s ۆ ::t oe ::comment ARABIC LETTER OE
145
+ ::s ۇ ::t u ::comment ARABIC LETTER U
146
+ ::s ۈ ::t yu ::comment ARABIC LETTER YU
147
+ ::s ۉ ::t yu ::comment ARABIC LETTER KIRGHIZ YU
148
+ ::s ۊ ::t w ::comment ARABIC LETTER WAW WITH TWO DOTS ABOVE
149
+ ::s ۋ ::t v ::comment ARABIC LETTER VE
150
+ ::s ی ::t y ::comment ARABIC LETTER FARSI YEH
151
+ ::s ۍ ::t y ::comment ARABIC LETTER YEH WITH TAIL
152
+ ::s ێ ::t y ::comment ARABIC LETTER YEH WITH SMALL V
153
+ ::s ۏ ::t w ::comment ARABIC LETTER WAW WITH DOT ABOVE
154
+ ::s ې ::t e ::comment ARABIC LETTER E
155
+ ::s ۑ ::t y ::comment ARABIC LETTER YEH WITH THREE DOTS BELOW
156
+ ::s ے ::t y ::comment ARABIC LETTER YEH BARREE
157
+ ::s ۓ ::t y ::comment ARABIC LETTER YEH BARREE WITH HAMZA ABOVE
158
+ ::s ۔ ::t . ::comment ARABIC FULL STOP
159
+ ::s ە ::t ae ::comment ARABIC LETTER AE
160
+ ::s ۮ ::t d ::comment ARABIC LETTER DAL WITH INVERTED V
161
+ ::s ۯ ::t r ::comment ARABIC LETTER REH WITH INVERTED V
162
+ ::s ۰ ::t 0 ::comment EXTENDED ARABIC-INDIC DIGIT ZERO
163
+ ::s ۱ ::t 1 ::comment EXTENDED ARABIC-INDIC DIGIT ONE
164
+ ::s ۲ ::t 2 ::comment EXTENDED ARABIC-INDIC DIGIT TWO
165
+ ::s ۳ ::t 3 ::comment EXTENDED ARABIC-INDIC DIGIT THREE
166
+ ::s ۴ ::t 4 ::comment EXTENDED ARABIC-INDIC DIGIT FOUR
167
+ ::s ۵ ::t 5 ::comment EXTENDED ARABIC-INDIC DIGIT FIVE
168
+ ::s ۶ ::t 6 ::comment EXTENDED ARABIC-INDIC DIGIT SIX
169
+ ::s ۷ ::t 7 ::comment EXTENDED ARABIC-INDIC DIGIT SEVEN
170
+ ::s ۸ ::t 8 ::comment EXTENDED ARABIC-INDIC DIGIT EIGHT
171
+ ::s ۹ ::t 9 ::comment EXTENDED ARABIC-INDIC DIGIT NINE
172
+ ::s ۺ ::t sh ::comment ARABIC LETTER SHEEN WITH DOT BELOW
173
+ ::s ۻ ::t d ::comment ARABIC LETTER DAD WITH DOT BELOW
174
+ ::s ۼ ::t gh ::comment ARABIC LETTER GHAIN WITH DOT BELOW
175
+ ::s ۽ ::t & ::comment ARABIC SIGN SINDHI AMPERSAND
176
+ ::s ﷲ ::t allah ::comment ARABIC LIGATURE ALLAH ISOLATED FORM
177
+
178
+ ::s ‌ ::t ::comment ZERO WIDTH NON-JOINER
179
+ ::s ‍ ::t ::comment ZERO WIDTH JOINER
uroman/data/romanization-table.txt ADDED
@@ -0,0 +1,2019 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 
2
+ ## European Latin extensions
3
+ # Vowels
4
+ ::s Ä ::t Ae
5
+ ::s Ö ::t Oe
6
+ ::s Ü ::t Ue
7
+ ::s Å ::t Aa
8
+ ::s Æ ::t Ae
9
+ ::s Ø ::t oe
10
+ ::s Œ ::t Oe
11
+ ::s ä ::t ae
12
+ ::s ö ::t oe
13
+ ::s ü ::t ue
14
+ ::s å ::t aa
15
+ ::s æ ::t ae
16
+ ::s ø ::t oe
17
+ ::s œ ::t oe
18
+ # Consonants
19
+ ::s Ç ::t S
20
+ ::s ç ::t s
21
+ ::s Ç ::t Ch ::lcode tur
22
+ ::s ç ::t ch ::lcode tur
23
+ ::s Ş ::t Sh
24
+ ::s ş ::t sh
25
+ ::s Ș ::t Sh
26
+ ::s ș ::t sh
27
+ ::s ß ::t ss
28
+ ::s Ț ::t Ts
29
+ ::s ț ::t ts
30
+
31
+ # Digraphs
32
+ # ::s ʣ ::t dz
33
+ ::s ʤ ::t dzh ::comment Latin small letter dezh digraph
34
+ # ::s ʥ ::t dz
35
+ # ::s ʦ ::t ts
36
+ ::s ʧ ::t tsh ::comment Latin small letter tesh digraph
37
+ # ::s ʨ ::t tc
38
+
39
+ # Miscellaneous
40
+ ::s ə ::t e
41
+
42
+ # English
43
+ ::s chr ::t chr ::t-alt kr ::example chromosome, synchronize
44
+ ::s Chr ::t Chr ::t-alt Kr ::example Christmas, Chrysler
45
+ ::s eight ::t eight ::t-alt eit ::example eight, weight
46
+ ::s Eight ::t Eight ::t-alt Eit ::example Eighteen
47
+ ::s ight ::t ight ::t-alt ait ::example Knight
48
+ ::s gh ::t gh ::t-alt f, ph, "" ::example laugh, daughter
49
+ ::s high ::t high ::t-alt hai ::example highlight
50
+ ::s High ::t High ::t-alt Hai ::example High School
51
+ ::s Isle ::t Isle ::t-alt Ail ::use-only-for-whole-word ::example Isle
52
+ ::s Island ::t Island ::t-alt Ailand ::use-only-for-whole-word ::example Island
53
+ ::s kn ::t kn ::t-alt n ::use-only-at-start-of-word ::example knowledge
54
+ ::s Kn ::t Kn ::t-alt N ::use-only-at-start-of-word ::example Knight
55
+ ::s Mc ::t Mc ::t-alt Mac ::use-only-at-start-of-word ::example McNulty
56
+ ::s mc ::t mc ::t-alt mac ::use-only-at-start-of-word
57
+ ::s oo ::t oo ::t-alt u ::lcode eng ::example Brooklyn; Goose Bay
58
+ ::s ph ::t ph ::t-alt f ::example alpha
59
+ ::s Ph ::t Ph ::t-alt F ::example Philip
60
+ ::s Thom ::t Thom ::t-alt Tom ::use-only-at-start-of-word ::example Thomas, Thompson
61
+ ::s tion ::t tion ::t-alt shen ::example
62
+ ::s Sean ::t Sean ::t-alt Shawn ::use-only-for-whole-word
63
+ ::s ssion ::t ssion ::t-alt shen ::example Sessions
64
+ ::s St ::t St ::t-alt Saint ::use-only-for-whole-word
65
+ ::s St. ::t St. ::t-alt Saint ::use-only-for-whole-word
66
+ ::s Wr ::t Wr ::t-alt R ::example Wren
67
+ ::s wr ::t wr ::t-alt r ::example Cartwright
68
+ ::s x ::t x ::t-alt ks ::example Mexico
69
+ ::s x ::t x ::t-alt gz ::example example, anxiety, exhaust, exit
70
+
71
+ # French
72
+ ::s â ::t a ::t-alt as ::example pâte/paste, pastry
73
+ ::s ê ::t e ::t-alt es ::example fête/feast
74
+ ::s î ::t i ::t-alt is ::example île/isle
75
+ ::s ô ::t o ::t-alt os ::example côte/coast
76
+ ::s û ::t u ::t-alt us ::example août/August
77
+ ::s eaux ::t eaux ::t-alt o ::example Bordeaux
78
+ ::s eau ::t eau ::t-alt o ::example Chateau
79
+ ::s auld ::t auld ::t-alt o ::use-only-at-end-of-word ::example Renauld
80
+ ::s ault ::t ault ::t-alt o ::use-only-at-end-of-word ::example Renault
81
+ ::s oux ::t oux ::t-alt u
82
+ ::s ois ::t ois ::t-alt oa ::use-only-at-end-of-word ::example Dubois
83
+
84
+ # German
85
+ ::s Sch ::t Sch ::t-alt Sh
86
+ ::s sch ::t sch ::t-alt sh
87
+ ::s stein ::t stein ::t-alt shtain
88
+ ::s dt ::t dt ::t-alt tt ::use-only-at-end-of-word ::example Schmidt
89
+
90
+ # Dutch
91
+ ::s ij ::t ij ::t-alt ai
92
+ ::s Ij ::t Ij ::t-alt Ai
93
+
94
+ # Latvian
95
+ ::s Ā ::t A ::t-alt Aa ::lcode lav
96
+ ::s ā ::t a ::t-alt aa ::lcode lav
97
+ ::s Ē ::t E ::t-alt Ee ::lcode lav
98
+ ::s ē ::t e ::t-alt ee ::lcode lav
99
+ ::s Ī ::t I ::t-alt Ii ::lcode lav
100
+ ::s ī ::t i ::t-alt ii ::lcode lav
101
+ ::s Ū ::t U ::t-alt Uu ::lcode lav
102
+ ::s ū ::t u ::t-alt uu ::lcode lav
103
+ ::s Ģ ::t G ::t-alt Gj ::lcode lav
104
+ ::s ģ ::t g ::t-alt gj ::lcode lav
105
+ ::s Ķ ::t K ::t-alt Kj ::lcode lav
106
+ ::s ķ ::t k ::t-alt kj ::lcode lav
107
+ ::s Ļ ::t L ::t-alt Lj ::lcode lav
108
+ ::s ļ ::t l ::t-alt lj ::lcode lav
109
+ ::s Ņ ::t N ::t-alt Nj ::lcode lav
110
+ ::s ņ ::t n ::t-alt nj ::lcode lav
111
+ ::s C ::t C ::t-alt Ts ::lcode lav
112
+ ::s c ::t c ::t-alt ts ::lcode lav
113
+ ::s Č ::t C ::t-alt Tsh ::lcode lav
114
+ ::s č ::t c ::t-alt tsh ::lcode lav
115
+ ::s Š ::t Sh ::t-alt s ::lcode lav
116
+ ::s š ::t sh ::t-alt s ::lcode lav
117
+ ::s Ž ::t Z ::t-alt Zh ::lcode lav
118
+ ::s ž ::t z ::t-alt zh ::lcode lav
119
+
120
+ # Lithuanian
121
+ ::s C ::t C ::t-alt Ts ::lcode lit
122
+ ::s c ::t c ::t-alt ts ::lcode lit
123
+ ::s Č ::t C ::t-alt Tsh ::lcode lit
124
+ ::s č ::t c ::t-alt tsh ::lcode lit
125
+ ::s Š ::t Sh ::t-alt s ::lcode lit
126
+ ::s š ::t sh ::t-alt s ::lcode lit
127
+ ::s Ž ::t Z ::t-alt Zh ::lcode lit
128
+ ::s ž ::t z ::t-alt zh ::lcode lit
129
+
130
+ # International Greek (e.g. as used in chemical compounds)
131
+ ::s β ::t b
132
+ ::s Β ::t B
133
+ ::s ϐ ::t b
134
+
135
+ # Ancient Greek
136
+ ::s β ::t b ::lcode grc
137
+ ::s Β ::t B ::lcode grc
138
+ ::s γγ ::t ng ::lcode grc
139
+ ::s γκ ::t nk ::lcode grc
140
+ ::s γξ ::t nx ::lcode grc
141
+ ::s γχ ::t nch ::lcode grc
142
+ ::s ϱ ::t r ::lcode grc
143
+
144
+ # Pontic Greek
145
+ ::s β ::t v ::t-alt b ::lcode pnt
146
+ ::s Β ::t V ::t-alt B ::lcode pnt
147
+ ::s ϐ ::t v ::t-alt b ::lcode pnt
148
+
149
+ # Modern Greek (generally the default)
150
+ ::s β ::t v ::t-alt b ::lcode ell
151
+ ::s Β ::t V ::t-alt B ::lcode ell
152
+ ::s ϐ ::t v ::t-alt b ::lcode ell
153
+ ::s Ι ::t I
154
+ ::s ι ::t i
155
+ ::s ί ::t i
156
+ ::s ἶ ::t i
157
+ ::s Υ ::t Y
158
+ ::s υ ::t y
159
+ ::s Ρ ::t R
160
+ ::s ρ ::t r
161
+ ::s ϱ ::t r
162
+ ::s Χ ::t Ch ::t-alt Kh
163
+ ::s χ ::t ch ::t-alt kh
164
+ ::s φ ::t f ::t-alt ph
165
+ ::s Φ ::t F ::t-alt Ph
166
+ ::s Ντ ::t D
167
+ ::s ντ ::t nd ::t-alt d, nt
168
+ # ::s ντζ ::t ntz
169
+ ::s Μπ ::t B
170
+ ::s μπ ::t b ::use-only-at-start-of-word
171
+ ::s μπ ::t mb ::t-alt b, mp ::dont-use-at-start-of-word
172
+ ::s λμπ ::t lb
173
+ ::s νμπ ::t nb
174
+ ::s ρμπ ::t rb
175
+ ::s γγ ::t ng
176
+ ::s Γκ ::t G
177
+ ::s γκ ::t ng ::t-alt g ::dont-use-at-start-of-word
178
+ ::s γκ ::t g ::use-only-at-start-of-word
179
+ ::s γξ ::t nx ::lcode grc
180
+ ::s γχ ::t nch ::lcode grc
181
+ ::s ει ::t ei ::t-alt i
182
+ ::s Ει ::t Ei ::t-alt I
183
+ ::s ευ ::t eu ::t-alt ev ::comment donated by Constantine
184
+ ::s Ευ ::t Eu ::t-alt Ev ::comment donated by Constantine
185
+ ::s αυ ::t au ::t-alt av
186
+ ::s Αυ ::t Au ::t-alt Av
187
+ ::s ου ::t ou ::t-alt u
188
+ ::s Ου ::t Ou ::t-alt U
189
+ ::s ηυ ::t eu
190
+ ::s Ηυ ::t Eu
191
+ ::s υι ::t ui
192
+ ::s Υι ::t Ui
193
+ ::s ωυ ::t ou
194
+ ::s Ωυ ::t Ou
195
+ ::s ͺ ::t ::comment GREEK YPOGEGRAMMENI (U+037A)
196
+ ::s ϒ ::t Y ::comment GREEK UPSILON WITH HOOK SYMBOL (U+03D2)
197
+ ::s ϓ ::t Y ::comment GREEK UPSILON WITH ACUTE AND HOOK SYMBOL (U+03D3)
198
+ ::s ϔ ::t Y ::comment GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL (U+03D4)
199
+ ::s ι ::t ::comment GREEK PROSGEGRAMMENI (U+1FBE)
200
+ ::s ᾿ ::t ::comment GREEK PSILI (U+1FBF)
201
+ ::s ῀ ::t ::comment GREEK PERISPOMENI (U+1FC0)
202
+ ::s ` ::t ::comment GREEK VARIA (U+1FEF)
203
+ ::s ´ ::t ::comment GREEK OXIA (U+1FFD)
204
+
205
+ # Glagolitic
206
+ ::s Ⰿ ::t M ::comment GLAGOLITIC CAPITAL LETTER MYSLITE (U+2C0F)
207
+ ::s Ⱞ ::t M ::comment GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE (U+2C2E)
208
+ ::s ⰿ ::t m ::comment GLAGOLITIC SMALL LETTER MYSLITE (U+2C3F)
209
+ ::s ⱞ ::t m ::comment GLAGOLITIC SMALL LETTER LATINATE MYSLITE (U+2C5E)
210
+ ::s 𞀏 ::t m ::comment COMBINING GLAGOLITIC LETTER MYSLITE (U+1E00F)
211
+
212
+ # Cyrillic
213
+ ::s Г ::t G ::t-alt H ::comment Cyrillic capital ghe
214
+ ::s г ::t g ::t-alt h ::comment Cyrillic small ghe
215
+ ::s Е ::t E ::t-alt Ye ::comment Cyrillic capital ie
216
+ ::s е ::t e ::t-alt ye ::comment Cyrillic small ie
217
+ ::s Ё ::t E ::t-alt Yo
218
+ ::s ё ::t e ::t-alt yo
219
+ ::s Х ::t Kh ::t-alt Ch, H ::comment Cyrillic capital ha
220
+ ::s х ::t kh ::t-alt ch, h ::comment Cyrillic small ha
221
+ ::s Щ ::t Shch ::t-alt Sh
222
+ ::s щ ::t shch ::t-alt sh
223
+ ::s Ъ ::t ::comment Cyrillic capital hard sign
224
+ ::s ъ ::t ::comment Cyrillic small hard sign
225
+ ::s ᲆ ::t ::comment CYRILLIC SMALL LETTER TALL HARD SIGN
226
+ ::s Ы ::t Y ::comment Cyrillic capital yeru
227
+ ::s ы ::t y ::comment Cyrillic small yeru
228
+ ::s Ь ::t ::comment Cyrillic capital soft sign
229
+ ::s ь ::t ::comment Cyrillic small soft sign
230
+ ::s Ж ::t Zh ::comment Cyrillic capital letter zhe
231
+ ::s Ш ::t Sh ::comment Cyrillic capital letter sha
232
+ ::s Ч ::t Ch ::comment Cyrillic capital letter che
233
+ ::s Џ ::t Dzh ::comment Cyrillic capital letter dzhe
234
+ ::s Є ::t Ie ::comment Cyrillic capital letter ie
235
+ ::s Ю ::t Yu ::comment Cyrillic capital letter yu
236
+ ::s Я ::t Ya ::comment Cyrillic capital letter ya
237
+
238
+ ::s Ҥ ::t Ng ::comment Cyrillic capital ligature EN GHE
239
+ ::s ҥ ::t ng ::comment Cyrillic small ligature EN GHE
240
+ ::s Ә ::t e ::comment Cyrillic capital schwa
241
+ ::s ә ::t e ::comment Cyrillic small schwa
242
+ ::s Ӏ ::t ' ::comment Cyrillic palochka
243
+ ::s Ҵ ::t TS ::comment Cyrillic capital ligature te tse, used in Abkhasian
244
+ ::s ҵ ::t ts ::comment Cyrillic small ligature te tse, used in Abkhasian
245
+ ::s Ӕ ::t AE ::comment Cyrillic capital ligature a ie
246
+ ::s ӕ ::t ae ::comment Cyrillic small ligature a ie
247
+ ::s ʹ ::t "'" ::comment modifier letter prime
248
+ ::s ʺ ::t '"' ::comment modifier letter double prime
249
+ ::s ий ::t iy ::dont-use-at-end-of-word
250
+ ::s ий ::t y ::use-only-at-end-of-word
251
+
252
+ ::s ᲈ ::t u ::comment CYRILLIC SMALL LETTER UNBLENDED UK ligature ou
253
+
254
+ # Russian
255
+ ::s Г ::t G ::t-alt _NONE_ ::lcode rus ::comment Cyrillic capital letter ghe
256
+ ::s г ::t g ::t-alt _NONE_ ::lcode rus ::comment Cyrillic small letter ghe
257
+ ::s Й ::t Y ::t-alt I, J ::lcode rus ::comment Cyrillic capital letter short i
258
+ ::s й ::t y ::t-alt i, j ::lcode rus ::comment Cyrillic small letter short i
259
+ ::s Ц ::t Ts ::t-alt C ::lcode rus ::comment Cyrillic capital letter tse
260
+ ::s ц ::t ts ::t-alt c ::lcode rus ::comment Cyrillic small letter tse
261
+ ::s Щ ::t Shch ::t-alt _NONE_ ::lcode rus ::comment Cyrillic capital letter shcha
262
+ ::s щ ::t shch ::t-alt _NONE_ ::lcode rus ::comment Cyrillic small letter shcha
263
+ ::s Ѣ ::t E ::t-alt Ie ::lcode rus ::comment archaic Cyrillic capital letter yat
264
+ ::s ѣ ::t e ::t-alt ie ::lcode rus ::comment archaic Cyrillic small letter yat
265
+ ::s Е ::t E ::t-alt Ye ::dont-use-at-start-of-word ::lcode rus ::comment Cyrillic capital ie
266
+ ::s Е ::t Ye ::t-alt E ::use-only-at-start-of-word ::lcode rus
267
+ ::s е ::t e ::t-alt ye ::dont-use-at-start-of-word ::lcode rus ::comment Cyrillic small ie
268
+ ::s е ::t ye ::t-alt e ::use-only-at-start-of-word ::lcode rus
269
+ ::s ае ::t aye ::lcode rus
270
+ ::s а́е ::t aye ::lcode rus
271
+ ::s ее ::t eye ::lcode rus
272
+ ::s е́е ::t eye ::lcode rus
273
+ ::s ие ::t iye ::lcode rus
274
+ ::s и́е ::t iye ::lcode rus
275
+ ::s ое ::t oye ::lcode rus
276
+ ::s о́е ::t oye ::lcode rus
277
+ ::s уе ::t uye ::lcode rus
278
+ ::s у́е ::t uye ::lcode rus
279
+ ::s ье ::t ye ::lcode rus
280
+ ::s ъе ::t ye ::lcode rus
281
+ ::s Ё ::t Yo ::t-alt E ::lcode rus ::comment Cyrillic capital io
282
+ ::s ё ::t yo ::t-alt e ::lcode rus
283
+ ::s аё ::t ayo ::lcode rus
284
+ ::s а́ё ::t ayo ::lcode rus
285
+ ::s её ::t eyo ::lcode rus
286
+ ::s е́ё ::t eyo ::lcode rus
287
+ ::s иё ::t iyo ::lcode rus
288
+ ::s и́ё ::t iyo ::lcode rus
289
+ ::s оё ::t oyo ::lcode rus
290
+ ::s о́ё ::t oyo ::lcode rus
291
+ ::s уё ::t uyo ::lcode rus
292
+ ::s у́ё ::t uyo ::lcode rus
293
+ ::s ьё ::t yo ::lcode rus
294
+ ::s ъё ::t yo ::lcode rus
295
+ ::s ий ::t y ::lcode rus
296
+
297
+ # Ukranian
298
+ ::s Г ::t H ::lcode ukr ::comment Ukrainian capital letter he
299
+ ::s г ::t h ::lcode ukr ::comment Ukrainian small letter he
300
+ ::s Ґ ::t G ::lcode ukr ::comment Ukrainian capital letter ghe
301
+ ::s ґ ::t g ::lcode ukr ::comment Ukrainian small letter ghe
302
+ ::s Е ::t E ::t-alt _NONE_ ::lcode ukr ::comment Cyrillic capital ie
303
+ ::s е ::t e ::t-alt _NONE_ ::lcode ukr ::comment Cyrillic small ie
304
+ ::s И ::t Y ::lcode ukr ::comment Ukrainian capital letter i
305
+ ::s и ::t y ::lcode ukr ::comment Ukrainian small letter i
306
+ ::s Ї ::t Yi ::lcode ukr ::comment Ukrainian capital letter yi
307
+ ::s ї ::t yi ::lcode ukr ::comment Ukrainian small letter yi
308
+ ::s Й ::t I ::t-alt Y ::lcode ukr ::comment Cyrillic capital letter short i
309
+ ::s й ::t i ::t-alt y ::lcode ukr ::comment Cyrillic small letter short i
310
+ ::s Ц ::t Ts ::t-alt C ::lcode ukr ::comment Cyrillic capital letter tse
311
+ ::s ц ::t ts ::t-alt c ::lcode ukr ::comment Cyrillic small letter tse
312
+ ::s Щ ::t Shch ::t-alt _NONE_ ::lcode ukr ::comment Cyrillic capital letter shcha
313
+ ::s щ ::t shch ::t-alt _NONE_ ::lcode ukr ::comment Cyrillic small letter shcha
314
+ ::s Ѣ ::t E ::t-alt Ie ::lcode ukr ::comment archaic Cyrillic capital letter yat
315
+ ::s ѣ ::t e ::t-alt ie ::lcode ukr ::comment archaic Cyrillic small letter yat
316
+ ::s Иї ::t Yi ::lcode ukr ::comment avoid Yyi
317
+ ::s иї ::t yi ::lcode ukr ::comment avoid yyi
318
+ ::s ій ::t iy ::lcode ukr
319
+ ::s і́й ::t iy ::lcode ukr
320
+ ::s ий ::t y ::lcode ukr ::comment Зеленський/Zelensky
321
+
322
+ # Belarusian
323
+ ::s Г ::t H ::t-alt G ::lcode bel ::comment capital letter he
324
+ ::s г ::t h ::t-alt g ::lcode bel ::comment small letter he
325
+ ::s Ґ ::t G ::lcode bel ::comment capital letter ghe
326
+ ::s ґ ::t g ::lcode bel ::comment small letter ghe
327
+ ::s Й ::t J ::t-alt Y ::lcode bel ::comment Cyrillic capital letter short i
328
+ ::s й ::t j ::t-alt y ::lcode bel ::comment Cyrillic small letter short i
329
+ ::s Ц ::t Ts ::t-alt C ::lcode bel ::comment Cyrillic capital letter tse
330
+ ::s ц ::t ts ::t-alt c ::lcode bel ::comment Cyrillic small letter tse
331
+ ::s Щ ::t Shch ::t-alt _NONE_ ::lcode bel ::comment Cyrillic capital letter shcha
332
+ ::s щ ::t shch ::t-alt _NONE_ ::lcode bel ::comment Cyrillic small letter shcha
333
+ ::s Ѣ ::t E ::t-alt Ie ::lcode bel ::comment archaic Cyrillic capital letter yat
334
+ ::s ѣ ::t e ::t-alt ie ::lcode bel ::comment archaic Cyrillic small letter yat
335
+ ::s 'я ::t ya ::lcode bel
336
+ ::s ’я ::t ya ::lcode bel
337
+ ::s 'і ::t i ::lcode bel
338
+ ::s ’і ::t i ::lcode bel
339
+ ::s Ё ::t Yo ::t-alt E ::lcode bel ::comment Cyrillic capital io
340
+ ::s ё ::t yo ::t-alt e ::lcode bel
341
+ ::s ёў ::t you ::lcode bel
342
+ ::s ий ::t y ::lcode bel
343
+
344
+ # Serbian
345
+ ::s Г ::t G ::t-alt _NONE_ ::lcode srp ::comment Cyrillic capital ghe
346
+ ::s г ::t g ::t-alt _NONE_ ::lcode srp ::comment Cyrillic small ghe
347
+ ::s Х ::t H ::t-alt _NONE_ ::lcode srp ::comment Cyrillic capital ha
348
+ ::s х ::t h ::t-alt _NONE_ ::lcode srp ::comment Cyrillic small ha
349
+ ::s Е ::t E ::t-alt _NONE_ ::lcode srp ::comment Cyrillic capital ie
350
+ ::s е ::t e ::t-alt _NONE_ ::lcode srp ::comment Cyrillic small ie
351
+ ::s Ђ ::t Dj ::lcode srp ::comment Cyrillic capital dje
352
+ ::s Љ ::t Lj ::lcode srp ::comment Cyrillic capital lje
353
+ ::s Ћ ::t Tsh ::lcode srp ::comment Cyrillic capital tshe
354
+ ::s Ж ::t Zh ::lcode srp ::comment Cyrillic capital zhe
355
+ ::s Ц ::t C ::t-alt Ts ::lcode srp ::comment Cyrillic capital tse
356
+ ::s ц ::t c ::t-alt ts ::lcode srp ::comment Cyrillic capital tse
357
+ ::s Đ ::t Dj ::lcode srp ::comment Latin capital d with stroke
358
+ ::s đ ::t dj ::lcode srp ::comment Latin small d with stroke
359
+ ::s Ž ::t Zh ::lcode srp ::comment Latin capital z with caron
360
+ ::s ž ::t zh ::lcode srp ::comment Latin small z with caron
361
+ ::s Ć ::t Tsh ::lcode srp ::comment Latin capital c with acute
362
+ ::s ć ::t tsh ::lcode srp ::comment Latin small c with acute
363
+ ::s Č ::t Ch ::lcode srp ::comment Latin capital c with caron
364
+ ::s č ::t ch ::lcode srp ::comment Latin small c with caron
365
+ ::s Š ::t Sh ::lcode srp ::comment Latin capital s with caron
366
+ ::s š ::t sh ::lcode srp ::comment Latin small s with caron
367
+
368
+ ::s Г ::t G ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital ghe
369
+ ::s г ::t g ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small ghe
370
+ ::s Х ::t H ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital ha
371
+ ::s х ::t h ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small ha
372
+ ::s Ц ::t C ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital letter tse
373
+ ::s ц ::t c ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small letter tse
374
+ ::s Ч ::t C ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital letter che
375
+ ::s ч ::t c ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small letter che
376
+ ::s Џ ::t Dz ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital letter dzhe
377
+ ::s џ ::t dz ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small letter dzhe
378
+ ::s Е ::t E ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital ie
379
+ ::s е ::t e ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small ie
380
+ ::s Ш ::t S ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital sha
381
+ ::s ш ::t s ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small sha
382
+ ::s Ж ::t Z ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital zhe
383
+ ::s ж ::t z ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small zhe
384
+ ::s Љ ::t Lj ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital lje
385
+ ::s љ ::t lj ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small lje
386
+ ::s Њ ::t Nj ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital nje
387
+ ::s њ ::t nj ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small nje
388
+ ::s Ђ ::t Dj ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital dje
389
+ ::s ђ ::t dj ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small dje
390
+ ::s Ћ ::t C ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital tshe
391
+ ::s ћ ::t c ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small tshe
392
+ ::s Đ ::t Dj ::lcode srp2 ::comment Latin capital d with stroke
393
+ ::s đ ::t dj ::lcode srp2 ::comment Latin small d with stroke
394
+
395
+ # Montenegrin extension (controversial)
396
+ ::s З́ ::t Zj ::lcode srp ::comment Cyrillic capital zje
397
+ ::s з́ ::t zj ::lcode srp ::comment Cyrillic small zje
398
+ ::s С́ ::t Sj ::lcode srp ::comment Cyrillic capital sje
399
+ ::s с́ ::t sj ::lcode srp ::comment Cyrillic small sje
400
+ ::s Ź ::t Zj ::lcode srp ::comment Latin capital z with acute
401
+ ::s ź ::t zj ::lcode srp ::comment Latin small z with acute
402
+ ::s Ś ::t Sj ::lcode srp ::comment Latin capital s with acute
403
+ ::s ś ::t sj ::lcode srp ::comment Latin small s with acute
404
+
405
+ ::s З́ ::t Z ::lcode srp2 ::comment Cyrillic capital zje
406
+ ::s з́ ::t z ::lcode srp2 ::comment Cyrillic small zje
407
+ ::s С́ ::t S ::lcode srp2 ::comment Cyrillic capital sje
408
+ ::s с́ ::t s ::lcode srp2 ::comment Cyrillic small sje
409
+ ::s Ź ::t Z ::lcode srp2 ::comment Latin capital z with acute
410
+ ::s ź ::t z ::lcode srp2 ::comment Latin small z with acute
411
+ ::s Ś ::t S ::lcode srp2 ::comment Latin capital s with acute
412
+ ::s ś ::t s ::lcode srp2 ::comment Latin small s with acute
413
+
414
+ # Bulgarian
415
+ ::s Г ::t G ::t-alt _NONE_ ::lcode bul ::comment Cyrillic capital ghe
416
+ ::s г ::t g ::t-alt _NONE_ ::lcode bul ::comment Cyrillic small ghe
417
+ ::s Х ::t H ::t-alt Kh ::lcode bul ::comment Cyrillic capital letter ha
418
+ ::s х ::t h ::t-alt kh ::lcode bul ::comment Cyrillic small letter ha
419
+ ::s Ц ::t C ::t-alt Ts ::lcode bul ::comment Cyrillic capital letter tse
420
+ ::s ц ::t c ::t-alt ts ::lcode bul ::comment Cyrillic small letter tse
421
+ ::s Щ ::t Sht ::t-alt _NONE_ ::lcode bul ::comment Cyrillic capital letter shcha
422
+ ::s щ ::t sht ::t-alt _NONE_ ::lcode bul ::comment Cyrillic small letter shcha
423
+ ::s Е ::t E ::t-alt _NONE_ ::lcode bul ::comment Cyrillic capital ie
424
+ ::s е ::t e ::t-alt _NONE_ ::lcode bul ::comment Cyrillic small ie
425
+ ::s Ж ::t Zh ::t-alt Z, J ::lcode bul ::comment Cyrillic capital zhe
426
+ ::s ж ::t zh ::t-alt z, j ::lcode bul ::comment Cyrillic small zhe
427
+ ::s Й ::t I ::t-alt Y, J ::lcode bul ::comment Cyrillic capital letter short i
428
+ ::s й ::t i ::t-alt y, j ::lcode bul ::comment Cyrillic short letter short i
429
+ ::s Ю ::t Yu ::t-alt U, Ju, Iu ::lcode bul ::comment Cyrillic capital letter yu
430
+ ::s ю ::t yu ::t-alt u, ju, iu ::lcode bul ::comment Cyrillic small letter yu
431
+ ::s Ъ ::t U ::t-alt A ::lcode bul ::comment Cyrillic capital letter hard sign
432
+ ::s ъ ::t u ::t-alt a ::lcode bul ::comment Cyrillic capital letter hard sign
433
+ ::s Ѣ ::t E ::t-alt Ie ::lcode bul ::comment archaic Cyrillic capital letter yat
434
+ ::s ѣ ::t e ::t-alt ie ::lcode bul ::comment archaic Cyrillic small letter yat
435
+ ::s Ѫ ::t U ::lcode bul ::comment archaic Cyrillic capital letter yus
436
+ ::s ѫ ::t u ::lcode bul ::comment archaic Cyrillic small letter yus
437
+ ::s ИЯ ::t IA ::lcode bul ::use-only-at-end-of-word
438
+ ::s ия ::t ia ::lcode bul ::use-only-at-end-of-word
439
+
440
+ ::s Ž ::t Zh ::lcode bul ::comment Latin capital z with caron
441
+ ::s ž ::t zh ::lcode bul ::comment Latin small z with caron
442
+ ::s Č ::t Ch ::lcode bul ::comment Latin capital c with caron
443
+ ::s č ::t ch ::lcode bul ::comment Latin small c with caron
444
+ ::s Š ::t Sh ::lcode bul ::comment Latin capital s with caron
445
+ ::s š ::t sh ::lcode bul ::comment Latin small s with caron
446
+ ::s Ŝ ::t Sht ::lcode bul ::comment Latin capital s with circumflex
447
+ ::s ŝ ::t sht ::lcode bul ::comment Latin small s with circumflex
448
+ ::s Û ::t Yu ::t-alt U, Ju, Iu ::lcode bul ::comment Latin capital u with circumflex
449
+ ::s û ::t yu ::t-alt u, ju, iu ::lcode bul ::comment Latin small u with circumflex
450
+ ::s  ::t Ya ::t-alt _NONE_ ::lcode bul ::comment Latin capital a with circumflex
451
+ ::s â ::t ya ::t-alt _NONE_ ::lcode bul ::comment Latin small a with circumflex
452
+ ::s Ŭ ::t U ::t-alt A ::lcode bul ::comment Latin capital u with breve (for hard sign)
453
+ ::s ŭ ::t u ::t-alt a ::lcode bul ::comment Latin small u with breve (for hard sign)
454
+ ::s Ǎ ::t U ::t-alt A ::lcode bul ::comment Latin capital a with caron (for hard sign)
455
+ ::s ǎ ::t u ::t-alt a ::lcode bul ::comment Latin small a with caron (for hard sign)
456
+
457
+ # Macedonian
458
+ ::s Г ::t G ::t-alt _NONE_ ::lcode mkd ::comment Cyrillic capital ghe
459
+ ::s г ::t g ::t-alt _NONE_ ::lcode mkd ::comment Cyrillic small ghe
460
+ ::s Х ::t H ::lcode mkd ::comment Cyrillic capital ha
461
+ ::s х ::t h ::lcode mkd ::comment Cyrillic small ha
462
+ ::s Ц ::t C ::t-alt Ts ::lcode mkd ::comment Cyrillic capital letter tse
463
+ ::s ц ::t c ::t-alt ts ::lcode mkd ::comment Cyrillic small letter tse
464
+ ::s Џ ::t Dzh ::t-alt Dj, Dz ::lcode mkd ::comment Cyrillic capital letter dzhe
465
+ ::s џ ::t dzh ::t-alt dj, dz ::lcode mkd ::comment Cyrillic small letter dzhe
466
+ ::s Е ::t E ::t-alt _NONE_ ::lcode mkd ::comment Cyrillic capital ie
467
+ ::s е ::t e ::t-alt _NONE_ ::lcode mkd ::comment Cyrillic small ie
468
+ ::s Ž ::t Zh ::lcode mkd ::comment Latin capital z with caron
469
+ ::s ž ::t zh ::lcode mkd ::comment Latin small z with caron
470
+ ::s Č ::t Ch ::lcode mkd ::comment Latin capital c with caron
471
+ ::s č ::t ch ::lcode mkd ::comment Latin small c with caron
472
+ ::s Š ::t Sh ::lcode mkd ::comment Latin capital s with caron
473
+ ::s š ::t sh ::lcode mkd ::comment Latin small s with caron
474
+ ::s Ǵ ::t Gj ::lcode mkd
475
+ ::s ǵ ::t gj ::lcode mkd
476
+ ::s Đ ::t Gj ::lcode mkd
477
+ ::s đ ::t gj ::lcode mkd
478
+ ::s Ẑ ::t Dz ::lcode mkd
479
+ ::s ẑ ::t dz ::lcode mkd
480
+ ::s J̌ ::t J ::lcode mkd
481
+ ::s ǰ ::t j ::lcode mkd
482
+ ::s L̂ ::t Lj ::lcode mkd
483
+ ::s l̂ ::t lj ::lcode mkd
484
+ ::s N̂ ::t Nj ::lcode mkd
485
+ ::s n̂ ::t nj ::lcode mkd
486
+ ::s Ḱ ::t Kj ::lcode mkd
487
+ ::s ḱ ::t kj ::lcode mkd
488
+ ::s Ć ::t Kj ::lcode mkd
489
+ ::s ć ::t kj ::lcode mkd
490
+ ::s D̂ ::t Dzh ::lcode mkd
491
+ ::s d̂ ::t dzh ::lcode mkd
492
+
493
+ ::s Г ::t G ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital ghe
494
+ ::s г ::t g ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small ghe
495
+ ::s Х ::t H ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital ha
496
+ ::s х ::t h ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small ha
497
+ ::s Ц ::t C ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital letter tse
498
+ ::s ц ::t c ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small letter tse
499
+ ::s Ч ::t C ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital letter che
500
+ ::s ч ::t c ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small letter che
501
+ ::s Џ ::t D ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital letter dzhe
502
+ ::s џ ::t d ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small letter dzhe
503
+ ::s Е ::t E ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital ie
504
+ ::s е ::t e ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small ie
505
+ ::s Ш ::t S ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital sha
506
+ ::s ш ::t s ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small sha
507
+ ::s Ѓ ::t G ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital gje
508
+ ::s ѓ ::t g ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small gje
509
+ ::s Ж ::t Z ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital zhe
510
+ ::s ж ::t z ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small zhe
511
+ ::s Ѕ ::t Z ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital dze
512
+ ::s ѕ ::t z ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small dze
513
+ ::s Ќ ::t K ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital kje
514
+ ::s ќ ::t k ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small kje
515
+ ::s Љ ::t L ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital lje
516
+ ::s љ ::t l ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small lje
517
+ ::s Њ ::t N ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital nje
518
+ ::s њ ::t n ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small nje
519
+ ::s Ž ::t Z ::lcode mkd2 ::comment Latin capital z with caron
520
+ ::s ž ::t z ::lcode mkd2 ::comment Latin small z with caron
521
+ ::s Č ::t C ::lcode mkd2 ::comment Latin capital c with caron
522
+ ::s č ::t c ::lcode mkd2 ::comment Latin small c with caron
523
+ ::s Š ::t S ::lcode mkd2 ::comment Latin capital s with caron
524
+ ::s š ::t s ::lcode mkd2 ::comment Latin small s with caron
525
+ ::s Ǵ ::t G ::lcode mkd2
526
+ ::s ǵ ::t g ::lcode mkd2
527
+ ::s Đ ::t G ::lcode mkd2
528
+ ::s đ ::t g ::lcode mkd2
529
+ ::s Ẑ ::t D ::lcode mkd2
530
+ ::s ẑ ::t d ::lcode mkd2
531
+ ::s J̌ ::t J ::lcode mkd2
532
+ ::s ǰ ::t j ::lcode mkd2
533
+ ::s L̂ ::t L ::lcode mkd2
534
+ ::s l̂ ::t l ::lcode mkd2
535
+ ::s N̂ ::t N ::lcode mkd2
536
+ ::s n̂ ::t n ::lcode mkd2
537
+ ::s Ḱ ::t K ::lcode mkd2
538
+ ::s ḱ ::t k ::lcode mkd2
539
+ ::s Ć ::t K ::lcode mkd2
540
+ ::s ć ::t k ::lcode mkd2
541
+ ::s D̂ ::t D ::lcode mkd2
542
+ ::s d̂ ::t d ::lcode mkd2
543
+
544
+ # Kazakh
545
+ ::s Ә ::t A ::lcode kaz
546
+ ::s ә ::t a ::lcode kaz
547
+ ::s Г ::t G ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic capital ghe
548
+ ::s г ::t g ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic small ghe
549
+ ::s Ғ ::t G ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic capital ghe with stroke
550
+ ::s ғ ::t g ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic small ghe with stroke
551
+ ::s Е ::t E ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic capital ie
552
+ ::s е ::t e ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic small ie
553
+ ::s Ё ::t Yo ::t-alt _NONE_ ::lcode kaz
554
+ ::s ё ::t yo ::t-alt _NONE_ ::lcode kaz
555
+ ::s Х ::t H ::t-alt X ::lcode kaz ::comment Cyrillic capital ha
556
+ ::s х ::t h ::t-alt x ::lcode kaz ::comment Cyrillic small ha
557
+ ::s Һ ::t H ::lcode kaz ::comment Cyrillic capital shha
558
+ ::s һ ::t h ::lcode kaz ::comment Cyrillic small shha
559
+ ::s Қ ::t Q ::t-alt K ::lcode kaz
560
+ ::s қ ::t q ::t-alt k ::lcode kaz
561
+ ::s Ц ::t Ts ::t-alt C ::lcode kaz ::comment Cyrillic capital letter tse
562
+ ::s ц ::t ts ::t-alt c ::lcode kaz ::comment Cyrillic small letter tse
563
+ ::s Щ ::t Sh ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic capital letter shcha
564
+ ::s щ ::t sh ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic small letter shcha
565
+ ::s У ::t U ::t-alt Y ::lcode kaz
566
+ ::s у ::t u ::t-alt y ::lcode kaz
567
+ ::s уы ::t wy ::lcode kaz
568
+ ::s Ж ::t J ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic capital zhe
569
+ ::s ж ::t j ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic small zhe
570
+ ::s Ю ::t Yw ::t-alt Yuw, Yiw ::lcode kaz ::comment Cyrillic capital letter yu
571
+ ::s ю ::t yw ::t-alt yuw, yiw ::lcode kaz ::comment Cyrillic small letter yu
572
+
573
+ # Kyrgyz
574
+ ::s Г ::t G ::t-alt _NONE_ ::lcode kir ::comment Cyrillic capital ghe
575
+ ::s г ::t g ::t-alt _NONE_ ::lcode kir ::comment Cyrillic small ghe
576
+ ::s Е ::t E ::t-alt Ye ::lcode kir ::comment Cyrillic capital ie
577
+ ::s е ::t e ::t-alt ye ::lcode kir ::comment Cyrillic small ie
578
+ ::s Ё ::t Yo ::t-alt _NONE_ ::lcode kir
579
+ ::s ё ::t yo ::t-alt _NONE_ ::lcode kir
580
+ ::s Х ::t Kh ::t-alt X, H ::lcode kir ::comment Cyrillic capital ha
581
+ ::s х ::t kh ::t-alt x, h ::lcode kir ::comment Cyrillic small ha
582
+ ::s Ж ::t Zh ::t-alt J ::lcode kir ::comment Cyrillic capital zhe
583
+ ::s ж ::t zh ::t-alt j ::lcode kir ::comment Cyrillic small zhe
584
+ ::s Й ::t Y ::t-alt I ::lcode kir ::comment Cyrillic capital letter short i
585
+ ::s й ::t y ::t-alt i ::lcode kir ::comment Cyrillic small letter short i
586
+ ::s Ц ::t Ts ::t-alt C ::lcode kir ::comment Cyrillic capital letter tse
587
+ ::s ц ::t ts ::t-alt c ::lcode kir ::comment Cyrillic small letter tse
588
+ ::s Ң ::t Ng ::lcode kir
589
+ ::s ң ::t ng ::lcode kir
590
+ ::s Ө ::t O ::t-alt Oe ::lcode kir
591
+ ::s ө ::t o ::t-alt oe ::lcode kir
592
+ ::s Ү ::t U ::t-alt Y, Ue ::lcode kir
593
+ ::s ү ::t u ::t-alt y, ue ::lcode kir
594
+ ::s Ы ::t I ::t-alt Y ::lcode kir
595
+ ::s ы ::t i ::t-alt y ::lcode kir
596
+ ::s йы ::t yi ::lcode kir
597
+ ::s ый ::t iy ::lcode kir
598
+
599
+ # Ossetian
600
+ ::s ийы ::t iy ::lcode oss
601
+
602
+ # Gothic
603
+ ::s 𐌴 ::t e ::comment Gothic letter aihvus
604
+ ::s 𐌹 ::t i ::comment Gothic letter eis
605
+ ::s 𐍇 ::t x ::comment Gothic letter iggws
606
+
607
+ # Runic
608
+ ::s ᛫ ::t " " ::comment Runic single punctuation, used as word separator
609
+ ::s ᛬ ::t . ::comment Runic multiple punctuation, used as sentence separator
610
+
611
+ # Ogham
612
+ ::s ᚁ ::t b ::comment Ogham letter Beith
613
+ ::s ᚂ ::t l ::comment Ogham letter Luis
614
+ ::s ᚃ ::t f ::comment Ogham letter Fearn
615
+ ::s ᚄ ::t s ::comment Ogham letter Sail
616
+ ::s ᚅ ::t n ::comment Ogham letter Nion
617
+ ::s ᚋ ::t m ::comment Ogham letter Muin
618
+ ::s ᚌ ::t g ::comment Ogham letter Gort
619
+ ::s ᚍ ::t v ::t-alt ng ::comment Ogham letter nGéadal
620
+ ::s ᚎ ::t z ::comment Ogham letter Straif
621
+ ::s ᚏ ::t r ::comment Ogham letter Ruis
622
+ ::s ᚆ ::t h ::t-alt j ::comment Ogham letter Uath
623
+ ::s ᚇ ::t d ::comment Ogham letter Dair
624
+ ::s ᚈ ::t t ::comment Ogham letter Tinne
625
+ ::s ᚉ ::t k ::comment Ogham letter Coll
626
+ ::s ᚊ ::t q ::t-alt kw ::comment Ogham letter Ceirt
627
+ ::s ᚐ ::t a ::comment Ogham letter Ailm
628
+ ::s ᚑ ::t o ::comment Ogham letter Onn
629
+ ::s ᚒ ::t u ::comment Ogham letter Úr
630
+ ::s ᚓ ::t e ::comment Ogham letter Eadhadh
631
+ ::s ᚔ ::t i ::comment Ogham letter Iodhadh
632
+ ::s ᚚ ::t p ::comment Ogham letter Peith
633
+ # Additional Ogham letters (outside standard alphabet)
634
+ ::s ᚕ ::t eo ::t-alt ea ::comment Ogham additional letter Éabhadh
635
+ ::s ᚖ ::t oi ::t-alt oe ::comment Ogham additional letter Ór
636
+ ::s ᚗ ::t ui ::t-alt ua ::comment Ogham additional letter Uilleann
637
+ ::s ᚘ ::t p ::t-alt io ::comment Ogham additional letter Ifín
638
+ ::s ᚙ ::t ch ::t-alt x, ai ::comment Ogham additional letter Eamhancholl
639
+ ::s   ::t " " ::comment Ogham space mark
640
+ ::s ᚛ ::t "" ::comment Ogham feather mark
641
+ ::s ᚜ ::t "" ::comment Ogham feather mark
642
+
643
+ # Georgian
644
+ ::s ა ::t a ::comment Georgian letter an
645
+ ::s ე ::t e ::comment Georgian letter en
646
+ ::s ი ::t i ::comment Georgian letter in
647
+ ::s ო ::t o ::comment Georgian letter on
648
+ ::s უ ::t u ::comment Georgian letter un
649
+ ::s ჱ ::t ey ::comment archaic Georgian letter he
650
+ ::s ჲ ::t i ::comment archaic Georgian letter hie
651
+ ::s ჳ :::t w ::comment archaic Georgian letter we
652
+ ::s ჴ ::t q ::comment archaic Georgian letter har
653
+ ::s ჵ ::t o ::comment archaic Georgian letter hoe
654
+ ::s ჶ ::t f ::comment Georgian letter fi (Greek phi)
655
+ ::s ჷ ::t e ::comment Georgian letter yn (schwa)
656
+ ::s ჸ ::t a ::comment Georgian letter elifi
657
+ ::s ჹ ::t g ::comment Georgian letter gan
658
+ ::s ჺ ::t ' ::comment Georgian letter ain
659
+ ::s ჼ ::t n ::comment Georgian letter nar
660
+ ::s ჽ ::t e ::comment Georgian letter aen
661
+ ::s ჾ ::t ::comment Georgian letter hard sign
662
+ ::s ჿ ::t w ::comment Georgian letter labial sign
663
+
664
+ ::s Ⴚ ::t TS ::comment GEORGIAN CAPITAL LETTER CAN
665
+ ::s ც ::t ts ::comment GEORGIAN LETTER CAN
666
+ ::s Ც ::t TS ::comment GEORGIAN MTAVRULI CAPITAL LETTER CAN
667
+ ::s ⴚ ::t ts ::comment GEORGIAN SMALL LETTER CAN
668
+ ::s Ⴜ ::t TS ::comment GEORGIAN CAPITAL LETTER CIL
669
+ ::s წ ::t ts ::comment GEORGIAN LETTER CIL
670
+ ::s Წ ::t TS ::comment GEORGIAN MTAVRULI CAPITAL LETTER CIL
671
+ ::s ⴜ ::t ts ::comment GEORGIAN SMALL LETTER CIL
672
+ ::s Ⴛ ::t DZ ::comment GEORGIAN CAPITAL LETTER JIL
673
+ ::s ძ ::t dz ::comment GEORGIAN LETTER JIL
674
+ ::s Ძ ::t DZ ::comment GEORGIAN MTAVRULI CAPITAL LETTER JIL
675
+ ::s ⴛ ::t dz ::comment GEORGIAN SMALL LETTER JIL
676
+ ::s Ⴟ ::t J ::comment GEORGIAN CAPITAL LETTER JHAN
677
+ ::s ჯ ::t j ::comment GEORGIAN LETTER JHAN
678
+ ::s Ჯ ::t J ::comment GEORGIAN MTAVRULI CAPITAL LETTER JHAN
679
+ ::s ⴟ ::t j ::comment GEORGIAN SMALL LETTER JHAN
680
+
681
+
682
+ ::s Ⴀ ::t A ::comment Georgian capital letter an
683
+ ::s Ⴄ ::t E ::comment Georgian capital letter en
684
+ ::s Ⴈ ::t I ::comment Georgian capital letter in
685
+ ::s Ⴍ ::t O ::comment Georgian capital letter on
686
+ ::s Ⴓ ::t U ::comment Georgian capital letter un
687
+ ::s Ⴡ ::t EY ::comment archaic Georgian capital letter he
688
+ ::s Ⴢ ::t I ::comment archaic Georgian capital letter hie
689
+ ::s Ⴣ :::t W ::comment archaic Georgian capitel letter we
690
+ ::s Ⴤ ::t Q ::comment archaic Georgian capital letter har
691
+ ::s Ⴥ ::t O ::comment archaic Georgian capital letter hoe
692
+ ::s Ⴧ ::t E ::comment archaic Georgian capital letter yn (schwa)
693
+ ::s Ⴭ ::t E ::comment archaic Georgian capital letter aen
694
+
695
+ ::s Ა ::t A ::comment Georgian Mtavruli capital letter an
696
+ ::s Ე ::t E ::comment Georgian Mtavruli capital letter en
697
+ ::s Ი ::t I ::comment Georgian Mtavruli capital letter in
698
+ ::s Ო ::t O ::comment Georgian Mtavruli capital letter on
699
+ ::s Უ ::t U ::comment Georgian Mtavruli capital letter un
700
+ ::s Ჱ ::t EY ::comment archaic Georgian Mtavruli capital letter he
701
+ ::s Ჲ ::t I ::comment archaic Georgian Mtavruli capital letter hie
702
+ ::s Ჳ :::t W ::comment archaic Georgian Mtavruli capital letter we
703
+ ::s Ჴ ::t Q ::comment archaic Georgian Mtavruli capital letter har
704
+ ::s Ჵ ::t O ::comment archaic Georgian Mtavruli capital letter hoe
705
+ ::s Ჶ ::t F ::comment Georgian Mtavruli capital letter fi (Greek phi)
706
+ ::s Ჷ ::t E ::comment Georgian Mtavruli capital letter yn (schwa)
707
+ ::s Ჸ ::t A ::comment Georgian Mtavruli capital letter elifi
708
+ ::s Ჹ ::t G ::comment Georgian Mtavruli capital letter gan
709
+ ::s Ჺ ::t ' ::comment Georgian Mtavruli capital letter ain
710
+ ::s Ჽ ::t E ::comment Georgian Mtavruli capital letter aen
711
+ ::s Ჾ ::t ::comment Georgian Mtavruli capital letter hard sign
712
+ ::s Ჿ ::t W ::comment Georgian Mtavruli capital letter labial sign
713
+
714
+ ::s ⴀ ::t a ::comment Georgian small letter an
715
+ ::s ⴄ ::t e ::comment Georgian small letter en
716
+ ::s ⴈ ::t i ::comment Georgian small letter in
717
+ ::s ⴍ ::t o ::comment Georgian small letter on
718
+ ::s ⴓ ::t u ::comment Georgian small letter un
719
+ ::s ⴡ ::t ey ::comment archaic Georgian small letter he
720
+ ::s ⴢ ::t i ::comment archaic Georgian small letter hie
721
+ ::s ⴣ :::t w ::comment archaic Georgian small letter we
722
+ ::s ⴤ ::t q ::comment archaic Georgian small letter har
723
+ ::s ⴥ ::t o ::comment archaic Georgian small letter hoe
724
+ ::s ⴧ ::t e ::comment Georgian small letter yn (schwa)
725
+ ::s ⴭ ::t e ::comment Georgian small letter aen
726
+
727
+ # Armenian
728
+ ::s Ա ::t A ::comment Armenian capital letter ayb
729
+ ::s ա ::t a ::comment Armenian small letter ayb
730
+ ::s ՠ ::t a ::comment ARMENIAN SMALL LETTER TURNED AYB (CHECK)
731
+ ::s Ե ::t E ::comment Armenian capital letter ech ::dont-use-at-start-of-word
732
+ ::s ե ::t e ::comment Armenian small letter ech ::dont-use-at-start-of-word
733
+ ::s Ե ::t Ye ::comment Armenian capital letter ech ::use-only-at-start-of-word
734
+ ::s ե ::t ye ::comment Armenian small letter ech ::use-only-at-start-of-word
735
+ ::s Է ::t E ::comment Armenian capital letter eh
736
+ ::s է ::t e ::comment Armenian small letter eh
737
+ ::s Ը ::t E ::comment Armenian capital letter et
738
+ ::s ը ::t e ::comment Armenian small letter et
739
+ ::s Ի ::t I ::comment Armenian capital letter ini
740
+ ::s ի ::t i ::comment Armenian small letter ini
741
+ ::s Յ ::t Y ::comment Armenian capital letter yi
742
+ ::s յ ::t y ::comment Armenian small letter yi
743
+ ::s ֈ ::t y ::comment ARMENIAN SMALL LETTER YI WITH STROKE (CHECK)
744
+ ::s Ո ::t Vo ::comment Armenian capital letter vo ::use-only-at-start-of-word
745
+ ::s ո ::t vo ::comment Armenian small letter vo ::use-only-at-start-of-word
746
+ ::s Ո ::t O ::comment Armenian capital letter vo ::dont-use-at-start-of-word
747
+ ::s ո ::t o ::comment Armenian small letter vo ::dont-use-at-start-of-word
748
+ ::s Ւ ::t W ::comment Armenian capital letter yiwn
749
+ ::s ւ ::t w ::comment Armenian small letter yiwn
750
+ ::s Օ ::t O ::comment Armenian capital letter oh
751
+ ::s օ ::t o ::comment Armenian small letter oh
752
+ ::s Խ ::t Kh ::comment Armenian capital letter xeh
753
+ ::s խ ::t kh ::comment Armenian small letter xeh
754
+
755
+ ::s Ժ ::t Zh ::comment Armenian capital letter zhe
756
+ ::s Ղ ::t Gh ::comment Armenian capital letter ghad
757
+ ::s Ճ ::t Tch ::comment Armenian capital letter cheh
758
+ ::s ճ ::t tch ::comment Armenian small letter cheh
759
+ ::s Շ ::t Sh ::comment Armenian capital letter sha
760
+ ::s Չ ::t Ch ::comment Armenian capital letter cha
761
+ ::s Ջ ::t J ::comment Armenian capital letter jheh
762
+ ::s ջ ::t j ::comment Armenian small letter jheh
763
+ ::s Վ ::t V ::comment Armenian capital letter vew
764
+ ::s վ ::t v ::comment Armenian small letter vew
765
+ ::s Ձ ::t Dz ::comment Armenian capital letter ja
766
+ ::s ձ ::t dz ::comment Armenian small letter ja
767
+ ::s Ծ ::t Ts ::comment Armenian capital letter ca
768
+ ::s ծ ::t ts ::comment Armenian small letter ca
769
+ ::s Ք ::t K ::t-alt Q ::comment Armenian capital letter keh - sometimes romanized as K' or Q
770
+ ::s ք ::t k ::t-alt q ::comment Armenian small letter keh - sometimes romanized as k' or q
771
+
772
+ ::s են ::t en ::use-only-for-whole-word ::comment exception (auxiliary verb)
773
+ ::s եմ ::t em ::use-only-for-whole-word ::comment exception (auxiliary verb)
774
+ ::s ենք ::t enk ::use-only-for-whole-word ::comment exception (auxiliary verb)
775
+ ::s ես ::t es ::use-only-for-whole-word ::comment exception (auxiliary verb)
776
+ ::s եք ::t ek ::use-only-for-whole-word ::comment exception (auxiliary verb)
777
+
778
+ ::s և ::t ev ::comment Armenian small ligature ech yiwn
779
+ ::s ՈՒ ::t U ::comment Armenian capital vo+yiwn
780
+ ::s Ու ::t U ::comment Armenian capital/small vo+yiwn
781
+ ::s ու ::t u ::comment Armenian small vo+wywn
782
+
783
+ ::s իւ ::t yu
784
+
785
+ ## Japanese
786
+ # Katakana
787
+ ::s シ ::t shi
788
+ ::s チ ::t chi
789
+ ::s フ ::t fu
790
+ ::s ジ ::t ji
791
+ ::s ヂ ::t ji
792
+ ::s ヅ ::t zu
793
+ ::s シャ ::t sha
794
+ ::s シュ ::t shu
795
+ ::s ショ ::t sho
796
+ ::s チャ ::t cha
797
+ ::s チェ ::t che
798
+ ::s チュ ::t chu
799
+ ::s チョ ::t cho
800
+ ::s ジャ ::t ja
801
+ ::s ジュ ::t ju
802
+ ::s ジョ ::t jo
803
+ ::s ジェ ::t je
804
+ ::s ヂャ ::t ja
805
+ ::s ヂュ ::t ju
806
+ ::s ヂョ ::t jo
807
+ ::s フェ ::t fe
808
+ ::s ヴェ ::t ve
809
+ ::s フィ ::t fi
810
+ ::s ウィ ::t wi
811
+ ::s ヴィ ::t vi
812
+ ::s ティ ::t ti
813
+ ::s ディ ::t di
814
+ ::s ッ ::t (__SOKUON__) ::comment katakana double following consonant
815
+ ::s ー ::t (__CHOONPU__) ::comment katakana prolonged sound mark
816
+ ::s 𛅤 ::t i ::comment KATAKANA LETTER SMALL WI
817
+ ::s 𛅥 ::t e ::comment KATAKANA LETTER SMALL WE
818
+ ::s 𛅦 ::t o ::comment KATAKANA LETTER SMALL WO
819
+ # Hiragana
820
+ ::s し ::t shi
821
+ ::s ち ::t chi
822
+ ::s つ ::t tsu
823
+ ::s ふ ::t fu
824
+ ::s を ::t o
825
+ ::s じ ::t ji
826
+ ::s ぢ ::t ji
827
+ ::s づ ::t zu
828
+ ::s しゃ ::t sha
829
+ ::s しゅ ::t shu
830
+ ::s しょ ::t sho
831
+ ::s ちゃ ::t cha
832
+ ::s ちゅ ::t chu
833
+ ::s ちょ ::t cho
834
+ ::s じゃ ::t ja
835
+ ::s じゅ ::t ju
836
+ ::s じょ ::t jo
837
+ ::s ぢゃ ::t ja
838
+ ::s ぢゅ ::t ju
839
+ ::s ぢょ ::t jo
840
+ ::s 𛅐 ::t i ::comment HIRAGANA LETTER SMALL WI
841
+ ::s 𛅑 ::t e ::comment HIRAGANA LETTER SMALL WE
842
+ ::s 𛅒 ::t o ::comment HIRAGANA LETTER SMALL WO
843
+ ::s っ ::t (__SOKUON__) ::comment hiragana double following consonant
844
+ ::s 々 ::t ² ::comment ideographic iteration mark ::annotation repetition-sign
845
+
846
+ ::s フ ::t fu ::t-alt f
847
+ ::s キ ::t ki ::t-alt k
848
+ ::s ク ::t ku ::t-alt k
849
+ ::s ラ ::t ra ::t-alt la
850
+ ::s リ ::t ri ::t-alt li
851
+ ::s ル ::t ru ::t-alt lu, l, r
852
+ ::s レ ::t re ::t-alt le
853
+ ::s ロ ::t ro ::t-alt lo
854
+ ::s ム ::t mu ::t-alt m ::example キム = Kim
855
+ ::s シ ::t shi ::t-alt si ::example メキシコ = meksiko (Mexico)
856
+ ::s ス ::t su ::t-alt s
857
+ ::s ト ::t to ::t-alt t
858
+ ::s ツ ::t tsu ::t-alt tu, ts ::example シュルツ = Schultz
859
+
860
+ ::s ㋿ ::t Reiwa ::comment SQUARE ERA NAME REIWA
861
+
862
+ # Chinese
863
+ ::s 邦 ::t bang ::t-alt bon, bum, bun, pon
864
+ ::s 鲍 ::t bao ::t-alt bow
865
+ ::s 堡 ::t bao ::t-alt berg, burg, bourg, burgh
866
+ ::s 贝 ::t bei ::t-alt ber
867
+ ::s 本 ::t ben ::t-alt bern, bon, bourn, burn
868
+ ::s 彼得 ::t bide ::t-alt peter, pet
869
+ ::s 伯 ::t bo ::t-alt ber
870
+ ::s 波 ::t bo ::t-alt po
871
+ ::s 布 ::t bu ::t-alt b
872
+ ::s 策 ::t ce ::t-alt tze, tzer
873
+ ::s 曾 ::t ceng ::t-alt tzen, zen
874
+ ::s 彻 ::t che ::t-alt tche
875
+ ::s 茨 ::t ci ::t-alt ts, tz, z
876
+ ::s 兹 ::t ci ::t-alt ds, dz, tz, z, zi
877
+ ::s 蒂 ::t di ::t-alt ti, tti
878
+ ::s 丁 ::t ding ::t-alt din, tin
879
+ ::s 顿 ::t dun ::t-alt ton
880
+ ::s 多 ::t duo ::t-alt do, dor, to
881
+ ::s 尔 ::t er ::t-alt l, le, ll, r
882
+ ::s 弗 ::t fu ::t-alt f, fer, pher, v, ver, vir
883
+ ::s 夫 ::t fu ::t-alt f, v, v
884
+ ::s 福 ::t fu ::t-alt faw, for, ford
885
+ ::s 哥 ::t ge ::t-alt go, co
886
+ ::s 戈 ::t ge ::t-alt go
887
+ ::s 各 ::t ge ::t-alt go, co
888
+ ::s 赫 ::t he ::t-alt ch, che, cher, ge
889
+ ::s 华 ::t hua ::t-alt ver, wa, war, wer ::example Washington
890
+ ::s 怀 ::t huai ::t-alt whi, wi, wy
891
+ ::s 惠 ::t hui ::t-alt wha, whea
892
+ ::s 基 ::t ji ::t-alt ki, chi
893
+ ::s 吉 ::t ji ::t-alt gi, gui
894
+ ::s 加 ::t jia ::t-alt ca, ga, ka ::example Canada
895
+ ::s 杰 ::t jie ::t-alt ger
896
+ ::s 金 ::t jin ::t-alt kin, gin
897
+ ::s 斤 ::t jin ::t-alt zin
898
+ ::s 康 ::t kang ::t-alt con, corn
899
+ ::s 考 ::t kao ::t-alt cow, cour
900
+ ::s 克 ::t ke ::t-alt k, che, cher
901
+ ::s 科 ::t ke ::t-alt ko
902
+ ::s 拉 ::t la ::t-alt ra ::example Tirana
903
+ ::s 朗 ::t lang ::t-alt lon, ron
904
+ ::s 赖 ::t lai ::t-alt ri
905
+ ::s 劳 ::t lao ::t-alt low
906
+ ::s 勒 ::t lei ::t-alt ler
907
+ ::s 伦 ::t lun ::t-alt lon, ran, ron
908
+ ::s 里 ::t li ::t-alt ri
909
+ ::s 利 ::t li ::t-alt ri ::example Ferrari
910
+ ::s 隆 ::t long ::t-alt lon, lum, lund
911
+ ::s 罗 ::t luo ::t-alt l, lo, lu, ro, row, ru
912
+ ::s 洛 ::t luo ::t-alt lo, low, ro
913
+ ::s 默 ::t mo ::t-alt mer
914
+ ::s 纳 ::t na ::t-alt ne, ner
915
+ ::s 珀 ::t po ::t-alt per
916
+ ::s 奇 ::t qi ::t-alt chi, dge, ge, tch
917
+ ::s 齐 ::t qi ::t-alt tsi, zi
918
+ ::s 乔 ::t qiao ::t-alt jo
919
+ ::s 青 ::t qing ::t-alt tsing
920
+ ::s 琼 ::t qiong ::t-alt jon, jum, jun
921
+ ::s 瑟 ::t se ::t-alt the
922
+ ::s 什 ::t shen ::t-alt sh
923
+ ::s 圣 ::t sheng ::t-alt san, sao, saint
924
+ ::s 斯 ::t si ::t-alt s, rth, th ::example Alaska
925
+ ::s 索 ::t suo ::t-alt tho
926
+ ::s 特 ::t te ::t-alt t
927
+ ::s 翁 ::t weng ::t-alt on
928
+ ::s 沃 ::t wo ::t-alt ver, vo, war, wer
929
+ ::s 乌 ::t wu ::t-alt ou, u
930
+ ::s 希 ::t xi ::t-alt chi, hi, shi
931
+ ::s 西 ::t xi ::t-alt s, si
932
+ ::s 锡 ::t xi ::t-alt ci, si, thi, zi
933
+ ::s 夏 ::t xia ::t-alt ha, cha, cia, sha, tia
934
+ ::s 香 ::t xiang ::t-alt chan, cham
935
+ ::s 歇 ::t xie ::t-alt she
936
+ ::s 谢 ::t xie ::t-alt che, she
937
+ ::s 辛 ::t xin ::t-alt cin, sen, sin, sing, sun, zen
938
+ ::s 欣 ::t xin ::t-alt hin, shin
939
+ ::s 休 ::t xiu ::t-alt hu, hue
940
+ ::s 修 ::t xiu ::t-alt ciu, siu, thew, tiu
941
+ ::s 许 ::t xu ::t-alt hue, schue
942
+ ::s 逊 ::t xun ::t-alt son
943
+ ::s 耶 ::t ye ::t-alt yer, ier
944
+ ::s 泽 ::t ze ::t-alt ser
945
+ ::s 扎 ::t zha ::t-alt za
946
+ ::s 詹 ::t zhan ::t-alt ja, jam, jan, jen, jon
947
+ ::s 治 ::t zhi ::t-alt ge ::example George
948
+
949
+ ## Numbers
950
+ # Chinese and Japanese numbers
951
+ ::s 零 ::num 0
952
+ ::s 〇 ::num 0
953
+ ::s 一 ::num 1
954
+ ::s 二 ::num 2
955
+ ::s 三 ::num 3
956
+ ::s 四 ::num 4
957
+ ::s 五 ::num 5
958
+ ::s 六 ::num 6
959
+ ::s 七 ::num 7
960
+ ::s 八 ::num 8
961
+ ::s 九 ::num 9
962
+ ::s 十 ::num 10
963
+ ::s 百 ::num 100
964
+ ::s 千 ::num 1000
965
+ ::s 万 ::num 10000
966
+ ::s 萬 ::num 10000
967
+ ::s 亿 ::num 100000000
968
+ ::s 億 ::num 100000000
969
+ ::s 兆 ::num 1000000000000
970
+ ::s 京 ::num 10000000000000000
971
+
972
+ # numbers in non-number words (to be exptended)
973
+ ::s 一贯 ::t yiguan ::comment consistent
974
+
975
+ ::s 红十字会 ::t hongshizihui ::comment Red Cross
976
+
977
+ ::s 百度 ::t baidu ::comment Baidu (company)
978
+ ::s 百分 ::t baifen ::comment percent
979
+ ::s 百合 ::t baihe ::comment lily
980
+ ::s 百货 ::t baihuo ::comment general merchandise
981
+ ::s 百科 ::t baike ::comment encyclopedia
982
+ ::s 百老汇 ::t bailaohui
983
+ ::s 百灵 ::t bailing
984
+ ::s 百慕大 ::t baimuda
985
+ ::s 百日咳 ::t bairike
986
+ ::s 百色市 ::t baiseshi
987
+ ::s 百事可乐 ::t baishikele ::comment Pepsi Cola
988
+ ::s 百無 ::t baiwu
989
+ ::s 百香 ::t baixiang
990
+ ::s 百姓 ::t baixing
991
+ ::s 百叶 ::t baiye
992
+ ::s 百色 ::t bose
993
+ ::s 杨百翰 ::t yangbaihan ::comment Brigham Young
994
+
995
+ ::s 北京 ::t beijing
996
+ ::s 京都 ::t jingdou
997
+ ::s 东京 ::t dongjing
998
+ ::s 京胡 ::t jinghu
999
+ ::s 南京 ::t nangjing
1000
+ ::s 普京 ::t pujing ::comment Putin
1001
+ ::s 東京 ::t dongjing ::comment Tokyo
1002
+ ::s 京兆 ::t jingzhao
1003
+
1004
+ ::s ㎢ ::t km²
1005
+ ::s ㎥ ::t m³
1006
+ ::s ㎝ ::t cm
1007
+
1008
+ ## Indian
1009
+ # see mostly under UnicodeDataOverwrite.txt
1010
+
1011
+ # Malayalam
1012
+ ::s ൗ ::t au ::comment MALAYALAM AU LENGTH MARK
1013
+
1014
+ # Tamil
1015
+ ::s ட ::t d ::comment most commonly d, but t when word-initial or in a doubled consonant
1016
+ ::s ஃப ::t f ::comment h+p=f
1017
+ ::s ஃஜ ::t z ::comment h+j=z
1018
+
1019
+ # Myanmar/Burmese
1020
+ # ::s ့ ::t ::comment dot below, denotes creaky tone
1021
+ # ::s း ::t ::comment visarga, denotes high tone
1022
+ ::s ၌ ::t -nai ::comment locative
1023
+ ::s ၍ ::t -jwe ::comment completed
1024
+ ::s ၎ ::t legau ::comment aforementioned
1025
+ ::s ၏ ::t -i ::comment genetive
1026
+
1027
+ # Lao
1028
+ ::s ັ ::t a ::comment vowel sign mai kan
1029
+ ::s ົ ::t o ::comment vowel sign mai kon
1030
+ ::s ູ ::t uu ::comment vowel sign uu
1031
+ ::s ຽ ::t y ::comment semivowel sign nyo
1032
+ ::s ຼ ::t l ::comment semivowel sign lo
1033
+ ::s ລ ::t l ::comment lo loot
1034
+ ::s ຣ ::t l ::comment lo ling
1035
+ ::s ໝ ::t m ::comment ho mo
1036
+ ::s ໜ ::n ::comment ho no
1037
+ ::s ຢ ::t y ::comment yo
1038
+ ::s ໍ ::t oo ::comment niggahita (possibly also nasal -m in final position)
1039
+ ::s ໆ ::t ² ::comment Lao ko la ::annotation repetition-sign
1040
+ ::s ຯ ::t ... ::comment Lao ellipsis
1041
+
1042
+ # Thai
1043
+ ::s ออ ::t o
1044
+ ::s อั ::t a
1045
+ ::s อิ ::t i
1046
+ ::s ๆ ::t ² ::comment Thai character maiyamok ::annotation repetition-sign
1047
+
1048
+ # Khmer
1049
+ ::s ័ ::t "" ::comment Khmer samyok sannya: indicates deviation from the general rules of pronunciation
1050
+ ::s ៏ ::t "" ::comment Khmer sign ahsda: denotes stressed intonation in some single-consonant words
1051
+ ::s ៍ ::t "" ::comment Khmer sign toandakhiat: indicates that the base character is not pronounced
1052
+ ::s ៌ ::t "" ::comment Khmer sign robat: a diacritic historically corresponding to the repha form of ra in Devanagari
1053
+ ::s ប៉ ::t pa ::comment Khmer ba + musĕkâtônd -> pa
1054
+ ::s ៗ ::t ² ::comment Khmer sign lek too ::annotation repetition-sign
1055
+
1056
+ ## Semitic languages
1057
+ # Arabic
1058
+ ::s و ::t w ::comment Arabic letter waw ::t-alt o, u ::lcode ara
1059
+ ::s ء ::t ' ::comment hamza
1060
+ ::s ٔ ::t ' ::comment hamza above
1061
+ ::s ٕ ::t ' ::comment hamza below
1062
+ ::s ع ::t ' ::comment ain
1063
+ ::s آ ::t a ::comment alef madda
1064
+ ::s ٓا ::t a ::comment Arabic maddah above plus alef (presumably an ill-formed version of آ; found 1 instance in Urdu text)
1065
+ ::s إ ::t i ::comment alef with hamza below
1066
+ ::s ٱ ::t a ::comment alef wasla ::comment typically indicates liaison with preceding word
1067
+ ::s ة ::t a ::comment teh marbuta
1068
+ ::s ۃ ::t a ::comment teh marbuta goal ::comment Used in Punjabi, Sindhi. Different from plain 'teh marbuta'?
1069
+ ::s ي ::t y ::comment Arabic yeh
1070
+ ::s ى ::t a ::comment alef maksura
1071
+ ::s ﻯ ::t a ::comment alef maksura isolated form
1072
+ ::s ﻰ ::t a ::comment alef maksura final form
1073
+ ::s ﯨ ::t a ::comment Uighur Kazach Kirghiz alef maksura initial form
1074
+ ::s ﯩ ::t a ::comment Uighur Kazach Kirghiz alef maksura medial form
1075
+ ::s ٰ ::t a ::comment Arabic letter superscript alef
1076
+ ::s ـ ::t ::comment tatweel (filler)
1077
+ ::s َ ::t a ::comment fatha ("-a")
1078
+ ::s ُ ::t u ::comment damma ("-u")
1079
+ ::s ِ ::t i ::comment kasra ("-i")
1080
+ ::s ْ ::t ::comment sukun (no vowel)
1081
+ ::s ۡ ::t ::comment small high dotless head of khah; like sukun (no vowel); used in Kashmiri, Assamese
1082
+ ::s ً ::t ::comment fathatan ("-an")
1083
+ ::s اً ::t an ::comment alef + fathatan
1084
+ ::s ٌ ::t ::comment dammatan ("-un")
1085
+ ::s ٍ ::t ::comment kasratan ("-in")
1086
+ ::s ّ ::t ::comment shadda (consonant doubler)
1087
+ ::s ڃ ::t ny ::comment Arabic letter nyeh U+0683 (used in Sindhi (snd))
1088
+ ::s ڄ ::t dy ::comment Arabic letter dyeh U+0684 (used in Sindhi (snd))
1089
+ ::s ۾ ::t men ::comment Sindhi postposition men
1090
+ ::s ؑ ::t alayhe wasallam ::comment "upon him be peace"
1091
+ ::s ﷴ ::t mohammad ::comment "Mohammad"
1092
+ ::s ﷸ ::t wasallam ::comment "and peace"
1093
+ ::s ﷺ ::t sallallahou alayhe wasallam ::comment "prayer of God be upon him and his family and peace"
1094
+
1095
+ ::s ࣓ ::t waw ::comment ARABIC SMALL LOW WAW
1096
+ ::s ࣔ ::t al-rub ::comment ARABIC SMALL HIGH WORD AR-RUB
1097
+ ::s ࣕ ::t s ::comment ARABIC SMALL HIGH SAD
1098
+ ::s ࣖ ::t ' ::comment ARABIC SMALL HIGH AIN
1099
+ ::s ࣗ ::t q ::comment ARABIC SMALL HIGH QAF
1100
+ ::s ࣘ ::t n ::comment ARABIC SMALL HIGH NOON WITH KASRA
1101
+ ::s ࣙ ::t n ::comment ARABIC SMALL LOW NOON WITH KASRA
1102
+ ::s ࣚ ::t al-thalatha ::comment ARABIC SMALL HIGH WORD ATH-THALATHA
1103
+ ::s ࣛ ::t al-sajda ::comment ARABIC SMALL HIGH WORD AS-SAJDA
1104
+ ::s ࣜ ::t al-nisf ::comment ARABIC SMALL HIGH WORD AN-NISF
1105
+ ::s ࣝ ::t sakta ::comment ARABIC SMALL HIGH WORD SAKTA
1106
+ ::s ࣞ ::t qif ::comment ARABIC SMALL HIGH WORD QIF
1107
+ ::s ࣟ ::t waqfa ::comment ARABIC SMALL HIGH WORD WAQFA
1108
+ ::s ࣠ ::t ::comment ARABIC SMALL HIGH FOOTNOTE MARKER (CHECK)
1109
+ ::s ࣡ ::t ::comment ARABIC SMALL HIGH SIGN SAFHA (CHECK)
1110
+ ::s ࣢ ::t ::comment ARABIC DISPUTED END OF AYAH (CHECK)
1111
+
1112
+ # Farsi
1113
+ ::s ی ::t i ::t-alt y ::comment Contributed by Nima
1114
+ ::s ای ::t i ::t-alt ai ::use-only-at-start-of-word ::comment Contributed by Nima
1115
+ ::s هٔ ::t eye ::use-only-at-end-of-word ::lcode fas ::comment Contributed by Nima
1116
+ ::s و ::t v ::t-alt o, u ::lcode fas ::comment Arabic letter waw
1117
+ ::s ض ::t z ::t-alt d ::lcode fas ::comment Contributed by Marjan
1118
+ ::s ث ::t s ::t-alt th ::lcode fas ::comment Contributed by Marjan
1119
+ ::s ذ ::t z ::t-alt th ::lcode fas ::comment Contributed by Nima
1120
+ ::s ع ::t a ::t-alt ' ::lcode fas ::comment Contributed by Nima
1121
+ ::s عا ::t a ::lcode fas ::comment Contributed by Nima
1122
+ ::s عی ::t i ::t-alt iy ::lcode fas ::comment Contributed by Nima
1123
+ ::s عو ::t u ::t-alt o, av ::lcode fas ::comment Contributed by Nima
1124
+ ::s چ ::t ch ::t-alt tch, tsh ::lcode fas ::comment Contributed by Nima
1125
+ ::s ه ::t e ::t-alt h ::use-only-at-end-of-word ::lcode fas ::comment Contributed by Nima
1126
+ ::s ‌ ::t "" ::t-alt " " ::lcode fas ::comment source is character "zero-width non-joiner" (U+200C); Contributed by Nima
1127
+ ::s غ ::t gh ::t-alt g ::lcode fas
1128
+ ::s آئی ::t ai ::t-alt ae ::lcode fas
1129
+ ::s ائی ::t ai ::t-alt ae ::lcode fas
1130
+ ::s آئو ::t au ::t-alt ao ::lcode fas
1131
+ ::s ائو ::t au ::t-alt ao ::lcode fas
1132
+
1133
+ # Kashmiri (so far: educated guesses)
1134
+ ::s ٖ ::t a ::comment Arabic subscript alef U+0656
1135
+ ::s ٗ ::t u ::comment Arabic inverted damma U+0657
1136
+ ::s ۚ ::t j ::comment Arabic small high jeem U+06DA
1137
+ ::s ۪ ::t ::comment Arabic emtpy centre low stop U+06EA
1138
+ ::s ۬ ::t ::comment Arabic rounded high stop with filled center U+06EC
1139
+
1140
+ # Pashto
1141
+ ::s ٙ ::t e ::comment Arabic zwarakay
1142
+ ::s ځ ::t z ::t-alt dz ::comment Pashto letter zim; Arabic letter "hah with hamza above"
1143
+ ::s څ ::t ts ::t-alt c ::comment Pashto letter tsim; Arabic letter "h with three dots above"
1144
+ ::s ګ ::t g ::comment Pashto letter gaf; Arabic letter "kaf with ring"
1145
+ ::s ڼ ::t n ::comment Arabic letter "noon with ring"
1146
+ ::s ږ ::t g ::t-alt z, zh, j ::comment pronunciation varies regionally
1147
+ ::s ښ ::t kh ::t-alt sh ::comment pronunciation varies regionally
1148
+ ::s ه ::t h ::t-alt a ::lcode pus
1149
+ ::s ۀ ::t e ::lcode pus ::comment Arabic letter "heh with yeh above"
1150
+ ::s و ::t w ::t-alt o, u ::lcode pus
1151
+ ::s ی ::t ay ::t-alt y ::lcode pus
1152
+ ::s وی ::t wy ::t-alt oy, uy ::lcode pus
1153
+ ::s ای ::t ay ::lcode pus
1154
+ ::s ۍ ::t ay ::lcode pus
1155
+ ::s ئ ::t ay ::t-alt y ::lcode pus
1156
+ ::s ژ ::t zh ::t-alt z ::lcode pus ::comment [ʒ]
1157
+ ::s ض ::t z ::t-alt d ::lcode pus
1158
+ ::s ث ::t s ::lcode pus ::t-alt th ::comment Arabic letter theh (unvoiced th/θ)
1159
+ ::s ذ ::t z ::lcode pus ::t-alt th ::comment Arabic letter thal (voiced th/ð)
1160
+
1161
+ # Hebrew
1162
+ ::s ב ::t v ::comment Hebrew letter bet ::t-alt b
1163
+ ::s כ ::t k ::comment Hebrew letter kaf ::t-alt kh
1164
+ ::s ך ::t k ::comment Hebrew letter kaf ::t-alt kh
1165
+ ::s פ ::t f ::comment Hebrew letter pe ::t-alt p
1166
+ ::s ש ::t sh ::comment Hebrew letter shin ::t-alt s
1167
+ ::s ו ::t v ::comment Hebrew letter vav ::t-alt o, u
1168
+ ::s ח ::t ch ::comment Hebrew letter het ::t-alt h ::use-alt-in-pointed
1169
+ ::s ק ::t q ::t-alt k ::use-alt-in-pointed
1170
+ ::s וֹ ::t o
1171
+ ::s וּ ::t u
1172
+ ::s קְוָ ::t qva ::t-alt kva ::use-alt-in-pointed
1173
+ ::s י ::t y
1174
+ ::s יּ ::t y
1175
+ ::s יָּ ::t ya
1176
+ ::s ײ ::t yy ::comment Hebrew ligature Yiddish double Yod (CHECK)
1177
+ ::s ׯ ::t yyy ::comment HEBREW YOD TRIANGLE (CHECK)
1178
+ ::s ע ::t '
1179
+ ::s ִי ::t i ::t-alt iy ::use-alt-in-pointed
1180
+ ::s ֵי ::t e
1181
+ ::s ִיּ ::t iy
1182
+ ::s ִיָּ ::t iya
1183
+ ::s ױ ::t oy
1184
+ ::s א ::t a ::t-alt '
1185
+ ::s אָ ::t a
1186
+ ::s ֹא ::t o
1187
+ ::s אַ ::t 'a
1188
+ ::s אֲ ::t 'a
1189
+ ::s אֶ ::t e
1190
+ ::s אֱ ::t e
1191
+ ::s פ ::t f
1192
+ ::s פּ ::t p
1193
+ ::s פַּ ::t pa
1194
+ ::s פְּ ::t pe ::t-alt p ::use-alt-in-pointed
1195
+ ::s שׁ ::t sh
1196
+ ::s שָׁ ::t sha
1197
+ ::s שָּׁ ::t sha ::comment ?
1198
+ ::s שְׁ ::t she ::t-alt sh ::use-alt-in-pointed
1199
+ ::s שֶׁ ::t she
1200
+ ::s שִׁ ::t shi
1201
+ ::s שֻׁ ::t shu
1202
+ ::s שׂ ::t s
1203
+ ::s שָׂ ::t sa
1204
+ ::s שְׂ ::t s ::t-alt se ::use-alt-in-pointed
1205
+ ::s כּ ::t k
1206
+ ::s כֶּ ::t ke
1207
+ ::s כֹּ ::t ko
1208
+ ::s בּ ::t b
1209
+ ::s בַּ ::t ba
1210
+ ::s בָּ ::t ba
1211
+ ::s בְּ ::t be ::t-alt b ::use-alt-in-pointed
1212
+ ::s בֶּ ::t be
1213
+ ::s תּ ::t t
1214
+ ::s תַּ ::t ta
1215
+ ::s תֵּ ::t te
1216
+ ::s תִּ ::t ti
1217
+ ::s דָּ ::t da
1218
+ ::s דְּ ::t de ::t-alt d ::use-alt-in-pointed
1219
+ ::s גּ ::t g
1220
+ ::s לֵּ ::t le
1221
+ ::s ד׳ ::t dh
1222
+ ::s ג׳ ::t j
1223
+ ::s ת׳ ::t th
1224
+ ::s ז׳ ::t zh
1225
+ ::s חַ ::t ach ::comment furtive patah ::use-only-at-end-of-word
1226
+ ::s עַ ::t a' ::comment furtive patah ::use-only-at-end-of-word
1227
+ ::s הַּ ::t ah ::comment furtive patah ::use-only-at-end-of-word
1228
+ ::s ַ ::t a ::comment Hebrew point patah
1229
+ ::s ֲ ::t a ::comment Hebrew point hataf patah (hataf = reduced)
1230
+ ::s ֳ ::t o ::comment Hebrew point hataf qamats
1231
+ ::s ָ ::t a ::comment Hebrew point qamats ::t-alt o ::use-alt-in-pointed
1232
+ ::s ֶ ::t e ::comment Hebrew point segol
1233
+ ::s ֱ ::t e ::comment Hebrew point hataf segol (hataf = reduced)
1234
+ ::s ְ ::t e ::comment Hebrew point sheva ::t-alt "" ::use-alt-in-pointed
1235
+ ::s ֵ ::t e ::comment Hebrew point tsere
1236
+ ::s ִ ::t i ::comment Hebrew point hiriq
1237
+ ::s ֹ ::t o ::comment Hebrew point holam
1238
+ ::s ֻ ::t u ::comment Hebrew point qubuts
1239
+ # ::s ּ ::t "" ::comment Hebrew point dagesh or mapiq
1240
+
1241
+ # Yiddish
1242
+ ::s א ::t a ::lcode yid ::comment called "silent" alef
1243
+ ::s אי ::t y ::lcode yid
1244
+ ::s איי ::t ey ::lcode yid
1245
+ ::s או ::t u ::lcode yid
1246
+ ::s אוי ::t oy ::lcode yid
1247
+ ::s אַ ::t a ::lcode yid
1248
+ ::s אָ ::t o ::lcode yid
1249
+ ::s ב ::t b ::lcode yid
1250
+ ::s בֿ ::t v ::lcode yid
1251
+ ::s דזש ::t dzh ::lcode yid
1252
+ ::s ו ::t u ::lcode yid
1253
+ ::s וּ ::t u ::lcode yid
1254
+ ::s וֹ ::t o ::lcode yid
1255
+ ::s װ ::t v ::lcode yid
1256
+ ::s ווא ::t wa ::lcode yid
1257
+ ::s וואַ ::t wa ::lcode yid
1258
+ ::s ווע ::t we ::lcode yid
1259
+ ::s ווי ::t wi ::lcode yid
1260
+ ::s וואוי ::t wo ::lcode yid
1261
+ ::s וי ::t oy ::lcode yid
1262
+ ::s זש ::t zh ::lcode yid
1263
+ ::s ח ::t ch ::lcode yid
1264
+ ::s טש ::t tsh ::lcode yid
1265
+ ::s יִ::t i ::lcode yid
1266
+ ::s יי ::t ey ::lcode yid ::comment maybe "yi" at beginning of word
1267
+ ::s ײַ ::t ay ::lcode yid
1268
+ ::s כּ ::t k ::lcode yid
1269
+ ::s כ ::t ch ::lcode yid
1270
+ ::s ך ::t ch ::lcode yid
1271
+ ::s ע ::t e ::lcode yid
1272
+ ::s פּ ::t p ::lcode yid
1273
+ ::s פֿ ::t f ::lcode yid
1274
+ ::s ף ::t f ::lcode yid ::comment sometimes p
1275
+ ::s ק ::t k ::lcode yid
1276
+ ::s ת ::t s ::lcode yid
1277
+
1278
+ # Syriac/Aramaic (should be vetted by expert)
1279
+ ::s ܰ ::t a ::comment Syriac pthaha above
1280
+ ::s ܲ ::t a ::comment Syriac pthaha dotted
1281
+ ::s ܳ ::t aa ::comment Syriac zqapha above
1282
+ ::s ܴ ::t aa ::comment Syriac zqapha below
1283
+ ::s ܵ ::t aa ::comment Syriac zqapha dotted
1284
+ ::s ܶ ::t e ::comment Syriac rbasa above
1285
+ ::s ܷ ::t e ::comment Syriac rbasa below
1286
+ ::s ܿ ::t o ::comment Syriac rwaha
1287
+ ::s ܸ ::t e ::comment Syriac dotted zlama horizontal
1288
+ ::s ܹ ::t e ::comment Syriac dotted zlama angular
1289
+ ::s ܺ ::t i ::comment Syriac hbasa above
1290
+ ::s ܝܺ ::t i ::comment Syriac yudh + hbasa above
1291
+ ::s ܼ ::t u ::comment Syriac hbasa-esasa dotted
1292
+ ::s ܽ ::t o ::comment Syriac esasa above
1293
+ ::s ܾ ::t u ::comment Syriac esasa below
1294
+ ::s ݇ ::t "" ::comment Syriac oblique line above; indication of a silent letter
1295
+
1296
+ ::s ܖ ::t d ::comment Syriac letter dotless dalath rish; ambiguous form for undifferentiated early dalath/rish
1297
+ ::s ܜ ::t t ::comment Syriac letter teth garshuni; used in Garshuni documents
1298
+ ::s ܒ݂ ::t v ::comment Syriac beth + rukkakha
1299
+ ::s ܒ�� ::t v ::comment Syriac beth + ring-below
1300
+ ::s ܓ݂ ::t g ::comment Syriac gammal + rukkakha [IPA: ɣ]
1301
+ ::s ܓ̥ ::t g ::comment Syriac gammal + ring-below [IPA: ɣ]
1302
+ ::s ܕ݂ ::t d ::comment Syriac dalath + rukkakha [IPA: ð]
1303
+ ::s ܕ̥ ::t d ::comment Syriac dalath + ring-below [IPA: ð]
1304
+ ::s ܟ݂ ::t kh ::comment Syriac kaph + rukkakha [IPA: x]
1305
+ ::s ܟ̥ ::t kh ::comment Syriac kaph + ring-below [IPA: x]
1306
+ ::s ܦ݂ ::t f ::comment Syriac pe + rukkakha
1307
+ ::s ܦ̥ ::t f ::comment Syriac pe + ring-below
1308
+ ::s ܦ݁ ::t p ::comment Syriac pe + qushshaya
1309
+ ::s ܬ݂ ::t th ::comment Syriac taw + rukkakha [IPA: θ]
1310
+ ::s ܬ̥ ::t th ::comment Syriac taw + ring-below [IPA: θ]
1311
+
1312
+ ::s ܄ ::t : ::comment Syriac sublinear colon; used at the end of verses of supplicationscolon skewed left
1313
+ ::s ܆ ::t , ::comment Syriac colon skewed left; marks a dependent clause
1314
+ ::s ܇ ::t , ::comment Syriac colon skewed right; marks the end of a subdivision of the apodosis, or latter part of a Biblical verse
1315
+
1316
+ # Uzbek
1317
+ ::s ʻ ::t ' ::comment modifies pronunciation of preceding "o" and "g"
1318
+ ::s ʼ ::t ' ::comment glottal stop (tutuq belgisi)
1319
+
1320
+ # Uyghur
1321
+ ::s ئا ::t a ::lcode uig
1322
+ ::s ە ::t e ::lcode uig
1323
+ ::s ئې ::t e ::lcode uig ::latinplus ë
1324
+ ::s ې ::t e ::lcode uig ::latinplus ë
1325
+ ::s ئە ::t e ::lcode uig
1326
+ ::s يە ::t e ::lcode uig
1327
+ ::s ئى ::t i ::lcode uig
1328
+ ::s ى ::t i ::lcode uig
1329
+ ::s ئو ::t o ::lcode uig
1330
+ ::s و ::t o ::lcode uig
1331
+ ::s ئۇ ::t u ::lcode uig
1332
+ ::s ۇ ::t u ::lcode uig
1333
+ ::s چ ::t ch ::t-alt q ::lcode uig
1334
+ ::s خ ::t x ::lcode uig
1335
+ ::s ژ ::t zh ::lcode uig
1336
+ ::s ئۆ ::t oe ::t-alt o ::lcode uig ::latinplus ö
1337
+ ::s ۆ ::t oe ::t-alt o ::lcode uig ::latinplus ö
1338
+ ::s ئۈ ::t ue ::t-alt u ::lcode uig ::latinplus ü
1339
+ ::s ۈ ::t ue ::t-alt u ::lcode uig ::latinplus ü
1340
+ ::s ۋ ::t w ::lcode uig
1341
+
1342
+ # Maldivian
1343
+ ::s ް ::t ::comment thaana sukun
1344
+ ::s ަ ::t a ::comment thaana abafili
1345
+ ::s ާ ::t aa ::comment thaana aabaafili
1346
+ ::s ި ::t i ::comment thaana ibifili
1347
+ ::s ީ ::t ee ::comment thaana eebeefili
1348
+ ::s ު ::t u ::comment thaana ubufili
1349
+ ::s ޫ ::t oo ::comment thaana ooboofili
1350
+ ::s ެ ::t e ::comment thaana ebefili
1351
+ ::s ޭ ::t ey ::comment thaana eybeyfili
1352
+ ::s ޮ ::t o ::comment thaana obofili
1353
+ ::s ޯ ::t oa ::comment thaana oaboafili
1354
+
1355
+ # Canadian syllabics (Inuktitut)
1356
+ ::s ᑊ ::t p ::comment syllable final
1357
+ ::s ᐟ ::t t ::comment syllable final
1358
+ ::s ᐠ ::t k ::comment syllable final
1359
+ ::s ᐨ ::t c ::comment syllable final
1360
+ ::s ᒼ ::t m ::comment syllable final
1361
+ ::s ᐣ ::t n ::comment syllable final
1362
+ ::s ᐢ ::t s ::comment syllable final
1363
+ ::s ᐧ ::t y ::comment syllable final
1364
+ ::s ᐤ ::t w ::comment syllable final
1365
+ ::s ᐦ ::t h ::comment syllable final
1366
+ ::s ᕽ ::t hk ::comment syllable final
1367
+ ::s ᓫ ::t l ::comment syllable final
1368
+ ::s ᕑ ::t r ::comment syllable final
1369
+
1370
+ # Mongolian
1371
+ ::s ᢅ ::t ::comment MONGOLIAN LETTER ALI GALI BALUDA (CHECK) indicates assimilation
1372
+ ::s ᢆ ::t ::comment MONGOLIAN LETTER ALI GALI THREE BALUDA (CHECK) indicates assimilation
1373
+
1374
+ # Tibetan
1375
+ ::s ྅ ::t ::comment TIBETAN MARK PALUTA (CHECK) indicates assimilation
1376
+
1377
+ ## Punctuation
1378
+ # delete
1379
+ ::s ¿ ::t "" ::comment inverted question mark
1380
+ ::s ¡ ::t "" ::comment inverted exclamation mark
1381
+ # decompose double-punctuation
1382
+ ::s ‼ ::t !!
1383
+ ::s ⁇ ::t ??
1384
+ ::s ⁉ ::t !?
1385
+ ::s ⁈ ::t ?!
1386
+ # preserve
1387
+ ::s ′ ::t ′
1388
+ ::s ∩ ::t ∩
1389
+ ::s ‡ ::t ‡
1390
+ # Cyrillic
1391
+ ::s ⁙ ::t . ::comment five dot punctuation
1392
+ # Amharic/Ethiopian
1393
+ ::s ። ::t .
1394
+ ::s ፣ ::t ,
1395
+ ::s ፤ ::t ;
1396
+ ::s ፥ ::t :
1397
+ ::s ፧ ::t ? ::comment Ethiopic question mark
1398
+ ::s ፡ ::t " " ::comment Ethiopic wordspace
1399
+ ::s ፦ ::t : ::comment Ethiopic preface colon
1400
+ # Ethiopic wordspace often appropriated for other purposes:
1401
+ ::s ፡፡ ::t .
1402
+ ::s ፡- ::t :
1403
+ ::s "፡ " ::t ", "
1404
+ ::s ቸ ::t cha ::comment Ethiopic syllable ca
1405
+ ::s ቹ ::t chu ::comment Ethiopic syllable cu
1406
+ ::s ቺ ::t chi ::comment Ethiopic syllable ci
1407
+ ::s ቻ ::t chaa ::comment Ethiopic syllable caa
1408
+ ::s ቼ ::t chee ::comment Ethiopic syllable cee
1409
+ ::s ች ::t che ::comment Ethiopic syllable ce
1410
+ ::s ቾ ::t cho ::comment Ethiopic syllable co
1411
+ ::s ሠ ::t sa ::comment Ethiopic syllable sza
1412
+ ::s ሡ ::t su ::comment Ethiopic syllable szu
1413
+ ::s ሢ ::t si ::comment Ethiopic syllable szi
1414
+ ::s ሣ ::t saa ::comment Ethiopic syllable szaa
1415
+ ::s ሤ ::t see::comment Ethiopic syllable szee
1416
+ ::s ሥ ::t se ::comment Ethiopic syllable sze
1417
+ ::s ሦ ::t so ::comment Ethiopic syllable szo
1418
+ ::s ጠ ::t te ::comment Ethiopic syllable the with ejective 't'
1419
+ ::s ጡ ::t tu ::comment Ethiopic syllable thu with ejective 't'
1420
+ ::s ጢ ::t ti ::comment Ethiopic syllable thi with ejective 't'
1421
+ ::s ጣ ::t taa ::comment Ethiopic syllable thaa with ejective 't'
1422
+ ::s ጤ ::t tee ::comment Ethiopic syllable thee with ejective 't'
1423
+ ::s ጥ ::t te ::comment Ethiopic syllable the with ejective 't'
1424
+ ::s ጦ ::t to ::comment Ethiopic syllable tho with ejective 't'
1425
+
1426
+ # Devanagari (Hindi etc.)
1427
+ ::s । ::t . ::comment danda
1428
+ ::s ॥ ::t . ::comment double danda
1429
+ ::s ৷ ::t . ::comment Bengali currency numerator four; used as danda
1430
+ ::s ॰ ::t . ::comment Devanagari abbreviation sign
1431
+ # Bengali
1432
+ ::s ৽ ::t . ::comment BENGALI ABBREVIATION SIGN
1433
+ ::s ৾ ::t ::comment BENGALI SANDHI MARK (CHECK)
1434
+ # Gurmukhi
1435
+ ::s ੶ ::t . ::comment GURMUKHI ABBREVIATION SIGN
1436
+ # Oriya/Odia (India)
1437
+ ::s ୤ ::t . ::comment danda (deprecated, should use Devanagari danda ।)
1438
+ ::s ୥ ::t . ::comment double danda (deprecated, should use Devanagari double danda ॥)
1439
+ # Tibetan
1440
+ ::s ། ::t ,
1441
+ ::s །: ::t :
1442
+ ::s ༏ ::t ;
1443
+ ::s ༎ ::t .
1444
+ ::s ༑ ::t , ::comment Tibetan mark run chen spungs shad
1445
+ ::s ༼ ::t ( ::comment Tibetan open roof punctuation
1446
+ ::s ༽ ::t ) ::comment Tibetan close roof punctuation
1447
+ ::s ༈ ::t "" ::comment Tibetan mark srbul shad
1448
+ ::s 【 ::t [ ::comment left black lenticular bracket
1449
+ ::s 】 ::t ] ::comment right black lenticular bracket
1450
+ ::s ༄ ::t "" ::comment Tibetan head mark
1451
+ ::s ༄༅ ::t "" ::comment Tibetan head mark
1452
+ ::s ༆ ::t "" ::comment Tibetan head mark
1453
+ # Myanmar/Burmese
1454
+ ::s ၊ ::t ,
1455
+ ::s ။ ::t .
1456
+ Khmer
1457
+ ::s ៖ ::t ; ::comment Khmer sign camnuc pii kuuh
1458
+ ::s ។ ::t . ::comment Khmer sign khan
1459
+ # Arabic
1460
+ ::s ، ::t ,
1461
+ ::s ؛ ::t ;
1462
+ ::s ٬ ::t ,
1463
+ ::s ۔ ::t .
1464
+ ::s ؟ ::t ?
1465
+ ::s ٪ ::t %
1466
+ ::s ٫ ::t , ::comment Arabic decimal separator
1467
+ ::s ۽ ::t & ::comment Arabic sign Sindhi ampersand
1468
+ # Aramaic
1469
+ ::s ܀ ::t .
1470
+ ::s ܂ ::t .
1471
+ # Hebrew
1472
+ ::s ־ ::t - ::comment maqaf
1473
+ # Armenian
1474
+ ::s ։ ::t .
1475
+ ::s ՝ ::t , ::comment Armenian comma
1476
+ # Chinese
1477
+ ::s , ::t ", "
1478
+ ::s 、 ::t ", "
1479
+ ::s 。 ::t ". "
1480
+ ::s ! ::t "! "
1481
+ ::s ? ::t "? "
1482
+ ::s 「 ::t ' "'
1483
+ ::s 」 ::t '" '
1484
+ ::s 《 ::t ' "'
1485
+ ::s 》 ::t '" '
1486
+ ::s ( ::t " ("
1487
+ ::s ) ::t ") "
1488
+ ::s ; ::t ;
1489
+ ::s : ::t ": "
1490
+ ::s ︰ ::t ": "
1491
+ ::s - ::t -
1492
+ ::s / ::t /
1493
+ ::s = ::t =
1494
+ ::s ~ ::t ~
1495
+ ::s & ::t &
1496
+ ::s < ::t <
1497
+ ::s > ::t >
1498
+ ::s % ::t %
1499
+ ::s _ ::t _ ::comment FULLWIDTH LOW LINE (U+FF3F)
1500
+ ::s { ::t { ::comment FULLWIDTH LEFT CURLY BRACKET (U+FF5B)
1501
+ ::s } ::t } ::comment FULLWIDTH RIGHT CURLY BRACKET (U+FF5D)
1502
+ ::s   ::t " " ::comment ideographic space
1503
+ # Japanese
1504
+ ::s 『 ::t ' "'
1505
+ ::s 』 ::t '" '
1506
+ ::s ・ ::t " " ::comment Katakana middle dot; separates name elements such as first and last name
1507
+ # N'ko
1508
+ ::s ߽ ::t . ::comment NKO DANTAYALAN used to abbreviate units of measure
1509
+ # Medefaidrin
1510
+ ::s 𖺗 ::t , ::comment MEDEFAIDRIN COMMA
1511
+ ::s 𖺘 ::t . ::comment MEDEFAIDRIN FULL STOP
1512
+ # Khitan
1513
+ ::s 𖿤 ::t ::comment KHITAN SMALL SCRIPT FILLER
1514
+
1515
+ # Symbols
1516
+ ::s ∞ ::t ∞ ::comment infinity
1517
+ ::s ­ ::t ::comment soft hyphen; used to indicate preferred line breaks; remove
1518
+ ::s ֊ ::t - ::comment Armenian hyphen; map to regular hyphen-minus
1519
+ ::s ᐩ ::t + ::comment Canadian syllabics final plus; map to regular plus
1520
+ ::s ﹐ ::t , ::comment small comma; map to regular comma
1521
+ ::s ˚ ::t ° ::comment ring above; map to degree sign
1522
+ ::s ⇒ ::t ⇒ ::comment rightwards double arrow
1523
+ ::s † ::t † ::comment dagger
1524
+ ::s • ::t • ::comment bullet
1525
+ ::s ℃ ::t °C ::comment degree Celsius; split into 2 characters
1526
+ ::s ℉ ::t °F ::comment degree Fahrenheit; split into 2 characters
1527
+ ::s ― ::t ― ::comment horizontal bar
1528
+ ::s ˇ ::t ˇ ::comment caron (sometimes apparently used for "Arabic vowel sign small v above" U+065A, e.g. in Gilaki language (glk))
1529
+ ::s ″ ::t ″ ::comment double prime
1530
+ ::s ﴾ ::t ( ::comment ornate left parenthesis
1531
+ ::s ﴿ ::t ) ::comment ornate right parenthesis
1532
+ ::s 〔 ::t [ ::comment left tortoise shell bracket
1533
+ ::s 〕 ::t ] ::comment right tortoise shell bracket
1534
+ ::s ﹝ ::t ( ::comment small left tortoise shell bracket
1535
+ ::s ﹞ ::t ) ::comment small left tortoise shell bracket
1536
+ ::s ¦ ::t ¦ ::comment BROKEN BAR (U+00A6)
1537
+ ::s ¨ ::t ::comment DIAERESIS (U+00A8)
1538
+ ::s ¯ ::t ::comment MACRON (U+00AF)
1539
+ ::s ¸ ::t ::comment CEDILLA (U+00B8)
1540
+ ::s Ƿ ::t W ::comment LATIN CAPITAL LETTER WYNN (U+01F7)
1541
+ ::s ˘ ::t ::comment BREVE (U+02D8)
1542
+ ::s ˛ ::t ::comment OGONEK (U+02DB)
1543
+ ::s ˜ ::t ~ ::comment SMALL TILDE (U+02DC)
1544
+ ::s ̒ ::t ::comment COMBINING TURNED COMMA ABOVE (U+0312)
1545
+ ::s ̔ ::t ::comment COMBINING REVERSED COMMA ABOVE (U+0314)
1546
+ ::s ̜ ::t ::comment COMBINING LEFT HALF RING BELOW (U+031C)
1547
+ ::s ̧ ::t ::comment COMBINING CEDILLA (U+0327)
1548
+ ::s ̫ ::t ::comment COMBINING INVERTED DOUBLE ARCH BELOW (U+032B)
1549
+ ::s ̲ ::t ::comment COMBINING LOW LINE (U+0332)
1550
+ ::s ̳ ::t ::comment COMBINING DOUBLE LOW LINE (U+0333)
1551
+ ::s ̹ ::t ::comment COMBINING RIGHT HALF RING BELOW (U+0339)
1552
+ ::s ̺ ::t ::comment COMBINING INVERTED BRIDGE BELOW (U+033A)
1553
+ ::s ̿ ::t ::comment COMBINING DOUBLE OVERLINE (U+033F)
1554
+ ::s ͅ ::t ::comment COMBINING GREEK YPOGEGRAMMENI (U+0345)
1555
+ ::s ͑ ::t ::comment COMBINING LEFT HALF RING ABOVE (U+0351)
1556
+ ::s ͗ ::t ::comment COMBINING RIGHT HALF RING ABOVE (U+0357)
1557
+ ::s ͚ ::t ::comment COMBINING DOUBLE RING BELOW (U+035A)
1558
+ ::s ͜ ::t ::comment COMBINING DOUBLE BREVE BELOW (U+035C)
1559
+ ::s ͝ ::t ::comment COMBINING DOUBLE BREVE (U+035D)
1560
+ ::s ͞ ::t ::comment COMBINING DOUBLE MACRON (U+035E)
1561
+ ::s ͟ ::t ::comment COMBINING DOUBLE MACRON BELOW (U+035F)
1562
+ ::s ͠ ::t ::comment COMBINING DOUBLE TILDE (U+0360)
1563
+
1564
+ ::s ‐ ::t - ::comment HYPHEN (U+2010)
1565
+ ::s ‗ ::t ‗ ::comment DOUBLE LOW LINE (U+2017)
1566
+ ::s ‵ ::t ‵ ::comment REVERSED PRIME (U+2035)
1567
+ ::s ‶ ::t ‶ ::comment REVERSED DOUBLE PRIME (U+2036)
1568
+ ::s ‸ ::t ‸ ::comment CARET (U+2038)
1569
+ ::s ‽ ::t ?! ::comment INTERROBANG (U+203D)
1570
+ ::s ‾ ::t ‾ ::comment OVERLINE (U+203E)
1571
+ ::s ‿ ::t ‿ ::comment UNDERTIE (U+203F)
1572
+ ::s ⁂ ::t ⁂ ::comment ASTERISM (U+2042)
1573
+ ::s ⁎ ::t * ::comment LOW ASTERISK (U+204E)
1574
+ ::s ⁏ ::t ; ::comment REVERSED SEMICOLON (U+204F)
1575
+ ::s ⁔ ::t ⁔ ::comment INVERTED UNDERTIE (U+2054)
1576
+ ::s ⁝ ::t ⁝ ::comment TRICOLON (U+205D)
1577
+ ::s   ::t " " ::comment MEDIUM MATHEMATICAL SPACE (U+205F)
1578
+ ::s ₋ ::t - ::comment SUBSCRIPT MINUS (U+208B)
1579
+ ::s ⃩ ::t ::comment COMBINING WIDE BRIDGE ABOVE (U+20E9)
1580
+
1581
+ ::s ﹔ ::t ; ::comment SMALL SEMICOLON (U+FE54)
1582
+ ::s ﹕ ::t : ::comment SMALL COLON (U+FE55)
1583
+ ::s ﹛ ::t { ::comment SMALL LEFT CURLY BRACKET (U+FE5B)
1584
+ ::s ﹜ ::t } ::comment SMALL RIGHT CURLY BRACKET (U+FE5C)
1585
+ ::s ﹠ ::t & ::comment SMALL AMPERSAND (U+FE60)
1586
+ ::s ﹡ ::t * ::comment SMALL ASTERISK (U+FE61)
1587
+ ::s ﹣ ::t - ::comment SMALL HYPHEN-MINUS (U+FE63)
1588
+
1589
+ ::s ℈ ::t ℈ ::comment SCRUPLE (U+2108)
1590
+ ::s ℟ ::t ℟ ::comment RESPONSE (U+211F)
1591
+ ::s ℣ ::t ℣ ::comment VERSICLE (U+2123)
1592
+ ::s ℽ ::t ℽ ::comment DOUBLE-STRUCK SMALL GAMMA (U+213D)
1593
+ ::s ℾ ::t ℾ ::comment DOUBLE-STRUCK CAPITAL GAMMA (U+213E)
1594
+ ::s ⅋ ::t ⅋ ::comment TURNED AMPERSAND (U+214B)
1595
+ ::s ⅍ ::t A/S::comment AKTIESELSKAB (U+214D)
1596
+
1597
+ ::s ⑃ ::t ⑃ ::comment OCR INVERTED FORK (U+2443)
1598
+ ::s ⑊ ::t \\ ::comment OCR DOUBLE BACKSLASH (U+244A)
1599
+ ::s ⟮ ::t ( ::comment MATHEMATICAL LEFT FLATTENED PARENTHESIS (U+27EE)
1600
+ ::s ⟯ ::t ) ::comment MATHEMATICAL RIGHT FLATTENED PARENTHESIS (U+27EF)
1601
+ ::s ⸨ ::t (( ::comment LEFT DOUBLE PARENTHESIS (U+2E28)
1602
+ ::s ⸩ ::t )) ::comment RIGHT DOUBLE PARENTHESIS (U+2E29)
1603
+
1604
+ # kavyka indicates alternative reading
1605
+ ::s ᷶ ::t ::comment COMBINING KAVYKA ABOVE RIGHT (U+1DF6)
1606
+ ::s ᷷ ::t ::comment COMBINING KAVYKA ABOVE LEFT (U+1DF7)
1607
+ ::s ⹅ ::t ::comment INVERTED LOW KAVYKA (U+2E45)
1608
+ ::s ⹆ ::t ::comment INVERTED LOW KAVYKA WITH KAVYKA ABOVE (U+2E46)
1609
+ ::s ⹇ ::t ::comment LOW KAVYKA (U+2E47)
1610
+ ::s ⹈ ::t ::comment LOW KAVYKA WITH DOT (U+2E48)
1611
+ ::s ꙾ ::t ::comment CYRILLIC KAVYKA (U+A67E)
1612
+
1613
+ # Braille
1614
+ ::s ⠁ ::t a
1615
+ ::s ⠃ ::t b
1616
+ ::s ⠉ ::t c
1617
+ ::s ⠙ ::t d
1618
+ ::s ⠑ ::t e
1619
+ ::s ⠋ ::t f
1620
+ ::s ⠛ ::t g
1621
+ ::s ⠓ ::t h
1622
+ ::s ⠊ ::t i
1623
+ ::s ⠚ ::t j
1624
+ ::s ⠅ ::t k
1625
+ ::s ⠇ ::t l
1626
+ ::s ⠍ ::t m
1627
+ ::s ⠝ ::t n
1628
+ ::s ⠕ ::t o
1629
+ ::s ⠏ ::t p
1630
+ ::s ⠟ ::t q
1631
+ ::s ⠗ ::t r
1632
+ ::s ⠎ ::t s
1633
+ ::s ⠞ ::t t
1634
+ ::s ⠥ ::t u
1635
+ ::s ⠧ ::t v
1636
+ ::s ⠺ ::t w
1637
+ ::s ⠭ ::t x
1638
+ ::s ⠽ ::t y
1639
+ ::s ⠵ ::t z
1640
+
1641
+ ::s ⠜ ::t ae
1642
+ ::s ⠪ ::t oe
1643
+ ::s ⠳ ::t ue
1644
+ ::s ⠷ ::t a ::comment à
1645
+ ::s ⠡ ::t a ::comment â
1646
+ ::s ⠿ ::t e ::comment é
1647
+ ::s ⠮ ::t e ::comment è
1648
+ ::s ⠣ ::t e ::comment ê
1649
+ ::s ⠫ ::t e ::comment ë
1650
+ ::s ⠩ ::t i ::comment î
1651
+ ::s ⠻ ::t i ::comment ï
1652
+ ::s ⠹ ::t o ::comment ô
1653
+ ::s ⠾ ::t u ::comment ù
1654
+ ::s ⠱ ::t u ::comment û
1655
+
1656
+ ::s ⠡ ::t au ::lcode deu
1657
+ ::s ⠌ ::t aeu ::lcode deu
1658
+ ::s ⠹ ::t ch ::lcode deu
1659
+ ::s ⠩ ::t ei ::lcode deu
1660
+ ::s ⠣ ::t eu ::lcode deu
1661
+ ::s ⠬ ::t ie ::lcode deu
1662
+ ::s ⠱ ::t sch ::lcode deu
1663
+ ::s ⠮ ::t ss ::lcode deu
1664
+ ::s ⠾ ::t st ::lcode deu
1665
+
1666
+ ::s ⠠⠠ ::t "" ::comment start of word all-caps mode
1667
+ # ::s ⠠⠁ ::t A
1668
+ # ::s ⠠⠃ ::t B
1669
+ # ::s ⠠⠉ ::t C
1670
+ # ::s ⠠⠙ ::t D
1671
+ # ::s ⠠⠑ ::t E
1672
+ # ::s ⠠⠋ ::t F
1673
+ # ::s ⠠⠛ ::t G
1674
+ # ::s ⠠⠓ ::t H
1675
+ # ::s ⠠⠊ ::t I
1676
+ # ::s ⠠⠚ ::t J
1677
+ # ::s ⠠⠅ ::t K
1678
+ # ::s ⠠⠇ ::t L
1679
+ # ::s ⠠⠍ ::t M
1680
+ # ::s ⠠⠝ ::t N
1681
+ # ::s ⠠⠕ ::t O
1682
+ # ::s ⠠⠏ ::t P
1683
+ # ::s ⠠⠟ ::t Q
1684
+ # ::s ⠠⠗ ::t R
1685
+ # ::s ⠠⠎ ::t S
1686
+ # ::s ⠠⠞ ::t T
1687
+ # ::s ⠠⠥ ::t U
1688
+ # ::s ⠠⠧ ::t V
1689
+ # ::s ⠠⠺ ::t W
1690
+ # ::s ⠠⠭ ::t X
1691
+ # ::s ⠠⠽ ::t Y
1692
+ # ::s ⠠⠵ ::t Z
1693
+
1694
+ ::s ⠼⠁ ::t 1
1695
+ ::s ⠼⠃ ::t 2
1696
+ ::s ⠼⠉ ::t 3
1697
+ ::s ⠼⠙ ::t 4
1698
+ ::s ⠼⠑ ::t 5
1699
+ ::s ⠼⠋ ::t 6
1700
+ ::s ⠼⠛ ::t 7
1701
+ ::s ⠼⠓ ::t 8
1702
+ ::s ⠼⠊ ::t 9
1703
+ ::s ⠼⠚ ::t 0
1704
+
1705
+ ::s ⠂ ::t ,
1706
+ ::s ⠆ ::t ;
1707
+ ::s ⠒ ::t :
1708
+ ::s ⠲ ::t .
1709
+ ::s ⠦ ::t ?
1710
+ ::s ⠖ ::t !
1711
+ ::s ⠄ ::t '
1712
+ ::s ⠤ ::t -
1713
+ ::s ⠨⠤ ::t _
1714
+
1715
+ ::s ⠀ ::t " " ::comment blank
1716
+ # ::s ⠐ t " " ::comment blank in numeric mode
1717
+ ::s ⠈ ::t "" ::comment accent
1718
+ # ::s ⠌ ::t / ::comment in numeric mode only
1719
+ # ::s ⠐ ::comment abbreviation sign
1720
+ # ::s ⠘ ::comment abbreviation sign
1721
+ # ::s ⠠ ::comment capital indicator
1722
+ ::s ⠨ ::t . ::comment decimal point; emphasis
1723
+ ::s ⠰ ::t "" ::comment letter indicator
1724
+ # ::s ⠴ ::t ”
1725
+ # ::s ⠶ ::t ()
1726
+ # ::s ⠸ ::comment abbreviation sign
1727
+ ::s ⠼ ::t "" ::comment number indicator
1728
+ ::s ⠘⠚ ::t ° ::word-external-punctuation
1729
+ ::s ⠘⠚⠠⠉ ::t °C
1730
+ ::s ⠘⠚⠉ ::t °C
1731
+ ::s ⠘⠚⠠⠋ ::t °F
1732
+ ::s ⠘⠚⠋ ::t °F
1733
+
1734
+ ::s ⠠⠶ ::t " ::word-external-punctuation
1735
+ ::s ⠘⠦ ::t “ ::word-external-punctuation
1736
+ ::s ⠘⠴ ::t ” ::word-external-punctuation
1737
+ ::s ⠄⠦ ::t ‘
1738
+ ::s ⠄⠴ ::t ’
1739
+ ::s ⠠⠴ ::t ���
1740
+ ::s ⠐⠣ ::t ( ::word-external-punctuation
1741
+ ::s ⠐⠜ ::t ) ::word-external-punctuation
1742
+ ::s ⠨⠣ ::t [ ::word-external-punctuation
1743
+ ::s ⠨⠜ ::t ] ::word-external-punctuation
1744
+ ::s ⠸⠣ ::t { ::word-external-punctuation
1745
+ ::s ⠸⠜ ::t } ::word-external-punctuation
1746
+ ::s ⠈⠣ ::t < ::word-external-punctuation
1747
+ ::s ⠈⠜ ::t > ::word-external-punctuation
1748
+ ::s ⠸⠌ ::t / ::word-external-punctuation
1749
+ ::s ⠸⠡ ::t \ ::word-external-punctuation
1750
+ ::s ⠠⠤ ::t – ::word-external-punctuation
1751
+ ::s ⠐⠠⠤ ::t — ::word-external-punctuation
1752
+ ::s ⠈⠯ ::t & ::word-external-punctuation
1753
+ ::s ⠐⠔ ::t * ::word-external-punctuation
1754
+ ::s ⠨⠦ ::t ∩ ::word-external-punctuation
1755
+ ::s ⠨⠴ ::t % ::word-external-punctuation
1756
+ ::s ⠐⠖ ::t + ::word-external-punctuation
1757
+ ::s ⠐⠤ ::t − ::word-external-punctuation
1758
+ ::s ⠐⠶ ::t = ::word-external-punctuation
1759
+ ::s ⠈⠎ ::t $ ::word-external-punctuation
1760
+ ::s ⠈⠉ ::t ¢ ::word-external-punctuation
1761
+ ::s ⠈⠇ ::t £ ::word-external-punctuation
1762
+ ::s ⠈⠽ ::t ¥ ::word-external-punctuation
1763
+ ::s ⠈⠁ ::t @ ::word-external-punctuation
1764
+ ::s ⠸⠹ ::t # ::word-external-punctuation
1765
+ ::s ⠸⠲ ::t • ::word-external-punctuation
1766
+ ::s ⠈⠢ ::t ^ ::word-external-punctuation
1767
+ ::s ⠈⠔ ::t ~ ::word-external-punctuation
1768
+ ::s ⠘⠉ ::t © ::word-external-punctuation
1769
+ ::s ⠐⠌ ::t ÷ ::word-external-punctuation
1770
+ ::s ⠐⠦ ::t × ::word-external-punctuation
1771
+ ::s ⠈⠠⠹ ::t † ::word-external-punctuation
1772
+ ::s ⠈⠠⠻ ::t ‡ ::word-external-punctuation
1773
+ ::s ⠘⠏ ::t ¶ ::word-external-punctuation
1774
+ ::s ⠘⠎ ::t § ::word-external-punctuation
1775
+ ::s ⠘⠗ ::t ® ::word-external-punctuation
1776
+ ::s ⠘⠞ ::t ™ ::word-external-punctuation
1777
+
1778
+ # English Braille
1779
+ ::s ⠁⠃ ::t about ::lcode eng ::use-only-for-whole-word
1780
+ ::s ⠁⠃⠧ ::t above ::lcode eng ::use-only-for-whole-word
1781
+ ::s ⠁⠉ ::t according ::lcode eng ::use-only-for-whole-word
1782
+ ::s ⠁⠉⠗ ::t across ::lcode eng ::use-only-for-whole-word
1783
+ ::s ⠁⠋ ::t after ::lcode eng ::use-only-for-whole-word
1784
+ ::s ⠁⠋⠝ ::t afternoon ::lcode eng ::use-only-for-whole-word
1785
+ ::s ⠁⠋⠺ ::t afterward ::lcode eng ::use-only-for-whole-word
1786
+ ::s ⠁⠛ ::t again ::lcode eng ::use-only-for-whole-word
1787
+ ::s ⠁⠛⠌ ::t against ::lcode eng ::use-only-for-whole-word
1788
+ ::s ⠠⠽ ::t ally ::lcode eng ::use-only-at-end-of-word ::use-only-in-lower-case-environment
1789
+ ::s ⠁⠇⠍ ::t almost ::lcode eng ::use-only-for-whole-word
1790
+ ::s ⠁⠇⠗ ::t already ::lcode eng ::use-only-for-whole-word
1791
+ ::s ⠁⠇ ::t also ::lcode eng ::use-only-for-whole-word
1792
+ ::s ⠁⠇⠹ ::t although ::lcode eng ::use-only-for-whole-word
1793
+ ::s ⠁⠇⠞ ::t altogether ::lcode eng ::use-only-for-whole-word
1794
+ ::s ⠁⠇⠺ ::t always ::lcode eng ::use-only-for-whole-word
1795
+ ::s ⠨⠑ ::t ance ::lcode eng
1796
+ ::s ⠯ ::t and ::lcode eng
1797
+ ::s ⠜ ::t ar ::lcode eng
1798
+ ::s ⠵ ::t as ::lcode eng ::use-only-for-whole-word
1799
+ ::s ⠠⠝ ::t ation ::lcode eng ::use-only-at-end-of-word ::use-only-in-lower-case-environment
1800
+ ::s ⠃ ::t b ::lcode eng
1801
+ ::s ⠆ ::t bb ::lcode eng ::dont-use-at-start-of-word ::dont-use-at-end-of-word
1802
+ ::s ⠆ ::t be ::lcode eng ::use-only-at-start-of-word
1803
+ ::s ⠆⠉ ::t because ::lcode eng ::use-only-for-whole-word
1804
+ ::s ⠆⠋ ::t before ::lcode eng ::use-only-for-whole-word
1805
+ ::s ⠆⠓ ::t behind ::lcode eng ::use-only-for-whole-word
1806
+ ::s ⠆⠇ ::t below ::lcode eng ::use-only-for-whole-word
1807
+ ::s ⠆⠝ ::t beneath ::lcode eng ::use-only-for-whole-word
1808
+ ::s ⠆⠎ ::t beside ::lcode eng ::use-only-for-whole-word
1809
+ ::s ⠆⠞ ::t between ::lcode eng ::use-only-for-whole-word
1810
+ ::s ⠆⠽ ::t beyond ::lcode eng ::use-only-for-whole-word
1811
+ ::s ⠃⠇ ::t blind ::lcode eng ::use-only-for-whole-word
1812
+ ::s ⠃⠗⠇ ::t Braille ::lcode eng ::use-only-for-whole-word
1813
+ ::s ⠃ ::t but ::lcode eng ::use-only-for-whole-word
1814
+ ::s ⠉ ::t c ::lcode eng
1815
+ ::s ⠉ ::t can ::lcode eng ::use-only-for-whole-word
1816
+ ::s ⠸⠉ ::t cannot ::lcode eng
1817
+ ::s ⠒ ::t cc ::lcode eng ::dont-use-at-start-of-word ::dont-use-at-end-of-word
1818
+ ::s ⠉⠧ ::t ceive ::lcode eng ::use-only-at-end-of-word
1819
+ ::s ⠉⠧⠙ ::t ceived ::lcode eng ::use-only-at-end-of-word
1820
+ ::s ⠉⠧⠎ ::t ceives ::lcode eng ::use-only-at-end-of-word
1821
+ ::s ⠉⠧⠛ ::t ceiving ::lcode eng
1822
+ ::s ⠡ ::t ch ::lcode eng
1823
+ ::s ⠐⠡ ::t character ::lcode eng
1824
+ ::s ⠡ ::t child ::lcode eng ::use-only-for-whole-word
1825
+ ::s ⠡⠝ ::t children ::lcode eng ::use-only-for-whole-word
1826
+ ::s ⠒ ::t con ::lcode eng ::use-only-at-start-of-word
1827
+ ::s ⠒ ::t : ::lcode eng ::use-only-at-end-of-word
1828
+ ::s ⠉⠙ ::t could ::lcode eng ::use-only-for-whole-word
1829
+ ::s ⠙ ::t d ::lcode eng
1830
+ ::s ⠙ ::t do ::lcode eng ::use-only-for-whole-word
1831
+ ::s ⠐⠙ ::t day ::lcode eng
1832
+ # ::s ⠲ ::t dd ::t-alt . ::lcode eng ::dont-use-at-start-of-word ::dont-use-at-end-of-word ::comment abolished; interferes with period in abbrevisations such as U.S.
1833
+ ::s ⠙⠉⠇ ::t declare ::lcode eng
1834
+ ::s ⠙⠉⠇⠛ ::t declaring ::lcode eng
1835
+ ::s ⠲ ::t dis ::lcode eng ::use-only-at-start-of-word
1836
+ ::s ⠲ ::t . ::lcode eng ::dont-use-at-start-of-word
1837
+ ::s ⠑ ::t e ::lcode eng
1838
+ ::s ⠂ ::t ea ::lcode eng ::dont-use-at-end-of-word
1839
+ ::s ⠂ ::t , ::lcode eng ::use-only-at-end-of-word
1840
+ ::s ⠫ ::t ed ::lcode eng
1841
+ ::s ⠑⠊ ::t either ::lcode eng ::use-only-for-whole-word
1842
+ ::s ⠢ ::t en ::lcode eng
1843
+ ::s ⠰⠑ ::t ence ::lcode eng ::dont-use-at-start-of-word
1844
+ ::s ⠢ ::t enough ::lcode eng ::use-only-for-whole-word
1845
+ ::s ⠻ ::t er ::lcode eng
1846
+ ::s ⠐⠑ ::t ever ::lcode eng
1847
+ ::s ⠑ ::t every ::lcode eng ::use-only-for-whole-word
1848
+ ::s ⠋ ::t f ::lcode eng
1849
+ ::s ⠐⠋ ::t father ::lcode eng
1850
+ ::s ⠖ ::t ff ::lcode eng ::dont-use-at-start-of-word ::dont-use-at-end-of-word
1851
+ ::s ⠋⠌ ::t first ::lcode eng
1852
+ ::s ⠿ ::t for ::lcode eng
1853
+ ::s ⠋⠗ ::t friend ::lcode eng ::use-only-for-whole-word
1854
+ ::s ⠋⠗⠎ ::t friends ::lcode eng ::use-only-for-whole-word
1855
+ ::s ⠋ ::t from ::lcode eng ::use-only-for-whole-word
1856
+ ::s ⠰⠇ ::t ful ::lcode eng ::dont-use-at-start-of-word
1857
+ ::s ⠛ ::t g ::lcode eng
1858
+ ::s ⠶ ::t gg ::lcode eng ::dont-use-at-start-of-word ::dont-use-at-end-of-word
1859
+ ::s ⠣ ::t gh ::lcode eng
1860
+ ::s ⠛ ::t go ::lcode eng ::use-only-for-whole-word
1861
+ ::s ⠛⠙ ::t good ::lcode eng ::use-only-at-start-of-word
1862
+ ::s ⠛⠗⠞ ::t great ::lcode eng
1863
+ ::s ⠓ ::t h ::lcode eng
1864
+ ::s ⠸⠓ ::t had ::lcode eng
1865
+ ::s ⠓ ::t have ::lcode eng ::use-only-for-whole-word
1866
+ ::s ⠐⠓ ::t here ::lcode eng
1867
+ ::s ⠓⠻⠋ ::t herself ::lcode eng ::use-only-for-whole-word
1868
+ ::s ⠓⠍ ::t him ::lcode eng ::use-only-for-whole-word
1869
+ ::s ⠓⠍⠋ ::t himself ::lcode eng ::use-only-for-whole-word
1870
+ ::s ⠦ ::t ? ::lcode eng
1871
+ ::s ⠦ ::t his ::lcode eng ::use-only-for-whole-word
1872
+ ::s ⠊⠍⠍ ::t immediate ::lcode eng ::use-only-for-whole-word
1873
+ ::s ⠊⠍⠍⠇⠽ ::t immediately ::lcode eng ::use-only-for-whole-word
1874
+ ::s ⠔ ::t in ::lcode eng
1875
+ ::s ⠔⠒ ::t incon ::lcode eng ::use-only-at-start-of-word
1876
+ ::s ⠬ ::t ing ::lcode eng
1877
+ ::s ⠭ ::t it ::lcode eng ::use-only-for-whole-word
1878
+ ::s ⠭⠎ ::t its ::lcode eng ::use-only-for-whole-word
1879
+ ::s ⠭⠋ ::t itself ::lcode eng ::use-only-for-whole-word
1880
+ ::s ⠰⠽ ::t ity ::lcode eng ::dont-use-at-start-of-word
1881
+ ::s ⠚ ::t j ::lcode eng
1882
+ ::s ⠚ ::t just ::lcode eng ::use-only-for-whole-word
1883
+ ::s ⠅ ::t k ::lcode eng
1884
+ ::s ⠐⠅ ::t know ::lcode eng
1885
+ ::s ⠅ ::t knowledge ::lcode eng ::use-only-for-whole-word
1886
+ ::s ⠇ ::t l ::lcode eng
1887
+ ::s ⠨⠎ ::t less ::lcode eng ::dont-use-at-start-of-word
1888
+ ::s ⠇⠗ ::t letter ::lcode eng ::use-only-for-whole-word
1889
+ ::s ⠇⠗⠎ ::t letters ::lcode eng ::use-only-for-whole-word
1890
+ ::s ⠇ ::t like ::lcode eng ::use-only-for-whole-word
1891
+ ::s ⠇⠇ ::t little ::lcode eng ::use-only-for-whole-word
1892
+ ::s ⠐⠇ ::t lord ::lcode eng
1893
+ ::s ⠍ ::t m ::lcode eng
1894
+ ::s ⠸⠍ ::t many ::lcode eng
1895
+ ::s ⠰⠞ ::t ment ::lcode eng ::dont-use-at-start-of-word
1896
+ ::s ⠍ ::t more ::lcode eng ::use-only-for-whole-word
1897
+ ::s ⠐⠍ ::t mother ::lcode eng
1898
+ ::s ⠍⠡ ::t much ::lcode eng ::use-only-for-whole-word
1899
+ ::s ⠍⠌ ::t must ::lcode eng ::use-only-for-whole-word
1900
+ ::s ⠍⠽⠋ ::t myself ::lcode eng ::use-only-for-whole-word
1901
+ ::s ⠝ ::t n ::lcode eng
1902
+ ::s ⠐⠝ ::t name ::lcode eng
1903
+ ::s ⠝⠑⠉ ::t necessary ::lcode eng ::use-only-for-whole-word
1904
+ ::s ⠝⠑⠊ ::t neither ::lcode eng ::use-only-for-whole-word
1905
+ ::s ⠰⠎ ::t ness ::lcode eng ::dont-use-at-start-of-word
1906
+ ::s ⠝ ::t not ::lcode eng ::use-only-for-whole-word
1907
+ ::s ⠕⠄⠉ ::t o'clock ::lcode eng ::use-only-for-whole-word
1908
+ ::s ⠷ ::t of ::lcode eng
1909
+ ::s ⠐⠕ ::t one ::lcode eng
1910
+ ::s ⠰⠛ ::t ong ::lcode eng ::dont-use-at-start-of-word
1911
+ ::s ⠳ ::t ou ::lcode eng
1912
+ ::s ⠨⠙ ::t ound ::lcode eng
1913
+ ::s ⠨⠞ ::t ount ::lcode eng
1914
+ ::s ⠐⠳ ::t ought ::lcode eng
1915
+ ::s ⠳⠗⠧⠎ ::t ourselves ::lcode eng ::use-only-for-whole-word
1916
+ ::s ⠳ ::t out ::lcode eng ::use-only-for-whole-word
1917
+ ::s ⠪ ::t ow ::lcode eng
1918
+ ::s ⠏ ::t p ::lcode eng
1919
+ ::s ⠏⠙ ::t paid ::lcode eng ::use-only-for-whole-word
1920
+ ::s ⠐⠏ ::t part ::lcode eng
1921
+ ::s ⠏ ::t people ::lcode eng ::use-only-for-whole-word
1922
+ ::s ⠏⠻⠓ ::t perhaps ::lcode eng ::use-only-for-whole-word
1923
+ ::s ⠟ ::t q ::lcode eng
1924
+ ::s ⠐⠟ ::t question ::lcode eng
1925
+ ::s ⠟⠅ ::t quick ::lcode eng ::use-only-for-whole-word
1926
+ ::s ⠟⠅⠻ ::t quicker ::lcode eng ::use-only-for-whole-word
1927
+ ::s ⠟⠅⠑⠌ ::t quickest ::lcode eng ::use-only-for-whole-word
1928
+ ::s ⠟ ::t quite ::lcode eng ::use-only-for-whole-word
1929
+ ::s ⠗ ::t r ::lcode eng
1930
+ ::s ⠗ ::t rather ::lcode eng ::use-only-for-whole-word
1931
+ ::s ⠐⠗ ::t right ::lcode eng
1932
+ ::s ⠗⠚⠉ ::t rejoice ::lcode eng
1933
+ ::s ⠗⠚⠉⠛ ::t rejoicing ::lcode eng
1934
+ ::s ⠎ ::t s ::lcode eng
1935
+ ::s ⠎⠙ ::t said ::lcode eng ::use-only-for-whole-word
1936
+ ::s ⠩ ::t sh ::lcode eng
1937
+ ::s ⠩ ::t shall ::lcode eng ::use-only-for-whole-word
1938
+ ::s ⠩⠙ ::t should ::lcode eng ::use-only-for-whole-word
1939
+ ::s ⠨⠝ ::t sion ::lcode eng
1940
+ ::s ⠎ ::t so ::lcode eng ::use-only-for-whole-word
1941
+ ::s ⠐⠎ ::t some ::lcode eng
1942
+ ::s ⠸⠎ ::t spirit ::lcode eng
1943
+ ::s ⠌ ::t st ::lcode eng
1944
+ ::s ⠌ ::t still ::lcode eng ::use-only-for-whole-word
1945
+ ::s ⠎⠡ ::t such ::lcode eng ::use-only-for-whole-word
1946
+ ::s ⠞ ::t t ::lcode eng
1947
+ ::s ⠹ ::t th ::lcode eng
1948
+ ::s ⠞ ::t that ::lcode eng ::use-only-for-whole-word
1949
+ ::s ⠹ ::t this ::lcode eng ::use-only-for-whole-word
1950
+ ::s ⠮ ::t the ::lcode eng
1951
+ ::s ⠸⠮ ::t their ::lcode eng
1952
+ ::s ⠮⠍⠧⠎ ::t themselves ::lcode eng ::use-only-for-whole-word
1953
+ ::s ⠐⠮ ::t there ::lcode eng
1954
+ ::s ⠘⠮ ::t these ::lcode eng
1955
+ ::s ⠘⠹ ::t those ::lcode eng
1956
+ ::s ⠐⠹ ::t through ::lcode eng
1957
+ ::s ⠐⠞ ::t time ::lcode eng
1958
+ ::s ⠰⠝ ::t tion ::lcode eng ::dont-use-at-start-of-word
1959
+ ::s ⠖ ::t to ::lcode eng ::use-only-for-whole-word
1960
+ ::s ⠞⠙ ::t today ::lcode eng ::use-only-for-whole-word
1961
+ ::s ⠞⠛⠗ ::t together ::lcode eng ::use-only-for-whole-word
1962
+ ::s ⠞⠍ ::t tomorrow ::lcode eng ::use-only-for-whole-word
1963
+ ::s ⠞⠝ ::t tonight ::lcode eng ::use-only-for-whole-word
1964
+ ::s ⠥ ::t u ::lcode eng
1965
+ ::s ⠥⠝⠒ ::t uncon ::lcode eng ::use-only-at-start-of-word
1966
+ ::s ⠥ ::t us ::lcode eng ::use-only-for-whole-word
1967
+ ::s ⠠⠥⠲⠎⠲ ::t U.S. ::lcode eng
1968
+ ::s ⠐⠥ ::t under ::lcode eng
1969
+ ::s ⠘⠥ ::t upon ::lcode eng
1970
+ ::s ⠧ ::t v ::lcode eng
1971
+ ::s ⠧ ::t very ::lcode eng ::use-only-for-whole-word
1972
+ ::s ⠺ ::t w ::lcode eng
1973
+ ::s ⠴ ::t " ::lcode eng
1974
+ ::s ⠴ ::t was ::lcode eng ::use-only-for-whole-word
1975
+ ::s ⠶ ::t were ::lcode eng ::use-only-for-whole-word
1976
+ ::s ⠱ ::t wh ::lcode eng
1977
+ ::s ⠐⠱ ::t where ::lcode eng
1978
+ ::s ⠱ ::t which ::lcode eng ::use-only-for-whole-word
1979
+ ::s ⠘⠱ ::t whose ::lcode eng
1980
+ ::s ⠺ ::t will ::lcode eng ::use-only-for-whole-word
1981
+ ::s ⠾ ::t with ::lcode eng
1982
+ ::s ⠘⠺ ::t word ::lcode eng
1983
+ ::s ⠐⠺ ::t work ::lcode eng
1984
+ ::s ⠸⠺ ::t world ::lcode eng
1985
+ ::s ⠺⠙ ::t would ::lcode eng ::use-only-for-whole-word
1986
+ ::s ⠭ ::t x ::lcode eng
1987
+ ::s ⠽ ::t y ::lcode eng
1988
+ ::s ⠽ ::t you ::lcode eng ::use-only-for-whole-word
1989
+ ::s ⠽⠗ ::t your ::lcode eng ::use-only-for-whole-word
1990
+ ::s ⠽⠗⠎ ::t yours ::lcode eng ::use-only-for-whole-word
1991
+ ::s ⠽⠗⠋ ::t yourself ::lcode eng ::use-only-for-whole-word
1992
+ ::s ⠽⠗⠧⠎ ::t yourselves ::lcode eng ::use-only-for-whole-word
1993
+ ::s ⠐⠽ ::t young ::lcode eng
1994
+ ::s ⠵ ::t z ::lcode eng
1995
+ ::s ⠠⠴ ::t ’ ::lcode eng
1996
+
1997
+ ::preserve ::from U+2190 ::to U+21FF ::comments Arrows
1998
+ ::preserve ::from U+2200 ::to U+22FF ::comment Mathematical Operators
1999
+ ::preserve ::from U+2300 ::to U+23FF ::comment Miscellaneous Technical
2000
+ ::preserve ::from U+2500 ::to U+257F ::comment Box Drawing
2001
+ ::preserve ::from U+2580 ::to U+259F ::comment Block Elements
2002
+ ::preserve ::from U+25A0 ::to U+25FF ::comment Geometric Shapes
2003
+ ::preserve ::from U+2600 ::to U+26FF ::comment Miscellaneous Symbols
2004
+ ::preserve ::from U+27C0 ::to U+27ED ::comment Miscellaneous Mathematical Symbols-A
2005
+ ::preserve ::from U+27F0 ::to U+27FF ::comment Supplemental Arrows-A
2006
+ ::preserve ::from U+2900 ::to U+297F ::comment Supplemental Arrows-B
2007
+ ::preserve ::from U+2980 ::to U+29FF ::comment Miscellaneous Mathematical Symbols-B
2008
+ ::preserve ::from U+2A00 ::to U+2AFF ::comment Supplemental Mathematical Operators
2009
+ ::preserve ::from U+2B00 ::to U+2BFF ::comment Miscellaneous Symbols and Arrows
2010
+ ::preserve ::from U+2E00 ::to U+2E27 ::comment Supplemental Punctuation (excluding ⸨⸩)
2011
+ ::preserve ::from U+2E2A ::to U+2E7F ::comment Supplemental Punctuation (cont'd)
2012
+ ::preserve ::from U+18B00 ::to U+18CD5 ::comment Khitan Small Script
2013
+ ::preserve ::from U+1D100 ::to U+1D1FF ::comment Musical Symbols
2014
+ ::preserve ::from U+1D6A8 ::to U+1D7CB ::comment Mathematical Alphanumeric Symbols (Greek)
2015
+ ::preserve ::from U+1D800 ::to U+1DAAF ::comment Sutton SignWriting
2016
+ ::preserve ::from U+1F800 ::to U+1F8FF ::comment Supplemental Arrows-C
2017
+ ::preserve ::from U+1FA00 ::to U+1FA6F ::comment Chess Symbols
2018
+ ::preserve ::from U+1FB00 ::to U+1FBCF ::comment Symbols for Legacy Computing
2019
+ ::preserve ::from U+1FA70 ::to U+1FAFF ::comment Symbols and Pictographs Extended-A
uroman/data/romanization-table.v1.2.1.txt ADDED
@@ -0,0 +1,814 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 
2
+ ## European Latin extensions
3
+ # Vowels
4
+ ::s Ä ::t Ae
5
+ ::s Ö ::t Oe
6
+ ::s Ü ::t Ue
7
+ ::s Å ::t Aa
8
+ ::s Æ ::t Ae
9
+ ::s Ø ::t oe
10
+ ::s Œ ::t Oe
11
+ ::s ä ::t ae
12
+ ::s ö ::t oe
13
+ ::s ü ::t ue
14
+ ::s å ::t aa
15
+ ::s æ ::t ae
16
+ ::s ø ::t oe
17
+ ::s œ ::t oe
18
+ # Consonants
19
+ ::s Ç ::t S
20
+ ::s ç ::t s
21
+ ::s Ç ::t Ch ::lcode tur
22
+ ::s ç ::t ch ::lcode tur
23
+ ::s Ş ::t Sh
24
+ ::s ş ::t sh
25
+ ::s Ș ::t Sh
26
+ ::s ș ::t sh
27
+ ::s ß ::t ss
28
+ ::s Ț ::t Ts
29
+ ::s ț ::t ts
30
+
31
+ # Miscellaneous
32
+ ::s ə ::t e
33
+
34
+ # English
35
+ ::s chr ::t chr ::t-alt kr ::example chromosome, synchronize
36
+ ::s Chr ::t Chr ::t-alt Kr ::example Christmas, Chrysler
37
+ ::s eight ::t eight ::t-alt eit ::example eight, weight
38
+ ::s Eight ::t Eight ::t-alt Eit ::example Eighteen
39
+ ::s ight ::t ight ::t-alt ait ::example Knight
40
+ ::s gh ::t gh ::t-alt f, ph, "" ::example laugh, daughter
41
+ ::s high ::t high ::t-alt hai ::example highlight
42
+ ::s High ::t High ::t-alt Hai ::example High School
43
+ ::s Isle ::t Isle ::t-alt Ail ::use-only-at-start-of-word ::use-only-at-end-of-word ::example Isle
44
+ ::s Island ::t Island ::t-alt Ailand ::use-only-at-start-of-word ::use-only-at-end-of-word ::example Island
45
+ ::s kn ::t kn ::t-alt n ::use-only-at-start-of-word ::example knowledge
46
+ ::s Kn ::t Kn ::t-alt N ::use-only-at-start-of-word ::example Knight
47
+ ::s Mc ::t Mc ::t-alt Mac ::use-only-at-start-of-word ::example McNulty
48
+ ::s mc ::t mc ::t-alt mac ::use-only-at-start-of-word
49
+ ::s oo ::t oo ::t-alt u ::lcode eng ::example Brooklyn; Goose Bay
50
+ ::s ph ::t ph ::t-alt f ::example alpha
51
+ ::s Ph ::t Ph ::t-alt F ::example Philip
52
+ ::s Thom ::t Thom ::t-alt Tom ::use-only-at-start-of-word ::example Thomas, Thompson
53
+ ::s tion ::t tion ::t-alt shen ::example
54
+ ::s Sean ::t Sean ::t-alt Shawn ::use-only-at-start-of-word ::use-only-at-end-of-word
55
+ ::s ssion ::t ssion ::t-alt shen ::example Sessions
56
+ ::s St ::t St ::t-alt Saint ::use-only-at-start-of-word ::use-only-at-end-of-word
57
+ ::s St. ::t St. ::t-alt Saint ::use-only-at-start-of-word ::use-only-at-end-of-word
58
+ ::s Wr ::t Wr ::t-alt R ::example Wren
59
+ ::s wr ::t wr ::t-alt r ::example Cartwright
60
+ ::s x ::t x ::t-alt ks ::example Mexico
61
+ ::s x ::t x ::t-alt gz ::example example, anxiety, exhaust, exit
62
+
63
+ # French
64
+ ::s â ::t a ::t-alt as ::example pâte/paste, pastry
65
+ ::s ê ::t e ::t-alt es ::example fête/feast
66
+ ::s î ::t i ::t-alt is ::example île/isle
67
+ ::s ô ::t o ::t-alt os ::example côte/coast
68
+ ::s û ::t u ::t-alt us ::example août/August
69
+ ::s eaux ::t eaux ::t-alt o ::example Bordeaux
70
+ ::s eau ::t eau ::t-alt o ::example Chateau
71
+ ::s auld ::t auld ::t-alt o ::use-only-at-end-of-word ::example Renauld
72
+ ::s ault ::t ault ::t-alt o ::use-only-at-end-of-word ::example Renault
73
+ ::s oux ::t oux ::t-alt u
74
+ ::s ois ::t ois ::t-alt oa ::use-only-at-end-of-word ::example Dubois
75
+
76
+ # German
77
+ ::s Sch ::t Sch ::t-alt Sh
78
+ ::s sch ::t sch ::t-alt sh
79
+ ::s stein ::t stein ::t-alt shtain
80
+ ::s dt ::t dt ::t-alt tt ::use-only-at-end-of-word ::example Schmidt
81
+
82
+ # Dutch
83
+ ::s ij ::t ij ::t-alt ai
84
+ ::s Ij ::t Ij ::t-alt Ai
85
+
86
+ # Greek
87
+ ::s Ι ::t I
88
+ ::s ι ::t i
89
+ ::s ί ::t i
90
+ ::s ἶ ::t i
91
+ ::s Υ ::t Y
92
+ ::s υ ::t y
93
+ ::s Ρ ::t R
94
+ ::s ρ ::t r
95
+ ::s Ντ ::t D
96
+ ::s ντ ::t nd ::t-alt d
97
+ # ::s ντζ ::t ntz
98
+ ::s Μπ ::t B
99
+ ::s μπ ::t mb ::t-alt b
100
+ ::s γγ ::t ng
101
+ ::s γκ ::t ng ::t-alt g
102
+ ::s ει ::t ei ::t-alt i
103
+ ::s ου ::t ou ::t-alt u
104
+ ::s χ ::t ch ::t-alt kh
105
+
106
+ # Cyrillic
107
+ ::s Г ::t G ::t-alt H
108
+ ::s г ::t g ::t-alt h
109
+ ::s Е ::t E ::t-alt Ye
110
+ ::s е ::t e ::t-alt ye
111
+ ::s Ё ::t E ::t-alt Yo
112
+ ::s ё ::t e ::t-alt yo
113
+ ::s Х ::t Kh ::t-alt Ch, H ::comment Cyrillic capital ha
114
+ ::s х ::t kh ::t-alt ch, h ::comment Cyrillic small ha
115
+ ::s Щ ::t Shch ::t-alt Sh
116
+ ::s щ ::t shch ::t-alt sh
117
+ ::s Ъ ::t ::comment Cyrillic capital hard sign
118
+ ::s ъ ::t ::comment Cyrillic small hard sign
119
+ ::s Ы ::t Y ::comment Cyrillic capital yeru
120
+ ::s ы ::t y ::comment Cyrillic small yeru
121
+ ::s Ь ::t ::comment Cyrillic capital soft sign
122
+ ::s ь ::t ::comment Cyrillic small soft sign
123
+
124
+ ::s Ҥ ::t Ng ::comment Cyrillic capital ligature EN GHE
125
+ ::s ҥ ::t ng ::comment Cyrillic small ligature EN GHE
126
+ ::s Ә ::t e ::comment Cyrillic capital schwa
127
+ ::s ә ::t e ::comment Cyrillic small schwa
128
+ ::s Ӏ ::t ' ::comment Cyrillic palochka
129
+ ::s Ҵ ::t TS ::comment Cyrillic capital ligature te tse, used in Abkhasian
130
+ ::s ҵ ::t ts ::comment Cyrillic small ligature te tse, used in Abkhasian
131
+ ::s Ӕ ::t AE ::comment Cyrillic capital ligature a ie
132
+ ::s ӕ ::t ae ::comment Cyrillic small ligature a ie
133
+ ::s Г ::t H ::lcode ukr ::comment Ukrainian capital letter he
134
+ ::s г ::t h ::lcode ukr ::comment Ukrainian small letter he
135
+ ::s Ґ ::t G ::lcode ukr ::comment Ukrainian capital letter ghe
136
+ ::s ґ ::t g ::lcode ukr ::comment Ukrainian small letter ghe
137
+
138
+ # Gothic
139
+ ::s 𐌴 ::t e ::comment Gothic letter aihvus
140
+ ::s 𐌹 ::t i ::comment Gothic letter eis
141
+ ::s 𐍇 ::t x ::comment Gothic letter iggws
142
+
143
+ # Georgian
144
+ ::s ა ::t a ::comment Georgian letter an
145
+ ::s ე ::t e ::comment Georgian letter en
146
+ ::s ი ::t i ::comment Georgian letter in
147
+ ::s ო ::t o ::comment Georgian letter on
148
+ ::s უ ::t u ::comment Georgian letter un
149
+
150
+ # Armenian
151
+ ::s Ա ::t a ::comment Armenian capital letter ayb
152
+ ::s ա ::t a ::comment Armenian small letter ayb
153
+ ::s Ե ::t e ::comment Armenian capital letter ech
154
+ ::s ե ::t e ::comment Armenian small letter ech
155
+ ::s և ::t ev ::comment Armenian small ligature ech yiwn
156
+ ::s Է ::t e ::comment Armenian capital letter eh
157
+ ::s է ::t e ::comment Armenian small letter eh
158
+ ::s Ի ::t i ::comment Armenian capital letter ini
159
+ ::s ի ::t i ::comment Armenian small letter ini
160
+ ::s Օ ::t o ::comment Armenian capital letter oh
161
+ ::s օ ::t o ::comment Armenian small letter oh
162
+
163
+ ## Japanese
164
+ # Katakana
165
+ ::s シ ::t shi
166
+ ::s チ ::t chi
167
+ ::s フ ::t fu
168
+ ::s ジ ::t ji
169
+ ::s ヂ ::t ji
170
+ ::s ヅ ::t zu
171
+ ::s シャ ::t sha
172
+ ::s シュ ::t shu
173
+ ::s ショ ::t sho
174
+ ::s チャ ::t cha
175
+ ::s チェ ::t che
176
+ ::s チュ ::t chu
177
+ ::s チョ ::t cho
178
+ ::s ジャ ::t ja
179
+ ::s ジュ ::t ju
180
+ ::s ジョ ::t jo
181
+ ::s ジェ ::t je
182
+ ::s ヂャ ::t ja
183
+ ::s ヂュ ::t ju
184
+ ::s ヂョ ::t jo
185
+ ::s フェ ::t fe
186
+ ::s ヴェ ::t ve
187
+ ::s フィ ::t fi
188
+ ::s ウィ ::t wi
189
+ ::s ヴィ ::t vi
190
+ ::s ティ ::t ti
191
+ ::s ディ ::t di
192
+ ::s ッ ::t (__SOKUON__) ::comment katakana double following consonant
193
+ ::s ー ::t (__CHOONPU__) ::comment katakana prolonged sound mark
194
+ # Hiragana
195
+ ::s し ::t shi
196
+ ::s ち ::t chi
197
+ ::s つ ::t tsu
198
+ ::s ふ ::t fu
199
+ ::s を ::t o
200
+ ::s じ ::t ji
201
+ ::s ぢ ::t ji
202
+ ::s づ ::t zu
203
+ ::s しゃ ::t sha
204
+ ::s しゅ ::t shu
205
+ ::s しょ ::t sho
206
+ ::s ちゃ ::t cha
207
+ ::s ちゅ ::t chu
208
+ ::s ちょ ::t cho
209
+ ::s じゃ ::t ja
210
+ ::s じゅ ::t ju
211
+ ::s じょ ::t jo
212
+ ::s ぢゃ ::t ja
213
+ ::s ぢゅ ::t ju
214
+ ::s ぢょ ::t jo
215
+ ::s っ ::t (__SOKUON__) ::comment hiragana double following consonant
216
+ ::s 々 ::t ² ::comment ideographic iteration mark ::annotation repetition-sign
217
+
218
+ ::s フ ::t fu ::t-alt f
219
+ ::s キ ::t ki ::t-alt k
220
+ ::s ク ::t ku ::t-alt k
221
+ ::s ラ ::t ra ::t-alt la
222
+ ::s リ ::t ri ::t-alt li
223
+ ::s ル ::t ru ::t-alt lu, l, r
224
+ ::s レ ::t re ::t-alt le
225
+ ::s ロ ::t ro ::t-alt lo
226
+ ::s ム ::t mu ::t-alt m ::example キム = Kim
227
+ ::s シ ::t shi ::t-alt si ::example メキシコ = meksiko (Mexico)
228
+ ::s ス ::t su ::t-alt s
229
+ ::s ト ::t to ::t-alt t
230
+ ::s ツ ::t tsu ::t-alt tu, ts ::example シュルツ = Schultz
231
+
232
+ # Chinese
233
+ ::s 邦 ::t bang ::t-alt bon, bum, bun, pon
234
+ ::s 鲍 ::t bao ::t-alt bow
235
+ ::s 堡 ::t bao ::t-alt berg, burg, bourg, burgh
236
+ ::s 贝 ::t bei ::t-alt ber
237
+ ::s 本 ::t ben ::t-alt bern, bon, bourn, burn
238
+ ::s 彼得 ::t bide ::t-alt peter, pet
239
+ ::s 伯 ::t bo ::t-alt ber
240
+ ::s 波 ::t bo ::t-alt po
241
+ ::s 布 ::t bu ::t-alt b
242
+ ::s 策 ::t ce ::t-alt tze, tzer
243
+ ::s 曾 ::t ceng ::t-alt tzen, zen
244
+ ::s 彻 ::t che ::t-alt tche
245
+ ::s 茨 ::t ci ::t-alt ts, tz, z
246
+ ::s 兹 ::t ci ::t-alt ds, dz, tz, z, zi
247
+ ::s 蒂 ::t di ::t-alt ti, tti
248
+ ::s 丁 ::t ding ::t-alt din, tin
249
+ ::s 顿 ::t dun ::t-alt ton
250
+ ::s 多 ::t duo ::t-alt do, dor, to
251
+ ::s 尔 ::t er ::t-alt l, le, ll, r
252
+ ::s 弗 ::t fu ::t-alt f, fer, pher, v, ver, vir
253
+ ::s 夫 ::t fu ::t-alt f, v, v
254
+ ::s 福 ::t fu ::t-alt faw, for, ford
255
+ ::s 哥 ::t ge ::t-alt go, co
256
+ ::s 戈 ::t ge ::t-alt go
257
+ ::s 各 ::t ge ::t-alt go, co
258
+ ::s 赫 ::t he ::t-alt ch, che, cher, ge
259
+ ::s 华 ::t hua ::t-alt ver, wa, war, wer ::example Washington
260
+ ::s 怀 ::t huai ::t-alt whi, wi, wy
261
+ ::s 惠 ::t hui ::t-alt wha, whea
262
+ ::s 基 ::t ji ::t-alt ki, chi
263
+ ::s 吉 ::t ji ::t-alt gi, gui
264
+ ::s 加 ::t jia ::t-alt ca, ga, ka ::example Canada
265
+ ::s 杰 ::t jie ::t-alt ger
266
+ ::s 金 ::t jin ::t-alt kin, gin
267
+ ::s 斤 ::t jin ::t-alt zin
268
+ ::s 康 ::t kang ::t-alt con, corn
269
+ ::s 考 ::t kao ::t-alt cow, cour
270
+ ::s 克 ::t ke ::t-alt k, che, cher
271
+ ::s 科 ::t ke ::t-alt ko
272
+ ::s 拉 ::t la ::t-alt ra ::example Tirana
273
+ ::s 朗 ::t lang ::t-alt lon, ron
274
+ ::s 赖 ::t lai ::t-alt ri
275
+ ::s 劳 ::t lao ::t-alt low
276
+ ::s 勒 ::t lei ::t-alt ler
277
+ ::s 伦 ::t lun ::t-alt lon, ran, ron
278
+ ::s 里 ::t li ::t-alt ri
279
+ ::s 利 ::t li ::t-alt ri ::example Ferrari
280
+ ::s 隆 ::t long ::t-alt lon, lum, lund
281
+ ::s 罗 ::t luo ::t-alt l, lo, lu, ro, row, ru
282
+ ::s 洛 ::t luo ::t-alt lo, low, ro
283
+ ::s 默 ::t mo ::t-alt mer
284
+ ::s 纳 ::t na ::t-alt ne, ner
285
+ ::s 珀 ::t po ::t-alt per
286
+ ::s 奇 ::t qi ::t-alt chi, dge, ge, tch
287
+ ::s 齐 ::t qi ::t-alt tsi, zi
288
+ ::s 乔 ::t qiao ::t-alt jo
289
+ ::s 青 ::t qing ::t-alt tsing
290
+ ::s 琼 ::t qiong ::t-alt jon, jum, jun
291
+ ::s 瑟 ::t se ::t-alt the
292
+ ::s 什 ::t shen ::t-alt sh
293
+ ::s 圣 ::t sheng ::t-alt san, sao, saint
294
+ ::s 斯 ::t si ::t-alt s, rth, th ::example Alaska
295
+ ::s 索 ::t suo ::t-alt tho
296
+ ::s 特 ::t te ::t-alt t
297
+ ::s 翁 ::t weng ::t-alt on
298
+ ::s 沃 ::t wo ::t-alt ver, vo, war, wer
299
+ ::s 乌 ::t wu ::t-alt ou, u
300
+ ::s 希 ::t xi ::t-alt chi, hi, shi
301
+ ::s 西 ::t xi ::t-alt s, si
302
+ ::s 锡 ::t xi ::t-alt ci, si, thi, zi
303
+ ::s 夏 ::t xia ::t-alt ha, cha, cia, sha, tia
304
+ ::s 香 ::t xiang ::t-alt chan, cham
305
+ ::s 歇 ::t xie ::t-alt she
306
+ ::s 谢 ::t xie ::t-alt che, she
307
+ ::s 辛 ::t xin ::t-alt cin, sen, sin, sing, sun, zen
308
+ ::s 欣 ::t xin ::t-alt hin, shin
309
+ ::s 休 ::t xiu ::t-alt hu, hue
310
+ ::s 修 ::t xiu ::t-alt ciu, siu, thew, tiu
311
+ ::s 许 ::t xu ::t-alt hue, schue
312
+ ::s 逊 ::t xun ::t-alt son
313
+ ::s 耶 ::t ye ::t-alt yer, ier
314
+ ::s 泽 ::t ze ::t-alt ser
315
+ ::s 扎 ::t zha ::t-alt za
316
+ ::s 詹 ::t zhan ::t-alt ja, jam, jan, jen, jon
317
+ ::s 治 ::t zhi ::t-alt ge ::example George
318
+
319
+ ## Numbers
320
+ # Chinese and Japanese numbers
321
+ ::s 零 ::num 0
322
+ ::s 〇 ::num 0
323
+ ::s 一 ::num 1
324
+ ::s 二 ::num 2
325
+ ::s 三 ::num 3
326
+ ::s 四 ::num 4
327
+ ::s 五 ::num 5
328
+ ::s 六 ::num 6
329
+ ::s 七 ::num 7
330
+ ::s 八 ::num 8
331
+ ::s 九 ::num 9
332
+ ::s 十 ::num 10
333
+ ::s 百 ::num 100
334
+ ::s 千 ::num 1000
335
+ ::s 万 ::num 10000
336
+ ::s 萬 ::num 10000
337
+ ::s 亿 ::num 100000000
338
+ ::s 億 ::num 100000000
339
+ ::s 兆 ::num 1000000000000
340
+ ::s 京 ::num 10000000000000000
341
+
342
+ ::s 北京 ::t beijing
343
+ ::s 京都 ::t jingdou
344
+ ::s 东京 ::t dongjing
345
+ ::s 京胡 ::t jinghu
346
+ ::s 南京 ::t nangjing
347
+ ::s 普京 ::t pujing ::comment Putin
348
+ ::s 東京 ::t dongjing ::comment Tokyo
349
+ ::s 京兆 ::t jingzhao
350
+
351
+ ::s ㎢ ::t km²
352
+ ::s ㎥ ::t m³
353
+ ::s ㎝ ::t cm
354
+
355
+ ## Indian
356
+ # see mostly under UnicodeDataOverwrite.txt
357
+
358
+ # Malayalam
359
+ ::s ൗ ::t au ::comment MALAYALAM AU LENGTH MARK
360
+
361
+ # Tamil
362
+ ::s ட ::t d ::comment most commonly d, but t when word-initial or in a doubled consonant
363
+ ::s ஃப ::t f ::comment h+p=f
364
+ ::s ஃஜ ::t z ::comment h+j=z
365
+
366
+ # Myanmar/Burmese
367
+ # ::s ့ ::t ::comment dot below, denotes creaky tone
368
+ # ::s း ::t ::comment visarga, denotes high tone
369
+ ::s ၌ ::t -nai ::comment locative
370
+ ::s ၍ ::t -jwe ::comment completed
371
+ ::s ၎ ::t legau ::comment aforementioned
372
+ ::s ၏ ::t -i ::comment genetive
373
+
374
+ # Lao
375
+ ::s ັ ::t a ::comment vowel sign mai kan
376
+ ::s ົ ::t o ::comment vowel sign mai kon
377
+ ::s ູ ::t uu ::comment vowel sign uu
378
+ ::s ຽ ::t y ::comment semivowel sign nyo
379
+ ::s ຼ ::t l ::comment semivowel sign lo
380
+ ::s ລ ::t l ::comment lo loot
381
+ ::s ຣ ::t l ::comment lo ling
382
+ ::s ໝ ::t m ::comment ho mo
383
+ ::s ໜ ::n ::comment ho no
384
+ ::s ຢ ::t y ::comment yo
385
+ ::s ໍ ::t oo ::comment niggahita (possibly also nasal -m in final position)
386
+ ::s ໆ ::t ² ::comment Lao ko la ::annotation repetition-sign
387
+ ::s ຯ ::t ... ::comment Lao ellipsis
388
+
389
+ # Thai
390
+ ::s ออ ::t o
391
+ ::s อั ::t a
392
+ ::s อิ ::t i
393
+ ::s ๆ ::t ² ::comment Thai character maiyamok ::annotation repetition-sign
394
+
395
+ # Khmer
396
+ ::s ័ ::t "" ::comment Khmer samyok sannya: indicates deviation from the general rules of pronunciation
397
+ ::s ៏ ::t "" ::comment Khmer sign ahsda: denotes stressed intonation in some single-consonant words
398
+ ::s ៍ ::t "" ::comment Khmer sign toandakhiat: indicates that the base character is not pronounced
399
+ ::s ៌ ::t "" ::comment Khmer sign robat: a diacritic historically corresponding to the repha form of ra in Devanagari
400
+ ::s ប៉ ::t pa ::comment Khmer ba + musĕkâtônd -> pa
401
+ ::s ៗ ::t ² ::comment Khmer sign lek too ::annotation repetition-sign
402
+
403
+ ## Semitic languages
404
+ # Arabic
405
+ ::s و ::t w ::comment Arabic letter waw ::t-alt o, u ::lcode ara
406
+ ::s ء ::t ' ::comment hamza
407
+ ::s ٔ ::t ' ::comment hamza above
408
+ ::s ٕ ::t ' ::comment hamza below
409
+ ::s ع ::t ' ::comment ain
410
+ ::s آ ::t a ::comment alef madda
411
+ ::s ٓا ::t a ::comment Arabic maddah above plus alef (presumably an ill-formed version of آ; found 1 instance in Urdu text)
412
+ ::s إ ::t i ::comment alef with hamza below
413
+ ::s ٱ ::t a ::comment alef wasla ::comment typically indicates liaison with preceding word
414
+ ::s ة ::t a ::comment teh marbuta
415
+ ::s ۃ ::t a ::comment teh marbuta goal ::comment Used in Punjabi, Sindhi. Different from plain 'teh marbuta'?
416
+ ::s ي ::t y ::comment Arabic yeh
417
+ ::s ى ::t a ::comment alef maksura
418
+ ::s ﻯ ::t a ::comment alef maksura isolated form
419
+ ::s ﻰ ::t a ::comment alef maksura final form
420
+ ::s ﯨ ::t a ::comment Uighur Kazach Kirghiz alef maksura initial form
421
+ ::s ﯩ ::t a ::comment Uighur Kazach Kirghiz alef maksura medial form
422
+ ::s ٰ ::t a ::comment Arabic letter superscript alef
423
+ ::s ـ ::t ::comment tatweel (filler)
424
+ ::s َ ::t a ::comment fatha ("-a")
425
+ ::s ُ ::t u ::comment damma ("-u")
426
+ ::s ِ ::t i ::comment kasra ("-i")
427
+ ::s ْ ::t ::comment sukun (no vowel)
428
+ ::s ۡ ::t ::comment small high dotless head of khah; like sukun (no vowel); used in Kashmiri, Assamese
429
+ ::s ً ::t ::comment fathatan ("-an")
430
+ ::s اً ::t an ::comment alef + fathatan
431
+ ::s ٌ ::t ::comment dammatan ("-un")
432
+ ::s ٍ ::t ::comment kasratan ("-in")
433
+ ::s ّ ::t ::comment shadda (consonant doubler)
434
+ ::s ڃ ::t ny ::comment Arabic letter nyeh U+0683 (used in Sindhi (snd))
435
+ ::s ڄ ::t dy ::comment Arabic letter dyeh U+0684 (used in Sindhi (snd))
436
+ ::s ۾ ::t men ::comment Sindhi postposition men
437
+ ::s ؑ ::t alayhe wasallam ::comment "upon him be peace"
438
+ ::s ﷴ ::t mohammad ::comment "Mohammad"
439
+ ::s ﷸ ::t wasallam ::comment "and peace"
440
+ ::s ﷺ ::t sallallahou alayhe wasallam ::comment "prayer of God be upon him and his family and peace"
441
+
442
+ # Farsi
443
+ ::s ی ::t i ::t-alt y ::comment Contributed by Nima
444
+ ::s ای ::t i ::t-alt ai ::use-only-at-start-of-word ::comment Contributed by Nima
445
+ ::s هٔ ::t eye ::use-only-at-end-of-word ::lcode fas ::comment Contributed by Nima
446
+ ::s و ::t v ::t-alt o, u ::lcode fas ::comment Arabic letter waw
447
+ ::s ض ::t z ::t-alt d ::lcode fas ::comment Contributed by Marjan
448
+ ::s ث ::t s ::t-alt th ::lcode fas ::comment Contributed by Marjan
449
+ ::s ذ ::t z ::t-alt th ::lcode fas ::comment Contributed by Nima
450
+ ::s ع ::t a ::t-alt ' ::lcode fas ::comment Contributed by Nima
451
+ ::s عا ::t a ::lcode fas ::comment Contributed by Nima
452
+ ::s عی ::t i ::t-alt iy ::lcode fas ::comment Contributed by Nima
453
+ ::s عو ::t u ::t-alt o, av ::lcode fas ::comment Contributed by Nima
454
+ ::s چ ::t ch ::t-alt tch, tsh ::lcode fas ::comment Contributed by Nima
455
+ ::s ه ::t e ::t-alt h ::use-only-at-end-of-word ::lcode fas ::comment Contributed by Nima
456
+ ::s ‌ ::t "" ::t-alt " " ::lcode fas ::comment source is character "zero-width non-joiner" (U+200C); Contributed by Nima
457
+ ::s غ ::t gh ::t-alt g ::lcode fas
458
+ ::s آئی ::t ai ::t-alt ae ::lcode fas
459
+ ::s ائی ::t ai ::t-alt ae ::lcode fas
460
+ ::s آئو ::t au ::t-alt ao ::lcode fas
461
+ ::s ائو ::t au ::t-alt ao ::lcode fas
462
+
463
+ # Kashmiri (so far: educated guesses)
464
+ ::s ٖ ::t a ::comment Arabic subscript alef U+0656
465
+ ::s ٗ ::t u ::comment Arabic inverted damma U+0657
466
+ ::s ۚ ::t j ::comment Arabic small high jeem U+06DA
467
+ ::s ۪ ::t ::comment Arabic emtpy centre low stop U+06EA
468
+ ::s ۬ ::t ::comment Arabic rounded high stop with filled center U+06EC
469
+
470
+ # Pashto
471
+ ::s ٙ ::t e
472
+
473
+ # Hebrew
474
+ ::s ב ::t v ::comment Hebrew letter bet ::t-alt b
475
+ ::s כ ::t k ::comment Hebrew letter kaf ::t-alt kh
476
+ ::s ך ::t k ::comment Hebrew letter kaf ::t-alt kh
477
+ ::s פ ::t f ::comment Hebrew letter pe ::t-alt p
478
+ ::s ש ::t sh ::comment Hebrew letter shin ::t-alt s
479
+ ::s ו ::t v ::comment Hebrew letter vav ::t-alt o, u
480
+ ::s ח ::t ch ::comment Hebrew letter het ::t-alt h ::use-alt-in-pointed
481
+ ::s ק ::t q ::t-alt k ::use-alt-in-pointed
482
+ ::s וֹ ::t o
483
+ ::s וּ ::t u
484
+ ::s קְוָ ::t qva ::t-alt kva ::use-alt-in-pointed
485
+ ::s י ::t y
486
+ ::s יּ ::t y
487
+ ::s יָּ ::t ya
488
+ ::s ע ::t '
489
+ ::s ִי ::t i ::t-alt iy ::use-alt-in-pointed
490
+ ::s ֵי ::t e
491
+ ::s ִיּ ::t iy
492
+ ::s ִיָּ ::t iya
493
+ ::s ױ ::t oy
494
+ ::s א ::t a ::t-alt '
495
+ ::s אָ ::t a
496
+ ::s ֹא ::t o
497
+ ::s אַ ::t 'a
498
+ ::s אֲ ::t 'a
499
+ ::s אֶ ::t e
500
+ ::s אֱ ::t e
501
+ ::s פ ::t f
502
+ ::s פּ ::t p
503
+ ::s פַּ ::t pa
504
+ ::s פְּ ::t pe ::t-alt p ::use-alt-in-pointed
505
+ ::s שׁ ::t sh
506
+ ::s שָׁ ::t sha
507
+ ::s שָּׁ ::t sha ::comment ?
508
+ ::s שְׁ ::t she ::t-alt sh ::use-alt-in-pointed
509
+ ::s שֶׁ ::t she
510
+ ::s שִׁ ::t shi
511
+ ::s שֻׁ ::t shu
512
+ ::s שׂ ::t s
513
+ ::s שָׂ ::t sa
514
+ ::s שְׂ ::t s ::t-alt se ::use-alt-in-pointed
515
+ ::s כּ ::t k
516
+ ::s כֶּ ::t ke
517
+ ::s כֹּ ::t ko
518
+ ::s בּ ::t b
519
+ ::s בַּ ::t ba
520
+ ::s בָּ ::t ba
521
+ ::s בְּ ::t be ::t-alt b ::use-alt-in-pointed
522
+ ::s בֶּ ::t be
523
+ ::s תּ ::t t
524
+ ::s תַּ ::t ta
525
+ ::s תֵּ ::t te
526
+ ::s תִּ ::t ti
527
+ ::s דָּ ::t da
528
+ ::s דְּ ::t de ::t-alt d ::use-alt-in-pointed
529
+ ::s גּ ::t g
530
+ ::s לֵּ ::t le
531
+ ::s ד׳ ::t dh
532
+ ::s ג׳ ::t j
533
+ ::s ת׳ ::t th
534
+ ::s ז׳ ::t zh
535
+ ::s חַ ::t ach ::comment furtive patah ::use-only-at-end-of-word
536
+ ::s עַ ::t a' ::comment furtive patah ::use-only-at-end-of-word
537
+ ::s הַּ ::t ah ::comment furtive patah ::use-only-at-end-of-word
538
+ ::s ַ ::t a ::comment Hebrew point patah
539
+ ::s ֲ ::t a ::comment Hebrew point hataf patah (hataf = reduced)
540
+ ::s ֳ ::t o ::comment Hebrew point hataf qamats
541
+ ::s ָ ::t a ::comment Hebrew point qamats ::t-alt o ::use-alt-in-pointed
542
+ ::s ֶ ::t e ::comment Hebrew point segol
543
+ ::s ֱ ::t e ::comment Hebrew point hataf segol (hataf = reduced)
544
+ ::s ְ ::t e ::comment Hebrew point sheva ::t-alt "" ::use-alt-in-pointed
545
+ ::s ֵ ::t e ::comment Hebrew point tsere
546
+ ::s ִ ::t i ::comment Hebrew point hiriq
547
+ ::s ֹ ::t o ::comment Hebrew point holam
548
+ ::s ֻ ::t u ::comment Hebrew point qubuts
549
+ # ::s ּ ::t "" ::comment Hebrew point dagesh or mapiq
550
+
551
+ # Yiddish
552
+ ::s א ::t a ::lcode yid ::comment called "silent" alef
553
+ ::s אי ::t y ::lcode yid
554
+ ::s איי ::t ey ::lcode yid
555
+ ::s או ::t u ::lcode yid
556
+ ::s אוי ::t oy ::lcode yid
557
+ ::s אַ ::t a ::lcode yid
558
+ ::s אָ ::t o ::lcode yid
559
+ ::s ב ::t b ::lcode yid
560
+ ::s בֿ ::t v ::lcode yid
561
+ ::s דזש ::t dzh ::lcode yid
562
+ ::s ו ::t u ::lcode yid
563
+ ::s וּ ::t u ::lcode yid
564
+ ::s וֹ ::t o ::lcode yid
565
+ ::s װ ::t v ::lcode yid
566
+ ::s ווא ::t wa ::lcode yid
567
+ ::s וואַ ::t wa ::lcode yid
568
+ ::s ווע ::t we ::lcode yid
569
+ ::s ווי ::t wi ::lcode yid
570
+ ::s וואוי ::t wo ::lcode yid
571
+ ::s וי ::t oy ::lcode yid
572
+ ::s זש ::t zh ::lcode yid
573
+ ::s ח ::t ch ::lcode yid
574
+ ::s טש ::t tsh ::lcode yid
575
+ ::s יִ::t i ::lcode yid
576
+ ::s יי ::t ey ::lcode yid ::comment maybe "yi" at beginning of word
577
+ ::s ײַ ::t ay ::lcode yid
578
+ ::s כּ ::t k ::lcode yid
579
+ ::s כ ::t ch ::lcode yid
580
+ ::s ך ::t ch ::lcode yid
581
+ ::s ע ::t e ::lcode yid
582
+ ::s פּ ::t p ::lcode yid
583
+ ::s פֿ ::t f ::lcode yid
584
+ ::s ף ::t f ::lcode yid ::comment sometimes p
585
+ ::s ק ::t k ::lcode yid
586
+ ::s ת ::t s ::lcode yid
587
+
588
+ # Syriac/Aramaic (should be vetted by expert)
589
+ ::s ܰ ::t a ::comment Syriac pthaha above
590
+ ::s ܲ ::t a ::comment Syriac pthaha dotted
591
+ ::s ܳ ::t aa ::comment Syriac zqapha above
592
+ ::s ܴ ::t aa ::comment Syriac zqapha below
593
+ ::s ܵ ::t aa ::comment Syriac zqapha dotted
594
+ ::s ܶ ::t e ::comment Syriac rbasa above
595
+ ::s ܷ ::t e ::comment Syriac rbasa below
596
+ ::s ܿ ::t o ::comment Syriac rwaha
597
+ ::s ܸ ::t e ::comment Syriac dotted zlama horizontal
598
+ ::s ܹ ::t e ::comment Syriac dotted zlama angular
599
+ ::s ܺ ::t i ::comment Syriac hbasa above
600
+ ::s ܝܺ ::t i ::comment Syriac yudh + hbasa above
601
+ ::s ܼ ::t u ::comment Syriac hbasa-esasa dotted
602
+ ::s ܽ ::t o ::comment Syriac esasa above
603
+ ::s ܾ ::t u ::comment Syriac esasa below
604
+ ::s ݇ ::t "" ::comment Syriac oblique line above; indication of a silent letter
605
+
606
+ ::s ܖ ::t d ::comment Syriac letter dotless dalath rish; ambiguous form for undifferentiated early dalath/rish
607
+ ::s ܜ ::t t ::comment Syriac letter teth garshuni; used in Garshuni documents
608
+ ::s ܒ݂ ::t v ::comment Syriac beth + rukkakha
609
+ ::s ܒ̥ ::t v ::comment Syriac beth + ring-below
610
+ ::s ܓ݂ ::t g ::comment Syriac gammal + rukkakha [IPA: ɣ]
611
+ ::s ܓ̥ ::t g ::comment Syriac gammal + ring-below [IPA: ɣ]
612
+ ::s ܕ݂ ::t d ::comment Syriac dalath + rukkakha [IPA: ð]
613
+ ::s ܕ̥ ::t d ::comment Syriac dalath + ring-below [IPA: ð]
614
+ ::s ܟ݂ ::t kh ::comment Syriac kaph + rukkakha [IPA: x]
615
+ ::s ܟ̥ ::t kh ::comment Syriac kaph + ring-below [IPA: x]
616
+ ::s ܦ݂ ::t f ::comment Syriac pe + rukkakha
617
+ ::s ܦ̥ ::t f ::comment Syriac pe + ring-below
618
+ ::s ܦ݁ ::t p ::comment Syriac pe + qushshaya
619
+ ::s ܬ݂ ::t th ::comment Syriac taw + rukkakha [IPA: θ]
620
+ ::s ܬ̥ ::t th ::comment Syriac taw + ring-below [IPA: θ]
621
+
622
+ ::s ܄ ::t : ::comment Syriac sublinear colon; used at the end of verses of supplicationscolon skewed left
623
+ ::s ܆ ::t , ::comment Syriac colon skewed left; marks a dependent clause
624
+ ::s ܇ ::t , ::comment Syriac colon skewed right; marks the end of a subdivision of the apodosis, or latter part of a Biblical verse
625
+
626
+ # Uzbek
627
+ ::s ʻ ::t ' ::comment modifies pronunciation of preceding "o" and "g"
628
+ ::s ʼ ::t ' ::comment glottal stop (tutuq belgisi)
629
+
630
+ # Uyghur
631
+ ::s ئا ::t a ::lcode uig
632
+ ::s ە ::t e ::lcode uig
633
+ ::s ئې ::t e ::lcode uig ::latinplus ë
634
+ ::s ې ::t e ::lcode uig ::latinplus ë
635
+ ::s ئە ::t e ::lcode uig
636
+ ::s يە ::t e ::lcode uig
637
+ ::s ئى ::t i ::lcode uig
638
+ ::s ى ::t i ::lcode uig
639
+ ::s ئو ::t o ::lcode uig
640
+ ::s و ::t o ::lcode uig
641
+ ::s ئۇ ::t u ::lcode uig
642
+ ::s ۇ ::t u ::lcode uig
643
+ ::s چ ::t ch ::t-alt q ::lcode uig
644
+ ::s خ ::t x ::lcode uig
645
+ ::s ژ ::t zh ::lcode uig
646
+ ::s ئۆ ::t oe ::t-alt o ::lcode uig ::latinplus ö
647
+ ::s ۆ ::t oe ::t-alt o ::lcode uig ::latinplus ö
648
+ ::s ئۈ ::t ue ::t-alt u ::lcode uig ::latinplus ü
649
+ ::s ۈ ::t ue ::t-alt u ::lcode uig ::latinplus ü
650
+ ::s ۋ ::t w ::lcode uig
651
+
652
+ # Maldivian
653
+ ::s ް ::t ::comment thaana sukun
654
+ ::s ަ ::t a ::comment thaana abafili
655
+ ::s ާ ::t aa ::comment thaana aabaafili
656
+ ::s ި ::t i ::comment thaana ibifili
657
+ ::s ީ ::t ee ::comment thaana eebeefili
658
+ ::s ު ::t u ::comment thaana ubufili
659
+ ::s ޫ ::t oo ::comment thaana ooboofili
660
+ ::s ެ ::t e ::comment thaana ebefili
661
+ ::s ޭ ::t ey ::comment thaana eybeyfili
662
+ ::s ޮ ::t o ::comment thaana obofili
663
+ ::s ޯ ::t oa ::comment thaana oaboafili
664
+
665
+ # Canadian syllabics (Inuktitut)
666
+ ::s ᑊ ::t p ::comment syllable final
667
+ ::s ᐟ ::t t ::comment syllable final
668
+ ::s ᐠ ::t k ::comment syllable final
669
+ ::s ᐨ ::t c ::comment syllable final
670
+ ::s ᒼ ::t m ::comment syllable final
671
+ ::s ᐣ ::t n ::comment syllable final
672
+ ::s ᐢ ::t s ::comment syllable final
673
+ ::s ᐧ ::t y ::comment syllable final
674
+ ::s ᐤ ::t w ::comment syllable final
675
+ ::s ᐦ ::t h ::comment syllable final
676
+ ::s ᕽ ::t hk ::comment syllable final
677
+ ::s ᓫ ::t l ::comment syllable final
678
+ ::s ᕑ ::t r ::comment syllable final
679
+
680
+ ## Punctuation
681
+ # delete
682
+ ::s ¿ ::t "" ::comment inverted question mark
683
+ ::s ¡ ::t "" ::comment inverted exclamation mark
684
+ # preserve
685
+ ::s ′ ::t ′
686
+ # Cyrillic
687
+ ::s ⁙ ::t . ::comment five dot punctuation
688
+ # Amharic/Ethiopian
689
+ ::s ። ::t .
690
+ ::s ፣ ::t ,
691
+ ::s ፤ ::t ;
692
+ ::s ፥ ::t :
693
+ ::s ፡ ::t " " ::comment Ethiopic wordspace
694
+ ::s ፦ ::t : ::comment Ethiopic preface colon
695
+ ::s ቸ ::t cha ::comment Ethiopic syllable ca
696
+ ::s ቹ ::t chu ::comment Ethiopic syllable cu
697
+ ::s ቺ ::t chi ::comment Ethiopic syllable ci
698
+ ::s ቻ ::t chaa ::comment Ethiopic syllable caa
699
+ ::s ቼ ::t chee ::comment Ethiopic syllable cee
700
+ ::s ች ::t che ::comment Ethiopic syllable ce
701
+ ::s ቾ ::t cho ::comment Ethiopic syllable co
702
+ ::s ሠ ::t sa ::comment Ethiopic syllable sza
703
+ ::s ሡ ::t su ::comment Ethiopic syllable szu
704
+ ::s ሢ ::t si ::comment Ethiopic syllable szi
705
+ ::s ሣ ::t saa ::comment Ethiopic syllable szaa
706
+ ::s ሤ ::t see::comment Ethiopic syllable szee
707
+ ::s ሥ ::t se ::comment Ethiopic syllable sze
708
+ ::s ሦ ::t so ::comment Ethiopic syllable szo
709
+ ::s ጠ ::t te ::comment Ethiopic syllable the with ejective 't'
710
+ ::s ጡ ::t tu ::comment Ethiopic syllable thu with ejective 't'
711
+ ::s ጢ ::t ti ::comment Ethiopic syllable thi with ejective 't'
712
+ ::s ጣ ::t taa ::comment Ethiopic syllable thaa with ejective 't'
713
+ ::s ጤ ::t tee ::comment Ethiopic syllable thee with ejective 't'
714
+ ::s ጥ ::t te ::comment Ethiopic syllable the with ejective 't'
715
+ ::s ጦ ::t to ::comment Ethiopic syllable tho with ejective 't'
716
+
717
+ # Devanagari (Hindi etc.)
718
+ ::s । ::t . ::comment danda
719
+ ::s ॥ ::t . ::comment double danda
720
+ ::s ৷ ::t . ::comment Bengali currency numerator four; used as danda
721
+ ::s ॰ ::t . ::comment Devanagari abbreviation sign
722
+ # Oriya/Odia (India)
723
+ ::s ୤ ::t . ::comment danda (deprecated, should use Devanagari danda ।)
724
+ ::s ୥ ::t . ::comment double danda (deprecated, should use Devanagari double danda ॥)
725
+ # Tibetan
726
+ ::s ། ::t ,
727
+ ::s །: ::t :
728
+ ::s ༏ ::t ;
729
+ ::s ༎ ::t .
730
+ ::s ༑ ::t , ::comment Tibetan mark run chen spungs shad
731
+ ::s ༼ ::t ( ::comment Tibetan open roof punctuation
732
+ ::s ༽ ::t ) ::comment Tibetan close roof punctuation
733
+ ::s ༈ ::t "" ::comment Tibetan mark srbul shad
734
+ ::s 【 ::t [ ::comment left black lenticular bracket
735
+ ::s 】 ::t ] ::comment right black lenticular bracket
736
+ ::s ༄ ::t "" ::comment Tibetan head mark
737
+ ::s ༄༅ ::t "" ::comment Tibetan head mark
738
+ ::s ༆ ::t "" ::comment Tibetan head mark
739
+ # Myanmar/Burmese
740
+ ::s ၊ ::t ,
741
+ ::s ။ ::t .
742
+ Khmer
743
+ ::s ៖ ::t ; ::comment Khmer sign camnuc pii kuuh
744
+ ::s ។ ::t . ::comment Khmer sign khan
745
+ # Arabic
746
+ ::s ، ::t ,
747
+ ::s ؛ ::t ;
748
+ ::s ٬ ::t ,
749
+ ::s ۔ ::t .
750
+ ::s ؟ ::t ?
751
+ ::s ٪ ::t %
752
+ ::s ٫ ::t , ::comment Arabic decimal separator
753
+ ::s ۽ ::t & ::comment Arabic sign Sindhi ampersand
754
+ # Aramaic
755
+ ::s ܀ ::t .
756
+ ::s ܂ ::t .
757
+ # Hebrew
758
+ ::s ־ ::t - ::comment maqaf
759
+ # Armenian
760
+ ::s ։ ::t .
761
+ ::s ՝ ::t , ::comment Armenian comma
762
+ # Chinese
763
+ ::s , ::t ", "
764
+ ::s 、 ::t ", "
765
+ ::s 。 ::t ". "
766
+ ::s ! ::t "! "
767
+ ::s ? ::t "? "
768
+ ::s 「 ::t ' "'
769
+ ::s 」 ::t '" '
770
+ ::s 《 ::t ' "'
771
+ ::s 》 ::t '" '
772
+ ::s ( ::t " ("
773
+ ::s ) ::t ") "
774
+ ::s ; ::t ;
775
+ ::s : ::t ": "
776
+ ::s ︰ ::t ": "
777
+ ::s - ::t -
778
+ ::s / ::t /
779
+ ::s = ::t =
780
+ ::s ~ ::t ~
781
+ ::s & ::t &
782
+ ::s < ::t <
783
+ ::s > ::t >
784
+ ::s % ::t %
785
+ ::s   ::t " " ::comment ideographic space
786
+ # Japanese
787
+ ::s 『 ::t ' "'
788
+ ::s 』 ::t '" '
789
+ ::s ・ ::t " " ::comment Katakana middle dot; separates name elements such as first and last name
790
+
791
+ # Symbols
792
+ ::s ∞ ::t ∞ ::comment infinity
793
+ ::s ­ ::t ::comment soft hyphen; used to indicate preferred line breaks; remove
794
+ ::s ֊ ::t - ::comment Armenian hyphen; map to regular hyphen-minus
795
+ ::s ᐩ ::t + ::comment Canadian syllabics final plus; map to regular plus
796
+ ::s ﹐ ::t , ::comment small comma; map to regular comma
797
+ ::s ˚ ::t ° ::comment ring above; map to degree sign
798
+ ::s ⇒ ::t ⇒ ::comment rightwards double arrow
799
+ ::s † ::t † ::comment dagger
800
+ ::s • ::t • ::comment bullet
801
+ ::s ℃ ::t °C ::comment degree Celsius; split into 2 characters
802
+ ::s ℉ ::t °F ::comment degree Fahrenheit; split into 2 characters
803
+ ::s ― ::t ― ::comment horizontal bar
804
+ ::s ˇ ::t ˇ ::comment caron (sometimes apparently used for "Arabic vowel sign small v above" U+065A, e.g. in Gilaki language (glk))
805
+ ::s ″ ::t ″ ::comment double prime
806
+ ::s ﴾ ::t ( ::comment ornate left parenthesis
807
+ ::s ﴿ ::t ) ::comment ornate right parenthesis
808
+ ::s 〔 ::t [ ::comment left tortoise shell bracket
809
+ ::s 〕 ::t ] ::comment right tortoise shell bracket
810
+ ::s ﹝ ::t ( ::comment small left tortoise shell bracket
811
+ ::s ﹞ ::t ) ::comment small left tortoise shell bracket
812
+ ::s ♄ ::t ♄ ::comment Saturn
813
+ ::s ♆ ::t ♆ ::comment Neptune
814
+ ::s ♋ ::t ♋ ::comment Cancer
uroman/data/string-distance-cost-rules.txt ADDED
@@ -0,0 +1,896 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # String distance
2
+
3
+ ::s1 a ::s2 ::cost 0.1
4
+ ::s1 b ::s2 ::cost 1
5
+ ::s1 b ::s2 ::cost 0.2 ::left1 /[aou]m$/ ::right1 [e] ::lc1 eng ::lc2 zho ::example Balcombe
6
+ ::s1 c ::s2 ::cost 1
7
+ ::s1 c ::s2 ::cost 0.2 ::left1 /[aeou]$/ ::right1 [cgkq] ::lc2 zho
8
+ ::s1 c ::s2 ::cost 0.5 ::left1 /[aeou][lnr]?$/ ::right1 [h] ::lc2 zho
9
+ ::s1 d ::s2 ::cost 1
10
+ ::s1 d ::s2 ::cost 0.5 ::left1 /[aeiou][lnr]$/ ::right1 [-,$ ]
11
+ ::s1 d ::s2 ::cost 0.4 ::lc1 eng ::lc2 zho ::right1 [bcfgklmnpqrstvwxz]
12
+ ::s1 e ::s2 ::cost 0.1
13
+ ::s1 é ::s2 ::cost 0.1
14
+ ::s1 e ::s2 ::cost 0.02 ::lc2 fas
15
+ ::s1 e ::s2 ::cost 0.02 ::lc1 amh ::lc2 eng
16
+ ::s1 f ::s2 ::cost 1
17
+ ::s1 g ::s2 ::cost 1
18
+ ::s1 g ::s2 ::cost 0.4 ::right1 [bcdfghklmnpqrstvwxz] ::lc2 zho
19
+ ::s1 g ::s2 ::cost 0.2 ::right1 [k] ::lc2 zho
20
+ ::s1 h ::s2 ::cost 0.5
21
+ ::s1 h ::s2 ::cost 0.1 ::left1 /[aeiouy]$/ ::right1 [-,bcdfghklmnpqrstvwxz$ ]
22
+ ::s1 h ::s2 ::cost 0.2 ::left1 /[bdlnr]$/ ::right1 [-,$ aeiouy] ::example Delhi, Minh, Riyadh
23
+ ::s1 i ::s2 ::cost 0.1
24
+ ::s1 j ::s2 ::cost 0.5
25
+ ::s1 k ::s2 ::cost 1
26
+ ::s1 l ::s2 ::cost 1
27
+ ::s1 l ::s2 ::cost 0.3 ::left1 /eui$/ ::right1 [-,$ ] ::example Argenteuil
28
+ ::s1 l ::s2 ::cost 0.3 ::left1 /a$/ ::right1 [km] ::comment walk, palm
29
+ ::s1 l ::s2 ::cost 0.3 ::left1 /[aeiou]$/ ::right1 [bdfgkmpstvwz] ::lc2 zho
30
+ ::s1 m ::s2 ::cost 1
31
+ ::s1 n ::s2 ::cost 1
32
+ ::s1 n ::s2 ::cost 0.7 ::right1 [-,$ ]
33
+ ::s1 o ::s2 ::cost 0.1
34
+ ::s1 p ::s2 ::cost 1
35
+ ::s1 q ::s2 ::cost 1
36
+ ::s1 r ::s2 ::cost 1
37
+ ::s1 r ::s2 ::cost 0.5 ::left1 /[aou]$/ ::right1 [-,bcdfghjklmnpqrstvwxz$ ]
38
+ ::s1 r ::s2 ::cost 0.3 ::left1 /[aeiou]$/ ::right1 [-,bcdfghjklmnpqrstvwxz$ ] ::lc2 zho
39
+ ::s1 re ::s2 ::cost 0.4 ::left1 /[ou]$/ ::right1 [-,$ ] ::lc2 zho
40
+ ::s1 re ::s2 ::cost 0.5 ::left1 /[aeiou]$/ ::right1 [-,bcdfghjklmnpqrstvwxz$ ] ::lc2 zho
41
+ ::s1 rr ::s2 ::cost 0.5 ::left1 /[aeiou]$/ ::right1 [-,bcdfghjklmnpqrstvwxz$ ] ::lc2 zho
42
+ ::s1 s ::s2 ::cost 1
43
+ ::s1 s ::s2 ::cost 0.6 ::right1 [-,$ ]
44
+ ::s1 t ::s2 ::cost 1
45
+ ::s1 t ::s2 ::cost 0.5 ::left1 /[aeiou][lnr]?$/ ::right1 [-,$ ]
46
+ ::s1 t ::s2 ::cost 0.6 ::left1 /[bcdfghklmnpqrstvwxz]$/ ::right1 [bcdfghklmnpqrstvwxz]
47
+ ::s1 u ::s2 ::cost 0.1
48
+ ::s1 v ::s2 ::cost 1
49
+ ::s1 w ::s2 ::cost 1
50
+ ::s1 w ::s2 ::cost 0.4 ::lc1 eng ::right1 [i][c][hk][-,$ ] ::example Greenwich, Alnwick
51
+ ::s1 x ::s2 ::cost 1
52
+ ::s1 y ::s2 ::cost 0.3
53
+ ::s1 z ::s2 ::cost 1
54
+ ::s1 ı ::s2 ::cost 0.3
55
+ ::s1 0 ::s2 ::cost 1
56
+ ::s1 1 ::s2 ::cost 1
57
+ ::s1 2 ::s2 ::cost 1
58
+ ::s1 3 ::s2 ::cost 1
59
+ ::s1 4 ::s2 ::cost 1
60
+ ::s1 5 ::s2 ::cost 1
61
+ ::s1 6 ::s2 ::cost 1
62
+ ::s1 7 ::s2 ::cost 1
63
+ ::s1 8 ::s2 ::cost 1
64
+ ::s1 9 ::s2 ::cost 1
65
+ ::s1 ' ::s2 ::cost 0.1
66
+ ::s1 ` ::s2 ::cost 0.1
67
+ ::s1 ( ::s2 ::cost 0.1
68
+ ::s1 ) ::s2 ::cost 0.1
69
+ ::s1 , ::s2 ::cost 0.1
70
+ ::s1 ; ::s2 ::cost 0.1
71
+ ::s1 - ::s2 ::cost 0.1
72
+ ::s1 . ::s2 ::cost 0.1
73
+ ::s1 .. ::s2 ::cost 0.12
74
+ ::s1 ... ::s2 ::cost 0.14
75
+ ::s1 ? ::s2 ::cost 0.2
76
+ ::s1 ! ::s2 ::cost 0.2
77
+ ::s1 ‼ ::s2 ::cost 0.2
78
+ ::s1 ‼ ::s2 !! ::cost 0.02
79
+ ::s1 ‼ ::s2 ! ::cost 0.1
80
+ ::s1 / ::s2 ::cost 0.1
81
+ ::s1 : ::s2 ::cost 0.1
82
+ ::s1 ː ::s2 ::cost 0.1
83
+ ::s1 ː ::s2 : ::cost 0.1
84
+ ::s1 « ::s2 ::cost 0.1
85
+ ::s1 » ::s2 ::cost 0.1
86
+ ::s1 – ::s2 ::cost 0.1
87
+ ::s1 – ::s2 - ::cost 0.05
88
+ ::s1 — ::s2 ::cost 0.15
89
+ ::s1 — ::s2 - ::cost 0.1
90
+ ::s1 — ::s2 – ::cost 0.05
91
+ ::s1 ─ ::s2 ::cost 0.2
92
+ ::s1 ─ ::s2 - ::cost 0.15
93
+ ::s1 ─ ::s2 – ::cost 0.1
94
+ ::s1 ─ ::s2 — ::cost 0.05
95
+ ::s1 ’ ::s2 ::cost 0.1
96
+ ::s1 ʼ ::s2 ::cost 0.1
97
+ ::s1 " " ::s2 ::cost 0.1
98
+ ::s1 “ ::s2 ::cost 0.1
99
+ ::s1 ” ::s2 ::cost 0.1
100
+ ::s1 ″ ::s2 ::cost 0.1
101
+ ::s1 # ::s2 ::cost 0.3
102
+ ::s1 + ::s2 ::cost 0.3
103
+ ::s1 * ::s2 ::cost 0.3
104
+ ::s1 = ::s2 ::cost 0.3
105
+ ::s1 < ::s2 ::cost 0.3
106
+ ::s1 > ::s2 ::cost 0.3
107
+ ::s1 [ ::s2 ::cost 0.3
108
+ ::s1 ] ::s2 ::cost 0.3
109
+ ::s1 { ::s2 ::cost 0.3
110
+ ::s1 } ::s2 ::cost 0.3
111
+ ::s1 | ::s2 ::cost 0.3
112
+ ::s1 & ::s2 ::cost 0.3
113
+ ::s1 _ ::s2 ::cost 0.3
114
+ ::s1 • ::s2 ::cost 0.1
115
+ ::s1 · ::s2 ::cost 0.1
116
+ ::s1 ◦ ::s2 ::cost 0.1
117
+ ::s1 ° ::s2 ::cost 0.1
118
+ ::s1 … ::s2 ::cost 0.1
119
+ ::s1 … ::s2 ... ::cost 0
120
+ ::s1 @ ::s2 ::cost 0.3
121
+ ::s1 © ::s2 ::cost 0.3
122
+ ::s1 © ::s2 (c) ::cost 0.1
123
+
124
+
125
+ ::s1 a ::s2 aa ::cost 0.02
126
+ ::s1 a ::s2 aaa ::cost 0.03
127
+ ::s1 a ::s2 aaaa ::cost 0.03
128
+ ::s1 a ::s2 aaaaa ::cost 0.03
129
+ ::s1 a ::s2 aaaaaa ::cost 0.04
130
+ ::s1 a ::s2 aaaaaaa ::cost 0.04
131
+ ::s1 a ::s2 aaaaaaaa ::cost 0.04
132
+ ::s1 a ::s2 aaaaaaaaa ::cost 0.04
133
+ ::s1 a ::s2 aaaaaaaaaa ::cost 0.04
134
+ ::s1 a ::s2 aaaaaaaaaaa ::cost 0.04
135
+ ::s1 a ::s2 aaaaaaaaaaaa ::cost 0.04
136
+ ::s1 a ::s2 aaaaaaaaaaaaa ::cost 0.04
137
+ ::s1 a ::s2 aaaaaaaaaaaaaa ::cost 0.04
138
+ ::s1 a ::s2 aaaaaaaaaaaaaaa ::cost 0.04
139
+ ::s1 a ::s2 aaaaaaaaaaaaaaaa ::cost 0.04
140
+ ::s1 b ::s2 bb ::cost 0.02
141
+ ::s1 b ::s2 bbb ::cost 0.03
142
+ ::s1 b ::s2 bbbb ::cost 0.03
143
+ ::s1 b ::s2 bbbbb ::cost 0.03
144
+ ::s1 c ::s2 cc ::cost 0.02
145
+ ::s1 c ::s2 ccc ::cost 0.03
146
+ ::s1 c ::s2 cccc ::cost 0.03
147
+ ::s1 c ::s2 ccccc ::cost 0.03
148
+ ::s1 d ::s2 dd ::cost 0.02
149
+ ::s1 d ::s2 ddd ::cost 0.03
150
+ ::s1 d ::s2 dddd ::cost 0.03
151
+ ::s1 d ::s2 ddddd ::cost 0.03
152
+ ::s1 e ::s2 ee ::cost 0.02
153
+ ::s1 e ::s2 eee ::cost 0.03
154
+ ::s1 e ::s2 eeee ::cost 0.03
155
+ ::s1 e ::s2 eeeee ::cost 0.03
156
+ ::s1 e ::s2 eeeeee ::cost 0.04
157
+ ::s1 e ::s2 eeeeeee ::cost 0.04
158
+ ::s1 e ::s2 eeeeeeee ::cost 0.04
159
+ ::s1 e ::s2 eeeeeeeee ::cost 0.04
160
+ ::s1 e ::s2 eeeeeeeeee ::cost 0.04
161
+ ::s1 e ::s2 eeeeeeeeeee ::cost 0.04
162
+ ::s1 e ::s2 eeeeeeeeeeee ::cost 0.04
163
+ ::s1 e ::s2 eeeeeeeeeeeee ::cost 0.04
164
+ ::s1 e ::s2 eeeeeeeeeeeeee ::cost 0.04
165
+ ::s1 e ::s2 eeeeeeeeeeeeeee ::cost 0.04
166
+ ::s1 e ::s2 eeeeeeeeeeeeeeee ::cost 0.04
167
+ ::s1 f ::s2 ff ::cost 0.02
168
+ ::s1 f ::s2 fff ::cost 0.03
169
+ ::s1 f ::s2 ffff ::cost 0.03
170
+ ::s1 f ::s2 fffff ::cost 0.03
171
+ ::s1 g ::s2 gg ::cost 0.02
172
+ ::s1 g ::s2 ggg ::cost 0.03
173
+ ::s1 g ::s2 gggg ::cost 0.03
174
+ ::s1 g ::s2 ggggg ::cost 0.03
175
+ ::s1 h ::s2 hh ::cost 0.02
176
+ ::s1 h ::s2 hhh ::cost 0.03
177
+ ::s1 h ::s2 hhhh ::cost 0.03
178
+ ::s1 h ::s2 hhhhh ::cost 0.03
179
+ ::s1 i ::s2 ii ::cost 0.02
180
+ ::s1 i ::s2 iii ::cost 0.03
181
+ ::s1 i ::s2 iiii ::cost 0.03
182
+ ::s1 i ::s2 iiiii ::cost 0.03
183
+ ::s1 i ::s2 iiiiii ::cost 0.04
184
+ ::s1 i ::s2 iiiiiii ::cost 0.04
185
+ ::s1 i ::s2 iiiiiiii ::cost 0.04
186
+ ::s1 i ::s2 iiiiiiiii ::cost 0.04
187
+ ::s1 i ::s2 iiiiiiiiii ::cost 0.04
188
+ ::s1 i ::s2 iiiiiiiiiii ::cost 0.04
189
+ ::s1 i ::s2 iiiiiiiiiiii ::cost 0.04
190
+ ::s1 i ::s2 iiiiiiiiiiiii ::cost 0.04
191
+ ::s1 i ::s2 iiiiiiiiiiiiii ::cost 0.04
192
+ ::s1 i ::s2 iiiiiiiiiiiiiii ::cost 0.04
193
+ ::s1 i ::s2 iiiiiiiiiiiiiiii ::cost 0.04
194
+ ::s1 j ::s2 jj ::cost 0.02
195
+ ::s1 j ::s2 jjj ::cost 0.03
196
+ ::s1 j ::s2 jjjj ::cost 0.03
197
+ ::s1 j ::s2 jjjjj ::cost 0.03
198
+ ::s1 k ::s2 kk ::cost 0.02
199
+ ::s1 k ::s2 kkk ::cost 0.03
200
+ ::s1 k ::s2 kkkk ::cost 0.03
201
+ ::s1 k ::s2 kkkkk ::cost 0.03
202
+ ::s1 l ::s2 ll ::cost 0.02
203
+ ::s1 l ::s2 lll ::cost 0.03
204
+ ::s1 l ::s2 llll ::cost 0.03
205
+ ::s1 l ::s2 lllll ::cost 0.03
206
+ ::s1 m ::s2 mm ::cost 0.02
207
+ ::s1 m ::s2 mmm ::cost 0.03
208
+ ::s1 m ::s2 mmmm ::cost 0.03
209
+ ::s1 m ::s2 mmmmm ::cost 0.03
210
+ ::s1 n ::s2 nn ::cost 0.02
211
+ ::s1 n ::s2 nnn ::cost 0.03
212
+ ::s1 n ::s2 nnnn ::cost 0.03
213
+ ::s1 n ::s2 nnnnn ::cost 0.03
214
+ ::s1 o ::s2 oo ::cost 0.02
215
+ ::s1 o ::s2 ooo ::cost 0.03
216
+ ::s1 o ::s2 oooo ::cost 0.03
217
+ ::s1 o ::s2 ooooo ::cost 0.03
218
+ ::s1 o ::s2 oooooo ::cost 0.04
219
+ ::s1 o ::s2 ooooooo ::cost 0.04
220
+ ::s1 o ::s2 oooooooo ::cost 0.04
221
+ ::s1 o ::s2 ooooooooo ::cost 0.04
222
+ ::s1 o ::s2 oooooooooo ::cost 0.04
223
+ ::s1 o ::s2 ooooooooooo ::cost 0.04
224
+ ::s1 o ::s2 oooooooooooo ::cost 0.04
225
+ ::s1 o ::s2 ooooooooooooo ::cost 0.04
226
+ ::s1 o ::s2 oooooooooooooo ::cost 0.04
227
+ ::s1 o ::s2 ooooooooooooooo ::cost 0.04
228
+ ::s1 o ::s2 oooooooooooooooo ::cost 0.04
229
+ ::s1 p ::s2 pp ::cost 0.02
230
+ ::s1 p ::s2 ppp ::cost 0.03
231
+ ::s1 p ::s2 pppp ::cost 0.03
232
+ ::s1 p ::s2 ppppp ::cost 0.03
233
+ ::s1 q ::s2 qq ::cost 0.02
234
+ ::s1 q ::s2 qqq ::cost 0.03
235
+ ::s1 q ::s2 qqqq ::cost 0.03
236
+ ::s1 q ::s2 qqqqq ::cost 0.03
237
+ ::s1 r ::s2 rr ::cost 0.02
238
+ ::s1 r ::s2 rrr ::cost 0.03
239
+ ::s1 r ::s2 rrrr ::cost 0.03
240
+ ::s1 r ::s2 rrrrr ::cost 0.03
241
+ ::s1 s ::s2 ss ::cost 0.02
242
+ ::s1 s ::s2 sss ::cost 0.03
243
+ ::s1 s ::s2 ssss ::cost 0.03
244
+ ::s1 s ::s2 sssss ::cost 0.03
245
+ ::s1 t ::s2 tt ::cost 0.02
246
+ ::s1 t ::s2 ttt ::cost 0.03
247
+ ::s1 t ::s2 tttt ::cost 0.03
248
+ ::s1 t ::s2 ttttt ::cost 0.03
249
+ ::s1 u ::s2 uu ::cost 0.02
250
+ ::s1 u ::s2 uuu ::cost 0.03
251
+ ::s1 u ::s2 uuuu ::cost 0.03
252
+ ::s1 u ::s2 uuuuu ::cost 0.03
253
+ ::s1 u ::s2 uuuuuu ::cost 0.04
254
+ ::s1 u ::s2 uuuuuuu ::cost 0.04
255
+ ::s1 u ::s2 uuuuuuuu ::cost 0.04
256
+ ::s1 u ::s2 uuuuuuuuu ::cost 0.04
257
+ ::s1 u ::s2 uuuuuuuuuu ::cost 0.04
258
+ ::s1 u ::s2 uuuuuuuuuuu ::cost 0.04
259
+ ::s1 u ::s2 uuuuuuuuuuuu ::cost 0.04
260
+ ::s1 u ::s2 uuuuuuuuuuuuu ::cost 0.04
261
+ ::s1 u ::s2 uuuuuuuuuuuuuu ::cost 0.04
262
+ ::s1 u ::s2 uuuuuuuuuuuuuuu ::cost 0.04
263
+ ::s1 u ::s2 uuuuuuuuuuuuuuuu ::cost 0.04
264
+ ::s1 v ::s2 vv ::cost 0.02
265
+ ::s1 v ::s2 vvv ::cost 0.03
266
+ ::s1 v ::s2 vvvv ::cost 0.03
267
+ ::s1 v ::s2 vvvvv ::cost 0.03
268
+ ::s1 w ::s2 ww ::cost 0.02
269
+ ::s1 w ::s2 www ::cost 0.03
270
+ ::s1 w ::s2 wwww ::cost 0.03
271
+ ::s1 w ::s2 wwwww ::cost 0.03
272
+ ::s1 x ::s2 xx ::cost 0.02
273
+ ::s1 x ::s2 xxx ::cost 0.03
274
+ ::s1 x ::s2 xxxx ::cost 0.03
275
+ ::s1 x ::s2 xxxxx ::cost 0.03
276
+ ::s1 y ::s2 yy ::cost 0.02
277
+ ::s1 y ::s2 yyy ::cost 0.03
278
+ ::s1 y ::s2 yyyy ::cost 0.03
279
+ ::s1 y ::s2 yyyyy ::cost 0.03
280
+ ::s1 z ::s2 zz ::cost 0.02
281
+ ::s1 z ::s2 zzz ::cost 0.03
282
+ ::s1 z ::s2 zzzz ::cost 0.03
283
+ ::s1 z ::s2 zzzzz ::cost 0.03
284
+ ::s1 " " ::s2 " " ::cost 0
285
+ ::s1 . ::s2 ::left1 /\./ ::left2 /\./ ::cost 0.02
286
+ ::s1 … ::s2 ::left1 /…/ ::left2 /…/ ::cost 0.01
287
+ ::s1 _ ::s2 ::left1 /_/ ::left2 /_/ ::cost 0.01
288
+ ::s1 = ::s2 ::left1 /=/ ::left2 /=/ ::cost 0.01
289
+ ::s1 ! ::s2 ::left1 /!/ ::left2 /!/ ::cost 0.02
290
+ ::s1 ? ::s2 ::left1 /\?/ ::left2 /\?/ ::cost 0.02
291
+ ::s1 aa ::s2 aː ::cost 0.02
292
+ ::s1 ee ::s2 eː ::cost 0.02
293
+ ::s1 ii ::s2 iː ::cost 0.02
294
+ ::s1 oo ::s2 oː ::cost 0.02
295
+ ::s1 uu ::s2 uː ::cost 0.02
296
+
297
+ ::s1 a ::s2 e ::cost 0.1
298
+ ::s1 au ::s2 o ::cost 0.1 ::lc1 eng
299
+ ::s1 aw ::s2 o ::cost 0.3 ::right1 [-,bcdfghklmnpqrstvwxz$ ]
300
+ ::s1 aw ::s2 o ::cost 0.1 ::right1 [-,bcdfghklmnpqrstvwxz$ ] ::lc1 eng
301
+ ::s1 aw ::s2 a ::cost 0.2 ::right1 [-,bcdfghklmnpqrstvwxz$ ] ::lc1 eng
302
+ ::s1 ay ::s2 i ::cost 0.02 ::lc1 fas ::lc2 eng
303
+ ::s1 aye ::s2 ae ::cost 0.05 ::lc1 fas
304
+ ::s1 é ::s2 e ::cost 0.05
305
+ ::s1 e ::s2 i ::cost 0.15
306
+ ::s1 e ::s2 i ::cost 0.1 ::lc1 uig ::lc2 uig
307
+ ::s1 e ::s2 y ::cost 0.15
308
+ ::s1 ew ::s2 u ::cost 0.3 ::right1 [-,bcdfghklmnpqrstvwxz$ ]
309
+ ::s1 ew ::s2 u ::cost 0.1 ::right1 [-,bcdfghklmnpqrstvwxz$ ] ::lc1 eng
310
+ ::s1 ew ::s2 u ::cost 0.3 ::right1 [aei][lgnrst] ::lc1 eng
311
+ ::s1 ew ::s2 e ::cost 0.3 ::right1 [-,bcdfghklmnpqrstvwxz$ ] ::lc1 eng
312
+ ::s1 i ::s2 a ::cost 0.1 ::right1 [-,$ ] ::lc1 fas
313
+ ::s1 i ::s2 ea ::cost 0.03 ::lc2 eng
314
+ ::s1 i ::s2 ee ::cost 0.03 ::lc2 eng
315
+ ::s1 i ::s2 ei ::cost 0.05 ::lc2 eng
316
+ ::s1 i ::s2 ie ::cost 0.03 ::lc2 eng
317
+ ::s1 i ::s2 ı ::cost 0.05
318
+ ::s1 i ::s2 e ::cost 0.1 ::lc2 eng
319
+ ::s1 i ::s2 y ::cost 0.15
320
+ ::s1 i ::s2 y ::cost 0.1 ::right2 [-,bcdfghklmnpqrstvwxz$ ]
321
+ ::s1 ie ::s2 ei ::cost 0.15
322
+ ::s1 ie ::s2 y ::cost 0.15
323
+ ::s1 ij ::s2 ai ::cost 0.15
324
+ ::s1 o ::s2 u ::cost 0.1
325
+ ::s1 oo ::s2 u ::cost 0.1
326
+ ::s1 ow ::s2 au ::cost 0.2 ::right1 [-,bcdfghklmnpqrstvwxz$ ]
327
+ ::s1 ow ::s2 o ::cost 0.2 ::right1 [-,bcdfghklmnpqrstvwxz$ ]
328
+ ::s1 ow ::s2 o ::cost 0.2 ::lc1 eng ::lc2 zho ::right1 [e]
329
+ ::s1 ow ::s2 o ::cost 0.4 ::lc1 eng ::lc2 zho ::right1 [iy]
330
+ ::s1 u ::s2 a ::cost 0.1 ::lc1 eng ::right1 [-,bcdfghklmnpqrstvwxz][bcdfghklmnpqrstvwxz$ ]
331
+ ::s1 u ::s2 ou ::cost 0.05
332
+ ::s1 u ::s2 yu ::cost 0.05 ::left1 /^(.*[- ])?$/
333
+ ::s1 yeo ::s2 eo ::cost 0.1 ::lc1 fas
334
+
335
+ # Amharic
336
+ ::s1 a ::s2 e ::cost 0.05 ::lc1 amh
337
+ ::s1 aa ::s2 o ::cost 0.15 ::lc1 amh
338
+ ::s1 aawe ::s2 au ::cost 0.05 ::lc1 amh
339
+ ::s1 aawe ::s2 ao ::cost 0.1 ::lc1 amh
340
+ ::s1 aawe ::s2 ou ::cost 0.1 ::lc1 amh
341
+ ::s1 aawo ::s2 ao ::cost 0.05 ::lc1 amh
342
+ ::s1 aaye ::s2 ai ::cost 0.05 ::lc1 amh
343
+ ::s1 aaye ::s2 i ::cost 0.1 ::lc1 amh
344
+ ::s1 aaye ::s2 ei ::cost 0.1 ::lc1 amh
345
+ ::s1 awe ::s2 au ::cost 0.05 ::lc1 amh
346
+ ::s1 awe ::s2 ao ::cost 0.1 ::lc1 amh
347
+ ::s1 awe ::s2 ou ::cost 0.1 ::lc1 amh
348
+ ::s1 ee ::s2 ai ::cost 0.1 ::lc1 amh
349
+ ::s1 eewo ::s2 eo ::cost 0.05 ::lc1 amh
350
+ ::s1 eeyaa ::s2 ea ::cost 0.1 ::lc1 amh
351
+ ::s1 eeye ::s2 ai ::cost 0.1 ::lc1 amh
352
+ ::s1 ewee ::s2 ue ::cost 0.1 ::lc1 amh
353
+ ::s1 gwaa ::s2 gua ::cost 0.05 ::lc1 amh
354
+ ::s1 iya ::s2 ie ::cost 0.05 ::lc1 amh
355
+ ::s1 iyaa ::s2 ia ::cost 0.05 ::lc1 amh
356
+ ::s1 iyo ::s2 io ::cost 0.05 ::lc1 amh
357
+ ::s1 kxaa ::s2 kha ::cost 0.05 ::lc1 amh
358
+ ::s1 liyaa ::s2 llia ::cost 0.05 ::lc1 amh
359
+ ::s2 qaa ::s2 cca ::cost 0.05 ::lc1 amh
360
+ ::s1 uwaa ::s2 ua ::cost 0.05 ::lc1 amh
361
+ ::s1 uwee ::s2 ue ::cost 0.05 ::lc1 amh
362
+ ::s1 uwi ::s2 oui ::cost 0.05 ::lc1 amh
363
+ ::s1 uwi ::s2 ui ::cost 0.05 ::lc1 amh
364
+ ::s1 xaaye ::s2 hai ::cost 0.1 ::lc1 amh
365
+ ::s1 xwaa ::s2 jua ::cost 0.1 ::lc1 amh
366
+ ::s1 ziyaa ::s1 sia ::cost 0.05 ::lc1 amh
367
+ ::s1 w ::s2 ::cost 0.3 ::lc1 amh ::left1 /[aeiou]$/ ::right1 [aeiou]
368
+ ::s1 y ::s2 ::cost 0.1 ::lc1 amh ::left1 /[aeiou]$/ ::right1 [aeiou]
369
+ # abbreviations
370
+ ::s1 ee. ::s2 a ::cost 0.02 ::lc1 amh ::left1 /^(.*[- ])?$/
371
+ ::s1 si. ::s2 c ::cost 0.02 ::lc1 amh ::left1 /^(.*[- ])?$/
372
+ ::s1 di. ::s2 d ::cost 0.02 ::lc1 amh ::left1 /^(.*[- ])?$/
373
+ ::s1 eefe. ::s2 f ::cost 0.02 ::lc1 amh ::left1 /^(.*[- ])?$/
374
+ ::s1 are. ::s2 r ::cost 0.02 ::lc1 amh ::left1 /^(.*[- ])?$/
375
+
376
+ # Arabic
377
+ ::s1 ::s2 a ::cost 0.02 ::lc1 ara
378
+ ::s1 ::s2 e ::cost 0.02 ::lc1 ara
379
+ ::s1 ::s2 i ::cost 0.05 ::lc1 ara
380
+ ::s1 ::s2 o ::cost 0.05 ::lc1 ara
381
+ ::s1 ::s2 p ::cost 0.15 ::lc1 ara ::left2 /m$/ ::right2 [dfgklmnpqrstvwz]
382
+ ::s1 ::s2 u ::cost 0.05 ::lc1 ara
383
+ ::s1 y ::s2 a ::cost 0.15 ::lc1 ara
384
+ ::s1 y ::s2 e ::cost 0.05 ::lc1 ara
385
+ ::s1 y ::s2 ea ::cost 0.02 ::lc1 ara
386
+ ::s1 y ::s2 ee ::cost 0.02 ::lc1 ara
387
+ ::s1 y ::s2 i ::cost 0.02 ::lc1 ara
388
+ ::s1 y ::s2 ie ::cost 0.02 ::lc1 ara
389
+ ::s1 b ::s2 p ::cost 0.02 ::lc1 ara
390
+ ::s1 b ::s2 pp ::cost 0.03 ::lc1 ara
391
+ ::s1 f ::s2 v ::cost 0.02 ::lc1 ara
392
+ ::s1 fyl ::s2 ville ::right2 [-,$ ] ::cost 0.05 ::lc1 ara
393
+ ::s1 gh ::s2 g ::right2 [abcdfgklmnopqrstuvwz] ::cost 0.05 ::lc1 ara
394
+ ::s1 ghz ::s2 gs ::cost 0.05 ::lc1 ara
395
+ ::s1 j ::s2 g ::cost 0.2 ::lc1 ara
396
+ ::s1 kh ::s2 g ::cost 0.3 ::lc1 ara ::right2 [eiy]
397
+ ::s1 q ::s2 g ::cost 0.2 ::lc1 ara ::right2 [arouz]
398
+ ::s1 q ::s2 gg ::cost 0.2 ::lc1 ara ::right2 [arouz]
399
+ ::s1 th ::s2 z ::cost 0.4 ::lc1 ara ::right2 [aou] ::comment Spanish
400
+ ::s1 " (" ::s2 ", " ::cost 0.02 ::lc1 ara
401
+ ::s1 ) ::s2 ::right2 [-,$ ] ::cost 0.02 ::lc1 ara
402
+
403
+ # Bengali
404
+ ::s1 aoyaa ::s2 wa ::cost 0.1 ::lc1 ben
405
+ ::s1 aoye ::s2 way ::cost 0.1 ::lc1 ben
406
+ ::s1 bhaa ::s2 ve ::cost 0.1 ::lc1 ben
407
+ ::s1 bh ::s2 v ::cost 0.2 ::lc1 ben
408
+ ::s1 bh ::s2 w ::cost 0.2 ::lc1 ben
409
+ ::s1 b ::s2 v ::cost 0.3 ::lc1 ben
410
+ ::s1 b ::s2 w ::cost 0.3 ::lc1 ben
411
+ ::s1 dda ::s2 rh ::right2 [-,$ ] ::cost 0.2 ::lc1 ben
412
+ ::s1 dd ::s2 r ::cost 0.4 ::lc1 ben
413
+ ::s1 gk ::s2 k ::cost 0.05 ::lc1 ben
414
+ ::s1 h ::s2 g ::right2 [eiy] ::cost 0.4 ::lc1 ben
415
+ ::s1 h ::s2 j ::cost 0.4 ::lc1 ben
416
+ ::s1 hoyaai ::s2 whi ::cost 0.05 ::lc1 ben
417
+ ::s1 j ::s2 z ::cost 0.1 ::lc1 ben
418
+ ::s1 j ::s2 s ::cost 0.3 ::lc1 ben
419
+ ::s1 myaaka ::s2 mc ::cost 0.1 ::lc1 ben
420
+ ::s1 myaaka ::s2 mac ::cost 0.1 ::lc1 ben
421
+ ::s1 oyaa ::s2 wa ::cost 0.02 ::lc1 ben
422
+ ::s1 oyaa ::s2 wo ::cost 0.1 ::lc1 ben
423
+ ::s1 oyena ::s2 owen ::cost 0.1 ::lc1 ben
424
+ ::s1 ph ::s2 v ::cost 0.1 ::lc1 ben
425
+ ::s1 phana ::s2 von ::cost 0.1 ::lc1 ben
426
+ ::s1 rhio ::s2 gio ::cost 0.2 ::lc1 ben
427
+ ::s1 sh ::s2 s ::cost 0.4 ::lc1 ben
428
+ ::s1 ss ::s2 sh ::left1 /[k]$/ ::cost 0.15 ::lc1 ben
429
+ ::s1 ss ::s2 sh ::cost 0.3 ::lc1 ben
430
+ ::s1 o ::s2 wo ::cost 0.2 ::lc1 ben ::left1 /^(.*[-, ]?)$/
431
+ ::s1 oye ::s2 we ::cost 0.2 ::lc1 ben
432
+ ::s1 tta ::s2 tho ::cost 0.3 ::lc1 ben
433
+ ::s1 tthaa ::s2 ta ::cost 0.3 ::lc1 ben
434
+ ::s1 u ::s2 wo ::cost 0.2 ::lc1 ben ::left1 /^(.*[-, ]?)$/
435
+ ::s1 u ::s2 woo ::cost 0.2 ::lc1 ben ::left1 /^(.*[-, ]?)$/
436
+ ::s1 u ::s2 wu ::cost 0.2 ::lc1 ben ::left1 /^(.*[-, ]?)$/
437
+ ::s1 ui ::s2 wi ::cost 0.02 ::lc1 ben ::left1 /^(.*[-, ]?)$/
438
+ ::s1 yaa ::s2 wa ::cost 0.3 ::lc1 ben
439
+ ::s1 ye ::s2 we ::cost 0.3 ::lc1 ben
440
+
441
+ # Russian
442
+ ::s1 ::s2 os ::cost 0.4 ::left2 /[bcdfghilmnprstvx]$/ ::right2 [-,$ ] ::lc1 rus
443
+ ::s1 ::s2 us ::cost 0.4 ::left2 /[bcdfghilmnprstvx]$/ ::right2 [-,$ ] ::lc1 rus
444
+ ::s1 av ::s2 au ::cost 0.05 ::lc1 rus
445
+ ::s1 ch ::s2 cz ::cost 0.1 ::lc1 rus ::comment Polish
446
+ ::s1 chch ::s2 cci ::right2 [aou] ::cost 0.1 ::lc1 rus
447
+ ::s1 chch ::s2 cc ::right2 [eiy] ::cost 0.1 ::lc1 rus
448
+ ::s1 chzh ::s2 zh ::cost 0.1 ::lc1 rus
449
+ ::s1 dz ::s2 zz ::cost 0.1 ::lc1 rus ::right2 [aeiouy]
450
+ ::s1 dz ::s2 j ::cost 0.3 ::lc1 rus ::right2 [aeiouy] ::comment Japanese
451
+ ::s1 dzh ::s2 g ::cost 0.05 ::lc1 rus ::right2 [eiy]
452
+ ::s1 dzh ::s2 gg ::cost 0.05 ::lc1 rus ::right2 [eiy]
453
+ ::s1 dzh ::s2 j ::cost 0.05 ::lc1 rus
454
+ ::s1 ev ::s2 eu ::cost 0.1 ::lc1 rus
455
+ ::s1 f ::s2 th ::cost 0.6 ::lc1 rus
456
+ ::s1 ievye ::s2 iaceae ::cost 0.02 ::right1 [-,$ ] ::lc1 rus ::comment scientific names for families of species
457
+ ::s1 ii ::s2 ius ::cost 0.2 ::right1 [-,$ ] ::lc1 rus
458
+ ::s1 i ::s2 j ::cost 0.2 ::lc1 rus
459
+ ::s1 naya ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::suffix adjective
460
+ ::s1 nyi ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::suffix adjective
461
+ ::s1 ovye ::s2 aceae ::cost 0.02 ::right1 [-,$ ] ::lc1 rus ::comment scientific names for families of species
462
+ ::s1 shsh ::s2 sh ::cost 0 ::lc1 rus
463
+ ::s1 skaya ::s2 ian ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::suffix possessive
464
+ ::s1 skaya ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::suffix possessive
465
+ ::s1 skii ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::suffix possessive
466
+ ::s1 skii ::s2 ian ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::suffix adjective
467
+ ::s1 tsian ::s2 tian ::cost 0.05 ::lc1 rus
468
+ ::s1 tsion ::s2 tion ::cost 0.05 ::lc1 rus
469
+ ::s1 ts ::s2 c ::cost 0.3 ::lc1 rus
470
+ ::s1 ts ::s2 c ::cost 0.02 ::right1 [-,$ ] ::lc1 rus
471
+ ::s1 tsz ::s2 z ::cost 0.1 ::lc1 rus
472
+ ::s1 itsa ::s2 ica ::cost 0.02 ::right1 [-,$ ] ::lc1 rus
473
+ ::s1 etski ::s2 ecky ::cost 0.02 ::right1 [-,$ ] ::lc1 rus
474
+ ::s1 tsiya ::s2 tion ::cost 0.02 ::right1 [-,$ ] ::lc1 rus
475
+ ::s1 tsi ::s2 qi ::cost 0.15 ::lc1 rus ::comment Chinese names
476
+ ::s1 tsy ::s2 qi ::cost 0.15 ::lc1 rus ::comment Chinese names
477
+ ::s1 tszi ::s2 ji ::cost 0.15 ::lc1 rus ::comment Chinese names
478
+ ::s1 tszy ::s2 ji ::cost 0.15 ::lc1 rus ::comment Chinese names
479
+ ::s1 u ::s2 w ::right2 [aeio] ::cost 0.05 ::lc1 rus
480
+ ::s1 u ::s2 w ::cost 0.2 ::lc1 rus
481
+ ::s1 uo ::s2 wa ::cost 0.2 ::lc1 rus ::right2 [lnrst]
482
+ ::s1 v ::s2 u ::cost 0.05 ::lc1 rus ::left1 /[bcdfghjklmnpqrstvwxz]$/ ::right1 [aeiou]
483
+ ::s1 gva ::s2 gua ::cost 0.02 ::lc1 rus
484
+ ::s1 gvi ::s2 gui ::cost 0.02 ::lc1 rus
485
+ ::s1 x ::s2 sh ::cost 0.2 ::left2 /[aeiou]$/ ::right2 [-,aouct$-] ::lc1 rus
486
+ ::s1 y ::s2 s ::cost 0.4 ::right2 [-,$-] ::lc1 rus
487
+ ::s1 zh ::s2 rz ::cost 0.1 ::lc1 rus ::comment Polish rz
488
+
489
+ # Russian case endings
490
+ ::s1 em ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
491
+ ::s1 ey ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
492
+ ::s1 om ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
493
+ ::s1 oy ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
494
+ ::s1 oyu ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
495
+ ::s1 y ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
496
+ ::s1 ya ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
497
+ ::s1 ye ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
498
+ ::s1 yem ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
499
+ ::s1 ym ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
500
+ ::s1 ymi ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
501
+ ::s1 yu ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
502
+ ::s1 ii ::s2 iya ::cost 0.1 ::right1 [-,$ ] ::right2 [-,$ ] ::lc1 rus ::lc2 rus ::comment Russian case endings
503
+ ::s1 ii ::s2 iye ::cost 0.1 ::right1 [-,$ ] ::right2 [-,$ ] ::lc1 rus ::lc2 rus ::comment Russian case endings
504
+
505
+ ::s1 am ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
506
+ ::s1 ami ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
507
+ ::s1 em ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
508
+ ::s1 ev ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
509
+ ::s1 eri ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
510
+ ::s1 eryu ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
511
+ ::s1 om ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
512
+ ::s1 ov ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
513
+ ::s1 akh ::s2 ::cost 0.3 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
514
+ ::s1 ykh ::s2 ::cost 0.3 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
515
+
516
+ # Ukrainian case endings
517
+ ::s1 eyu ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
518
+ ::s1 oyu ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
519
+ ::s1 ya ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
520
+ ::s1 yi ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
521
+ ::s1 yu ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
522
+
523
+ ::s1 am ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
524
+ ::s1 amy ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
525
+ ::s1 em ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
526
+ ::s1 evy ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
527
+ ::s1 iv ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
528
+ ::s1 om ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
529
+ ::s1 ovy ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
530
+ ::s1 yam ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
531
+ ::s1 yamy ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
532
+ ::s1 yiv ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
533
+ ::s1 akh ::s2 ::cost 0.3 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
534
+ ::s1 yakh ::s2 ::cost 0.3 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
535
+
536
+ # Uyghur
537
+ ::s1 aw ::s2 ao ::cost 0.05 ::lc1 uig
538
+ ::s1 aw ::s2 au ::cost 0.05 ::lc1 uig
539
+ ::s1 gwi ::s2 gui ::cost 0.05 ::lc1 uig
540
+ ::s1 iye ::s2 ia ::cost 0.05 ::lc1 uig
541
+ ::s1 istan ::s2 ia ::cost 0.1 ::right1 [-,$ ] ::lc1 uig
542
+ ::s1 j ::s2 c ::cost 0.4 ::lc1 uig
543
+ ::s1 q ::s2 h ::cost 0.2 ::lc1 uig
544
+ ::s1 sey ::s2 cai ::cost 0.2 ::lc1 uig
545
+ ::s1 sh ::s2 x ::cost 0.2 ::lc1 uig
546
+
547
+ ::s1 b ::s2 p ::cost 0.3
548
+ ::s1 b ::s2 v ::cost 0.5 ::left2 /^(.*[- ])?$/
549
+ ::s1 b ::s2 v ::cost 0.7
550
+ ::s1 c ::s2 ch ::cost 0.25 ::right1 [eiy]
551
+ ::s1 c ::s2 ck ::cost 0.02 ::right1 [-,abcdfghklmnpoqrstuvwxz$ ]
552
+ ::s1 c ::s2 k ::cost 0.4
553
+ ::s1 c ::s2 k ::cost 0.05 ::left1 /^(.* )?ma?$/ ::comment MacIntyre
554
+ ::s1 c ::s2 k ::cost 0.02 ::right1 [-,abcdfghklmnpoqrstuvwxz$ ]
555
+ ::s1 c ::s2 kk ::cost 0.02 ::right1 [-,abcdfghklmnpoqrstuvwxz$ ]
556
+ ::s1 c ::s2 s ::cost 0.7
557
+ ::s1 c ::s2 s ::cost 0.1 ::right1 [eiy]
558
+ ::s1 c ::s2 ts ::cost 0.15 ::right1 [eiy]
559
+ ::s1 c ::s2 z ::cost 0.3
560
+ ::s1 ch ::s2 ck ::cost 0.2
561
+ ::s1 ch ::s2 g ::cost 0.3 ::right1 [eiy] ::right2 [eiy]
562
+ ::s1 ch ::s2 k ::cost 0.2
563
+ ::s1 ch ::s2 kk ::cost 0.2
564
+ ::s1 ch ::s2 sh ::cost 0.3
565
+ ::s1 ch ::s2 sh ::cost 0.2 ::left1 /eiy$/ ::right1 [$ ]
566
+ ::s1 ch ::s2 tch ::cost 0.1
567
+ ::s1 ch ::s2 tsh ::cost 0.1
568
+ ::s1 ch ::s2 z ::cost 0.5
569
+ ::s1 ck ::s2 kk ::cost 0.02
570
+ ::s1 cz ::s2 ch ::cost 0.2 ::left1 /i$/
571
+ ::s1 d ::s2 t ::cost 0.3
572
+ ::s1 de ::s2 dre ::cost 0.3 ::lc1 zho ::right2 [-,$ ]
573
+ ::s1 dg ::s2 j ::cost 0.6 ::lc1 eng ::comment Cambridge
574
+ ::s1 dg ::s2 j ::cost 0.3 ::right1 [eiy] ::lc1 eng
575
+ ::s1 dg ::s2 j ::cost 0.1 ::right1 [eiy] ::lc1 eng ::lc2 fas, jpn
576
+ ::s1 dt ::s2 d ::cost 0.3
577
+ ::s1 dt ::s2 t ::cost 0.03
578
+ ::s1 dt ::s2 tt ::cost 0.03
579
+ ::s1 f ::s2 p ::cost 0.8
580
+ ::s1 f ::s2 ph ::cost 0.01
581
+ ::s1 ff ::s2 ph ::cost 0.02
582
+ ::s1 f ::s2 pf ::cost 0.1
583
+ ::s1 f ::s2 v ::cost 0.3
584
+ ::s1 f ::s2 v ::cost 0.1 ::right1 [-,$ ]
585
+ ::s1 ef ::s2 ev ::cost 0.1 ::right1 [-,bcdfghklmnpqrstvwxz$ ]
586
+ ::s1 f ::s2 w ::cost 0.3
587
+ ::s1 g ::s2 j ::cost 0.6
588
+ ::s1 g ::s2 j ::cost 0.3 ::right1 [eiy]
589
+ ::s1 g ::s2 j ::cost 0.1 ::right1 [eiy] ::lc2 amh, ara, fas, jpn, som
590
+ ::s1 g ::s2 k ::cost 0.3
591
+ ::s1 g ::s2 gh ::cost 0.3
592
+ ::s1 g ::s2 ch ::cost 0.4 ::left1 /[eiy]$/ ::right1 [-,$ ] ::comment German: Ludwig, Braunschweig
593
+ ::s1 gh ::s2 f ::cost 0.2 ::lc1 eng ::comment laughter
594
+ ::s1 gh ::s2 "" ::cost 0.2 ::lc1 eng ::comment daughter
595
+ ::s1 gh ::s2 g ::cost 0.2 ::lc1 eng ::comment Afghanistan
596
+ ::s1 gl ::s2 l ::cost 0.2 ::lc1 eng ::right1 [i]
597
+ ::s1 gn ::s2 n ::cost 0.05 ::left1 /^(.* )?$/ ::lc1 eng
598
+ ::s1 gn ::s2 n ::cost 0.2 ::lc1 eng
599
+ ::s1 gz ::s2 ks ::cost 0.2
600
+ ::s1 h ::s2 e ::cost 0.4 ::lc1 fas
601
+ ::s1 ise ::s2 ize ::cost 0.1
602
+ ::s1 j ::s2 y ::cost 0.2
603
+ ::s1 j ::s2 dj ::cost 0.2
604
+ ::s1 j ::s2 h ::cost 0.4 ::right2 [aeiou] ::lc2 amh ::example Jose
605
+ ::s1 j ::s2 hh ::cost 0.4 ::right2 [aeiou] ::lc2 amh ::example Tardajos
606
+ ::s1 j ::s2 zh ::cost 0.2
607
+ ::s1 k ::s2 cc ::cost 0.02 ::right2 [aour]
608
+ ::s1 k ::s2 cc ::cost 0.3
609
+ ::s1 k ::s2 cch ::cost 0.15
610
+ ::s1 k ::s2 ck ::cost 0.02
611
+ ::s1 k ::s2 cq ::cost 0.05
612
+ ::s1 k ::s2 cqu ::cost 0.05
613
+ ::s1 k ::s2 cque ::cost 0.1
614
+ ::s1 k ::s2 cque ::cost 0.05 ::right2 [-,$ ]
615
+ ::s1 k ::s2 cques ::cost 0.05 ::right2 [-,$ ]
616
+ ::s1 k ::s2 q ::cost 0.05
617
+ ::s1 k ::s2 qu ::cost 0.05
618
+ ::s1 k ::s2 que ::cost 0.1
619
+ ::s1 k ::s2 que ::cost 0.05 ::right2 [-,$ ]
620
+ ::s1 k ::s2 ques ::cost 0.1 ::right2 [-,$ ]
621
+ ::s1 kh ::s2 j ::cost 0.2
622
+ ::s1 kh ::s2 q ::cost 0.2
623
+ ::s1 kh ::s2 k ::cost 0.25 ::right1 [aeiouy]
624
+ ::s1 kh ::s2 k ::cost 0.1 ::right1 [aeiouys] ::lc2 amh
625
+ ::s1 kn ::s2 n ::cost 0.05 ::left1 /^(.* )?$/ ::lc1 eng
626
+ ::s1 kj ::s2 sh ::cost 0.2 ::comment Swedish
627
+ ::s1 l ::s2 r ::cost 0.1 ::lc1 zho
628
+ ::s1 aib ::s2 alb ::cost 0.1 ::lc1 zho
629
+ ::s1 al ::s2 ::cost 0.5 ::left1 /^(.* )?$/
630
+ ::s1 al- ::s2 ::cost 0.3 ::left1 /^(.* )?$/
631
+ ::s1 el ::s2 ::cost 0.5 ::left1 /^(.* )?$/
632
+ ::s1 el- ::s2 ::cost 0.3 ::left1 /^(.* )?$/
633
+ ::s1 ll ::s2 y ::cost 0.1 ::left1 /[aeiouy]$/ ::right1 [aeiouy] ::comment Guillermo, Guillaume
634
+ ::s1 mb ::s2 m ::cost 0.2 ::right1 [-,bcdfghklmnpqstvwxz$ ] ::lc1 eng ::comment bomb
635
+ ::s1 n ::s2 m ::cost 0.5 ::left1 /[aeiou]$/ ::left2 /[aeiou]$/ ::right1 [bcdfghklmnpqrstvwxz$ ] ::right2 [-,bcdfghklmnpqrstvwxz$ ]
636
+ ::s1 ng ::s2 n ::cost 0.1 ::left1 /[aeiou]$/ ::lc1 zho
637
+ ::s1 ng ::s2 m ::cost 0.25 ::left1 /[aeiou]$/ ::lc1 zho
638
+ ::s1 ng ::s2 n ::cost 0.1 ::left2 /[aeiou]$/ ::lc2 ara, ben, rus, zho
639
+ ::s1 nm ::s2 m ::cost 0.25 ::lc1 zho ::left1
640
+ ::s1 pn ::s2 n ::cost 0.05 ::left1 /^(.* )?$/ ::lc1 eng
641
+ ::s1 ph ::s2 p ::cost 0.3 ::lc1 amh
642
+ ::s1 q ::s2 c ::cost 0.15
643
+ ::s1 q ::s2 ch ::cost 0.2 ::right2 [eiy]
644
+ ::s1 q ::s2 ck ::cost 0.2
645
+ ::s1 q ::s2 kk ::cost 0.2
646
+ ::s1 q ::s2 gh ::cost 0.2 ::lc1 fas ::right2 [aeiouy]
647
+ ::s1 qi ::s2 ch ::cost 0.2 ::lc1 zho ::right1 [aeou]
648
+ ::s1 qi ::s2 cci ::cost 0.1 ::lc1 zho
649
+ ::s1 qi ::s2 chi ::cost 0.1 ::lc1 zho
650
+ ::s1 qi ::s2 tch ::cost 0.2 ::lc1 zho ::right1 [aeou]
651
+ ::s1 qi ::s2 ts ::cost 0.4 ::lc1 zho ::right1 [aeou]
652
+ ::s1 qi ::s2 tsch ::cost 0.2 ::lc1 zho ::right1 [aeou]
653
+ ::s1 qi ::s2 tzsch ::cost 0.2 ::lc1 zho ::right1 [aeou]
654
+ ::s1 qi ::s2 czy ::cost 0.2 ::lc1 zho
655
+ ::s1 qu ::s2 kw ::cost 0.15
656
+ ::s1 qu ::s2 kv ::cost 0.15
657
+ ::s1 e ::s2 er ::cost 0.25 ::left1 /[bcdfghklmnpqrstvwxz]$/ ::lc1 zho
658
+ ::s1 re ::s2 er ::cost 0.1
659
+ ::s1 rh ::s2 r ::cost 0.05 ::left1 /^(.*[- ])?$/ ::example Rhine
660
+ ::s1 s ::s2 sh ::cost 0.03 ::right2 [aeiou] ::lc2 amh
661
+ ::s1 s ::s2 sz ::cost 0.3 ::lc2 eng ::example Liszt (Hungarian)
662
+ ::s1 s ::s2 ts ::cost 0.4 ::lc1 amh, zho
663
+ ::s1 s ::s2 z ::cost 0.4
664
+ ::s1 s ::s2 z ::cost 0.1 ::left1 /[aeiouy]$/ ::right1 [aeiouy] ::lc1 eng
665
+ ::s1 s ::s2 z ::cost 0.1 ::left1 /[aeiouy][bdglmnrvw]?$/ ::right1 [-,$ ] ::lc1 eng
666
+ ::s1 s ::s2 z ::cost 0.2 ::lc2 fas
667
+ ::s1 sc ::s2 s ::cost 0.2 ::right1 [i] ::example Nascimento
668
+ ::s1 sci ::s2 sh ::cost 0.2 ::example Brescia
669
+ ::s1 sch ::s2 sh ::cost 0.1
670
+ ::s1 sh ::s2 sz ::cost 0.2 ::example Mariusz (Polish) ::lc2 eng
671
+ ::s1 si ::s2 j ::cost 0.1 ::right2 [a] ::lc1 eng
672
+ ::s1 ss ::s2 z ::cost 0.5
673
+ # ::s1 smith ::s2 mith ::cost 0.75 ::lc2 zho ::comment weird, but several different Xinhua examples
674
+ ::s1 tch ::s2 c ::cost 0.2 ::left2 /[aeiou]$/ ::right2 [-,e$ ]
675
+ ::s1 te ::s2 tre ::cost 0.3 ::lc1 zho ::right2 [-,$ ]
676
+ ::s1 th ::s2 t ::cost 0.2 ::lc2 amh, fas, uig
677
+ ::s1 th ::s2 s ::cost 0.4 ::lc2 zho
678
+ ::s1 th ::s2 sth ::cost 0.4 ::lc1 zho
679
+ ::s1 th ::s2 ths ::cost 0.4 ::lc1 zho
680
+ ::s1 th ::s2 z ::cost 0.3 ::lc2 amh ::right2 [-,$ aeot]
681
+ ::s1 v ::s2 w ::cost 0.02
682
+ ::s1 v ::s2 wh ::cost 0.02 ::left1 /^(.* )?$/
683
+ ::s1 vv ::s2 w ::cost 0.02
684
+ ::s1 w ::s2 u ::cost 0.1 ::lc2 uig
685
+ ::s1 wa ::s2 ua ::cost 0.05
686
+ ::s1 wh ::s2 w ::cost 0.05 ::left1 /^(.* )?$/
687
+ ::s1 wr ::s2 r ::cost 0.05 ::left1 /^(.* )?$/ ::lc1 eng
688
+ ::s1 x ::s2 ks ::cost 0.05
689
+ ::s1 x ::s2 s ::cost 0.2 ::left1 /^(.* )?$/
690
+ ::s1 x ::s2 sh ::cost 0.2 ::lc1 uig ::left1 /^(.* )?$/ ::right1 [aeiou]
691
+ ::s1 x ::s2 z ::cost 0.2 ::left1 /^(.* )?$/ ::right1 [aeiouy]
692
+ ::s1 x ::s2 h ::cost 0.3 ::lc1 uig
693
+ ::s1 x ::s2 h ::cost 0.05 ::lc1 uig ::left1 /^(.* )?$/ ::right1 [aeiou]
694
+ ::s1 x ::s2 kh ::cost 0.1 ::lc1 uig
695
+ ::s1 xi ::s2 sch ::cost 0.2 ::right1 [aeou] ::lc1 zho
696
+ ::s1 xi ::s2 sh ::cost 0.2 ::right1 [aeou] ::lc1 zho
697
+ ::s1 xi ::s2 ch ::cost 0.4 ::right1 [aeou] ::lc1 zho
698
+ ::s1 xi ::s2 sci ::cost 0.4 ::right1 [aeou] ::lc1 zho
699
+ ::s1 xi ::s2 s ::cost 0.6 ::right1 [aeou] ::lc1 zho
700
+ ::s1 z ::s2 dz ::cost 0.1 ::left1 /^(.*[ aeiouy])?[lnr]?$/
701
+ ::s1 z ::s2 ts ::cost 0.15
702
+ ::s1 z ::s2 tz ::cost 0.15
703
+ ::s1 zh ::s2 g ::cost 0.2 ::right2 [eiy]
704
+ ::s1 zh ::s2 g ::cost 0.1 ::right2 [eiy] ::lc2 amh
705
+ ::s1 zz ::s2 ts ::cost 0.15
706
+ ::s1 zz ::s2 tz ::cost 0.1
707
+
708
+ # Oromo
709
+ ::s1 nb ::s2 mb ::cost 0.4 ::lc1 orm ::lc2 orm ::left1 /[aeiou]$/ ::left2 /[aeiou]$/
710
+ ::s1 np ::s2 mp ::cost 0.4 ::lc1 orm ::lc2 orm ::left1 /[aeiou]$/ ::left2 /[aeiou]$/
711
+ ::s1 ph ::s2 p ::cost 0.3 ::lc1 orm ::lc2 orm
712
+
713
+ # Tigrinya
714
+ ::s1 aaye ::s2 a ::cost 0.4 ::lc1 tir ::lc2 tir ::left1 /[bcdfghklmnpqrstvwxz]$/ ::right1 [bcdfghklmnpqrstvwxz] ::comment internal plural
715
+ ::s1 aaye ::s2 i ::cost 0.4 ::lc1 tir ::lc2 tir ::left1 /[bcdfghklmnpqrstvwxz]$/ ::right1 [bcdfghklmnpqrstvwxz] ::comment internal plural
716
+
717
+ # Somali
718
+ ::s1 ay ::s2 ey ::cost 0.1 ::lc1 som ::lc2 som
719
+ ::s1 ay ::s2 eey ::cost 0.15 ::lc1 som ::lc2 som
720
+ ::s1 aha ::s2 ihii ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
721
+ ::s1 aha ::s2 ihi ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
722
+ ::s1 aha ::s2 uhu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
723
+ ::s1 ihii ::s2 uhu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
724
+ ::s1 ihi ::s2 uhu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
725
+ ::s1 ha ::s2 hii ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
726
+ ::s1 ha ::s2 hi ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
727
+ ::s1 ha ::s2 hu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
728
+ ::s1 hii ::s2 hu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
729
+ ::s1 hi ::s2 hu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
730
+ ::s1 aka ::s2 ikii ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
731
+ ::s1 aka ::s2 iki ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
732
+ ::s1 aka ::s2 uku ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
733
+ ::s1 ikii ::s2 uku ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
734
+ ::s1 iki ::s2 uku ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
735
+ ::s1 ka ::s2 kii ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
736
+ ::s1 ka ::s2 ki ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
737
+ ::s1 ka ::s2 ku ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
738
+ ::s1 kii ::s2 ku ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
739
+ ::s1 ki ::s2 ku ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
740
+ ::s1 aga ::s2 ugu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
741
+ ::s1 ga ::s2 gu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
742
+ ::s1 ata ::s2 itii ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
743
+ ::s1 ata ::s2 iti ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
744
+ ::s1 ata ::s2 utu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
745
+ ::s1 itii ::s2 utu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
746
+ ::s1 iti ::s2 utu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
747
+ ::s1 ta ::s2 tii ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
748
+ ::s1 ta ::s2 ti ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
749
+ ::s1 ta ::s2 tu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
750
+ ::s1 tii ::s2 tu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
751
+ ::s1 ti ::s2 tu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
752
+ ::s1 ata ::s2 ete ::cost 0.15 ::lc1 som ::lc2 som
753
+ ::s1 ata ::s2 iti ::cost 0.2 ::lc1 som ::lc2 som
754
+ ::s1 ete ::s2 iti ::cost 0.15 ::lc1 som ::lc2 som
755
+ ::s1 g ::s2 k ::cost 0.2 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [aeiou]
756
+ ::s1 g ::s2 k ::cost 0.25 ::lc1 som ::lc2 som
757
+ ::s1 g ::s2 kh ::cost 0.25 ::lc1 som ::lc2 som
758
+ ::s1 gh ::s2 kh ::cost 0.1 ::lc1 som ::lc2 som
759
+ ::s1 gh ::s2 k ::cost 0.2 ::lc1 som ::lc2 som
760
+ ::s1 g ::s2 q ::cost 0.25 ::lc1 som ::lc2 som
761
+ ::s1 g ::s2 q ::cost 0.2 ::lc1 som ::lc2 som ::right1 [aou] ::right2 [aou]
762
+ ::s1 ga ::s2 q ::cost 0.2 ::lc1 som ::lc2 som ::left1 /^(.*[aeiou])?$/ ::left2 /^(.*[aeiou])?$/ ::right1 [bcdfghklmnpqrstvwxz] ::right2 [bcdfghklmnpqrstvwxz]
763
+ ::s1 g ::s2 j ::cost 0.25 ::lc1 som ::lc2 som
764
+ ::s1 g ::s2 j ::cost 0.15 ::lc1 som ::lc2 som ::right1 [ei] ::right2 [ei]
765
+ ::s1 gi ::s2 j ::cost 0.15 ::lc1 som ::lc2 som ::right2 [ei]
766
+ ::s1 n ::s2 m ::cost 0.2 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [aeiou]
767
+ ::s1 n ::s2 mm ::cost 0.2 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [aeiou]
768
+ ::s1 n ::s2 m ::cost 0.25 ::lc1 som ::lc2 som ::right2 [aeiko]
769
+ ::s1 n ::s2 mm ::cost 0.25 ::lc1 som ::lc2 som ::right2 [aeiko]
770
+ ::s1 ii ::s2 a ::cost 0.15 ::lc1 som ::lc2 som
771
+ ::s1 y ::s2 dj ::cost 0.2 ::lc2 som
772
+ ::s1 ca ::s2 a ::cost 0.15 ::left1 /^(.*[-, ])?$/ ::lc1 som
773
+ ::s1 c ::s2 ::cost 0.25 ::left1 /^(.*[-, ])?$/ ::lc1 som
774
+ ::s1 x ::s2 h ::cost 0.25 ::lc1 som
775
+ ::s1 x ::s2 h ::cost 0.05 ::lc1 som ::left1 /^(.* )?$/ ::right1 [aeiou]
776
+ ::s1 x ::s2 h ::cost 0.1 ::lc1 som ::left1 /[aeiou]$/
777
+ ::s1 b ::s2 p ::cost 0.1 ::lc1 som
778
+ ::s1 majm ::s2 mahm ::cost 0.1 ::lc1 som
779
+ ::s1 chalim ::s2 halim ::cost 0.1 ::lc1 som ::lc2 som
780
+ ::s1 chalim ::s2 jalim ::cost 0.1 ::lc1 som ::lc2 som
781
+ ::s1 chalim ::s2 kalim ::cost 0.1 ::lc1 som ::lc2 som
782
+ ::s1 halim ::s2 jalim ::cost 0.1 ::lc1 som ::lc2 som
783
+ ::s1 halim ::s2 kalim ::cost 0.1 ::lc1 som ::lc2 som
784
+ ::s1 jalim ::s2 kalim ::cost 0.1 ::lc1 som ::lc2 som
785
+ ::s1 dh ::s2 r ::cost 0.25 ::lc1 som ::lc2 som ::left1 /[aeiou]$/
786
+ ::s1 j ::s2 ch ::cost 0.25 ::lc1 som ::lc2 som
787
+ ::s1 j ::s2 kh ::cost 0.25 ::lc1 som ::lc2 som
788
+ ::s1 ch ::s2 sh ::cost 0.2 ::lc1 som ::lc2 som
789
+
790
+ # French
791
+ ::s1 aud ::s2 o ::cost 0.3 ::right1 [-,$ ] ::lc1 eng, fra
792
+ ::s1 aux ::s2 o ::cost 0.05 ::right1 [-,$ ]
793
+ ::s1 eaux ::s2 o ::cost 0.05 ::right1 [-,$ ]
794
+ ::s1 eux ::s2 o ::cost 0.05 ::right1 [-,$ ]
795
+ ::s1 eux ::s2 e ::cost 0.15 ::right1 [-,$ ]
796
+
797
+ ::s1 - ::s2 " " ::cost 0.1
798
+ ::s1 : ::s2 , ::cost 0.1 ::lc1 amh
799
+
800
+ # mini dictionary Amharic-English
801
+ ::s1 dabube ::s2 south ::cost 0 ::lc1 amh ::lc2 eng
802
+ ::s1 daseete ::s2 island ::cost 0 ::lc1 amh ::lc2 eng
803
+ ::s1 daseetoche ::s2 islands ::cost 0 ::lc1 amh ::lc2 eng
804
+ ::s1 kaaweneti ::s2 county ::cost 0 ::lc1 amh ::lc2 eng
805
+ ::s1 katamaa ::s2 city ::cost 0 ::lc1 amh ::lc2 eng
806
+ ::s1 kelele ::s2 region ::cost 0 ::lc1 amh ::lc2 eng
807
+ ::s1 meseraaqe ::s2 east ::cost 0 ::lc1 amh ::lc2 eng
808
+ ::s1 sameene ::s2 north ::cost 0 ::lc1 amh ::lc2 eng
809
+ ::s1 setaadiyame ::s2 stadium ::cost 0 ::lc1 amh ::lc2 eng
810
+ ::s1 waneze ::s2 river ::cost 0 ::lc1 amh ::lc2 eng
811
+
812
+ # mini dictionary Arabic-English
813
+ ::s1 " " ::s2 " of " ::cost 0 ::lc1 ara ::lc2 eng
814
+ ::s1 " alawl" ::s2 " i" ::cost 0 ::lc1 ara ::lc2 eng ::right2 [-,$ ]
815
+
816
+ # mini dictionary Bengali-English
817
+ ::s1 anychala ::s2 zone ::cost 0 ::lc1 ben ::lc2 eng
818
+ ::s1 pradesha ::s2 province ::cost 0 ::lc1 ben ::lc2 eng
819
+ ::s1 saamraajya ::s2 empire ::cost 0 ::lc1 ben ::lc2 eng
820
+ ::s1 upajelaa ::s2 upazila ::cost 0 ::lc1 ben ::lc2 eng
821
+ ::s1 uttara ::s2 north ::cost 0 ::lc1 ben ::lc2 eng
822
+ ::s1 "dya " ::s2 "the " ::left1 /^(.*[-, ])?$/ ::cost 0.2 ::lc1 ben ::lc2 eng
823
+ ::s1 " aba " ::s2 " of " ::cost 0 ::lc1 ben ::lc2 eng
824
+
825
+ # mini dictionary Russian-English
826
+ ::s1 akademiya ::s2 academy ::cost 0 ::lc1 rus ::lc2 eng
827
+ ::s1 eparkhiya ::s2 diocese ::cost 0 ::lc1 rus ::lc2 eng
828
+ ::s1 gorod ::s2 city ::cost 0 ::lc1 rus ::lc2 eng
829
+ ::s1 gosudarstvennyi ::s2 state ::cost 0 ::lc1 rus ::lc2 eng
830
+ ::s1 gubernator ::s2 governor ::cost 0 ::lc1 rus ::lc2 eng
831
+ ::s1 guberniya ::s2 governate ::cost 0 ::lc1 rus ::lc2 eng
832
+ ::s1 imperator ::s2 emperor ::cost 0 ::lc1 rus ::lc2 eng
833
+ ::s1 komitet ::s2 committee ::cost 0 ::lc1 rus ::lc2 eng
834
+ ::s1 korolevstvo ::s2 kingdom ::cost 0 ::lc1 rus ::lc2 eng
835
+ ::s1 koroli ::s2 king ::cost 0 ::lc1 rus ::lc2 eng
836
+ ::s1 mezhdunarodnaya ::s2 international ::cost 0 ::lc1 rus ::lc2 eng
837
+ ::s1 natsionalnyi ::s2 national ::cost 0 ::lc1 rus ::lc2 eng
838
+ ::s1 novyi ::s2 new ::cost 0 ::lc1 rus ::lc2 eng
839
+ ::s1 oblast ::s2 province ::cost 0 ::lc1 rus ::lc2 eng
840
+ ::s1 oblast ::s2 region ::cost 0 ::lc1 rus ::lc2 eng
841
+ ::s1 obshchestvo ::s2 society ::cost 0 ::lc1 rus ::lc2 eng
842
+ ::s1 okrug ::s2 district ::cost 0 ::lc1 rus ::lc2 eng
843
+ ::s1 okrug ::s2 region ::cost 0 ::lc1 rus ::lc2 eng
844
+ ::s1 ostrova ::s2 island ::cost 0 ::lc1 rus ::lc2 eng
845
+ ::s1 partiya ::s2 party ::cost 0 ::lc1 rus ::lc2 eng
846
+ ::s1 raion ::s2 district ::cost 0 ::lc1 rus ::lc2 eng
847
+ ::s1 respublika ::s2 republic ::cost 0 ::lc1 rus ::lc2 eng
848
+ ::s1 respublik ::s2 republic ::cost 0 ::lc1 rus ::lc2 eng
849
+ ::s1 sbornaya ::s2 team ::cost 0 ::lc1 rus ::lc2 eng
850
+ ::s1 severnaya ::s2 north ::cost 0 ::lc1 rus ::lc2 eng
851
+ ::s1 sovet council ::cost 0 ::lc1 rus ::lc2 eng
852
+ ::s1 soyuz ::s2 alliance ::cost 0 ::lc1 rus ::lc2 eng
853
+ ::s1 soyuz ::s2 association ::cost 0 ::lc1 rus ::lc2 eng
854
+ ::s1 soyuz ::s2 league ::cost 0 ::lc1 rus ::lc2 eng
855
+ ::s1 soyuz ::s2 union ::cost 0 ::lc1 rus ::lc2 eng
856
+ ::s1 svyataya ::s2 saint ::cost 0 ::lc1 rus ::lc2 eng
857
+ ::s1 svobodnyi ::s2 free ::cost 0 ::lc1 rus ::lc2 eng
858
+ ::s1 tserkov ::s2 church ::cost 0 ::lc1 rus ::lc2 eng
859
+ ::s1 uezd ::s2 county ::cost 0 ::lc1 rus ::lc2 eng
860
+ ::s1 universitet ::s2 university ::cost 0 ::lc1 rus ::lc2 eng
861
+ ::s1 vostochnaya ::s2 east ::cost 0 ::lc1 rus ::lc2 eng
862
+ ::s1 vostochnaya ::s2 eastern ::cost 0 ::lc1 rus ::lc2 eng
863
+ ::s1 yuzhnaya ::s2 south ::cost 0 ::lc1 rus ::lc2 eng
864
+ ::s1 yuzhnaya ::s2 southern ::cost 0 ::lc1 rus ::lc2 eng
865
+ ::s1 yuzhnoi ::s2 south ::cost 0 ::lc1 rus ::lc2 eng
866
+ ::s1 yuzhnoi ::s2 southern ::cost 0 ::lc1 rus ::lc2 eng
867
+ ::s1 yuzhnyi ::s2 south ::cost 0 ::lc1 rus ::lc2 eng
868
+ # often dropped in Russian name
869
+ ::s1 ::s2 county ::cost 0 ::lc1 rus ::lc2 eng
870
+ ::s1 ::s2 island ::cost 0 ::lc1 rus ::lc2 eng
871
+ ::s1 ::s2 pope ::cost 0 ::lc1 rus ::lc2 eng
872
+ ::s1 ::s2 river ::cost 0 ::lc1 rus ::lc2 eng
873
+ ::s1 ::s2 "the " ::cost 0 ::lc1 rus ::lc2 eng ::left2 /^(.*[- ])?$/
874
+ ::s1 " " ::s2 " of " ::cost 0 ::lc1 rus ::lc2 eng
875
+
876
+
877
+ # mini dictionary Uyghur-English
878
+ ::s1 aptonom ::s2 automomous ::cost 0 ::lc1 uig ::lc2 eng
879
+ ::s1 aralliri ::s2 islands ::cost 0 ::lc1 uig ::lc2 eng
880
+ ::s1 aralliri ::s2 ::cost 0 ::lc1 uig ::lc2 eng
881
+ ::s1 arili ::s2 island ::cost 0 ::lc1 uig ::lc2 eng
882
+ ::s1 arili ::s2 ::cost 0 ::lc1 uig ::lc2 eng
883
+ ::s1 nahiyisi ::s2 county ::cost 0 ::lc1 uig ::lc2 eng
884
+ ::s1 oelkisi ::s2 province ::cost 0 ::lc1 uig ::lc2 eng
885
+ ::s1 oelkisi ::s2 ::cost 0 ::lc1 uig ::lc2 eng
886
+ ::s1 ottura ::s2 central ::cost 0 ::lc1 uig ::lc2 eng
887
+ ::s1 rayoni ::s2 region ::cost 0 ::lc1 uig ::lc2 eng
888
+ ::s1 shehiri ::s2 city ::cost 0 ::lc1 uig ::lc2 eng
889
+ ::s1 shehiri ::s2 ::cost 0 ::lc1 uig ::lc2 eng
890
+ ::s1 shitati ::s2 state ::cost 0 ::lc1 uig ::lc2 eng
891
+ ::s1 shitati ::s2 ::cost 0 ::lc1 uig ::lc2 eng
892
+ ::s1 shtati ::s2 state ::cost 0 ::lc1 uig ::lc2 eng
893
+ ::s1 shtati ::s2 ::cost 0 ::lc1 uig ::lc2 eng
894
+ ::s1 uniwersiteti ::s2 university ::cost 0 ::lc1 uig ::lc2 eng
895
+ ::s1 yengi ::s2 new ::cost 0 ::lc1 uig ::lc2 eng
896
+
uroman/lib/JSON.pm ADDED
@@ -0,0 +1,2317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package JSON;
2
+
3
+
4
+ use strict;
5
+ use Carp ();
6
+ use base qw(Exporter);
7
+ @JSON::EXPORT = qw(from_json to_json jsonToObj objToJson encode_json decode_json);
8
+
9
+ BEGIN {
10
+ $JSON::VERSION = '2.90';
11
+ $JSON::DEBUG = 0 unless (defined $JSON::DEBUG);
12
+ $JSON::DEBUG = $ENV{ PERL_JSON_DEBUG } if exists $ENV{ PERL_JSON_DEBUG };
13
+ }
14
+
15
+ my $Module_XS = 'JSON::XS';
16
+ my $Module_PP = 'JSON::PP';
17
+ my $Module_bp = 'JSON::backportPP'; # included in JSON distribution
18
+ my $PP_Version = '2.27203';
19
+ my $XS_Version = '2.34';
20
+
21
+
22
+ # XS and PP common methods
23
+
24
+ my @PublicMethods = qw/
25
+ ascii latin1 utf8 pretty indent space_before space_after relaxed canonical allow_nonref
26
+ allow_blessed convert_blessed filter_json_object filter_json_single_key_object
27
+ shrink max_depth max_size encode decode decode_prefix allow_unknown
28
+ /;
29
+
30
+ my @Properties = qw/
31
+ ascii latin1 utf8 indent space_before space_after relaxed canonical allow_nonref
32
+ allow_blessed convert_blessed shrink max_depth max_size allow_unknown
33
+ /;
34
+
35
+ my @XSOnlyMethods = qw/allow_tags/; # Currently nothing
36
+
37
+ my @PPOnlyMethods = qw/
38
+ indent_length sort_by
39
+ allow_singlequote allow_bignum loose allow_barekey escape_slash as_nonblessed
40
+ /; # JSON::PP specific
41
+
42
+
43
+ # used in _load_xs and _load_pp ($INSTALL_ONLY is not used currently)
44
+ my $_INSTALL_DONT_DIE = 1; # When _load_xs fails to load XS, don't die.
45
+ my $_INSTALL_ONLY = 2; # Don't call _set_methods()
46
+ my $_ALLOW_UNSUPPORTED = 0;
47
+ my $_UNIV_CONV_BLESSED = 0;
48
+ my $_USSING_bpPP = 0;
49
+
50
+
51
+ # Check the environment variable to decide worker module.
52
+
53
+ unless ($JSON::Backend) {
54
+ $JSON::DEBUG and Carp::carp("Check used worker module...");
55
+
56
+ my $backend = exists $ENV{PERL_JSON_BACKEND} ? $ENV{PERL_JSON_BACKEND} : 1;
57
+
58
+ if ($backend eq '1' or $backend =~ /JSON::XS\s*,\s*JSON::PP/) {
59
+ _load_xs($_INSTALL_DONT_DIE) or _load_pp();
60
+ }
61
+ elsif ($backend eq '0' or $backend eq 'JSON::PP') {
62
+ _load_pp();
63
+ }
64
+ elsif ($backend eq '2' or $backend eq 'JSON::XS') {
65
+ _load_xs();
66
+ }
67
+ elsif ($backend eq 'JSON::backportPP') {
68
+ $_USSING_bpPP = 1;
69
+ _load_pp();
70
+ }
71
+ else {
72
+ Carp::croak "The value of environmental variable 'PERL_JSON_BACKEND' is invalid.";
73
+ }
74
+ }
75
+
76
+
77
+ sub import {
78
+ my $pkg = shift;
79
+ my @what_to_export;
80
+ my $no_export;
81
+
82
+ for my $tag (@_) {
83
+ if ($tag eq '-support_by_pp') {
84
+ if (!$_ALLOW_UNSUPPORTED++) {
85
+ JSON::Backend::XS
86
+ ->support_by_pp(@PPOnlyMethods) if ($JSON::Backend eq $Module_XS);
87
+ }
88
+ next;
89
+ }
90
+ elsif ($tag eq '-no_export') {
91
+ $no_export++, next;
92
+ }
93
+ elsif ( $tag eq '-convert_blessed_universally' ) {
94
+ eval q|
95
+ require B;
96
+ *UNIVERSAL::TO_JSON = sub {
97
+ my $b_obj = B::svref_2object( $_[0] );
98
+ return $b_obj->isa('B::HV') ? { %{ $_[0] } }
99
+ : $b_obj->isa('B::AV') ? [ @{ $_[0] } ]
100
+ : undef
101
+ ;
102
+ }
103
+ | if ( !$_UNIV_CONV_BLESSED++ );
104
+ next;
105
+ }
106
+ push @what_to_export, $tag;
107
+ }
108
+
109
+ return if ($no_export);
110
+
111
+ __PACKAGE__->export_to_level(1, $pkg, @what_to_export);
112
+ }
113
+
114
+
115
+ # OBSOLETED
116
+
117
+ sub jsonToObj {
118
+ my $alternative = 'from_json';
119
+ if (defined $_[0] and UNIVERSAL::isa($_[0], 'JSON')) {
120
+ shift @_; $alternative = 'decode';
121
+ }
122
+ Carp::carp "'jsonToObj' will be obsoleted. Please use '$alternative' instead.";
123
+ return JSON::from_json(@_);
124
+ };
125
+
126
+ sub objToJson {
127
+ my $alternative = 'to_json';
128
+ if (defined $_[0] and UNIVERSAL::isa($_[0], 'JSON')) {
129
+ shift @_; $alternative = 'encode';
130
+ }
131
+ Carp::carp "'objToJson' will be obsoleted. Please use '$alternative' instead.";
132
+ JSON::to_json(@_);
133
+ };
134
+
135
+
136
+ # INTERFACES
137
+
138
+ sub to_json ($@) {
139
+ if (
140
+ ref($_[0]) eq 'JSON'
141
+ or (@_ > 2 and $_[0] eq 'JSON')
142
+ ) {
143
+ Carp::croak "to_json should not be called as a method.";
144
+ }
145
+ my $json = JSON->new;
146
+
147
+ if (@_ == 2 and ref $_[1] eq 'HASH') {
148
+ my $opt = $_[1];
149
+ for my $method (keys %$opt) {
150
+ $json->$method( $opt->{$method} );
151
+ }
152
+ }
153
+
154
+ $json->encode($_[0]);
155
+ }
156
+
157
+
158
+ sub from_json ($@) {
159
+ if ( ref($_[0]) eq 'JSON' or $_[0] eq 'JSON' ) {
160
+ Carp::croak "from_json should not be called as a method.";
161
+ }
162
+ my $json = JSON->new;
163
+
164
+ if (@_ == 2 and ref $_[1] eq 'HASH') {
165
+ my $opt = $_[1];
166
+ for my $method (keys %$opt) {
167
+ $json->$method( $opt->{$method} );
168
+ }
169
+ }
170
+
171
+ return $json->decode( $_[0] );
172
+ }
173
+
174
+
175
+
176
+ sub true { $JSON::true }
177
+
178
+ sub false { $JSON::false }
179
+
180
+ sub null { undef; }
181
+
182
+
183
+ sub require_xs_version { $XS_Version; }
184
+
185
+ sub backend {
186
+ my $proto = shift;
187
+ $JSON::Backend;
188
+ }
189
+
190
+ #*module = *backend;
191
+
192
+
193
+ sub is_xs {
194
+ return $_[0]->backend eq $Module_XS;
195
+ }
196
+
197
+
198
+ sub is_pp {
199
+ return not $_[0]->is_xs;
200
+ }
201
+
202
+
203
+ sub pureperl_only_methods { @PPOnlyMethods; }
204
+
205
+
206
+ sub property {
207
+ my ($self, $name, $value) = @_;
208
+
209
+ if (@_ == 1) {
210
+ my %props;
211
+ for $name (@Properties) {
212
+ my $method = 'get_' . $name;
213
+ if ($name eq 'max_size') {
214
+ my $value = $self->$method();
215
+ $props{$name} = $value == 1 ? 0 : $value;
216
+ next;
217
+ }
218
+ $props{$name} = $self->$method();
219
+ }
220
+ return \%props;
221
+ }
222
+ elsif (@_ > 3) {
223
+ Carp::croak('property() can take only the option within 2 arguments.');
224
+ }
225
+ elsif (@_ == 2) {
226
+ if ( my $method = $self->can('get_' . $name) ) {
227
+ if ($name eq 'max_size') {
228
+ my $value = $self->$method();
229
+ return $value == 1 ? 0 : $value;
230
+ }
231
+ $self->$method();
232
+ }
233
+ }
234
+ else {
235
+ $self->$name($value);
236
+ }
237
+
238
+ }
239
+
240
+
241
+
242
+ # INTERNAL
243
+
244
+ sub _load_xs {
245
+ my $opt = shift;
246
+
247
+ $JSON::DEBUG and Carp::carp "Load $Module_XS.";
248
+
249
+ # if called after install module, overload is disable.... why?
250
+ JSON::Boolean::_overrride_overload($Module_XS);
251
+ JSON::Boolean::_overrride_overload($Module_PP);
252
+
253
+ eval qq|
254
+ use $Module_XS $XS_Version ();
255
+ |;
256
+
257
+ if ($@) {
258
+ if (defined $opt and $opt & $_INSTALL_DONT_DIE) {
259
+ $JSON::DEBUG and Carp::carp "Can't load $Module_XS...($@)";
260
+ return 0;
261
+ }
262
+ Carp::croak $@;
263
+ }
264
+
265
+ unless (defined $opt and $opt & $_INSTALL_ONLY) {
266
+ _set_module( $JSON::Backend = $Module_XS );
267
+ my $data = join("", <DATA>); # this code is from Jcode 2.xx.
268
+ close(DATA);
269
+ eval $data;
270
+ JSON::Backend::XS->init;
271
+ }
272
+
273
+ return 1;
274
+ };
275
+
276
+
277
+ sub _load_pp {
278
+ my $opt = shift;
279
+ my $backend = $_USSING_bpPP ? $Module_bp : $Module_PP;
280
+
281
+ $JSON::DEBUG and Carp::carp "Load $backend.";
282
+
283
+ # if called after install module, overload is disable.... why?
284
+ JSON::Boolean::_overrride_overload($Module_XS);
285
+ JSON::Boolean::_overrride_overload($backend);
286
+
287
+ if ( $_USSING_bpPP ) {
288
+ eval qq| require $backend |;
289
+ }
290
+ else {
291
+ eval qq| use $backend $PP_Version () |;
292
+ }
293
+
294
+ if ($@) {
295
+ if ( $backend eq $Module_PP ) {
296
+ $JSON::DEBUG and Carp::carp "Can't load $Module_PP ($@), so try to load $Module_bp";
297
+ $_USSING_bpPP++;
298
+ $backend = $Module_bp;
299
+ JSON::Boolean::_overrride_overload($backend);
300
+ local $^W; # if PP installed but invalid version, backportPP redefines methods.
301
+ eval qq| require $Module_bp |;
302
+ }
303
+ Carp::croak $@ if $@;
304
+ }
305
+
306
+ unless (defined $opt and $opt & $_INSTALL_ONLY) {
307
+ _set_module( $JSON::Backend = $Module_PP ); # even if backportPP, set $Backend with 'JSON::PP'
308
+ JSON::Backend::PP->init;
309
+ }
310
+ };
311
+
312
+
313
+ sub _set_module {
314
+ return if defined $JSON::true;
315
+
316
+ my $module = shift;
317
+
318
+ local $^W;
319
+ no strict qw(refs);
320
+
321
+ $JSON::true = ${"$module\::true"};
322
+ $JSON::false = ${"$module\::false"};
323
+
324
+ push @JSON::ISA, $module;
325
+ if ( JSON->is_xs and JSON->backend->VERSION < 3 ) {
326
+ eval 'package JSON::PP::Boolean';
327
+ push @{"$module\::Boolean::ISA"}, qw(JSON::PP::Boolean);
328
+ }
329
+
330
+ *{"JSON::is_bool"} = \&{"$module\::is_bool"};
331
+
332
+ for my $method ($module eq $Module_XS ? @PPOnlyMethods : @XSOnlyMethods) {
333
+ *{"JSON::$method"} = sub {
334
+ Carp::carp("$method is not supported in $module.");
335
+ $_[0];
336
+ };
337
+ }
338
+
339
+ return 1;
340
+ }
341
+
342
+
343
+
344
+ #
345
+ # JSON Boolean
346
+ #
347
+
348
+ package JSON::Boolean;
349
+
350
+ my %Installed;
351
+
352
+ sub _overrride_overload {
353
+ return; # this function is currently disable.
354
+ return if ($Installed{ $_[0] }++);
355
+
356
+ my $boolean = $_[0] . '::Boolean';
357
+
358
+ eval sprintf(q|
359
+ package %s;
360
+ use overload (
361
+ '""' => sub { ${$_[0]} == 1 ? 'true' : 'false' },
362
+ 'eq' => sub {
363
+ my ($obj, $op) = ref ($_[0]) ? ($_[0], $_[1]) : ($_[1], $_[0]);
364
+ if ($op eq 'true' or $op eq 'false') {
365
+ return "$obj" eq 'true' ? 'true' eq $op : 'false' eq $op;
366
+ }
367
+ else {
368
+ return $obj ? 1 == $op : 0 == $op;
369
+ }
370
+ },
371
+ );
372
+ |, $boolean);
373
+
374
+ if ($@) { Carp::croak $@; }
375
+
376
+ if ( exists $INC{'JSON/XS.pm'} and $boolean eq 'JSON::XS::Boolean' ) {
377
+ local $^W;
378
+ my $true = do { bless \(my $dummy = 1), $boolean };
379
+ my $false = do { bless \(my $dummy = 0), $boolean };
380
+ *JSON::XS::true = sub () { $true };
381
+ *JSON::XS::false = sub () { $false };
382
+ }
383
+ elsif ( exists $INC{'JSON/PP.pm'} and $boolean eq 'JSON::PP::Boolean' ) {
384
+ local $^W;
385
+ my $true = do { bless \(my $dummy = 1), $boolean };
386
+ my $false = do { bless \(my $dummy = 0), $boolean };
387
+ *JSON::PP::true = sub { $true };
388
+ *JSON::PP::false = sub { $false };
389
+ }
390
+
391
+ return 1;
392
+ }
393
+
394
+
395
+ #
396
+ # Helper classes for Backend Module (PP)
397
+ #
398
+
399
+ package JSON::Backend::PP;
400
+
401
+ sub init {
402
+ local $^W;
403
+ no strict qw(refs); # this routine may be called after JSON::Backend::XS init was called.
404
+ *{"JSON::decode_json"} = \&{"JSON::PP::decode_json"};
405
+ *{"JSON::encode_json"} = \&{"JSON::PP::encode_json"};
406
+ *{"JSON::PP::is_xs"} = sub { 0 };
407
+ *{"JSON::PP::is_pp"} = sub { 1 };
408
+ return 1;
409
+ }
410
+
411
+ #
412
+ # To save memory, the below lines are read only when XS backend is used.
413
+ #
414
+
415
+ package JSON;
416
+
417
+ 1;
418
+ __DATA__
419
+
420
+
421
+ #
422
+ # Helper classes for Backend Module (XS)
423
+ #
424
+
425
+ package JSON::Backend::XS;
426
+
427
+ use constant INDENT_LENGTH_FLAG => 15 << 12;
428
+
429
+ use constant UNSUPPORTED_ENCODE_FLAG => {
430
+ ESCAPE_SLASH => 0x00000010,
431
+ ALLOW_BIGNUM => 0x00000020,
432
+ AS_NONBLESSED => 0x00000040,
433
+ EXPANDED => 0x10000000, # for developer's
434
+ };
435
+
436
+ use constant UNSUPPORTED_DECODE_FLAG => {
437
+ LOOSE => 0x00000001,
438
+ ALLOW_BIGNUM => 0x00000002,
439
+ ALLOW_BAREKEY => 0x00000004,
440
+ ALLOW_SINGLEQUOTE => 0x00000008,
441
+ EXPANDED => 0x20000000, # for developer's
442
+ };
443
+
444
+
445
+ sub init {
446
+ local $^W;
447
+ no strict qw(refs);
448
+ *{"JSON::decode_json"} = \&{"JSON::XS::decode_json"};
449
+ *{"JSON::encode_json"} = \&{"JSON::XS::encode_json"};
450
+ *{"JSON::XS::is_xs"} = sub { 1 };
451
+ *{"JSON::XS::is_pp"} = sub { 0 };
452
+ return 1;
453
+ }
454
+
455
+
456
+ sub support_by_pp {
457
+ my ($class, @methods) = @_;
458
+
459
+ local $^W;
460
+ no strict qw(refs);
461
+
462
+ my $JSON_XS_encode_orignal = \&JSON::XS::encode;
463
+ my $JSON_XS_decode_orignal = \&JSON::XS::decode;
464
+ my $JSON_XS_incr_parse_orignal = \&JSON::XS::incr_parse;
465
+
466
+ *JSON::XS::decode = \&JSON::Backend::XS::Supportable::_decode;
467
+ *JSON::XS::encode = \&JSON::Backend::XS::Supportable::_encode;
468
+ *JSON::XS::incr_parse = \&JSON::Backend::XS::Supportable::_incr_parse;
469
+
470
+ *{JSON::XS::_original_decode} = $JSON_XS_decode_orignal;
471
+ *{JSON::XS::_original_encode} = $JSON_XS_encode_orignal;
472
+ *{JSON::XS::_original_incr_parse} = $JSON_XS_incr_parse_orignal;
473
+
474
+ push @JSON::Backend::XS::Supportable::ISA, 'JSON';
475
+
476
+ my $pkg = 'JSON::Backend::XS::Supportable';
477
+
478
+ *{JSON::new} = sub {
479
+ my $proto = JSON::XS->new; $$proto = 0;
480
+ bless $proto, $pkg;
481
+ };
482
+
483
+
484
+ for my $method (@methods) {
485
+ my $flag = uc($method);
486
+ my $type |= (UNSUPPORTED_ENCODE_FLAG->{$flag} || 0);
487
+ $type |= (UNSUPPORTED_DECODE_FLAG->{$flag} || 0);
488
+
489
+ next unless($type);
490
+
491
+ $pkg->_make_unsupported_method($method => $type);
492
+ }
493
+
494
+ # push @{"JSON::XS::Boolean::ISA"}, qw(JSON::PP::Boolean);
495
+ # push @{"JSON::PP::Boolean::ISA"}, qw(JSON::Boolean);
496
+
497
+ $JSON::DEBUG and Carp::carp("set -support_by_pp mode.");
498
+
499
+ return 1;
500
+ }
501
+
502
+
503
+
504
+
505
+ #
506
+ # Helper classes for XS
507
+ #
508
+
509
+ package JSON::Backend::XS::Supportable;
510
+
511
+ $Carp::Internal{'JSON::Backend::XS::Supportable'} = 1;
512
+
513
+ sub _make_unsupported_method {
514
+ my ($pkg, $method, $type) = @_;
515
+
516
+ local $^W;
517
+ no strict qw(refs);
518
+
519
+ *{"$pkg\::$method"} = sub {
520
+ local $^W;
521
+ if (defined $_[1] ? $_[1] : 1) {
522
+ ${$_[0]} |= $type;
523
+ }
524
+ else {
525
+ ${$_[0]} &= ~$type;
526
+ }
527
+ $_[0];
528
+ };
529
+
530
+ *{"$pkg\::get_$method"} = sub {
531
+ ${$_[0]} & $type ? 1 : '';
532
+ };
533
+
534
+ }
535
+
536
+
537
+ sub _set_for_pp {
538
+ JSON::_load_pp( $_INSTALL_ONLY );
539
+
540
+ my $type = shift;
541
+ my $pp = JSON::PP->new;
542
+ my $prop = $_[0]->property;
543
+
544
+ for my $name (keys %$prop) {
545
+ $pp->$name( $prop->{$name} ? $prop->{$name} : 0 );
546
+ }
547
+
548
+ my $unsupported = $type eq 'encode' ? JSON::Backend::XS::UNSUPPORTED_ENCODE_FLAG
549
+ : JSON::Backend::XS::UNSUPPORTED_DECODE_FLAG;
550
+ my $flags = ${$_[0]} || 0;
551
+
552
+ for my $name (keys %$unsupported) {
553
+ next if ($name eq 'EXPANDED'); # for developer's
554
+ my $enable = ($flags & $unsupported->{$name}) ? 1 : 0;
555
+ my $method = lc $name;
556
+ $pp->$method($enable);
557
+ }
558
+
559
+ $pp->indent_length( $_[0]->get_indent_length );
560
+
561
+ return $pp;
562
+ }
563
+
564
+ sub _encode { # using with PP encode
565
+ if (${$_[0]}) {
566
+ _set_for_pp('encode' => @_)->encode($_[1]);
567
+ }
568
+ else {
569
+ $_[0]->_original_encode( $_[1] );
570
+ }
571
+ }
572
+
573
+
574
+ sub _decode { # if unsupported-flag is set, use PP
575
+ if (${$_[0]}) {
576
+ _set_for_pp('decode' => @_)->decode($_[1]);
577
+ }
578
+ else {
579
+ $_[0]->_original_decode( $_[1] );
580
+ }
581
+ }
582
+
583
+
584
+ sub decode_prefix { # if unsupported-flag is set, use PP
585
+ _set_for_pp('decode' => @_)->decode_prefix($_[1]);
586
+ }
587
+
588
+
589
+ sub _incr_parse {
590
+ if (${$_[0]}) {
591
+ _set_for_pp('decode' => @_)->incr_parse($_[1]);
592
+ }
593
+ else {
594
+ $_[0]->_original_incr_parse( $_[1] );
595
+ }
596
+ }
597
+
598
+
599
+ sub get_indent_length {
600
+ ${$_[0]} << 4 >> 16;
601
+ }
602
+
603
+
604
+ sub indent_length {
605
+ my $length = $_[1];
606
+
607
+ if (!defined $length or $length > 15 or $length < 0) {
608
+ Carp::carp "The acceptable range of indent_length() is 0 to 15.";
609
+ }
610
+ else {
611
+ local $^W;
612
+ $length <<= 12;
613
+ ${$_[0]} &= ~ JSON::Backend::XS::INDENT_LENGTH_FLAG;
614
+ ${$_[0]} |= $length;
615
+ *JSON::XS::encode = \&JSON::Backend::XS::Supportable::_encode;
616
+ }
617
+
618
+ $_[0];
619
+ }
620
+
621
+
622
+ 1;
623
+ __END__
624
+
625
+ =head1 NAME
626
+
627
+ JSON - JSON (JavaScript Object Notation) encoder/decoder
628
+
629
+ =head1 SYNOPSIS
630
+
631
+ use JSON; # imports encode_json, decode_json, to_json and from_json.
632
+
633
+ # simple and fast interfaces (expect/generate UTF-8)
634
+
635
+ $utf8_encoded_json_text = encode_json $perl_hash_or_arrayref;
636
+ $perl_hash_or_arrayref = decode_json $utf8_encoded_json_text;
637
+
638
+ # OO-interface
639
+
640
+ $json = JSON->new->allow_nonref;
641
+
642
+ $json_text = $json->encode( $perl_scalar );
643
+ $perl_scalar = $json->decode( $json_text );
644
+
645
+ $pretty_printed = $json->pretty->encode( $perl_scalar ); # pretty-printing
646
+
647
+ # If you want to use PP only support features, call with '-support_by_pp'
648
+ # When XS unsupported feature is enable, using PP (de|en)code instead of XS ones.
649
+
650
+ use JSON -support_by_pp;
651
+
652
+ # option-acceptable interfaces (expect/generate UNICODE by default)
653
+
654
+ $json_text = to_json( $perl_scalar, { ascii => 1, pretty => 1 } );
655
+ $perl_scalar = from_json( $json_text, { utf8 => 1 } );
656
+
657
+ # Between (en|de)code_json and (to|from)_json, if you want to write
658
+ # a code which communicates to an outer world (encoded in UTF-8),
659
+ # recommend to use (en|de)code_json.
660
+
661
+ =head1 VERSION
662
+
663
+ 2.90
664
+
665
+ This version is compatible with JSON::XS B<2.34> and later.
666
+ (Not yet compatble to JSON::XS B<3.0x>.)
667
+
668
+
669
+ =head1 NOTE
670
+
671
+ JSON::PP was earlier included in the C<JSON> distribution, but
672
+ has since Perl 5.14 been a core module. For this reason,
673
+ L<JSON::PP> was removed from the JSON distribution and can now
674
+ be found also in the Perl5 repository at
675
+
676
+ =over
677
+
678
+ =item * L<http://perl5.git.perl.org/perl.git>
679
+
680
+ =back
681
+
682
+ (The newest JSON::PP version still exists in CPAN.)
683
+
684
+ Instead, the C<JSON> distribution will include JSON::backportPP
685
+ for backwards computability. JSON.pm should thus work as it did
686
+ before.
687
+
688
+ =head1 DESCRIPTION
689
+
690
+ *************************** CAUTION **************************************
691
+ * *
692
+ * INCOMPATIBLE CHANGE (JSON::XS version 2.90) *
693
+ * *
694
+ * JSON.pm had patched JSON::XS::Boolean and JSON::PP::Boolean internally *
695
+ * on loading time for making these modules inherit JSON::Boolean. *
696
+ * But since JSON::XS v3.0 it use Types::Serialiser as boolean class. *
697
+ * Then now JSON.pm breaks boolean classe overload features and *
698
+ * -support_by_pp if JSON::XS v3.0 or later is installed. *
699
+ * *
700
+ * JSON::true and JSON::false returned JSON::Boolean objects. *
701
+ * For workaround, they return JSON::PP::Boolean objects in this version. *
702
+ * *
703
+ * isa_ok(JSON::true, 'JSON::PP::Boolean'); *
704
+ * *
705
+ * And it discards a feature: *
706
+ * *
707
+ * ok(JSON::true eq 'true'); *
708
+ * *
709
+ * In other word, JSON::PP::Boolean overload numeric only. *
710
+ * *
711
+ * ok( JSON::true == 1 ); *
712
+ * *
713
+ **************************************************************************
714
+
715
+ ************************** CAUTION ********************************
716
+ * This is 'JSON module version 2' and there are many differences *
717
+ * to version 1.xx *
718
+ * Please check your applications using old version. *
719
+ * See to 'INCOMPATIBLE CHANGES TO OLD VERSION' *
720
+ *******************************************************************
721
+
722
+ JSON (JavaScript Object Notation) is a simple data format.
723
+ See to L<http://www.json.org/> and C<RFC4627>(L<http://www.ietf.org/rfc/rfc4627.txt>).
724
+
725
+ This module converts Perl data structures to JSON and vice versa using either
726
+ L<JSON::XS> or L<JSON::PP>.
727
+
728
+ JSON::XS is the fastest and most proper JSON module on CPAN which must be
729
+ compiled and installed in your environment.
730
+ JSON::PP is a pure-Perl module which is bundled in this distribution and
731
+ has a strong compatibility to JSON::XS.
732
+
733
+ This module try to use JSON::XS by default and fail to it, use JSON::PP instead.
734
+ So its features completely depend on JSON::XS or JSON::PP.
735
+
736
+ See to L<BACKEND MODULE DECISION>.
737
+
738
+ To distinguish the module name 'JSON' and the format type JSON,
739
+ the former is quoted by CE<lt>E<gt> (its results vary with your using media),
740
+ and the latter is left just as it is.
741
+
742
+ Module name : C<JSON>
743
+
744
+ Format type : JSON
745
+
746
+ =head2 FEATURES
747
+
748
+ =over
749
+
750
+ =item * correct unicode handling
751
+
752
+ This module (i.e. backend modules) knows how to handle Unicode, documents
753
+ how and when it does so, and even documents what "correct" means.
754
+
755
+ Even though there are limitations, this feature is available since Perl version 5.6.
756
+
757
+ JSON::XS requires Perl 5.8.2 (but works correctly in 5.8.8 or later), so in older versions
758
+ C<JSON> should call JSON::PP as the backend which can be used since Perl 5.005.
759
+
760
+ With Perl 5.8.x JSON::PP works, but from 5.8.0 to 5.8.2, because of a Perl side problem,
761
+ JSON::PP works slower in the versions. And in 5.005, the Unicode handling is not available.
762
+ See to L<JSON::PP/UNICODE HANDLING ON PERLS> for more information.
763
+
764
+ See also to L<JSON::XS/A FEW NOTES ON UNICODE AND PERL>
765
+ and L<JSON::XS/ENCODING/CODESET_FLAG_NOTES>.
766
+
767
+
768
+ =item * round-trip integrity
769
+
770
+ When you serialise a perl data structure using only data types supported
771
+ by JSON and Perl, the deserialised data structure is identical on the Perl
772
+ level. (e.g. the string "2.0" doesn't suddenly become "2" just because
773
+ it looks like a number). There I<are> minor exceptions to this, read the
774
+ L</MAPPING> section below to learn about those.
775
+
776
+
777
+ =item * strict checking of JSON correctness
778
+
779
+ There is no guessing, no generating of illegal JSON texts by default,
780
+ and only JSON is accepted as input by default (the latter is a security
781
+ feature).
782
+
783
+ See to L<JSON::XS/FEATURES> and L<JSON::PP/FEATURES>.
784
+
785
+ =item * fast
786
+
787
+ This module returns a JSON::XS object itself if available.
788
+ Compared to other JSON modules and other serialisers such as Storable,
789
+ JSON::XS usually compares favorably in terms of speed, too.
790
+
791
+ If not available, C<JSON> returns a JSON::PP object instead of JSON::XS and
792
+ it is very slow as pure-Perl.
793
+
794
+ =item * simple to use
795
+
796
+ This module has both a simple functional interface as well as an
797
+ object oriented interface interface.
798
+
799
+ =item * reasonably versatile output formats
800
+
801
+ You can choose between the most compact guaranteed-single-line format possible
802
+ (nice for simple line-based protocols), a pure-ASCII format (for when your transport
803
+ is not 8-bit clean, still supports the whole Unicode range), or a pretty-printed
804
+ format (for when you want to read that stuff). Or you can combine those features
805
+ in whatever way you like.
806
+
807
+ =back
808
+
809
+ =head1 FUNCTIONAL INTERFACE
810
+
811
+ Some documents are copied and modified from L<JSON::XS/FUNCTIONAL INTERFACE>.
812
+ C<to_json> and C<from_json> are additional functions.
813
+
814
+ =head2 encode_json
815
+
816
+ $json_text = encode_json $perl_scalar
817
+
818
+ Converts the given Perl data structure to a UTF-8 encoded, binary string.
819
+
820
+ This function call is functionally identical to:
821
+
822
+ $json_text = JSON->new->utf8->encode($perl_scalar)
823
+
824
+ =head2 decode_json
825
+
826
+ $perl_scalar = decode_json $json_text
827
+
828
+ The opposite of C<encode_json>: expects an UTF-8 (binary) string and tries
829
+ to parse that as an UTF-8 encoded JSON text, returning the resulting
830
+ reference.
831
+
832
+ This function call is functionally identical to:
833
+
834
+ $perl_scalar = JSON->new->utf8->decode($json_text)
835
+
836
+
837
+ =head2 to_json
838
+
839
+ $json_text = to_json($perl_scalar)
840
+
841
+ Converts the given Perl data structure to a json string.
842
+
843
+ This function call is functionally identical to:
844
+
845
+ $json_text = JSON->new->encode($perl_scalar)
846
+
847
+ Takes a hash reference as the second.
848
+
849
+ $json_text = to_json($perl_scalar, $flag_hashref)
850
+
851
+ So,
852
+
853
+ $json_text = to_json($perl_scalar, {utf8 => 1, pretty => 1})
854
+
855
+ equivalent to:
856
+
857
+ $json_text = JSON->new->utf8(1)->pretty(1)->encode($perl_scalar)
858
+
859
+ If you want to write a modern perl code which communicates to outer world,
860
+ you should use C<encode_json> (supposed that JSON data are encoded in UTF-8).
861
+
862
+ =head2 from_json
863
+
864
+ $perl_scalar = from_json($json_text)
865
+
866
+ The opposite of C<to_json>: expects a json string and tries
867
+ to parse it, returning the resulting reference.
868
+
869
+ This function call is functionally identical to:
870
+
871
+ $perl_scalar = JSON->decode($json_text)
872
+
873
+ Takes a hash reference as the second.
874
+
875
+ $perl_scalar = from_json($json_text, $flag_hashref)
876
+
877
+ So,
878
+
879
+ $perl_scalar = from_json($json_text, {utf8 => 1})
880
+
881
+ equivalent to:
882
+
883
+ $perl_scalar = JSON->new->utf8(1)->decode($json_text)
884
+
885
+ If you want to write a modern perl code which communicates to outer world,
886
+ you should use C<decode_json> (supposed that JSON data are encoded in UTF-8).
887
+
888
+ =head2 JSON::is_bool
889
+
890
+ $is_boolean = JSON::is_bool($scalar)
891
+
892
+ Returns true if the passed scalar represents either JSON::true or
893
+ JSON::false, two constants that act like C<1> and C<0> respectively
894
+ and are also used to represent JSON C<true> and C<false> in Perl strings.
895
+
896
+ =head2 JSON::true
897
+
898
+ Returns JSON true value which is blessed object.
899
+ It C<isa> JSON::Boolean object.
900
+
901
+ =head2 JSON::false
902
+
903
+ Returns JSON false value which is blessed object.
904
+ It C<isa> JSON::Boolean object.
905
+
906
+ =head2 JSON::null
907
+
908
+ Returns C<undef>.
909
+
910
+ See L<MAPPING>, below, for more information on how JSON values are mapped to
911
+ Perl.
912
+
913
+ =head1 HOW DO I DECODE A DATA FROM OUTER AND ENCODE TO OUTER
914
+
915
+ This section supposes that your perl version is 5.8 or later.
916
+
917
+ If you know a JSON text from an outer world - a network, a file content, and so on,
918
+ is encoded in UTF-8, you should use C<decode_json> or C<JSON> module object
919
+ with C<utf8> enable. And the decoded result will contain UNICODE characters.
920
+
921
+ # from network
922
+ my $json = JSON->new->utf8;
923
+ my $json_text = CGI->new->param( 'json_data' );
924
+ my $perl_scalar = $json->decode( $json_text );
925
+
926
+ # from file content
927
+ local $/;
928
+ open( my $fh, '<', 'json.data' );
929
+ $json_text = <$fh>;
930
+ $perl_scalar = decode_json( $json_text );
931
+
932
+ If an outer data is not encoded in UTF-8, firstly you should C<decode> it.
933
+
934
+ use Encode;
935
+ local $/;
936
+ open( my $fh, '<', 'json.data' );
937
+ my $encoding = 'cp932';
938
+ my $unicode_json_text = decode( $encoding, <$fh> ); # UNICODE
939
+
940
+ # or you can write the below code.
941
+ #
942
+ # open( my $fh, "<:encoding($encoding)", 'json.data' );
943
+ # $unicode_json_text = <$fh>;
944
+
945
+ In this case, C<$unicode_json_text> is of course UNICODE string.
946
+ So you B<cannot> use C<decode_json> nor C<JSON> module object with C<utf8> enable.
947
+ Instead of them, you use C<JSON> module object with C<utf8> disable or C<from_json>.
948
+
949
+ $perl_scalar = $json->utf8(0)->decode( $unicode_json_text );
950
+ # or
951
+ $perl_scalar = from_json( $unicode_json_text );
952
+
953
+ Or C<encode 'utf8'> and C<decode_json>:
954
+
955
+ $perl_scalar = decode_json( encode( 'utf8', $unicode_json_text ) );
956
+ # this way is not efficient.
957
+
958
+ And now, you want to convert your C<$perl_scalar> into JSON data and
959
+ send it to an outer world - a network or a file content, and so on.
960
+
961
+ Your data usually contains UNICODE strings and you want the converted data to be encoded
962
+ in UTF-8, you should use C<encode_json> or C<JSON> module object with C<utf8> enable.
963
+
964
+ print encode_json( $perl_scalar ); # to a network? file? or display?
965
+ # or
966
+ print $json->utf8->encode( $perl_scalar );
967
+
968
+ If C<$perl_scalar> does not contain UNICODE but C<$encoding>-encoded strings
969
+ for some reason, then its characters are regarded as B<latin1> for perl
970
+ (because it does not concern with your $encoding).
971
+ You B<cannot> use C<encode_json> nor C<JSON> module object with C<utf8> enable.
972
+ Instead of them, you use C<JSON> module object with C<utf8> disable or C<to_json>.
973
+ Note that the resulted text is a UNICODE string but no problem to print it.
974
+
975
+ # $perl_scalar contains $encoding encoded string values
976
+ $unicode_json_text = $json->utf8(0)->encode( $perl_scalar );
977
+ # or
978
+ $unicode_json_text = to_json( $perl_scalar );
979
+ # $unicode_json_text consists of characters less than 0x100
980
+ print $unicode_json_text;
981
+
982
+ Or C<decode $encoding> all string values and C<encode_json>:
983
+
984
+ $perl_scalar->{ foo } = decode( $encoding, $perl_scalar->{ foo } );
985
+ # ... do it to each string values, then encode_json
986
+ $json_text = encode_json( $perl_scalar );
987
+
988
+ This method is a proper way but probably not efficient.
989
+
990
+ See to L<Encode>, L<perluniintro>.
991
+
992
+
993
+ =head1 COMMON OBJECT-ORIENTED INTERFACE
994
+
995
+ =head2 new
996
+
997
+ $json = JSON->new
998
+
999
+ Returns a new C<JSON> object inherited from either JSON::XS or JSON::PP
1000
+ that can be used to de/encode JSON strings.
1001
+
1002
+ All boolean flags described below are by default I<disabled>.
1003
+
1004
+ The mutators for flags all return the JSON object again and thus calls can
1005
+ be chained:
1006
+
1007
+ my $json = JSON->new->utf8->space_after->encode({a => [1,2]})
1008
+ => {"a": [1, 2]}
1009
+
1010
+ =head2 ascii
1011
+
1012
+ $json = $json->ascii([$enable])
1013
+
1014
+ $enabled = $json->get_ascii
1015
+
1016
+ If $enable is true (or missing), then the encode method will not generate characters outside
1017
+ the code range 0..127. Any Unicode characters outside that range will be escaped using either
1018
+ a single \uXXXX or a double \uHHHH\uLLLLL escape sequence, as per RFC4627.
1019
+
1020
+ If $enable is false, then the encode method will not escape Unicode characters unless
1021
+ required by the JSON syntax or other flags. This results in a faster and more compact format.
1022
+
1023
+ This feature depends on the used Perl version and environment.
1024
+
1025
+ See to L<JSON::PP/UNICODE HANDLING ON PERLS> if the backend is PP.
1026
+
1027
+ JSON->new->ascii(1)->encode([chr 0x10401])
1028
+ => ["\ud801\udc01"]
1029
+
1030
+ =head2 latin1
1031
+
1032
+ $json = $json->latin1([$enable])
1033
+
1034
+ $enabled = $json->get_latin1
1035
+
1036
+ If $enable is true (or missing), then the encode method will encode the resulting JSON
1037
+ text as latin1 (or iso-8859-1), escaping any characters outside the code range 0..255.
1038
+
1039
+ If $enable is false, then the encode method will not escape Unicode characters
1040
+ unless required by the JSON syntax or other flags.
1041
+
1042
+ JSON->new->latin1->encode (["\x{89}\x{abc}"]
1043
+ => ["\x{89}\\u0abc"] # (perl syntax, U+abc escaped, U+89 not)
1044
+
1045
+ =head2 utf8
1046
+
1047
+ $json = $json->utf8([$enable])
1048
+
1049
+ $enabled = $json->get_utf8
1050
+
1051
+ If $enable is true (or missing), then the encode method will encode the JSON result
1052
+ into UTF-8, as required by many protocols, while the decode method expects to be handled
1053
+ an UTF-8-encoded string. Please note that UTF-8-encoded strings do not contain any
1054
+ characters outside the range 0..255, they are thus useful for bytewise/binary I/O.
1055
+
1056
+ In future versions, enabling this option might enable autodetection of the UTF-16 and UTF-32
1057
+ encoding families, as described in RFC4627.
1058
+
1059
+ If $enable is false, then the encode method will return the JSON string as a (non-encoded)
1060
+ Unicode string, while decode expects thus a Unicode string. Any decoding or encoding
1061
+ (e.g. to UTF-8 or UTF-16) needs to be done yourself, e.g. using the Encode module.
1062
+
1063
+
1064
+ Example, output UTF-16BE-encoded JSON:
1065
+
1066
+ use Encode;
1067
+ $jsontext = encode "UTF-16BE", JSON::XS->new->encode ($object);
1068
+
1069
+ Example, decode UTF-32LE-encoded JSON:
1070
+
1071
+ use Encode;
1072
+ $object = JSON::XS->new->decode (decode "UTF-32LE", $jsontext);
1073
+
1074
+ See to L<JSON::PP/UNICODE HANDLING ON PERLS> if the backend is PP.
1075
+
1076
+
1077
+ =head2 pretty
1078
+
1079
+ $json = $json->pretty([$enable])
1080
+
1081
+ This enables (or disables) all of the C<indent>, C<space_before> and
1082
+ C<space_after> (and in the future possibly more) flags in one call to
1083
+ generate the most readable (or most compact) form possible.
1084
+
1085
+ Equivalent to:
1086
+
1087
+ $json->indent->space_before->space_after
1088
+
1089
+ The indent space length is three and JSON::XS cannot change the indent
1090
+ space length.
1091
+
1092
+ =head2 indent
1093
+
1094
+ $json = $json->indent([$enable])
1095
+
1096
+ $enabled = $json->get_indent
1097
+
1098
+ If C<$enable> is true (or missing), then the C<encode> method will use a multiline
1099
+ format as output, putting every array member or object/hash key-value pair
1100
+ into its own line, identifying them properly.
1101
+
1102
+ If C<$enable> is false, no newlines or indenting will be produced, and the
1103
+ resulting JSON text is guaranteed not to contain any C<newlines>.
1104
+
1105
+ This setting has no effect when decoding JSON texts.
1106
+
1107
+ The indent space length is three.
1108
+ With JSON::PP, you can also access C<indent_length> to change indent space length.
1109
+
1110
+
1111
+ =head2 space_before
1112
+
1113
+ $json = $json->space_before([$enable])
1114
+
1115
+ $enabled = $json->get_space_before
1116
+
1117
+ If C<$enable> is true (or missing), then the C<encode> method will add an extra
1118
+ optional space before the C<:> separating keys from values in JSON objects.
1119
+
1120
+ If C<$enable> is false, then the C<encode> method will not add any extra
1121
+ space at those places.
1122
+
1123
+ This setting has no effect when decoding JSON texts.
1124
+
1125
+ Example, space_before enabled, space_after and indent disabled:
1126
+
1127
+ {"key" :"value"}
1128
+
1129
+
1130
+ =head2 space_after
1131
+
1132
+ $json = $json->space_after([$enable])
1133
+
1134
+ $enabled = $json->get_space_after
1135
+
1136
+ If C<$enable> is true (or missing), then the C<encode> method will add an extra
1137
+ optional space after the C<:> separating keys from values in JSON objects
1138
+ and extra whitespace after the C<,> separating key-value pairs and array
1139
+ members.
1140
+
1141
+ If C<$enable> is false, then the C<encode> method will not add any extra
1142
+ space at those places.
1143
+
1144
+ This setting has no effect when decoding JSON texts.
1145
+
1146
+ Example, space_before and indent disabled, space_after enabled:
1147
+
1148
+ {"key": "value"}
1149
+
1150
+
1151
+ =head2 relaxed
1152
+
1153
+ $json = $json->relaxed([$enable])
1154
+
1155
+ $enabled = $json->get_relaxed
1156
+
1157
+ If C<$enable> is true (or missing), then C<decode> will accept some
1158
+ extensions to normal JSON syntax (see below). C<encode> will not be
1159
+ affected in anyway. I<Be aware that this option makes you accept invalid
1160
+ JSON texts as if they were valid!>. I suggest only to use this option to
1161
+ parse application-specific files written by humans (configuration files,
1162
+ resource files etc.)
1163
+
1164
+ If C<$enable> is false (the default), then C<decode> will only accept
1165
+ valid JSON texts.
1166
+
1167
+ Currently accepted extensions are:
1168
+
1169
+ =over 4
1170
+
1171
+ =item * list items can have an end-comma
1172
+
1173
+ JSON I<separates> array elements and key-value pairs with commas. This
1174
+ can be annoying if you write JSON texts manually and want to be able to
1175
+ quickly append elements, so this extension accepts comma at the end of
1176
+ such items not just between them:
1177
+
1178
+ [
1179
+ 1,
1180
+ 2, <- this comma not normally allowed
1181
+ ]
1182
+ {
1183
+ "k1": "v1",
1184
+ "k2": "v2", <- this comma not normally allowed
1185
+ }
1186
+
1187
+ =item * shell-style '#'-comments
1188
+
1189
+ Whenever JSON allows whitespace, shell-style comments are additionally
1190
+ allowed. They are terminated by the first carriage-return or line-feed
1191
+ character, after which more white-space and comments are allowed.
1192
+
1193
+ [
1194
+ 1, # this comment not allowed in JSON
1195
+ # neither this one...
1196
+ ]
1197
+
1198
+ =back
1199
+
1200
+
1201
+ =head2 canonical
1202
+
1203
+ $json = $json->canonical([$enable])
1204
+
1205
+ $enabled = $json->get_canonical
1206
+
1207
+ If C<$enable> is true (or missing), then the C<encode> method will output JSON objects
1208
+ by sorting their keys. This is adding a comparatively high overhead.
1209
+
1210
+ If C<$enable> is false, then the C<encode> method will output key-value
1211
+ pairs in the order Perl stores them (which will likely change between runs
1212
+ of the same script).
1213
+
1214
+ This option is useful if you want the same data structure to be encoded as
1215
+ the same JSON text (given the same overall settings). If it is disabled,
1216
+ the same hash might be encoded differently even if contains the same data,
1217
+ as key-value pairs have no inherent ordering in Perl.
1218
+
1219
+ This setting has no effect when decoding JSON texts.
1220
+
1221
+ =head2 allow_nonref
1222
+
1223
+ $json = $json->allow_nonref([$enable])
1224
+
1225
+ $enabled = $json->get_allow_nonref
1226
+
1227
+ If C<$enable> is true (or missing), then the C<encode> method can convert a
1228
+ non-reference into its corresponding string, number or null JSON value,
1229
+ which is an extension to RFC4627. Likewise, C<decode> will accept those JSON
1230
+ values instead of croaking.
1231
+
1232
+ If C<$enable> is false, then the C<encode> method will croak if it isn't
1233
+ passed an arrayref or hashref, as JSON texts must either be an object
1234
+ or array. Likewise, C<decode> will croak if given something that is not a
1235
+ JSON object or array.
1236
+
1237
+ JSON->new->allow_nonref->encode ("Hello, World!")
1238
+ => "Hello, World!"
1239
+
1240
+ =head2 allow_unknown
1241
+
1242
+ $json = $json->allow_unknown ([$enable])
1243
+
1244
+ $enabled = $json->get_allow_unknown
1245
+
1246
+ If $enable is true (or missing), then "encode" will *not* throw an
1247
+ exception when it encounters values it cannot represent in JSON (for
1248
+ example, filehandles) but instead will encode a JSON "null" value.
1249
+ Note that blessed objects are not included here and are handled
1250
+ separately by c<allow_nonref>.
1251
+
1252
+ If $enable is false (the default), then "encode" will throw an
1253
+ exception when it encounters anything it cannot encode as JSON.
1254
+
1255
+ This option does not affect "decode" in any way, and it is
1256
+ recommended to leave it off unless you know your communications
1257
+ partner.
1258
+
1259
+ =head2 allow_blessed
1260
+
1261
+ $json = $json->allow_blessed([$enable])
1262
+
1263
+ $enabled = $json->get_allow_blessed
1264
+
1265
+ If C<$enable> is true (or missing), then the C<encode> method will not
1266
+ barf when it encounters a blessed reference. Instead, the value of the
1267
+ B<convert_blessed> option will decide whether C<null> (C<convert_blessed>
1268
+ disabled or no C<TO_JSON> method found) or a representation of the
1269
+ object (C<convert_blessed> enabled and C<TO_JSON> method found) is being
1270
+ encoded. Has no effect on C<decode>.
1271
+
1272
+ If C<$enable> is false (the default), then C<encode> will throw an
1273
+ exception when it encounters a blessed object.
1274
+
1275
+
1276
+ =head2 convert_blessed
1277
+
1278
+ $json = $json->convert_blessed([$enable])
1279
+
1280
+ $enabled = $json->get_convert_blessed
1281
+
1282
+ If C<$enable> is true (or missing), then C<encode>, upon encountering a
1283
+ blessed object, will check for the availability of the C<TO_JSON> method
1284
+ on the object's class. If found, it will be called in scalar context
1285
+ and the resulting scalar will be encoded instead of the object. If no
1286
+ C<TO_JSON> method is found, the value of C<allow_blessed> will decide what
1287
+ to do.
1288
+
1289
+ The C<TO_JSON> method may safely call die if it wants. If C<TO_JSON>
1290
+ returns other blessed objects, those will be handled in the same
1291
+ way. C<TO_JSON> must take care of not causing an endless recursion cycle
1292
+ (== crash) in this case. The name of C<TO_JSON> was chosen because other
1293
+ methods called by the Perl core (== not by the user of the object) are
1294
+ usually in upper case letters and to avoid collisions with the C<to_json>
1295
+ function or method.
1296
+
1297
+ This setting does not yet influence C<decode> in any way.
1298
+
1299
+ If C<$enable> is false, then the C<allow_blessed> setting will decide what
1300
+ to do when a blessed object is found.
1301
+
1302
+ =over
1303
+
1304
+ =item convert_blessed_universally mode
1305
+
1306
+ If use C<JSON> with C<-convert_blessed_universally>, the C<UNIVERSAL::TO_JSON>
1307
+ subroutine is defined as the below code:
1308
+
1309
+ *UNIVERSAL::TO_JSON = sub {
1310
+ my $b_obj = B::svref_2object( $_[0] );
1311
+ return $b_obj->isa('B::HV') ? { %{ $_[0] } }
1312
+ : $b_obj->isa('B::AV') ? [ @{ $_[0] } ]
1313
+ : undef
1314
+ ;
1315
+ }
1316
+
1317
+ This will cause that C<encode> method converts simple blessed objects into
1318
+ JSON objects as non-blessed object.
1319
+
1320
+ JSON -convert_blessed_universally;
1321
+ $json->allow_blessed->convert_blessed->encode( $blessed_object )
1322
+
1323
+ This feature is experimental and may be removed in the future.
1324
+
1325
+ =back
1326
+
1327
+ =head2 filter_json_object
1328
+
1329
+ $json = $json->filter_json_object([$coderef])
1330
+
1331
+ When C<$coderef> is specified, it will be called from C<decode> each
1332
+ time it decodes a JSON object. The only argument passed to the coderef
1333
+ is a reference to the newly-created hash. If the code references returns
1334
+ a single scalar (which need not be a reference), this value
1335
+ (i.e. a copy of that scalar to avoid aliasing) is inserted into the
1336
+ deserialised data structure. If it returns an empty list
1337
+ (NOTE: I<not> C<undef>, which is a valid scalar), the original deserialised
1338
+ hash will be inserted. This setting can slow down decoding considerably.
1339
+
1340
+ When C<$coderef> is omitted or undefined, any existing callback will
1341
+ be removed and C<decode> will not change the deserialised hash in any
1342
+ way.
1343
+
1344
+ Example, convert all JSON objects into the integer 5:
1345
+
1346
+ my $js = JSON->new->filter_json_object (sub { 5 });
1347
+ # returns [5]
1348
+ $js->decode ('[{}]'); # the given subroutine takes a hash reference.
1349
+ # throw an exception because allow_nonref is not enabled
1350
+ # so a lone 5 is not allowed.
1351
+ $js->decode ('{"a":1, "b":2}');
1352
+
1353
+
1354
+ =head2 filter_json_single_key_object
1355
+
1356
+ $json = $json->filter_json_single_key_object($key [=> $coderef])
1357
+
1358
+ Works remotely similar to C<filter_json_object>, but is only called for
1359
+ JSON objects having a single key named C<$key>.
1360
+
1361
+ This C<$coderef> is called before the one specified via
1362
+ C<filter_json_object>, if any. It gets passed the single value in the JSON
1363
+ object. If it returns a single value, it will be inserted into the data
1364
+ structure. If it returns nothing (not even C<undef> but the empty list),
1365
+ the callback from C<filter_json_object> will be called next, as if no
1366
+ single-key callback were specified.
1367
+
1368
+ If C<$coderef> is omitted or undefined, the corresponding callback will be
1369
+ disabled. There can only ever be one callback for a given key.
1370
+
1371
+ As this callback gets called less often then the C<filter_json_object>
1372
+ one, decoding speed will not usually suffer as much. Therefore, single-key
1373
+ objects make excellent targets to serialise Perl objects into, especially
1374
+ as single-key JSON objects are as close to the type-tagged value concept
1375
+ as JSON gets (it's basically an ID/VALUE tuple). Of course, JSON does not
1376
+ support this in any way, so you need to make sure your data never looks
1377
+ like a serialised Perl hash.
1378
+
1379
+ Typical names for the single object key are C<__class_whatever__>, or
1380
+ C<$__dollars_are_rarely_used__$> or C<}ugly_brace_placement>, or even
1381
+ things like C<__class_md5sum(classname)__>, to reduce the risk of clashing
1382
+ with real hashes.
1383
+
1384
+ Example, decode JSON objects of the form C<< { "__widget__" => <id> } >>
1385
+ into the corresponding C<< $WIDGET{<id>} >> object:
1386
+
1387
+ # return whatever is in $WIDGET{5}:
1388
+ JSON
1389
+ ->new
1390
+ ->filter_json_single_key_object (__widget__ => sub {
1391
+ $WIDGET{ $_[0] }
1392
+ })
1393
+ ->decode ('{"__widget__": 5')
1394
+
1395
+ # this can be used with a TO_JSON method in some "widget" class
1396
+ # for serialisation to json:
1397
+ sub WidgetBase::TO_JSON {
1398
+ my ($self) = @_;
1399
+
1400
+ unless ($self->{id}) {
1401
+ $self->{id} = ..get..some..id..;
1402
+ $WIDGET{$self->{id}} = $self;
1403
+ }
1404
+
1405
+ { __widget__ => $self->{id} }
1406
+ }
1407
+
1408
+
1409
+ =head2 shrink
1410
+
1411
+ $json = $json->shrink([$enable])
1412
+
1413
+ $enabled = $json->get_shrink
1414
+
1415
+ With JSON::XS, this flag resizes strings generated by either
1416
+ C<encode> or C<decode> to their minimum size possible. This can save
1417
+ memory when your JSON texts are either very very long or you have many
1418
+ short strings. It will also try to downgrade any strings to octet-form
1419
+ if possible: perl stores strings internally either in an encoding called
1420
+ UTF-X or in octet-form. The latter cannot store everything but uses less
1421
+ space in general (and some buggy Perl or C code might even rely on that
1422
+ internal representation being used).
1423
+
1424
+ With JSON::PP, it is noop about resizing strings but tries
1425
+ C<utf8::downgrade> to the returned string by C<encode>. See to L<utf8>.
1426
+
1427
+ See to L<JSON::XS/OBJECT-ORIENTED INTERFACE> and L<JSON::PP/METHODS>.
1428
+
1429
+ =head2 max_depth
1430
+
1431
+ $json = $json->max_depth([$maximum_nesting_depth])
1432
+
1433
+ $max_depth = $json->get_max_depth
1434
+
1435
+ Sets the maximum nesting level (default C<512>) accepted while encoding
1436
+ or decoding. If a higher nesting level is detected in JSON text or a Perl
1437
+ data structure, then the encoder and decoder will stop and croak at that
1438
+ point.
1439
+
1440
+ Nesting level is defined by number of hash- or arrayrefs that the encoder
1441
+ needs to traverse to reach a given point or the number of C<{> or C<[>
1442
+ characters without their matching closing parenthesis crossed to reach a
1443
+ given character in a string.
1444
+
1445
+ If no argument is given, the highest possible setting will be used, which
1446
+ is rarely useful.
1447
+
1448
+ Note that nesting is implemented by recursion in C. The default value has
1449
+ been chosen to be as large as typical operating systems allow without
1450
+ crashing. (JSON::XS)
1451
+
1452
+ With JSON::PP as the backend, when a large value (100 or more) was set and
1453
+ it de/encodes a deep nested object/text, it may raise a warning
1454
+ 'Deep recursion on subroutine' at the perl runtime phase.
1455
+
1456
+ See L<JSON::XS/SECURITY CONSIDERATIONS> for more info on why this is useful.
1457
+
1458
+ =head2 max_size
1459
+
1460
+ $json = $json->max_size([$maximum_string_size])
1461
+
1462
+ $max_size = $json->get_max_size
1463
+
1464
+ Set the maximum length a JSON text may have (in bytes) where decoding is
1465
+ being attempted. The default is C<0>, meaning no limit. When C<decode>
1466
+ is called on a string that is longer then this many bytes, it will not
1467
+ attempt to decode the string but throw an exception. This setting has no
1468
+ effect on C<encode> (yet).
1469
+
1470
+ If no argument is given, the limit check will be deactivated (same as when
1471
+ C<0> is specified).
1472
+
1473
+ See L<JSON::XS/SECURITY CONSIDERATIONS>, below, for more info on why this is useful.
1474
+
1475
+ =head2 encode
1476
+
1477
+ $json_text = $json->encode($perl_scalar)
1478
+
1479
+ Converts the given Perl data structure (a simple scalar or a reference
1480
+ to a hash or array) to its JSON representation. Simple scalars will be
1481
+ converted into JSON string or number sequences, while references to arrays
1482
+ become JSON arrays and references to hashes become JSON objects. Undefined
1483
+ Perl values (e.g. C<undef>) become JSON C<null> values.
1484
+ References to the integers C<0> and C<1> are converted into C<true> and C<false>.
1485
+
1486
+ =head2 decode
1487
+
1488
+ $perl_scalar = $json->decode($json_text)
1489
+
1490
+ The opposite of C<encode>: expects a JSON text and tries to parse it,
1491
+ returning the resulting simple scalar or reference. Croaks on error.
1492
+
1493
+ JSON numbers and strings become simple Perl scalars. JSON arrays become
1494
+ Perl arrayrefs and JSON objects become Perl hashrefs. C<true> becomes
1495
+ C<1> (C<JSON::true>), C<false> becomes C<0> (C<JSON::false>) and
1496
+ C<null> becomes C<undef>.
1497
+
1498
+ =head2 decode_prefix
1499
+
1500
+ ($perl_scalar, $characters) = $json->decode_prefix($json_text)
1501
+
1502
+ This works like the C<decode> method, but instead of raising an exception
1503
+ when there is trailing garbage after the first JSON object, it will
1504
+ silently stop parsing there and return the number of characters consumed
1505
+ so far.
1506
+
1507
+ JSON->new->decode_prefix ("[1] the tail")
1508
+ => ([], 3)
1509
+
1510
+ See to L<JSON::XS/OBJECT-ORIENTED INTERFACE>
1511
+
1512
+ =head2 property
1513
+
1514
+ $boolean = $json->property($property_name)
1515
+
1516
+ Returns a boolean value about above some properties.
1517
+
1518
+ The available properties are C<ascii>, C<latin1>, C<utf8>,
1519
+ C<indent>,C<space_before>, C<space_after>, C<relaxed>, C<canonical>,
1520
+ C<allow_nonref>, C<allow_unknown>, C<allow_blessed>, C<convert_blessed>,
1521
+ C<shrink>, C<max_depth> and C<max_size>.
1522
+
1523
+ $boolean = $json->property('utf8');
1524
+ => 0
1525
+ $json->utf8;
1526
+ $boolean = $json->property('utf8');
1527
+ => 1
1528
+
1529
+ Sets the property with a given boolean value.
1530
+
1531
+ $json = $json->property($property_name => $boolean);
1532
+
1533
+ With no argument, it returns all the above properties as a hash reference.
1534
+
1535
+ $flag_hashref = $json->property();
1536
+
1537
+ =head1 INCREMENTAL PARSING
1538
+
1539
+ Most of this section are copied and modified from L<JSON::XS/INCREMENTAL PARSING>.
1540
+
1541
+ In some cases, there is the need for incremental parsing of JSON texts.
1542
+ This module does allow you to parse a JSON stream incrementally.
1543
+ It does so by accumulating text until it has a full JSON object, which
1544
+ it then can decode. This process is similar to using C<decode_prefix>
1545
+ to see if a full JSON object is available, but is much more efficient
1546
+ (and can be implemented with a minimum of method calls).
1547
+
1548
+ The backend module will only attempt to parse the JSON text once it is sure it
1549
+ has enough text to get a decisive result, using a very simple but
1550
+ truly incremental parser. This means that it sometimes won't stop as
1551
+ early as the full parser, for example, it doesn't detect parenthesis
1552
+ mismatches. The only thing it guarantees is that it starts decoding as
1553
+ soon as a syntactically valid JSON text has been seen. This means you need
1554
+ to set resource limits (e.g. C<max_size>) to ensure the parser will stop
1555
+ parsing in the presence if syntax errors.
1556
+
1557
+ The following methods implement this incremental parser.
1558
+
1559
+ =head2 incr_parse
1560
+
1561
+ $json->incr_parse( [$string] ) # void context
1562
+
1563
+ $obj_or_undef = $json->incr_parse( [$string] ) # scalar context
1564
+
1565
+ @obj_or_empty = $json->incr_parse( [$string] ) # list context
1566
+
1567
+ This is the central parsing function. It can both append new text and
1568
+ extract objects from the stream accumulated so far (both of these
1569
+ functions are optional).
1570
+
1571
+ If C<$string> is given, then this string is appended to the already
1572
+ existing JSON fragment stored in the C<$json> object.
1573
+
1574
+ After that, if the function is called in void context, it will simply
1575
+ return without doing anything further. This can be used to add more text
1576
+ in as many chunks as you want.
1577
+
1578
+ If the method is called in scalar context, then it will try to extract
1579
+ exactly I<one> JSON object. If that is successful, it will return this
1580
+ object, otherwise it will return C<undef>. If there is a parse error,
1581
+ this method will croak just as C<decode> would do (one can then use
1582
+ C<incr_skip> to skip the erroneous part). This is the most common way of
1583
+ using the method.
1584
+
1585
+ And finally, in list context, it will try to extract as many objects
1586
+ from the stream as it can find and return them, or the empty list
1587
+ otherwise. For this to work, there must be no separators between the JSON
1588
+ objects or arrays, instead they must be concatenated back-to-back. If
1589
+ an error occurs, an exception will be raised as in the scalar context
1590
+ case. Note that in this case, any previously-parsed JSON texts will be
1591
+ lost.
1592
+
1593
+ Example: Parse some JSON arrays/objects in a given string and return them.
1594
+
1595
+ my @objs = JSON->new->incr_parse ("[5][7][1,2]");
1596
+
1597
+ =head2 incr_text
1598
+
1599
+ $lvalue_string = $json->incr_text
1600
+
1601
+ This method returns the currently stored JSON fragment as an lvalue, that
1602
+ is, you can manipulate it. This I<only> works when a preceding call to
1603
+ C<incr_parse> in I<scalar context> successfully returned an object. Under
1604
+ all other circumstances you must not call this function (I mean it.
1605
+ although in simple tests it might actually work, it I<will> fail under
1606
+ real world conditions). As a special exception, you can also call this
1607
+ method before having parsed anything.
1608
+
1609
+ This function is useful in two cases: a) finding the trailing text after a
1610
+ JSON object or b) parsing multiple JSON objects separated by non-JSON text
1611
+ (such as commas).
1612
+
1613
+ $json->incr_text =~ s/\s*,\s*//;
1614
+
1615
+ In Perl 5.005, C<lvalue> attribute is not available.
1616
+ You must write codes like the below:
1617
+
1618
+ $string = $json->incr_text;
1619
+ $string =~ s/\s*,\s*//;
1620
+ $json->incr_text( $string );
1621
+
1622
+ =head2 incr_skip
1623
+
1624
+ $json->incr_skip
1625
+
1626
+ This will reset the state of the incremental parser and will remove the
1627
+ parsed text from the input buffer. This is useful after C<incr_parse>
1628
+ died, in which case the input buffer and incremental parser state is left
1629
+ unchanged, to skip the text parsed so far and to reset the parse state.
1630
+
1631
+ =head2 incr_reset
1632
+
1633
+ $json->incr_reset
1634
+
1635
+ This completely resets the incremental parser, that is, after this call,
1636
+ it will be as if the parser had never parsed anything.
1637
+
1638
+ This is useful if you want to repeatedly parse JSON objects and want to
1639
+ ignore any trailing data, which means you have to reset the parser after
1640
+ each successful decode.
1641
+
1642
+ See to L<JSON::XS/INCREMENTAL PARSING> for examples.
1643
+
1644
+
1645
+ =head1 JSON::PP SUPPORT METHODS
1646
+
1647
+ The below methods are JSON::PP own methods, so when C<JSON> works
1648
+ with JSON::PP (i.e. the created object is a JSON::PP object), available.
1649
+ See to L<JSON::PP/JSON::PP OWN METHODS> in detail.
1650
+
1651
+ If you use C<JSON> with additional C<-support_by_pp>, some methods
1652
+ are available even with JSON::XS. See to L<USE PP FEATURES EVEN THOUGH XS BACKEND>.
1653
+
1654
+ BEING { $ENV{PERL_JSON_BACKEND} = 'JSON::XS' }
1655
+
1656
+ use JSON -support_by_pp;
1657
+
1658
+ my $json = JSON->new;
1659
+ $json->allow_nonref->escape_slash->encode("/");
1660
+
1661
+ # functional interfaces too.
1662
+ print to_json(["/"], {escape_slash => 1});
1663
+ print from_json('["foo"]', {utf8 => 1});
1664
+
1665
+ If you do not want to all functions but C<-support_by_pp>,
1666
+ use C<-no_export>.
1667
+
1668
+ use JSON -support_by_pp, -no_export;
1669
+ # functional interfaces are not exported.
1670
+
1671
+ =head2 allow_singlequote
1672
+
1673
+ $json = $json->allow_singlequote([$enable])
1674
+
1675
+ If C<$enable> is true (or missing), then C<decode> will accept
1676
+ any JSON strings quoted by single quotations that are invalid JSON
1677
+ format.
1678
+
1679
+ $json->allow_singlequote->decode({"foo":'bar'});
1680
+ $json->allow_singlequote->decode({'foo':"bar"});
1681
+ $json->allow_singlequote->decode({'foo':'bar'});
1682
+
1683
+ As same as the C<relaxed> option, this option may be used to parse
1684
+ application-specific files written by humans.
1685
+
1686
+ =head2 allow_barekey
1687
+
1688
+ $json = $json->allow_barekey([$enable])
1689
+
1690
+ If C<$enable> is true (or missing), then C<decode> will accept
1691
+ bare keys of JSON object that are invalid JSON format.
1692
+
1693
+ As same as the C<relaxed> option, this option may be used to parse
1694
+ application-specific files written by humans.
1695
+
1696
+ $json->allow_barekey->decode('{foo:"bar"}');
1697
+
1698
+ =head2 allow_bignum
1699
+
1700
+ $json = $json->allow_bignum([$enable])
1701
+
1702
+ If C<$enable> is true (or missing), then C<decode> will convert
1703
+ the big integer Perl cannot handle as integer into a L<Math::BigInt>
1704
+ object and convert a floating number (any) into a L<Math::BigFloat>.
1705
+
1706
+ On the contrary, C<encode> converts C<Math::BigInt> objects and C<Math::BigFloat>
1707
+ objects into JSON numbers with C<allow_blessed> enable.
1708
+
1709
+ $json->allow_nonref->allow_blessed->allow_bignum;
1710
+ $bigfloat = $json->decode('2.000000000000000000000000001');
1711
+ print $json->encode($bigfloat);
1712
+ # => 2.000000000000000000000000001
1713
+
1714
+ See to L<MAPPING> about the conversion of JSON number.
1715
+
1716
+ =head2 loose
1717
+
1718
+ $json = $json->loose([$enable])
1719
+
1720
+ The unescaped [\x00-\x1f\x22\x2f\x5c] strings are invalid in JSON strings
1721
+ and the module doesn't allow to C<decode> to these (except for \x2f).
1722
+ If C<$enable> is true (or missing), then C<decode> will accept these
1723
+ unescaped strings.
1724
+
1725
+ $json->loose->decode(qq|["abc
1726
+ def"]|);
1727
+
1728
+ See to L<JSON::PP/JSON::PP OWN METHODS>.
1729
+
1730
+ =head2 escape_slash
1731
+
1732
+ $json = $json->escape_slash([$enable])
1733
+
1734
+ According to JSON Grammar, I<slash> (U+002F) is escaped. But by default
1735
+ JSON backend modules encode strings without escaping slash.
1736
+
1737
+ If C<$enable> is true (or missing), then C<encode> will escape slashes.
1738
+
1739
+ =head2 indent_length
1740
+
1741
+ $json = $json->indent_length($length)
1742
+
1743
+ With JSON::XS, The indent space length is 3 and cannot be changed.
1744
+ With JSON::PP, it sets the indent space length with the given $length.
1745
+ The default is 3. The acceptable range is 0 to 15.
1746
+
1747
+ =head2 sort_by
1748
+
1749
+ $json = $json->sort_by($function_name)
1750
+ $json = $json->sort_by($subroutine_ref)
1751
+
1752
+ If $function_name or $subroutine_ref are set, its sort routine are used.
1753
+
1754
+ $js = $pc->sort_by(sub { $JSON::PP::a cmp $JSON::PP::b })->encode($obj);
1755
+ # is($js, q|{"a":1,"b":2,"c":3,"d":4,"e":5,"f":6,"g":7,"h":8,"i":9}|);
1756
+
1757
+ $js = $pc->sort_by('own_sort')->encode($obj);
1758
+ # is($js, q|{"a":1,"b":2,"c":3,"d":4,"e":5,"f":6,"g":7,"h":8,"i":9}|);
1759
+
1760
+ sub JSON::PP::own_sort { $JSON::PP::a cmp $JSON::PP::b }
1761
+
1762
+ As the sorting routine runs in the JSON::PP scope, the given
1763
+ subroutine name and the special variables C<$a>, C<$b> will begin
1764
+ with 'JSON::PP::'.
1765
+
1766
+ If $integer is set, then the effect is same as C<canonical> on.
1767
+
1768
+ See to L<JSON::PP/JSON::PP OWN METHODS>.
1769
+
1770
+ =head1 MAPPING
1771
+
1772
+ This section is copied from JSON::XS and modified to C<JSON>.
1773
+ JSON::XS and JSON::PP mapping mechanisms are almost equivalent.
1774
+
1775
+ See to L<JSON::XS/MAPPING>.
1776
+
1777
+ =head2 JSON -> PERL
1778
+
1779
+ =over 4
1780
+
1781
+ =item object
1782
+
1783
+ A JSON object becomes a reference to a hash in Perl. No ordering of object
1784
+ keys is preserved (JSON does not preserver object key ordering itself).
1785
+
1786
+ =item array
1787
+
1788
+ A JSON array becomes a reference to an array in Perl.
1789
+
1790
+ =item string
1791
+
1792
+ A JSON string becomes a string scalar in Perl - Unicode codepoints in JSON
1793
+ are represented by the same codepoints in the Perl string, so no manual
1794
+ decoding is necessary.
1795
+
1796
+ =item number
1797
+
1798
+ A JSON number becomes either an integer, numeric (floating point) or
1799
+ string scalar in perl, depending on its range and any fractional parts. On
1800
+ the Perl level, there is no difference between those as Perl handles all
1801
+ the conversion details, but an integer may take slightly less memory and
1802
+ might represent more values exactly than floating point numbers.
1803
+
1804
+ If the number consists of digits only, C<JSON> will try to represent
1805
+ it as an integer value. If that fails, it will try to represent it as
1806
+ a numeric (floating point) value if that is possible without loss of
1807
+ precision. Otherwise it will preserve the number as a string value (in
1808
+ which case you lose roundtripping ability, as the JSON number will be
1809
+ re-encoded to a JSON string).
1810
+
1811
+ Numbers containing a fractional or exponential part will always be
1812
+ represented as numeric (floating point) values, possibly at a loss of
1813
+ precision (in which case you might lose perfect roundtripping ability, but
1814
+ the JSON number will still be re-encoded as a JSON number).
1815
+
1816
+ Note that precision is not accuracy - binary floating point values cannot
1817
+ represent most decimal fractions exactly, and when converting from and to
1818
+ floating point, C<JSON> only guarantees precision up to but not including
1819
+ the least significant bit.
1820
+
1821
+ If the backend is JSON::PP and C<allow_bignum> is enable, the big integers
1822
+ and the numeric can be optionally converted into L<Math::BigInt> and
1823
+ L<Math::BigFloat> objects.
1824
+
1825
+ =item true, false
1826
+
1827
+ These JSON atoms become C<JSON::true> and C<JSON::false>,
1828
+ respectively. They are overloaded to act almost exactly like the numbers
1829
+ C<1> and C<0>. You can check whether a scalar is a JSON boolean by using
1830
+ the C<JSON::is_bool> function.
1831
+
1832
+ print JSON::true + 1;
1833
+ => 1
1834
+
1835
+ ok(JSON::true eq '1');
1836
+ ok(JSON::true == 1);
1837
+
1838
+ C<JSON> will install these missing overloading features to the backend modules.
1839
+
1840
+
1841
+ =item null
1842
+
1843
+ A JSON null atom becomes C<undef> in Perl.
1844
+
1845
+ C<JSON::null> returns C<undef>.
1846
+
1847
+ =back
1848
+
1849
+
1850
+ =head2 PERL -> JSON
1851
+
1852
+ The mapping from Perl to JSON is slightly more difficult, as Perl is a
1853
+ truly typeless language, so we can only guess which JSON type is meant by
1854
+ a Perl value.
1855
+
1856
+ =over 4
1857
+
1858
+ =item hash references
1859
+
1860
+ Perl hash references become JSON objects. As there is no inherent ordering
1861
+ in hash keys (or JSON objects), they will usually be encoded in a
1862
+ pseudo-random order that can change between runs of the same program but
1863
+ stays generally the same within a single run of a program. C<JSON>
1864
+ optionally sort the hash keys (determined by the I<canonical> flag), so
1865
+ the same data structure will serialise to the same JSON text (given same
1866
+ settings and version of JSON::XS), but this incurs a runtime overhead
1867
+ and is only rarely useful, e.g. when you want to compare some JSON text
1868
+ against another for equality.
1869
+
1870
+ In future, the ordered object feature will be added to JSON::PP using C<tie> mechanism.
1871
+
1872
+
1873
+ =item array references
1874
+
1875
+ Perl array references become JSON arrays.
1876
+
1877
+ =item other references
1878
+
1879
+ Other unblessed references are generally not allowed and will cause an
1880
+ exception to be thrown, except for references to the integers C<0> and
1881
+ C<1>, which get turned into C<false> and C<true> atoms in JSON. You can
1882
+ also use C<JSON::false> and C<JSON::true> to improve readability.
1883
+
1884
+ to_json [\0,JSON::true] # yields [false,true]
1885
+
1886
+ =item JSON::true, JSON::false, JSON::null
1887
+
1888
+ These special values become JSON true and JSON false values,
1889
+ respectively. You can also use C<\1> and C<\0> directly if you want.
1890
+
1891
+ JSON::null returns C<undef>.
1892
+
1893
+ =item blessed objects
1894
+
1895
+ Blessed objects are not directly representable in JSON. See the
1896
+ C<allow_blessed> and C<convert_blessed> methods on various options on
1897
+ how to deal with this: basically, you can choose between throwing an
1898
+ exception, encoding the reference as if it weren't blessed, or provide
1899
+ your own serialiser method.
1900
+
1901
+ With C<convert_blessed_universally> mode, C<encode> converts blessed
1902
+ hash references or blessed array references (contains other blessed references)
1903
+ into JSON members and arrays.
1904
+
1905
+ use JSON -convert_blessed_universally;
1906
+ JSON->new->allow_blessed->convert_blessed->encode( $blessed_object );
1907
+
1908
+ See to L<convert_blessed>.
1909
+
1910
+ =item simple scalars
1911
+
1912
+ Simple Perl scalars (any scalar that is not a reference) are the most
1913
+ difficult objects to encode: JSON::XS and JSON::PP will encode undefined scalars as
1914
+ JSON C<null> values, scalars that have last been used in a string context
1915
+ before encoding as JSON strings, and anything else as number value:
1916
+
1917
+ # dump as number
1918
+ encode_json [2] # yields [2]
1919
+ encode_json [-3.0e17] # yields [-3e+17]
1920
+ my $value = 5; encode_json [$value] # yields [5]
1921
+
1922
+ # used as string, so dump as string
1923
+ print $value;
1924
+ encode_json [$value] # yields ["5"]
1925
+
1926
+ # undef becomes null
1927
+ encode_json [undef] # yields [null]
1928
+
1929
+ You can force the type to be a string by stringifying it:
1930
+
1931
+ my $x = 3.1; # some variable containing a number
1932
+ "$x"; # stringified
1933
+ $x .= ""; # another, more awkward way to stringify
1934
+ print $x; # perl does it for you, too, quite often
1935
+
1936
+ You can force the type to be a number by numifying it:
1937
+
1938
+ my $x = "3"; # some variable containing a string
1939
+ $x += 0; # numify it, ensuring it will be dumped as a number
1940
+ $x *= 1; # same thing, the choice is yours.
1941
+
1942
+ You can not currently force the type in other, less obscure, ways.
1943
+
1944
+ Note that numerical precision has the same meaning as under Perl (so
1945
+ binary to decimal conversion follows the same rules as in Perl, which
1946
+ can differ to other languages). Also, your perl interpreter might expose
1947
+ extensions to the floating point numbers of your platform, such as
1948
+ infinities or NaN's - these cannot be represented in JSON, and it is an
1949
+ error to pass those in.
1950
+
1951
+ =item Big Number
1952
+
1953
+ If the backend is JSON::PP and C<allow_bignum> is enable,
1954
+ C<encode> converts C<Math::BigInt> objects and C<Math::BigFloat>
1955
+ objects into JSON numbers.
1956
+
1957
+
1958
+ =back
1959
+
1960
+ =head1 JSON and ECMAscript
1961
+
1962
+ See to L<JSON::XS/JSON and ECMAscript>.
1963
+
1964
+ =head1 JSON and YAML
1965
+
1966
+ JSON is not a subset of YAML.
1967
+ See to L<JSON::XS/JSON and YAML>.
1968
+
1969
+
1970
+ =head1 BACKEND MODULE DECISION
1971
+
1972
+ When you use C<JSON>, C<JSON> tries to C<use> JSON::XS. If this call failed, it will
1973
+ C<uses> JSON::PP. The required JSON::XS version is I<2.2> or later.
1974
+
1975
+ The C<JSON> constructor method returns an object inherited from the backend module,
1976
+ and JSON::XS object is a blessed scalar reference while JSON::PP is a blessed hash
1977
+ reference.
1978
+
1979
+ So, your program should not depend on the backend module, especially
1980
+ returned objects should not be modified.
1981
+
1982
+ my $json = JSON->new; # XS or PP?
1983
+ $json->{stash} = 'this is xs object'; # this code may raise an error!
1984
+
1985
+ To check the backend module, there are some methods - C<backend>, C<is_pp> and C<is_xs>.
1986
+
1987
+ JSON->backend; # 'JSON::XS' or 'JSON::PP'
1988
+
1989
+ JSON->backend->is_pp: # 0 or 1
1990
+
1991
+ JSON->backend->is_xs: # 1 or 0
1992
+
1993
+ $json->is_xs; # 1 or 0
1994
+
1995
+ $json->is_pp; # 0 or 1
1996
+
1997
+
1998
+ If you set an environment variable C<PERL_JSON_BACKEND>, the calling action will be changed.
1999
+
2000
+ =over
2001
+
2002
+ =item PERL_JSON_BACKEND = 0 or PERL_JSON_BACKEND = 'JSON::PP'
2003
+
2004
+ Always use JSON::PP
2005
+
2006
+ =item PERL_JSON_BACKEND == 1 or PERL_JSON_BACKEND = 'JSON::XS,JSON::PP'
2007
+
2008
+ (The default) Use compiled JSON::XS if it is properly compiled & installed,
2009
+ otherwise use JSON::PP.
2010
+
2011
+ =item PERL_JSON_BACKEND == 2 or PERL_JSON_BACKEND = 'JSON::XS'
2012
+
2013
+ Always use compiled JSON::XS, die if it isn't properly compiled & installed.
2014
+
2015
+ =item PERL_JSON_BACKEND = 'JSON::backportPP'
2016
+
2017
+ Always use JSON::backportPP.
2018
+ JSON::backportPP is JSON::PP back port module.
2019
+ C<JSON> includes JSON::backportPP instead of JSON::PP.
2020
+
2021
+ =back
2022
+
2023
+ These ideas come from L<DBI::PurePerl> mechanism.
2024
+
2025
+ example:
2026
+
2027
+ BEGIN { $ENV{PERL_JSON_BACKEND} = 'JSON::PP' }
2028
+ use JSON; # always uses JSON::PP
2029
+
2030
+ In future, it may be able to specify another module.
2031
+
2032
+ =head1 USE PP FEATURES EVEN THOUGH XS BACKEND
2033
+
2034
+ Many methods are available with either JSON::XS or JSON::PP and
2035
+ when the backend module is JSON::XS, if any JSON::PP specific (i.e. JSON::XS unsupported)
2036
+ method is called, it will C<warn> and be noop.
2037
+
2038
+ But If you C<use> C<JSON> passing the optional string C<-support_by_pp>,
2039
+ it makes a part of those unsupported methods available.
2040
+ This feature is achieved by using JSON::PP in C<de/encode>.
2041
+
2042
+ BEGIN { $ENV{PERL_JSON_BACKEND} = 2 } # with JSON::XS
2043
+ use JSON -support_by_pp;
2044
+ my $json = JSON->new;
2045
+ $json->allow_nonref->escape_slash->encode("/");
2046
+
2047
+ At this time, the returned object is a C<JSON::Backend::XS::Supportable>
2048
+ object (re-blessed XS object), and by checking JSON::XS unsupported flags
2049
+ in de/encoding, can support some unsupported methods - C<loose>, C<allow_bignum>,
2050
+ C<allow_barekey>, C<allow_singlequote>, C<escape_slash> and C<indent_length>.
2051
+
2052
+ When any unsupported methods are not enable, C<XS de/encode> will be
2053
+ used as is. The switch is achieved by changing the symbolic tables.
2054
+
2055
+ C<-support_by_pp> is effective only when the backend module is JSON::XS
2056
+ and it makes the de/encoding speed down a bit.
2057
+
2058
+ See to L<JSON::PP SUPPORT METHODS>.
2059
+
2060
+ =head1 INCOMPATIBLE CHANGES TO OLD VERSION
2061
+
2062
+ There are big incompatibility between new version (2.00) and old (1.xx).
2063
+ If you use old C<JSON> 1.xx in your code, please check it.
2064
+
2065
+ See to L<Transition ways from 1.xx to 2.xx.>
2066
+
2067
+ =over
2068
+
2069
+ =item jsonToObj and objToJson are obsoleted.
2070
+
2071
+ Non Perl-style name C<jsonToObj> and C<objToJson> are obsoleted
2072
+ (but not yet deleted from the source).
2073
+ If you use these functions in your code, please replace them
2074
+ with C<from_json> and C<to_json>.
2075
+
2076
+
2077
+ =item Global variables are no longer available.
2078
+
2079
+ C<JSON> class variables - C<$JSON::AUTOCONVERT>, C<$JSON::BareKey>, etc...
2080
+ - are not available any longer.
2081
+ Instead, various features can be used through object methods.
2082
+
2083
+
2084
+ =item Package JSON::Converter and JSON::Parser are deleted.
2085
+
2086
+ Now C<JSON> bundles with JSON::PP which can handle JSON more properly than them.
2087
+
2088
+ =item Package JSON::NotString is deleted.
2089
+
2090
+ There was C<JSON::NotString> class which represents JSON value C<true>, C<false>, C<null>
2091
+ and numbers. It was deleted and replaced by C<JSON::Boolean>.
2092
+
2093
+ C<JSON::Boolean> represents C<true> and C<false>.
2094
+
2095
+ C<JSON::Boolean> does not represent C<null>.
2096
+
2097
+ C<JSON::null> returns C<undef>.
2098
+
2099
+ C<JSON> makes L<JSON::XS::Boolean> and L<JSON::PP::Boolean> is-a relation
2100
+ to L<JSON::Boolean>.
2101
+
2102
+ =item function JSON::Number is obsoleted.
2103
+
2104
+ C<JSON::Number> is now needless because JSON::XS and JSON::PP have
2105
+ round-trip integrity.
2106
+
2107
+ =item JSONRPC modules are deleted.
2108
+
2109
+ Perl implementation of JSON-RPC protocol - C<JSONRPC >, C<JSONRPC::Transport::HTTP>
2110
+ and C<Apache::JSONRPC > are deleted in this distribution.
2111
+ Instead of them, there is L<JSON::RPC> which supports JSON-RPC protocol version 1.1.
2112
+
2113
+ =back
2114
+
2115
+ =head2 Transition ways from 1.xx to 2.xx.
2116
+
2117
+ You should set C<suport_by_pp> mode firstly, because
2118
+ it is always successful for the below codes even with JSON::XS.
2119
+
2120
+ use JSON -support_by_pp;
2121
+
2122
+ =over
2123
+
2124
+ =item Exported jsonToObj (simple)
2125
+
2126
+ from_json($json_text);
2127
+
2128
+ =item Exported objToJson (simple)
2129
+
2130
+ to_json($perl_scalar);
2131
+
2132
+ =item Exported jsonToObj (advanced)
2133
+
2134
+ $flags = {allow_barekey => 1, allow_singlequote => 1};
2135
+ from_json($json_text, $flags);
2136
+
2137
+ equivalent to:
2138
+
2139
+ $JSON::BareKey = 1;
2140
+ $JSON::QuotApos = 1;
2141
+ jsonToObj($json_text);
2142
+
2143
+ =item Exported objToJson (advanced)
2144
+
2145
+ $flags = {allow_blessed => 1, allow_barekey => 1};
2146
+ to_json($perl_scalar, $flags);
2147
+
2148
+ equivalent to:
2149
+
2150
+ $JSON::BareKey = 1;
2151
+ objToJson($perl_scalar);
2152
+
2153
+ =item jsonToObj as object method
2154
+
2155
+ $json->decode($json_text);
2156
+
2157
+ =item objToJson as object method
2158
+
2159
+ $json->encode($perl_scalar);
2160
+
2161
+ =item new method with parameters
2162
+
2163
+ The C<new> method in 2.x takes any parameters no longer.
2164
+ You can set parameters instead;
2165
+
2166
+ $json = JSON->new->pretty;
2167
+
2168
+ =item $JSON::Pretty, $JSON::Indent, $JSON::Delimiter
2169
+
2170
+ If C<indent> is enable, that means C<$JSON::Pretty> flag set. And
2171
+ C<$JSON::Delimiter> was substituted by C<space_before> and C<space_after>.
2172
+ In conclusion:
2173
+
2174
+ $json->indent->space_before->space_after;
2175
+
2176
+ Equivalent to:
2177
+
2178
+ $json->pretty;
2179
+
2180
+ To change indent length, use C<indent_length>.
2181
+
2182
+ (Only with JSON::PP, if C<-support_by_pp> is not used.)
2183
+
2184
+ $json->pretty->indent_length(2)->encode($perl_scalar);
2185
+
2186
+ =item $JSON::BareKey
2187
+
2188
+ (Only with JSON::PP, if C<-support_by_pp> is not used.)
2189
+
2190
+ $json->allow_barekey->decode($json_text)
2191
+
2192
+ =item $JSON::ConvBlessed
2193
+
2194
+ use C<-convert_blessed_universally>. See to L<convert_blessed>.
2195
+
2196
+ =item $JSON::QuotApos
2197
+
2198
+ (Only with JSON::PP, if C<-support_by_pp> is not used.)
2199
+
2200
+ $json->allow_singlequote->decode($json_text)
2201
+
2202
+ =item $JSON::SingleQuote
2203
+
2204
+ Disable. C<JSON> does not make such a invalid JSON string any longer.
2205
+
2206
+ =item $JSON::KeySort
2207
+
2208
+ $json->canonical->encode($perl_scalar)
2209
+
2210
+ This is the ascii sort.
2211
+
2212
+ If you want to use with your own sort routine, check the C<sort_by> method.
2213
+
2214
+ (Only with JSON::PP, even if C<-support_by_pp> is used currently.)
2215
+
2216
+ $json->sort_by($sort_routine_ref)->encode($perl_scalar)
2217
+
2218
+ $json->sort_by(sub { $JSON::PP::a <=> $JSON::PP::b })->encode($perl_scalar)
2219
+
2220
+ Can't access C<$a> and C<$b> but C<$JSON::PP::a> and C<$JSON::PP::b>.
2221
+
2222
+ =item $JSON::SkipInvalid
2223
+
2224
+ $json->allow_unknown
2225
+
2226
+ =item $JSON::AUTOCONVERT
2227
+
2228
+ Needless. C<JSON> backend modules have the round-trip integrity.
2229
+
2230
+ =item $JSON::UTF8
2231
+
2232
+ Needless because C<JSON> (JSON::XS/JSON::PP) sets
2233
+ the UTF8 flag on properly.
2234
+
2235
+ # With UTF8-flagged strings
2236
+
2237
+ $json->allow_nonref;
2238
+ $str = chr(1000); # UTF8-flagged
2239
+
2240
+ $json_text = $json->utf8(0)->encode($str);
2241
+ utf8::is_utf8($json_text);
2242
+ # true
2243
+ $json_text = $json->utf8(1)->encode($str);
2244
+ utf8::is_utf8($json_text);
2245
+ # false
2246
+
2247
+ $str = '"' . chr(1000) . '"'; # UTF8-flagged
2248
+
2249
+ $perl_scalar = $json->utf8(0)->decode($str);
2250
+ utf8::is_utf8($perl_scalar);
2251
+ # true
2252
+ $perl_scalar = $json->utf8(1)->decode($str);
2253
+ # died because of 'Wide character in subroutine'
2254
+
2255
+ See to L<JSON::XS/A FEW NOTES ON UNICODE AND PERL>.
2256
+
2257
+ =item $JSON::UnMapping
2258
+
2259
+ Disable. See to L<MAPPING>.
2260
+
2261
+ =item $JSON::SelfConvert
2262
+
2263
+ This option was deleted.
2264
+ Instead of it, if a given blessed object has the C<TO_JSON> method,
2265
+ C<TO_JSON> will be executed with C<convert_blessed>.
2266
+
2267
+ $json->convert_blessed->encode($blessed_hashref_or_arrayref)
2268
+ # if need, call allow_blessed
2269
+
2270
+ Note that it was C<toJson> in old version, but now not C<toJson> but C<TO_JSON>.
2271
+
2272
+ =back
2273
+
2274
+ =head1 TODO
2275
+
2276
+ =over
2277
+
2278
+ =item example programs
2279
+
2280
+ =back
2281
+
2282
+ =head1 THREADS
2283
+
2284
+ No test with JSON::PP. If with JSON::XS, See to L<JSON::XS/THREADS>.
2285
+
2286
+
2287
+ =head1 BUGS
2288
+
2289
+ Please report bugs relevant to C<JSON> to E<lt>makamaka[at]cpan.orgE<gt>.
2290
+
2291
+
2292
+ =head1 SEE ALSO
2293
+
2294
+ Most of the document is copied and modified from JSON::XS doc.
2295
+
2296
+ L<JSON::XS>, L<JSON::PP>
2297
+
2298
+ C<RFC4627>(L<http://www.ietf.org/rfc/rfc4627.txt>)
2299
+
2300
+ =head1 AUTHOR
2301
+
2302
+ Makamaka Hannyaharamitu, E<lt>makamaka[at]cpan.orgE<gt>
2303
+
2304
+ JSON::XS was written by Marc Lehmann <schmorp[at]schmorp.de>
2305
+
2306
+ The release of this new version owes to the courtesy of Marc Lehmann.
2307
+
2308
+
2309
+ =head1 COPYRIGHT AND LICENSE
2310
+
2311
+ Copyright 2005-2013 by Makamaka Hannyaharamitu
2312
+
2313
+ This library is free software; you can redistribute it and/or modify
2314
+ it under the same terms as Perl itself.
2315
+
2316
+ =cut
2317
+
uroman/lib/JSON/backportPP.pm ADDED
@@ -0,0 +1,2806 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package # This is JSON::backportPP
2
+ JSON::PP;
3
+
4
+ # JSON-2.0
5
+
6
+ use 5.005;
7
+ use strict;
8
+ use base qw(Exporter);
9
+ use overload ();
10
+
11
+ use Carp ();
12
+ use B ();
13
+ #use Devel::Peek;
14
+
15
+ use vars qw($VERSION);
16
+ $VERSION = '2.27204';
17
+
18
+ @JSON::PP::EXPORT = qw(encode_json decode_json from_json to_json);
19
+
20
+ # instead of hash-access, i tried index-access for speed.
21
+ # but this method is not faster than what i expected. so it will be changed.
22
+
23
+ use constant P_ASCII => 0;
24
+ use constant P_LATIN1 => 1;
25
+ use constant P_UTF8 => 2;
26
+ use constant P_INDENT => 3;
27
+ use constant P_CANONICAL => 4;
28
+ use constant P_SPACE_BEFORE => 5;
29
+ use constant P_SPACE_AFTER => 6;
30
+ use constant P_ALLOW_NONREF => 7;
31
+ use constant P_SHRINK => 8;
32
+ use constant P_ALLOW_BLESSED => 9;
33
+ use constant P_CONVERT_BLESSED => 10;
34
+ use constant P_RELAXED => 11;
35
+
36
+ use constant P_LOOSE => 12;
37
+ use constant P_ALLOW_BIGNUM => 13;
38
+ use constant P_ALLOW_BAREKEY => 14;
39
+ use constant P_ALLOW_SINGLEQUOTE => 15;
40
+ use constant P_ESCAPE_SLASH => 16;
41
+ use constant P_AS_NONBLESSED => 17;
42
+
43
+ use constant P_ALLOW_UNKNOWN => 18;
44
+
45
+ use constant OLD_PERL => $] < 5.008 ? 1 : 0;
46
+
47
+ BEGIN {
48
+ my @xs_compati_bit_properties = qw(
49
+ latin1 ascii utf8 indent canonical space_before space_after allow_nonref shrink
50
+ allow_blessed convert_blessed relaxed allow_unknown
51
+ );
52
+ my @pp_bit_properties = qw(
53
+ allow_singlequote allow_bignum loose
54
+ allow_barekey escape_slash as_nonblessed
55
+ );
56
+
57
+ # Perl version check, Unicode handling is enable?
58
+ # Helper module sets @JSON::PP::_properties.
59
+ if ($] < 5.008 ) {
60
+ my $helper = $] >= 5.006 ? 'JSON::backportPP::Compat5006' : 'JSON::backportPP::Compat5005';
61
+ eval qq| require $helper |;
62
+ if ($@) { Carp::croak $@; }
63
+ }
64
+
65
+ for my $name (@xs_compati_bit_properties, @pp_bit_properties) {
66
+ my $flag_name = 'P_' . uc($name);
67
+
68
+ eval qq/
69
+ sub $name {
70
+ my \$enable = defined \$_[1] ? \$_[1] : 1;
71
+
72
+ if (\$enable) {
73
+ \$_[0]->{PROPS}->[$flag_name] = 1;
74
+ }
75
+ else {
76
+ \$_[0]->{PROPS}->[$flag_name] = 0;
77
+ }
78
+
79
+ \$_[0];
80
+ }
81
+
82
+ sub get_$name {
83
+ \$_[0]->{PROPS}->[$flag_name] ? 1 : '';
84
+ }
85
+ /;
86
+ }
87
+
88
+ }
89
+
90
+
91
+
92
+ # Functions
93
+
94
+ my %encode_allow_method
95
+ = map {($_ => 1)} qw/utf8 pretty allow_nonref latin1 self_encode escape_slash
96
+ allow_blessed convert_blessed indent indent_length allow_bignum
97
+ as_nonblessed
98
+ /;
99
+ my %decode_allow_method
100
+ = map {($_ => 1)} qw/utf8 allow_nonref loose allow_singlequote allow_bignum
101
+ allow_barekey max_size relaxed/;
102
+
103
+
104
+ my $JSON; # cache
105
+
106
+ sub encode_json ($) { # encode
107
+ ($JSON ||= __PACKAGE__->new->utf8)->encode(@_);
108
+ }
109
+
110
+
111
+ sub decode_json { # decode
112
+ ($JSON ||= __PACKAGE__->new->utf8)->decode(@_);
113
+ }
114
+
115
+ # Obsoleted
116
+
117
+ sub to_json($) {
118
+ Carp::croak ("JSON::PP::to_json has been renamed to encode_json.");
119
+ }
120
+
121
+
122
+ sub from_json($) {
123
+ Carp::croak ("JSON::PP::from_json has been renamed to decode_json.");
124
+ }
125
+
126
+
127
+ # Methods
128
+
129
+ sub new {
130
+ my $class = shift;
131
+ my $self = {
132
+ max_depth => 512,
133
+ max_size => 0,
134
+ indent => 0,
135
+ FLAGS => 0,
136
+ fallback => sub { encode_error('Invalid value. JSON can only reference.') },
137
+ indent_length => 3,
138
+ };
139
+
140
+ bless $self, $class;
141
+ }
142
+
143
+
144
+ sub encode {
145
+ return $_[0]->PP_encode_json($_[1]);
146
+ }
147
+
148
+
149
+ sub decode {
150
+ return $_[0]->PP_decode_json($_[1], 0x00000000);
151
+ }
152
+
153
+
154
+ sub decode_prefix {
155
+ return $_[0]->PP_decode_json($_[1], 0x00000001);
156
+ }
157
+
158
+
159
+ # accessor
160
+
161
+
162
+ # pretty printing
163
+
164
+ sub pretty {
165
+ my ($self, $v) = @_;
166
+ my $enable = defined $v ? $v : 1;
167
+
168
+ if ($enable) { # indent_length(3) for JSON::XS compatibility
169
+ $self->indent(1)->indent_length(3)->space_before(1)->space_after(1);
170
+ }
171
+ else {
172
+ $self->indent(0)->space_before(0)->space_after(0);
173
+ }
174
+
175
+ $self;
176
+ }
177
+
178
+ # etc
179
+
180
+ sub max_depth {
181
+ my $max = defined $_[1] ? $_[1] : 0x80000000;
182
+ $_[0]->{max_depth} = $max;
183
+ $_[0];
184
+ }
185
+
186
+
187
+ sub get_max_depth { $_[0]->{max_depth}; }
188
+
189
+
190
+ sub max_size {
191
+ my $max = defined $_[1] ? $_[1] : 0;
192
+ $_[0]->{max_size} = $max;
193
+ $_[0];
194
+ }
195
+
196
+
197
+ sub get_max_size { $_[0]->{max_size}; }
198
+
199
+
200
+ sub filter_json_object {
201
+ $_[0]->{cb_object} = defined $_[1] ? $_[1] : 0;
202
+ $_[0]->{F_HOOK} = ($_[0]->{cb_object} or $_[0]->{cb_sk_object}) ? 1 : 0;
203
+ $_[0];
204
+ }
205
+
206
+ sub filter_json_single_key_object {
207
+ if (@_ > 1) {
208
+ $_[0]->{cb_sk_object}->{$_[1]} = $_[2];
209
+ }
210
+ $_[0]->{F_HOOK} = ($_[0]->{cb_object} or $_[0]->{cb_sk_object}) ? 1 : 0;
211
+ $_[0];
212
+ }
213
+
214
+ sub indent_length {
215
+ if (!defined $_[1] or $_[1] > 15 or $_[1] < 0) {
216
+ Carp::carp "The acceptable range of indent_length() is 0 to 15.";
217
+ }
218
+ else {
219
+ $_[0]->{indent_length} = $_[1];
220
+ }
221
+ $_[0];
222
+ }
223
+
224
+ sub get_indent_length {
225
+ $_[0]->{indent_length};
226
+ }
227
+
228
+ sub sort_by {
229
+ $_[0]->{sort_by} = defined $_[1] ? $_[1] : 1;
230
+ $_[0];
231
+ }
232
+
233
+ sub allow_bigint {
234
+ Carp::carp("allow_bigint() is obsoleted. use allow_bignum() insted.");
235
+ }
236
+
237
+ ###############################
238
+
239
+ ###
240
+ ### Perl => JSON
241
+ ###
242
+
243
+
244
+ { # Convert
245
+
246
+ my $max_depth;
247
+ my $indent;
248
+ my $ascii;
249
+ my $latin1;
250
+ my $utf8;
251
+ my $space_before;
252
+ my $space_after;
253
+ my $canonical;
254
+ my $allow_blessed;
255
+ my $convert_blessed;
256
+
257
+ my $indent_length;
258
+ my $escape_slash;
259
+ my $bignum;
260
+ my $as_nonblessed;
261
+
262
+ my $depth;
263
+ my $indent_count;
264
+ my $keysort;
265
+
266
+
267
+ sub PP_encode_json {
268
+ my $self = shift;
269
+ my $obj = shift;
270
+
271
+ $indent_count = 0;
272
+ $depth = 0;
273
+
274
+ my $idx = $self->{PROPS};
275
+
276
+ ($ascii, $latin1, $utf8, $indent, $canonical, $space_before, $space_after, $allow_blessed,
277
+ $convert_blessed, $escape_slash, $bignum, $as_nonblessed)
278
+ = @{$idx}[P_ASCII .. P_SPACE_AFTER, P_ALLOW_BLESSED, P_CONVERT_BLESSED,
279
+ P_ESCAPE_SLASH, P_ALLOW_BIGNUM, P_AS_NONBLESSED];
280
+
281
+ ($max_depth, $indent_length) = @{$self}{qw/max_depth indent_length/};
282
+
283
+ $keysort = $canonical ? sub { $a cmp $b } : undef;
284
+
285
+ if ($self->{sort_by}) {
286
+ $keysort = ref($self->{sort_by}) eq 'CODE' ? $self->{sort_by}
287
+ : $self->{sort_by} =~ /\D+/ ? $self->{sort_by}
288
+ : sub { $a cmp $b };
289
+ }
290
+
291
+ encode_error("hash- or arrayref expected (not a simple scalar, use allow_nonref to allow this)")
292
+ if(!ref $obj and !$idx->[ P_ALLOW_NONREF ]);
293
+
294
+ my $str = $self->object_to_json($obj);
295
+
296
+ $str .= "\n" if ( $indent ); # JSON::XS 2.26 compatible
297
+
298
+ unless ($ascii or $latin1 or $utf8) {
299
+ utf8::upgrade($str);
300
+ }
301
+
302
+ if ($idx->[ P_SHRINK ]) {
303
+ utf8::downgrade($str, 1);
304
+ }
305
+
306
+ return $str;
307
+ }
308
+
309
+
310
+ sub object_to_json {
311
+ my ($self, $obj) = @_;
312
+ my $type = ref($obj);
313
+
314
+ if($type eq 'HASH'){
315
+ return $self->hash_to_json($obj);
316
+ }
317
+ elsif($type eq 'ARRAY'){
318
+ return $self->array_to_json($obj);
319
+ }
320
+ elsif ($type) { # blessed object?
321
+ if (blessed($obj)) {
322
+
323
+ return $self->value_to_json($obj) if ( $obj->isa('JSON::PP::Boolean') );
324
+
325
+ if ( $convert_blessed and $obj->can('TO_JSON') ) {
326
+ my $result = $obj->TO_JSON();
327
+ if ( defined $result and ref( $result ) ) {
328
+ if ( refaddr( $obj ) eq refaddr( $result ) ) {
329
+ encode_error( sprintf(
330
+ "%s::TO_JSON method returned same object as was passed instead of a new one",
331
+ ref $obj
332
+ ) );
333
+ }
334
+ }
335
+
336
+ return $self->object_to_json( $result );
337
+ }
338
+
339
+ return "$obj" if ( $bignum and _is_bignum($obj) );
340
+ return $self->blessed_to_json($obj) if ($allow_blessed and $as_nonblessed); # will be removed.
341
+
342
+ encode_error( sprintf("encountered object '%s', but neither allow_blessed "
343
+ . "nor convert_blessed settings are enabled", $obj)
344
+ ) unless ($allow_blessed);
345
+
346
+ return 'null';
347
+ }
348
+ else {
349
+ return $self->value_to_json($obj);
350
+ }
351
+ }
352
+ else{
353
+ return $self->value_to_json($obj);
354
+ }
355
+ }
356
+
357
+
358
+ sub hash_to_json {
359
+ my ($self, $obj) = @_;
360
+ my @res;
361
+
362
+ encode_error("json text or perl structure exceeds maximum nesting level (max_depth set too low?)")
363
+ if (++$depth > $max_depth);
364
+
365
+ my ($pre, $post) = $indent ? $self->_up_indent() : ('', '');
366
+ my $del = ($space_before ? ' ' : '') . ':' . ($space_after ? ' ' : '');
367
+
368
+ for my $k ( _sort( $obj ) ) {
369
+ if ( OLD_PERL ) { utf8::decode($k) } # key for Perl 5.6 / be optimized
370
+ push @res, string_to_json( $self, $k )
371
+ . $del
372
+ . ( $self->object_to_json( $obj->{$k} ) || $self->value_to_json( $obj->{$k} ) );
373
+ }
374
+
375
+ --$depth;
376
+ $self->_down_indent() if ($indent);
377
+
378
+ return '{' . ( @res ? $pre : '' ) . ( @res ? join( ",$pre", @res ) . $post : '' ) . '}';
379
+ }
380
+
381
+
382
+ sub array_to_json {
383
+ my ($self, $obj) = @_;
384
+ my @res;
385
+
386
+ encode_error("json text or perl structure exceeds maximum nesting level (max_depth set too low?)")
387
+ if (++$depth > $max_depth);
388
+
389
+ my ($pre, $post) = $indent ? $self->_up_indent() : ('', '');
390
+
391
+ for my $v (@$obj){
392
+ push @res, $self->object_to_json($v) || $self->value_to_json($v);
393
+ }
394
+
395
+ --$depth;
396
+ $self->_down_indent() if ($indent);
397
+
398
+ return '[' . ( @res ? $pre : '' ) . ( @res ? join( ",$pre", @res ) . $post : '' ) . ']';
399
+ }
400
+
401
+
402
+ sub value_to_json {
403
+ my ($self, $value) = @_;
404
+
405
+ return 'null' if(!defined $value);
406
+
407
+ my $b_obj = B::svref_2object(\$value); # for round trip problem
408
+ my $flags = $b_obj->FLAGS;
409
+
410
+ return $value # as is
411
+ if $flags & ( B::SVp_IOK | B::SVp_NOK ) and !( $flags & B::SVp_POK ); # SvTYPE is IV or NV?
412
+
413
+ my $type = ref($value);
414
+
415
+ if(!$type){
416
+ return string_to_json($self, $value);
417
+ }
418
+ elsif( blessed($value) and $value->isa('JSON::PP::Boolean') ){
419
+ return $$value == 1 ? 'true' : 'false';
420
+ }
421
+ elsif ($type) {
422
+ if ((overload::StrVal($value) =~ /=(\w+)/)[0]) {
423
+ return $self->value_to_json("$value");
424
+ }
425
+
426
+ if ($type eq 'SCALAR' and defined $$value) {
427
+ return $$value eq '1' ? 'true'
428
+ : $$value eq '0' ? 'false'
429
+ : $self->{PROPS}->[ P_ALLOW_UNKNOWN ] ? 'null'
430
+ : encode_error("cannot encode reference to scalar");
431
+ }
432
+
433
+ if ( $self->{PROPS}->[ P_ALLOW_UNKNOWN ] ) {
434
+ return 'null';
435
+ }
436
+ else {
437
+ if ( $type eq 'SCALAR' or $type eq 'REF' ) {
438
+ encode_error("cannot encode reference to scalar");
439
+ }
440
+ else {
441
+ encode_error("encountered $value, but JSON can only represent references to arrays or hashes");
442
+ }
443
+ }
444
+
445
+ }
446
+ else {
447
+ return $self->{fallback}->($value)
448
+ if ($self->{fallback} and ref($self->{fallback}) eq 'CODE');
449
+ return 'null';
450
+ }
451
+
452
+ }
453
+
454
+
455
+ my %esc = (
456
+ "\n" => '\n',
457
+ "\r" => '\r',
458
+ "\t" => '\t',
459
+ "\f" => '\f',
460
+ "\b" => '\b',
461
+ "\"" => '\"',
462
+ "\\" => '\\\\',
463
+ "\'" => '\\\'',
464
+ );
465
+
466
+
467
+ sub string_to_json {
468
+ my ($self, $arg) = @_;
469
+
470
+ $arg =~ s/([\x22\x5c\n\r\t\f\b])/$esc{$1}/g;
471
+ $arg =~ s/\//\\\//g if ($escape_slash);
472
+ $arg =~ s/([\x00-\x08\x0b\x0e-\x1f])/'\\u00' . unpack('H2', $1)/eg;
473
+
474
+ if ($ascii) {
475
+ $arg = JSON_PP_encode_ascii($arg);
476
+ }
477
+
478
+ if ($latin1) {
479
+ $arg = JSON_PP_encode_latin1($arg);
480
+ }
481
+
482
+ if ($utf8) {
483
+ utf8::encode($arg);
484
+ }
485
+
486
+ return '"' . $arg . '"';
487
+ }
488
+
489
+
490
+ sub blessed_to_json {
491
+ my $reftype = reftype($_[1]) || '';
492
+ if ($reftype eq 'HASH') {
493
+ return $_[0]->hash_to_json($_[1]);
494
+ }
495
+ elsif ($reftype eq 'ARRAY') {
496
+ return $_[0]->array_to_json($_[1]);
497
+ }
498
+ else {
499
+ return 'null';
500
+ }
501
+ }
502
+
503
+
504
+ sub encode_error {
505
+ my $error = shift;
506
+ Carp::croak "$error";
507
+ }
508
+
509
+
510
+ sub _sort {
511
+ defined $keysort ? (sort $keysort (keys %{$_[0]})) : keys %{$_[0]};
512
+ }
513
+
514
+
515
+ sub _up_indent {
516
+ my $self = shift;
517
+ my $space = ' ' x $indent_length;
518
+
519
+ my ($pre,$post) = ('','');
520
+
521
+ $post = "\n" . $space x $indent_count;
522
+
523
+ $indent_count++;
524
+
525
+ $pre = "\n" . $space x $indent_count;
526
+
527
+ return ($pre,$post);
528
+ }
529
+
530
+
531
+ sub _down_indent { $indent_count--; }
532
+
533
+
534
+ sub PP_encode_box {
535
+ {
536
+ depth => $depth,
537
+ indent_count => $indent_count,
538
+ };
539
+ }
540
+
541
+ } # Convert
542
+
543
+
544
+ sub _encode_ascii {
545
+ join('',
546
+ map {
547
+ $_ <= 127 ?
548
+ chr($_) :
549
+ $_ <= 65535 ?
550
+ sprintf('\u%04x', $_) : sprintf('\u%x\u%x', _encode_surrogates($_));
551
+ } unpack('U*', $_[0])
552
+ );
553
+ }
554
+
555
+
556
+ sub _encode_latin1 {
557
+ join('',
558
+ map {
559
+ $_ <= 255 ?
560
+ chr($_) :
561
+ $_ <= 65535 ?
562
+ sprintf('\u%04x', $_) : sprintf('\u%x\u%x', _encode_surrogates($_));
563
+ } unpack('U*', $_[0])
564
+ );
565
+ }
566
+
567
+
568
+ sub _encode_surrogates { # from perlunicode
569
+ my $uni = $_[0] - 0x10000;
570
+ return ($uni / 0x400 + 0xD800, $uni % 0x400 + 0xDC00);
571
+ }
572
+
573
+
574
+ sub _is_bignum {
575
+ $_[0]->isa('Math::BigInt') or $_[0]->isa('Math::BigFloat');
576
+ }
577
+
578
+
579
+
580
+ #
581
+ # JSON => Perl
582
+ #
583
+
584
+ my $max_intsize;
585
+
586
+ BEGIN {
587
+ my $checkint = 1111;
588
+ for my $d (5..64) {
589
+ $checkint .= 1;
590
+ my $int = eval qq| $checkint |;
591
+ if ($int =~ /[eE]/) {
592
+ $max_intsize = $d - 1;
593
+ last;
594
+ }
595
+ }
596
+ }
597
+
598
+ { # PARSE
599
+
600
+ my %escapes = ( # by Jeremy Muhlich <jmuhlich [at] bitflood.org>
601
+ b => "\x8",
602
+ t => "\x9",
603
+ n => "\xA",
604
+ f => "\xC",
605
+ r => "\xD",
606
+ '\\' => '\\',
607
+ '"' => '"',
608
+ '/' => '/',
609
+ );
610
+
611
+ my $text; # json data
612
+ my $at; # offset
613
+ my $ch; # 1chracter
614
+ my $len; # text length (changed according to UTF8 or NON UTF8)
615
+ # INTERNAL
616
+ my $depth; # nest counter
617
+ my $encoding; # json text encoding
618
+ my $is_valid_utf8; # temp variable
619
+ my $utf8_len; # utf8 byte length
620
+ # FLAGS
621
+ my $utf8; # must be utf8
622
+ my $max_depth; # max nest number of objects and arrays
623
+ my $max_size;
624
+ my $relaxed;
625
+ my $cb_object;
626
+ my $cb_sk_object;
627
+
628
+ my $F_HOOK;
629
+
630
+ my $allow_bigint; # using Math::BigInt
631
+ my $singlequote; # loosely quoting
632
+ my $loose; #
633
+ my $allow_barekey; # bareKey
634
+
635
+ # $opt flag
636
+ # 0x00000001 .... decode_prefix
637
+ # 0x10000000 .... incr_parse
638
+
639
+ sub PP_decode_json {
640
+ my ($self, $opt); # $opt is an effective flag during this decode_json.
641
+
642
+ ($self, $text, $opt) = @_;
643
+
644
+ ($at, $ch, $depth) = (0, '', 0);
645
+
646
+ if ( !defined $text or ref $text ) {
647
+ decode_error("malformed JSON string, neither array, object, number, string or atom");
648
+ }
649
+
650
+ my $idx = $self->{PROPS};
651
+
652
+ ($utf8, $relaxed, $loose, $allow_bigint, $allow_barekey, $singlequote)
653
+ = @{$idx}[P_UTF8, P_RELAXED, P_LOOSE .. P_ALLOW_SINGLEQUOTE];
654
+
655
+ if ( $utf8 ) {
656
+ utf8::downgrade( $text, 1 ) or Carp::croak("Wide character in subroutine entry");
657
+ }
658
+ else {
659
+ utf8::upgrade( $text );
660
+ }
661
+
662
+ $len = length $text;
663
+
664
+ ($max_depth, $max_size, $cb_object, $cb_sk_object, $F_HOOK)
665
+ = @{$self}{qw/max_depth max_size cb_object cb_sk_object F_HOOK/};
666
+
667
+ if ($max_size > 1) {
668
+ use bytes;
669
+ my $bytes = length $text;
670
+ decode_error(
671
+ sprintf("attempted decode of JSON text of %s bytes size, but max_size is set to %s"
672
+ , $bytes, $max_size), 1
673
+ ) if ($bytes > $max_size);
674
+ }
675
+
676
+ # Currently no effect
677
+ # should use regexp
678
+ my @octets = unpack('C4', $text);
679
+ $encoding = ( $octets[0] and $octets[1]) ? 'UTF-8'
680
+ : (!$octets[0] and $octets[1]) ? 'UTF-16BE'
681
+ : (!$octets[0] and !$octets[1]) ? 'UTF-32BE'
682
+ : ( $octets[2] ) ? 'UTF-16LE'
683
+ : (!$octets[2] ) ? 'UTF-32LE'
684
+ : 'unknown';
685
+
686
+ white(); # remove head white space
687
+
688
+ my $valid_start = defined $ch; # Is there a first character for JSON structure?
689
+
690
+ my $result = value();
691
+
692
+ return undef if ( !$result && ( $opt & 0x10000000 ) ); # for incr_parse
693
+
694
+ decode_error("malformed JSON string, neither array, object, number, string or atom") unless $valid_start;
695
+
696
+ if ( !$idx->[ P_ALLOW_NONREF ] and !ref $result ) {
697
+ decode_error(
698
+ 'JSON text must be an object or array (but found number, string, true, false or null,'
699
+ . ' use allow_nonref to allow this)', 1);
700
+ }
701
+
702
+ Carp::croak('something wrong.') if $len < $at; # we won't arrive here.
703
+
704
+ my $consumed = defined $ch ? $at - 1 : $at; # consumed JSON text length
705
+
706
+ white(); # remove tail white space
707
+
708
+ if ( $ch ) {
709
+ return ( $result, $consumed ) if ($opt & 0x00000001); # all right if decode_prefix
710
+ decode_error("garbage after JSON object");
711
+ }
712
+
713
+ ( $opt & 0x00000001 ) ? ( $result, $consumed ) : $result;
714
+ }
715
+
716
+
717
+ sub next_chr {
718
+ return $ch = undef if($at >= $len);
719
+ $ch = substr($text, $at++, 1);
720
+ }
721
+
722
+
723
+ sub value {
724
+ white();
725
+ return if(!defined $ch);
726
+ return object() if($ch eq '{');
727
+ return array() if($ch eq '[');
728
+ return string() if($ch eq '"' or ($singlequote and $ch eq "'"));
729
+ return number() if($ch =~ /[0-9]/ or $ch eq '-');
730
+ return word();
731
+ }
732
+
733
+ sub string {
734
+ my ($i, $s, $t, $u);
735
+ my $utf16;
736
+ my $is_utf8;
737
+
738
+ ($is_valid_utf8, $utf8_len) = ('', 0);
739
+
740
+ $s = ''; # basically UTF8 flag on
741
+
742
+ if($ch eq '"' or ($singlequote and $ch eq "'")){
743
+ my $boundChar = $ch;
744
+
745
+ OUTER: while( defined(next_chr()) ){
746
+
747
+ if($ch eq $boundChar){
748
+ next_chr();
749
+
750
+ if ($utf16) {
751
+ decode_error("missing low surrogate character in surrogate pair");
752
+ }
753
+
754
+ utf8::decode($s) if($is_utf8);
755
+
756
+ return $s;
757
+ }
758
+ elsif($ch eq '\\'){
759
+ next_chr();
760
+ if(exists $escapes{$ch}){
761
+ $s .= $escapes{$ch};
762
+ }
763
+ elsif($ch eq 'u'){ # UNICODE handling
764
+ my $u = '';
765
+
766
+ for(1..4){
767
+ $ch = next_chr();
768
+ last OUTER if($ch !~ /[0-9a-fA-F]/);
769
+ $u .= $ch;
770
+ }
771
+
772
+ # U+D800 - U+DBFF
773
+ if ($u =~ /^[dD][89abAB][0-9a-fA-F]{2}/) { # UTF-16 high surrogate?
774
+ $utf16 = $u;
775
+ }
776
+ # U+DC00 - U+DFFF
777
+ elsif ($u =~ /^[dD][c-fC-F][0-9a-fA-F]{2}/) { # UTF-16 low surrogate?
778
+ unless (defined $utf16) {
779
+ decode_error("missing high surrogate character in surrogate pair");
780
+ }
781
+ $is_utf8 = 1;
782
+ $s .= JSON_PP_decode_surrogates($utf16, $u) || next;
783
+ $utf16 = undef;
784
+ }
785
+ else {
786
+ if (defined $utf16) {
787
+ decode_error("surrogate pair expected");
788
+ }
789
+
790
+ if ( ( my $hex = hex( $u ) ) > 127 ) {
791
+ $is_utf8 = 1;
792
+ $s .= JSON_PP_decode_unicode($u) || next;
793
+ }
794
+ else {
795
+ $s .= chr $hex;
796
+ }
797
+ }
798
+
799
+ }
800
+ else{
801
+ unless ($loose) {
802
+ $at -= 2;
803
+ decode_error('illegal backslash escape sequence in string');
804
+ }
805
+ $s .= $ch;
806
+ }
807
+ }
808
+ else{
809
+
810
+ if ( ord $ch > 127 ) {
811
+ if ( $utf8 ) {
812
+ unless( $ch = is_valid_utf8($ch) ) {
813
+ $at -= 1;
814
+ decode_error("malformed UTF-8 character in JSON string");
815
+ }
816
+ else {
817
+ $at += $utf8_len - 1;
818
+ }
819
+ }
820
+ else {
821
+ utf8::encode( $ch );
822
+ }
823
+
824
+ $is_utf8 = 1;
825
+ }
826
+
827
+ if (!$loose) {
828
+ if ($ch =~ /[\x00-\x1f\x22\x5c]/) { # '/' ok
829
+ $at--;
830
+ decode_error('invalid character encountered while parsing JSON string');
831
+ }
832
+ }
833
+
834
+ $s .= $ch;
835
+ }
836
+ }
837
+ }
838
+
839
+ decode_error("unexpected end of string while parsing JSON string");
840
+ }
841
+
842
+
843
+ sub white {
844
+ while( defined $ch ){
845
+ if($ch le ' '){
846
+ next_chr();
847
+ }
848
+ elsif($ch eq '/'){
849
+ next_chr();
850
+ if(defined $ch and $ch eq '/'){
851
+ 1 while(defined(next_chr()) and $ch ne "\n" and $ch ne "\r");
852
+ }
853
+ elsif(defined $ch and $ch eq '*'){
854
+ next_chr();
855
+ while(1){
856
+ if(defined $ch){
857
+ if($ch eq '*'){
858
+ if(defined(next_chr()) and $ch eq '/'){
859
+ next_chr();
860
+ last;
861
+ }
862
+ }
863
+ else{
864
+ next_chr();
865
+ }
866
+ }
867
+ else{
868
+ decode_error("Unterminated comment");
869
+ }
870
+ }
871
+ next;
872
+ }
873
+ else{
874
+ $at--;
875
+ decode_error("malformed JSON string, neither array, object, number, string or atom");
876
+ }
877
+ }
878
+ else{
879
+ if ($relaxed and $ch eq '#') { # correctly?
880
+ pos($text) = $at;
881
+ $text =~ /\G([^\n]*(?:\r\n|\r|\n|$))/g;
882
+ $at = pos($text);
883
+ next_chr;
884
+ next;
885
+ }
886
+
887
+ last;
888
+ }
889
+ }
890
+ }
891
+
892
+
893
+ sub array {
894
+ my $a = $_[0] || []; # you can use this code to use another array ref object.
895
+
896
+ decode_error('json text or perl structure exceeds maximum nesting level (max_depth set too low?)')
897
+ if (++$depth > $max_depth);
898
+
899
+ next_chr();
900
+ white();
901
+
902
+ if(defined $ch and $ch eq ']'){
903
+ --$depth;
904
+ next_chr();
905
+ return $a;
906
+ }
907
+ else {
908
+ while(defined($ch)){
909
+ push @$a, value();
910
+
911
+ white();
912
+
913
+ if (!defined $ch) {
914
+ last;
915
+ }
916
+
917
+ if($ch eq ']'){
918
+ --$depth;
919
+ next_chr();
920
+ return $a;
921
+ }
922
+
923
+ if($ch ne ','){
924
+ last;
925
+ }
926
+
927
+ next_chr();
928
+ white();
929
+
930
+ if ($relaxed and $ch eq ']') {
931
+ --$depth;
932
+ next_chr();
933
+ return $a;
934
+ }
935
+
936
+ }
937
+ }
938
+
939
+ decode_error(", or ] expected while parsing array");
940
+ }
941
+
942
+
943
+ sub object {
944
+ my $o = $_[0] || {}; # you can use this code to use another hash ref object.
945
+ my $k;
946
+
947
+ decode_error('json text or perl structure exceeds maximum nesting level (max_depth set too low?)')
948
+ if (++$depth > $max_depth);
949
+ next_chr();
950
+ white();
951
+
952
+ if(defined $ch and $ch eq '}'){
953
+ --$depth;
954
+ next_chr();
955
+ if ($F_HOOK) {
956
+ return _json_object_hook($o);
957
+ }
958
+ return $o;
959
+ }
960
+ else {
961
+ while (defined $ch) {
962
+ $k = ($allow_barekey and $ch ne '"' and $ch ne "'") ? bareKey() : string();
963
+ white();
964
+
965
+ if(!defined $ch or $ch ne ':'){
966
+ $at--;
967
+ decode_error("':' expected");
968
+ }
969
+
970
+ next_chr();
971
+ $o->{$k} = value();
972
+ white();
973
+
974
+ last if (!defined $ch);
975
+
976
+ if($ch eq '}'){
977
+ --$depth;
978
+ next_chr();
979
+ if ($F_HOOK) {
980
+ return _json_object_hook($o);
981
+ }
982
+ return $o;
983
+ }
984
+
985
+ if($ch ne ','){
986
+ last;
987
+ }
988
+
989
+ next_chr();
990
+ white();
991
+
992
+ if ($relaxed and $ch eq '}') {
993
+ --$depth;
994
+ next_chr();
995
+ if ($F_HOOK) {
996
+ return _json_object_hook($o);
997
+ }
998
+ return $o;
999
+ }
1000
+
1001
+ }
1002
+
1003
+ }
1004
+
1005
+ $at--;
1006
+ decode_error(", or } expected while parsing object/hash");
1007
+ }
1008
+
1009
+
1010
+ sub bareKey { # doesn't strictly follow Standard ECMA-262 3rd Edition
1011
+ my $key;
1012
+ while($ch =~ /[^\x00-\x23\x25-\x2F\x3A-\x40\x5B-\x5E\x60\x7B-\x7F]/){
1013
+ $key .= $ch;
1014
+ next_chr();
1015
+ }
1016
+ return $key;
1017
+ }
1018
+
1019
+
1020
+ sub word {
1021
+ my $word = substr($text,$at-1,4);
1022
+
1023
+ if($word eq 'true'){
1024
+ $at += 3;
1025
+ next_chr;
1026
+ return $JSON::PP::true;
1027
+ }
1028
+ elsif($word eq 'null'){
1029
+ $at += 3;
1030
+ next_chr;
1031
+ return undef;
1032
+ }
1033
+ elsif($word eq 'fals'){
1034
+ $at += 3;
1035
+ if(substr($text,$at,1) eq 'e'){
1036
+ $at++;
1037
+ next_chr;
1038
+ return $JSON::PP::false;
1039
+ }
1040
+ }
1041
+
1042
+ $at--; # for decode_error report
1043
+
1044
+ decode_error("'null' expected") if ($word =~ /^n/);
1045
+ decode_error("'true' expected") if ($word =~ /^t/);
1046
+ decode_error("'false' expected") if ($word =~ /^f/);
1047
+ decode_error("malformed JSON string, neither array, object, number, string or atom");
1048
+ }
1049
+
1050
+
1051
+ sub number {
1052
+ my $n = '';
1053
+ my $v;
1054
+
1055
+ # According to RFC4627, hex or oct digits are invalid.
1056
+ if($ch eq '0'){
1057
+ my $peek = substr($text,$at,1);
1058
+ my $hex = $peek =~ /[xX]/; # 0 or 1
1059
+
1060
+ if($hex){
1061
+ decode_error("malformed number (leading zero must not be followed by another digit)");
1062
+ ($n) = ( substr($text, $at+1) =~ /^([0-9a-fA-F]+)/);
1063
+ }
1064
+ else{ # oct
1065
+ ($n) = ( substr($text, $at) =~ /^([0-7]+)/);
1066
+ if (defined $n and length $n > 1) {
1067
+ decode_error("malformed number (leading zero must not be followed by another digit)");
1068
+ }
1069
+ }
1070
+
1071
+ if(defined $n and length($n)){
1072
+ if (!$hex and length($n) == 1) {
1073
+ decode_error("malformed number (leading zero must not be followed by another digit)");
1074
+ }
1075
+ $at += length($n) + $hex;
1076
+ next_chr;
1077
+ return $hex ? hex($n) : oct($n);
1078
+ }
1079
+ }
1080
+
1081
+ if($ch eq '-'){
1082
+ $n = '-';
1083
+ next_chr;
1084
+ if (!defined $ch or $ch !~ /\d/) {
1085
+ decode_error("malformed number (no digits after initial minus)");
1086
+ }
1087
+ }
1088
+
1089
+ while(defined $ch and $ch =~ /\d/){
1090
+ $n .= $ch;
1091
+ next_chr;
1092
+ }
1093
+
1094
+ if(defined $ch and $ch eq '.'){
1095
+ $n .= '.';
1096
+
1097
+ next_chr;
1098
+ if (!defined $ch or $ch !~ /\d/) {
1099
+ decode_error("malformed number (no digits after decimal point)");
1100
+ }
1101
+ else {
1102
+ $n .= $ch;
1103
+ }
1104
+
1105
+ while(defined(next_chr) and $ch =~ /\d/){
1106
+ $n .= $ch;
1107
+ }
1108
+ }
1109
+
1110
+ if(defined $ch and ($ch eq 'e' or $ch eq 'E')){
1111
+ $n .= $ch;
1112
+ next_chr;
1113
+
1114
+ if(defined($ch) and ($ch eq '+' or $ch eq '-')){
1115
+ $n .= $ch;
1116
+ next_chr;
1117
+ if (!defined $ch or $ch =~ /\D/) {
1118
+ decode_error("malformed number (no digits after exp sign)");
1119
+ }
1120
+ $n .= $ch;
1121
+ }
1122
+ elsif(defined($ch) and $ch =~ /\d/){
1123
+ $n .= $ch;
1124
+ }
1125
+ else {
1126
+ decode_error("malformed number (no digits after exp sign)");
1127
+ }
1128
+
1129
+ while(defined(next_chr) and $ch =~ /\d/){
1130
+ $n .= $ch;
1131
+ }
1132
+
1133
+ }
1134
+
1135
+ $v .= $n;
1136
+
1137
+ if ($v !~ /[.eE]/ and length $v > $max_intsize) {
1138
+ if ($allow_bigint) { # from Adam Sussman
1139
+ require Math::BigInt;
1140
+ return Math::BigInt->new($v);
1141
+ }
1142
+ else {
1143
+ return "$v";
1144
+ }
1145
+ }
1146
+ elsif ($allow_bigint) {
1147
+ require Math::BigFloat;
1148
+ return Math::BigFloat->new($v);
1149
+ }
1150
+
1151
+ return 0+$v;
1152
+ }
1153
+
1154
+
1155
+ sub is_valid_utf8 {
1156
+
1157
+ $utf8_len = $_[0] =~ /[\x00-\x7F]/ ? 1
1158
+ : $_[0] =~ /[\xC2-\xDF]/ ? 2
1159
+ : $_[0] =~ /[\xE0-\xEF]/ ? 3
1160
+ : $_[0] =~ /[\xF0-\xF4]/ ? 4
1161
+ : 0
1162
+ ;
1163
+
1164
+ return unless $utf8_len;
1165
+
1166
+ my $is_valid_utf8 = substr($text, $at - 1, $utf8_len);
1167
+
1168
+ return ( $is_valid_utf8 =~ /^(?:
1169
+ [\x00-\x7F]
1170
+ |[\xC2-\xDF][\x80-\xBF]
1171
+ |[\xE0][\xA0-\xBF][\x80-\xBF]
1172
+ |[\xE1-\xEC][\x80-\xBF][\x80-\xBF]
1173
+ |[\xED][\x80-\x9F][\x80-\xBF]
1174
+ |[\xEE-\xEF][\x80-\xBF][\x80-\xBF]
1175
+ |[\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF]
1176
+ |[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]
1177
+ |[\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF]
1178
+ )$/x ) ? $is_valid_utf8 : '';
1179
+ }
1180
+
1181
+
1182
+ sub decode_error {
1183
+ my $error = shift;
1184
+ my $no_rep = shift;
1185
+ my $str = defined $text ? substr($text, $at) : '';
1186
+ my $mess = '';
1187
+ my $type = $] >= 5.008 ? 'U*'
1188
+ : $] < 5.006 ? 'C*'
1189
+ : utf8::is_utf8( $str ) ? 'U*' # 5.6
1190
+ : 'C*'
1191
+ ;
1192
+
1193
+ for my $c ( unpack( $type, $str ) ) { # emulate pv_uni_display() ?
1194
+ $mess .= $c == 0x07 ? '\a'
1195
+ : $c == 0x09 ? '\t'
1196
+ : $c == 0x0a ? '\n'
1197
+ : $c == 0x0d ? '\r'
1198
+ : $c == 0x0c ? '\f'
1199
+ : $c < 0x20 ? sprintf('\x{%x}', $c)
1200
+ : $c == 0x5c ? '\\\\'
1201
+ : $c < 0x80 ? chr($c)
1202
+ : sprintf('\x{%x}', $c)
1203
+ ;
1204
+ if ( length $mess >= 20 ) {
1205
+ $mess .= '...';
1206
+ last;
1207
+ }
1208
+ }
1209
+
1210
+ unless ( length $mess ) {
1211
+ $mess = '(end of string)';
1212
+ }
1213
+
1214
+ Carp::croak (
1215
+ $no_rep ? "$error" : "$error, at character offset $at (before \"$mess\")"
1216
+ );
1217
+
1218
+ }
1219
+
1220
+
1221
+ sub _json_object_hook {
1222
+ my $o = $_[0];
1223
+ my @ks = keys %{$o};
1224
+
1225
+ if ( $cb_sk_object and @ks == 1 and exists $cb_sk_object->{ $ks[0] } and ref $cb_sk_object->{ $ks[0] } ) {
1226
+ my @val = $cb_sk_object->{ $ks[0] }->( $o->{$ks[0]} );
1227
+ if (@val == 1) {
1228
+ return $val[0];
1229
+ }
1230
+ }
1231
+
1232
+ my @val = $cb_object->($o) if ($cb_object);
1233
+ if (@val == 0 or @val > 1) {
1234
+ return $o;
1235
+ }
1236
+ else {
1237
+ return $val[0];
1238
+ }
1239
+ }
1240
+
1241
+
1242
+ sub PP_decode_box {
1243
+ {
1244
+ text => $text,
1245
+ at => $at,
1246
+ ch => $ch,
1247
+ len => $len,
1248
+ depth => $depth,
1249
+ encoding => $encoding,
1250
+ is_valid_utf8 => $is_valid_utf8,
1251
+ };
1252
+ }
1253
+
1254
+ } # PARSE
1255
+
1256
+
1257
+ sub _decode_surrogates { # from perlunicode
1258
+ my $uni = 0x10000 + (hex($_[0]) - 0xD800) * 0x400 + (hex($_[1]) - 0xDC00);
1259
+ my $un = pack('U*', $uni);
1260
+ utf8::encode( $un );
1261
+ return $un;
1262
+ }
1263
+
1264
+
1265
+ sub _decode_unicode {
1266
+ my $un = pack('U', hex shift);
1267
+ utf8::encode( $un );
1268
+ return $un;
1269
+ }
1270
+
1271
+ #
1272
+ # Setup for various Perl versions (the code from JSON::PP58)
1273
+ #
1274
+
1275
+ BEGIN {
1276
+
1277
+ unless ( defined &utf8::is_utf8 ) {
1278
+ require Encode;
1279
+ *utf8::is_utf8 = *Encode::is_utf8;
1280
+ }
1281
+
1282
+ if ( $] >= 5.008 ) {
1283
+ *JSON::PP::JSON_PP_encode_ascii = \&_encode_ascii;
1284
+ *JSON::PP::JSON_PP_encode_latin1 = \&_encode_latin1;
1285
+ *JSON::PP::JSON_PP_decode_surrogates = \&_decode_surrogates;
1286
+ *JSON::PP::JSON_PP_decode_unicode = \&_decode_unicode;
1287
+ }
1288
+
1289
+ if ($] >= 5.008 and $] < 5.008003) { # join() in 5.8.0 - 5.8.2 is broken.
1290
+ package # hide from PAUSE
1291
+ JSON::PP;
1292
+ require subs;
1293
+ subs->import('join');
1294
+ eval q|
1295
+ sub join {
1296
+ return '' if (@_ < 2);
1297
+ my $j = shift;
1298
+ my $str = shift;
1299
+ for (@_) { $str .= $j . $_; }
1300
+ return $str;
1301
+ }
1302
+ |;
1303
+ }
1304
+
1305
+
1306
+ sub JSON::PP::incr_parse {
1307
+ local $Carp::CarpLevel = 1;
1308
+ ( $_[0]->{_incr_parser} ||= JSON::PP::IncrParser->new )->incr_parse( @_ );
1309
+ }
1310
+
1311
+
1312
+ sub JSON::PP::incr_skip {
1313
+ ( $_[0]->{_incr_parser} ||= JSON::PP::IncrParser->new )->incr_skip;
1314
+ }
1315
+
1316
+
1317
+ sub JSON::PP::incr_reset {
1318
+ ( $_[0]->{_incr_parser} ||= JSON::PP::IncrParser->new )->incr_reset;
1319
+ }
1320
+
1321
+ eval q{
1322
+ sub JSON::PP::incr_text : lvalue {
1323
+ $_[0]->{_incr_parser} ||= JSON::PP::IncrParser->new;
1324
+
1325
+ if ( $_[0]->{_incr_parser}->{incr_parsing} ) {
1326
+ Carp::croak("incr_text can not be called when the incremental parser already started parsing");
1327
+ }
1328
+ $_[0]->{_incr_parser}->{incr_text};
1329
+ }
1330
+ } if ( $] >= 5.006 );
1331
+
1332
+ } # Setup for various Perl versions (the code from JSON::PP58)
1333
+
1334
+
1335
+ ###############################
1336
+ # Utilities
1337
+ #
1338
+
1339
+ BEGIN {
1340
+ eval 'require Scalar::Util';
1341
+ unless($@){
1342
+ *JSON::PP::blessed = \&Scalar::Util::blessed;
1343
+ *JSON::PP::reftype = \&Scalar::Util::reftype;
1344
+ *JSON::PP::refaddr = \&Scalar::Util::refaddr;
1345
+ }
1346
+ else{ # This code is from Scalar::Util.
1347
+ # warn $@;
1348
+ eval 'sub UNIVERSAL::a_sub_not_likely_to_be_here { ref($_[0]) }';
1349
+ *JSON::PP::blessed = sub {
1350
+ local($@, $SIG{__DIE__}, $SIG{__WARN__});
1351
+ ref($_[0]) ? eval { $_[0]->a_sub_not_likely_to_be_here } : undef;
1352
+ };
1353
+ my %tmap = qw(
1354
+ B::NULL SCALAR
1355
+ B::HV HASH
1356
+ B::AV ARRAY
1357
+ B::CV CODE
1358
+ B::IO IO
1359
+ B::GV GLOB
1360
+ B::REGEXP REGEXP
1361
+ );
1362
+ *JSON::PP::reftype = sub {
1363
+ my $r = shift;
1364
+
1365
+ return undef unless length(ref($r));
1366
+
1367
+ my $t = ref(B::svref_2object($r));
1368
+
1369
+ return
1370
+ exists $tmap{$t} ? $tmap{$t}
1371
+ : length(ref($$r)) ? 'REF'
1372
+ : 'SCALAR';
1373
+ };
1374
+ *JSON::PP::refaddr = sub {
1375
+ return undef unless length(ref($_[0]));
1376
+
1377
+ my $addr;
1378
+ if(defined(my $pkg = blessed($_[0]))) {
1379
+ $addr .= bless $_[0], 'Scalar::Util::Fake';
1380
+ bless $_[0], $pkg;
1381
+ }
1382
+ else {
1383
+ $addr .= $_[0]
1384
+ }
1385
+
1386
+ $addr =~ /0x(\w+)/;
1387
+ local $^W;
1388
+ #no warnings 'portable';
1389
+ hex($1);
1390
+ }
1391
+ }
1392
+ }
1393
+
1394
+
1395
+ # shamelessly copied and modified from JSON::XS code.
1396
+
1397
+ unless ( $INC{'JSON/PP.pm'} ) {
1398
+ eval q|
1399
+ package
1400
+ JSON::PP::Boolean;
1401
+
1402
+ use overload (
1403
+ "0+" => sub { ${$_[0]} },
1404
+ "++" => sub { $_[0] = ${$_[0]} + 1 },
1405
+ "--" => sub { $_[0] = ${$_[0]} - 1 },
1406
+ fallback => 1,
1407
+ );
1408
+ |;
1409
+ }
1410
+
1411
+ $JSON::PP::true = do { bless \(my $dummy = 1), "JSON::PP::Boolean" };
1412
+ $JSON::PP::false = do { bless \(my $dummy = 0), "JSON::PP::Boolean" };
1413
+
1414
+ sub is_bool { defined $_[0] and UNIVERSAL::isa($_[0], "JSON::PP::Boolean"); }
1415
+
1416
+ sub true { $JSON::PP::true }
1417
+ sub false { $JSON::PP::false }
1418
+ sub null { undef; }
1419
+
1420
+ ###############################
1421
+
1422
+ ###############################
1423
+
1424
+ package # hide from PAUSE
1425
+ JSON::PP::IncrParser;
1426
+
1427
+ use strict;
1428
+
1429
+ use constant INCR_M_WS => 0; # initial whitespace skipping
1430
+ use constant INCR_M_STR => 1; # inside string
1431
+ use constant INCR_M_BS => 2; # inside backslash
1432
+ use constant INCR_M_JSON => 3; # outside anything, count nesting
1433
+ use constant INCR_M_C0 => 4;
1434
+ use constant INCR_M_C1 => 5;
1435
+
1436
+ use vars qw($VERSION);
1437
+ $VERSION = '1.01';
1438
+
1439
+ my $unpack_format = $] < 5.006 ? 'C*' : 'U*';
1440
+
1441
+ sub new {
1442
+ my ( $class ) = @_;
1443
+
1444
+ bless {
1445
+ incr_nest => 0,
1446
+ incr_text => undef,
1447
+ incr_parsing => 0,
1448
+ incr_p => 0,
1449
+ }, $class;
1450
+ }
1451
+
1452
+
1453
+ sub incr_parse {
1454
+ my ( $self, $coder, $text ) = @_;
1455
+
1456
+ $self->{incr_text} = '' unless ( defined $self->{incr_text} );
1457
+
1458
+ if ( defined $text ) {
1459
+ if ( utf8::is_utf8( $text ) and !utf8::is_utf8( $self->{incr_text} ) ) {
1460
+ utf8::upgrade( $self->{incr_text} ) ;
1461
+ utf8::decode( $self->{incr_text} ) ;
1462
+ }
1463
+ $self->{incr_text} .= $text;
1464
+ }
1465
+
1466
+
1467
+ my $max_size = $coder->get_max_size;
1468
+
1469
+ if ( defined wantarray ) {
1470
+
1471
+ $self->{incr_mode} = INCR_M_WS unless defined $self->{incr_mode};
1472
+
1473
+ if ( wantarray ) {
1474
+ my @ret;
1475
+
1476
+ $self->{incr_parsing} = 1;
1477
+
1478
+ do {
1479
+ push @ret, $self->_incr_parse( $coder, $self->{incr_text} );
1480
+
1481
+ unless ( !$self->{incr_nest} and $self->{incr_mode} == INCR_M_JSON ) {
1482
+ $self->{incr_mode} = INCR_M_WS if $self->{incr_mode} != INCR_M_STR;
1483
+ }
1484
+
1485
+ } until ( length $self->{incr_text} >= $self->{incr_p} );
1486
+
1487
+ $self->{incr_parsing} = 0;
1488
+
1489
+ return @ret;
1490
+ }
1491
+ else { # in scalar context
1492
+ $self->{incr_parsing} = 1;
1493
+ my $obj = $self->_incr_parse( $coder, $self->{incr_text} );
1494
+ $self->{incr_parsing} = 0 if defined $obj; # pointed by Martin J. Evans
1495
+ return $obj ? $obj : undef; # $obj is an empty string, parsing was completed.
1496
+ }
1497
+
1498
+ }
1499
+
1500
+ }
1501
+
1502
+
1503
+ sub _incr_parse {
1504
+ my ( $self, $coder, $text, $skip ) = @_;
1505
+ my $p = $self->{incr_p};
1506
+ my $restore = $p;
1507
+
1508
+ my @obj;
1509
+ my $len = length $text;
1510
+
1511
+ if ( $self->{incr_mode} == INCR_M_WS ) {
1512
+ while ( $len > $p ) {
1513
+ my $s = substr( $text, $p, 1 );
1514
+ $p++ and next if ( 0x20 >= unpack($unpack_format, $s) );
1515
+ $self->{incr_mode} = INCR_M_JSON;
1516
+ last;
1517
+ }
1518
+ }
1519
+
1520
+ while ( $len > $p ) {
1521
+ my $s = substr( $text, $p++, 1 );
1522
+
1523
+ if ( $s eq '"' ) {
1524
+ if (substr( $text, $p - 2, 1 ) eq '\\' ) {
1525
+ next;
1526
+ }
1527
+
1528
+ if ( $self->{incr_mode} != INCR_M_STR ) {
1529
+ $self->{incr_mode} = INCR_M_STR;
1530
+ }
1531
+ else {
1532
+ $self->{incr_mode} = INCR_M_JSON;
1533
+ unless ( $self->{incr_nest} ) {
1534
+ last;
1535
+ }
1536
+ }
1537
+ }
1538
+
1539
+ if ( $self->{incr_mode} == INCR_M_JSON ) {
1540
+
1541
+ if ( $s eq '[' or $s eq '{' ) {
1542
+ if ( ++$self->{incr_nest} > $coder->get_max_depth ) {
1543
+ Carp::croak('json text or perl structure exceeds maximum nesting level (max_depth set too low?)');
1544
+ }
1545
+ }
1546
+ elsif ( $s eq ']' or $s eq '}' ) {
1547
+ last if ( --$self->{incr_nest} <= 0 );
1548
+ }
1549
+ elsif ( $s eq '#' ) {
1550
+ while ( $len > $p ) {
1551
+ last if substr( $text, $p++, 1 ) eq "\n";
1552
+ }
1553
+ }
1554
+
1555
+ }
1556
+
1557
+ }
1558
+
1559
+ $self->{incr_p} = $p;
1560
+
1561
+ return if ( $self->{incr_mode} == INCR_M_STR and not $self->{incr_nest} );
1562
+ return if ( $self->{incr_mode} == INCR_M_JSON and $self->{incr_nest} > 0 );
1563
+
1564
+ return '' unless ( length substr( $self->{incr_text}, 0, $p ) );
1565
+
1566
+ local $Carp::CarpLevel = 2;
1567
+
1568
+ $self->{incr_p} = $restore;
1569
+ $self->{incr_c} = $p;
1570
+
1571
+ my ( $obj, $tail ) = $coder->PP_decode_json( substr( $self->{incr_text}, 0, $p ), 0x10000001 );
1572
+
1573
+ $self->{incr_text} = substr( $self->{incr_text}, $p );
1574
+ $self->{incr_p} = 0;
1575
+
1576
+ return $obj || '';
1577
+ }
1578
+
1579
+
1580
+ sub incr_text {
1581
+ if ( $_[0]->{incr_parsing} ) {
1582
+ Carp::croak("incr_text can not be called when the incremental parser already started parsing");
1583
+ }
1584
+ $_[0]->{incr_text};
1585
+ }
1586
+
1587
+
1588
+ sub incr_skip {
1589
+ my $self = shift;
1590
+ $self->{incr_text} = substr( $self->{incr_text}, $self->{incr_c} );
1591
+ $self->{incr_p} = 0;
1592
+ }
1593
+
1594
+
1595
+ sub incr_reset {
1596
+ my $self = shift;
1597
+ $self->{incr_text} = undef;
1598
+ $self->{incr_p} = 0;
1599
+ $self->{incr_mode} = 0;
1600
+ $self->{incr_nest} = 0;
1601
+ $self->{incr_parsing} = 0;
1602
+ }
1603
+
1604
+ ###############################
1605
+
1606
+
1607
+ 1;
1608
+ __END__
1609
+ =pod
1610
+
1611
+ =head1 NAME
1612
+
1613
+ JSON::PP - JSON::XS compatible pure-Perl module.
1614
+
1615
+ =head1 SYNOPSIS
1616
+
1617
+ use JSON::PP;
1618
+
1619
+ # exported functions, they croak on error
1620
+ # and expect/generate UTF-8
1621
+
1622
+ $utf8_encoded_json_text = encode_json $perl_hash_or_arrayref;
1623
+ $perl_hash_or_arrayref = decode_json $utf8_encoded_json_text;
1624
+
1625
+ # OO-interface
1626
+
1627
+ $coder = JSON::PP->new->ascii->pretty->allow_nonref;
1628
+
1629
+ $json_text = $json->encode( $perl_scalar );
1630
+ $perl_scalar = $json->decode( $json_text );
1631
+
1632
+ $pretty_printed = $json->pretty->encode( $perl_scalar ); # pretty-printing
1633
+
1634
+ # Note that JSON version 2.0 and above will automatically use
1635
+ # JSON::XS or JSON::PP, so you should be able to just:
1636
+
1637
+ use JSON;
1638
+
1639
+
1640
+ =head1 VERSION
1641
+
1642
+ 2.27200
1643
+
1644
+ L<JSON::XS> 2.27 (~2.30) compatible.
1645
+
1646
+ =head1 DESCRIPTION
1647
+
1648
+ This module is L<JSON::XS> compatible pure Perl module.
1649
+ (Perl 5.8 or later is recommended)
1650
+
1651
+ JSON::XS is the fastest and most proper JSON module on CPAN.
1652
+ It is written by Marc Lehmann in C, so must be compiled and
1653
+ installed in the used environment.
1654
+
1655
+ JSON::PP is a pure-Perl module and has compatibility to JSON::XS.
1656
+
1657
+
1658
+ =head2 FEATURES
1659
+
1660
+ =over
1661
+
1662
+ =item * correct unicode handling
1663
+
1664
+ This module knows how to handle Unicode (depending on Perl version).
1665
+
1666
+ See to L<JSON::XS/A FEW NOTES ON UNICODE AND PERL> and
1667
+ L<UNICODE HANDLING ON PERLS>.
1668
+
1669
+
1670
+ =item * round-trip integrity
1671
+
1672
+ When you serialise a perl data structure using only data types
1673
+ supported by JSON and Perl, the deserialised data structure is
1674
+ identical on the Perl level. (e.g. the string "2.0" doesn't suddenly
1675
+ become "2" just because it looks like a number). There I<are> minor
1676
+ exceptions to this, read the MAPPING section below to learn about
1677
+ those.
1678
+
1679
+
1680
+ =item * strict checking of JSON correctness
1681
+
1682
+ There is no guessing, no generating of illegal JSON texts by default,
1683
+ and only JSON is accepted as input by default (the latter is a
1684
+ security feature). But when some options are set, loose checking
1685
+ features are available.
1686
+
1687
+ =back
1688
+
1689
+ =head1 FUNCTIONAL INTERFACE
1690
+
1691
+ Some documents are copied and modified from L<JSON::XS/FUNCTIONAL INTERFACE>.
1692
+
1693
+ =head2 encode_json
1694
+
1695
+ $json_text = encode_json $perl_scalar
1696
+
1697
+ Converts the given Perl data structure to a UTF-8 encoded, binary string.
1698
+
1699
+ This function call is functionally identical to:
1700
+
1701
+ $json_text = JSON::PP->new->utf8->encode($perl_scalar)
1702
+
1703
+ =head2 decode_json
1704
+
1705
+ $perl_scalar = decode_json $json_text
1706
+
1707
+ The opposite of C<encode_json>: expects an UTF-8 (binary) string and tries
1708
+ to parse that as an UTF-8 encoded JSON text, returning the resulting
1709
+ reference.
1710
+
1711
+ This function call is functionally identical to:
1712
+
1713
+ $perl_scalar = JSON::PP->new->utf8->decode($json_text)
1714
+
1715
+ =head2 JSON::PP::is_bool
1716
+
1717
+ $is_boolean = JSON::PP::is_bool($scalar)
1718
+
1719
+ Returns true if the passed scalar represents either JSON::PP::true or
1720
+ JSON::PP::false, two constants that act like C<1> and C<0> respectively
1721
+ and are also used to represent JSON C<true> and C<false> in Perl strings.
1722
+
1723
+ =head2 JSON::PP::true
1724
+
1725
+ Returns JSON true value which is blessed object.
1726
+ It C<isa> JSON::PP::Boolean object.
1727
+
1728
+ =head2 JSON::PP::false
1729
+
1730
+ Returns JSON false value which is blessed object.
1731
+ It C<isa> JSON::PP::Boolean object.
1732
+
1733
+ =head2 JSON::PP::null
1734
+
1735
+ Returns C<undef>.
1736
+
1737
+ See L<MAPPING>, below, for more information on how JSON values are mapped to
1738
+ Perl.
1739
+
1740
+
1741
+ =head1 HOW DO I DECODE A DATA FROM OUTER AND ENCODE TO OUTER
1742
+
1743
+ This section supposes that your perl version is 5.8 or later.
1744
+
1745
+ If you know a JSON text from an outer world - a network, a file content, and so on,
1746
+ is encoded in UTF-8, you should use C<decode_json> or C<JSON> module object
1747
+ with C<utf8> enable. And the decoded result will contain UNICODE characters.
1748
+
1749
+ # from network
1750
+ my $json = JSON::PP->new->utf8;
1751
+ my $json_text = CGI->new->param( 'json_data' );
1752
+ my $perl_scalar = $json->decode( $json_text );
1753
+
1754
+ # from file content
1755
+ local $/;
1756
+ open( my $fh, '<', 'json.data' );
1757
+ $json_text = <$fh>;
1758
+ $perl_scalar = decode_json( $json_text );
1759
+
1760
+ If an outer data is not encoded in UTF-8, firstly you should C<decode> it.
1761
+
1762
+ use Encode;
1763
+ local $/;
1764
+ open( my $fh, '<', 'json.data' );
1765
+ my $encoding = 'cp932';
1766
+ my $unicode_json_text = decode( $encoding, <$fh> ); # UNICODE
1767
+
1768
+ # or you can write the below code.
1769
+ #
1770
+ # open( my $fh, "<:encoding($encoding)", 'json.data' );
1771
+ # $unicode_json_text = <$fh>;
1772
+
1773
+ In this case, C<$unicode_json_text> is of course UNICODE string.
1774
+ So you B<cannot> use C<decode_json> nor C<JSON> module object with C<utf8> enable.
1775
+ Instead of them, you use C<JSON> module object with C<utf8> disable.
1776
+
1777
+ $perl_scalar = $json->utf8(0)->decode( $unicode_json_text );
1778
+
1779
+ Or C<encode 'utf8'> and C<decode_json>:
1780
+
1781
+ $perl_scalar = decode_json( encode( 'utf8', $unicode_json_text ) );
1782
+ # this way is not efficient.
1783
+
1784
+ And now, you want to convert your C<$perl_scalar> into JSON data and
1785
+ send it to an outer world - a network or a file content, and so on.
1786
+
1787
+ Your data usually contains UNICODE strings and you want the converted data to be encoded
1788
+ in UTF-8, you should use C<encode_json> or C<JSON> module object with C<utf8> enable.
1789
+
1790
+ print encode_json( $perl_scalar ); # to a network? file? or display?
1791
+ # or
1792
+ print $json->utf8->encode( $perl_scalar );
1793
+
1794
+ If C<$perl_scalar> does not contain UNICODE but C<$encoding>-encoded strings
1795
+ for some reason, then its characters are regarded as B<latin1> for perl
1796
+ (because it does not concern with your $encoding).
1797
+ You B<cannot> use C<encode_json> nor C<JSON> module object with C<utf8> enable.
1798
+ Instead of them, you use C<JSON> module object with C<utf8> disable.
1799
+ Note that the resulted text is a UNICODE string but no problem to print it.
1800
+
1801
+ # $perl_scalar contains $encoding encoded string values
1802
+ $unicode_json_text = $json->utf8(0)->encode( $perl_scalar );
1803
+ # $unicode_json_text consists of characters less than 0x100
1804
+ print $unicode_json_text;
1805
+
1806
+ Or C<decode $encoding> all string values and C<encode_json>:
1807
+
1808
+ $perl_scalar->{ foo } = decode( $encoding, $perl_scalar->{ foo } );
1809
+ # ... do it to each string values, then encode_json
1810
+ $json_text = encode_json( $perl_scalar );
1811
+
1812
+ This method is a proper way but probably not efficient.
1813
+
1814
+ See to L<Encode>, L<perluniintro>.
1815
+
1816
+
1817
+ =head1 METHODS
1818
+
1819
+ Basically, check to L<JSON> or L<JSON::XS>.
1820
+
1821
+ =head2 new
1822
+
1823
+ $json = JSON::PP->new
1824
+
1825
+ Returns a new JSON::PP object that can be used to de/encode JSON
1826
+ strings.
1827
+
1828
+ All boolean flags described below are by default I<disabled>.
1829
+
1830
+ The mutators for flags all return the JSON object again and thus calls can
1831
+ be chained:
1832
+
1833
+ my $json = JSON::PP->new->utf8->space_after->encode({a => [1,2]})
1834
+ => {"a": [1, 2]}
1835
+
1836
+ =head2 ascii
1837
+
1838
+ $json = $json->ascii([$enable])
1839
+
1840
+ $enabled = $json->get_ascii
1841
+
1842
+ If $enable is true (or missing), then the encode method will not generate characters outside
1843
+ the code range 0..127. Any Unicode characters outside that range will be escaped using either
1844
+ a single \uXXXX or a double \uHHHH\uLLLLL escape sequence, as per RFC4627.
1845
+ (See to L<JSON::XS/OBJECT-ORIENTED INTERFACE>).
1846
+
1847
+ In Perl 5.005, there is no character having high value (more than 255).
1848
+ See to L<UNICODE HANDLING ON PERLS>.
1849
+
1850
+ If $enable is false, then the encode method will not escape Unicode characters unless
1851
+ required by the JSON syntax or other flags. This results in a faster and more compact format.
1852
+
1853
+ JSON::PP->new->ascii(1)->encode([chr 0x10401])
1854
+ => ["\ud801\udc01"]
1855
+
1856
+ =head2 latin1
1857
+
1858
+ $json = $json->latin1([$enable])
1859
+
1860
+ $enabled = $json->get_latin1
1861
+
1862
+ If $enable is true (or missing), then the encode method will encode the resulting JSON
1863
+ text as latin1 (or iso-8859-1), escaping any characters outside the code range 0..255.
1864
+
1865
+ If $enable is false, then the encode method will not escape Unicode characters
1866
+ unless required by the JSON syntax or other flags.
1867
+
1868
+ JSON::XS->new->latin1->encode (["\x{89}\x{abc}"]
1869
+ => ["\x{89}\\u0abc"] # (perl syntax, U+abc escaped, U+89 not)
1870
+
1871
+ See to L<UNICODE HANDLING ON PERLS>.
1872
+
1873
+ =head2 utf8
1874
+
1875
+ $json = $json->utf8([$enable])
1876
+
1877
+ $enabled = $json->get_utf8
1878
+
1879
+ If $enable is true (or missing), then the encode method will encode the JSON result
1880
+ into UTF-8, as required by many protocols, while the decode method expects to be handled
1881
+ an UTF-8-encoded string. Please note that UTF-8-encoded strings do not contain any
1882
+ characters outside the range 0..255, they are thus useful for bytewise/binary I/O.
1883
+
1884
+ (In Perl 5.005, any character outside the range 0..255 does not exist.
1885
+ See to L<UNICODE HANDLING ON PERLS>.)
1886
+
1887
+ In future versions, enabling this option might enable autodetection of the UTF-16 and UTF-32
1888
+ encoding families, as described in RFC4627.
1889
+
1890
+ If $enable is false, then the encode method will return the JSON string as a (non-encoded)
1891
+ Unicode string, while decode expects thus a Unicode string. Any decoding or encoding
1892
+ (e.g. to UTF-8 or UTF-16) needs to be done yourself, e.g. using the Encode module.
1893
+
1894
+ Example, output UTF-16BE-encoded JSON:
1895
+
1896
+ use Encode;
1897
+ $jsontext = encode "UTF-16BE", JSON::PP->new->encode ($object);
1898
+
1899
+ Example, decode UTF-32LE-encoded JSON:
1900
+
1901
+ use Encode;
1902
+ $object = JSON::PP->new->decode (decode "UTF-32LE", $jsontext);
1903
+
1904
+
1905
+ =head2 pretty
1906
+
1907
+ $json = $json->pretty([$enable])
1908
+
1909
+ This enables (or disables) all of the C<indent>, C<space_before> and
1910
+ C<space_after> flags in one call to generate the most readable
1911
+ (or most compact) form possible.
1912
+
1913
+ Equivalent to:
1914
+
1915
+ $json->indent->space_before->space_after
1916
+
1917
+ =head2 indent
1918
+
1919
+ $json = $json->indent([$enable])
1920
+
1921
+ $enabled = $json->get_indent
1922
+
1923
+ The default indent space length is three.
1924
+ You can use C<indent_length> to change the length.
1925
+
1926
+ =head2 space_before
1927
+
1928
+ $json = $json->space_before([$enable])
1929
+
1930
+ $enabled = $json->get_space_before
1931
+
1932
+ If C<$enable> is true (or missing), then the C<encode> method will add an extra
1933
+ optional space before the C<:> separating keys from values in JSON objects.
1934
+
1935
+ If C<$enable> is false, then the C<encode> method will not add any extra
1936
+ space at those places.
1937
+
1938
+ This setting has no effect when decoding JSON texts.
1939
+
1940
+ Example, space_before enabled, space_after and indent disabled:
1941
+
1942
+ {"key" :"value"}
1943
+
1944
+ =head2 space_after
1945
+
1946
+ $json = $json->space_after([$enable])
1947
+
1948
+ $enabled = $json->get_space_after
1949
+
1950
+ If C<$enable> is true (or missing), then the C<encode> method will add an extra
1951
+ optional space after the C<:> separating keys from values in JSON objects
1952
+ and extra whitespace after the C<,> separating key-value pairs and array
1953
+ members.
1954
+
1955
+ If C<$enable> is false, then the C<encode> method will not add any extra
1956
+ space at those places.
1957
+
1958
+ This setting has no effect when decoding JSON texts.
1959
+
1960
+ Example, space_before and indent disabled, space_after enabled:
1961
+
1962
+ {"key": "value"}
1963
+
1964
+ =head2 relaxed
1965
+
1966
+ $json = $json->relaxed([$enable])
1967
+
1968
+ $enabled = $json->get_relaxed
1969
+
1970
+ If C<$enable> is true (or missing), then C<decode> will accept some
1971
+ extensions to normal JSON syntax (see below). C<encode> will not be
1972
+ affected in anyway. I<Be aware that this option makes you accept invalid
1973
+ JSON texts as if they were valid!>. I suggest only to use this option to
1974
+ parse application-specific files written by humans (configuration files,
1975
+ resource files etc.)
1976
+
1977
+ If C<$enable> is false (the default), then C<decode> will only accept
1978
+ valid JSON texts.
1979
+
1980
+ Currently accepted extensions are:
1981
+
1982
+ =over 4
1983
+
1984
+ =item * list items can have an end-comma
1985
+
1986
+ JSON I<separates> array elements and key-value pairs with commas. This
1987
+ can be annoying if you write JSON texts manually and want to be able to
1988
+ quickly append elements, so this extension accepts comma at the end of
1989
+ such items not just between them:
1990
+
1991
+ [
1992
+ 1,
1993
+ 2, <- this comma not normally allowed
1994
+ ]
1995
+ {
1996
+ "k1": "v1",
1997
+ "k2": "v2", <- this comma not normally allowed
1998
+ }
1999
+
2000
+ =item * shell-style '#'-comments
2001
+
2002
+ Whenever JSON allows whitespace, shell-style comments are additionally
2003
+ allowed. They are terminated by the first carriage-return or line-feed
2004
+ character, after which more white-space and comments are allowed.
2005
+
2006
+ [
2007
+ 1, # this comment not allowed in JSON
2008
+ # neither this one...
2009
+ ]
2010
+
2011
+ =back
2012
+
2013
+ =head2 canonical
2014
+
2015
+ $json = $json->canonical([$enable])
2016
+
2017
+ $enabled = $json->get_canonical
2018
+
2019
+ If C<$enable> is true (or missing), then the C<encode> method will output JSON objects
2020
+ by sorting their keys. This is adding a comparatively high overhead.
2021
+
2022
+ If C<$enable> is false, then the C<encode> method will output key-value
2023
+ pairs in the order Perl stores them (which will likely change between runs
2024
+ of the same script).
2025
+
2026
+ This option is useful if you want the same data structure to be encoded as
2027
+ the same JSON text (given the same overall settings). If it is disabled,
2028
+ the same hash might be encoded differently even if contains the same data,
2029
+ as key-value pairs have no inherent ordering in Perl.
2030
+
2031
+ This setting has no effect when decoding JSON texts.
2032
+
2033
+ If you want your own sorting routine, you can give a code reference
2034
+ or a subroutine name to C<sort_by>. See to C<JSON::PP OWN METHODS>.
2035
+
2036
+ =head2 allow_nonref
2037
+
2038
+ $json = $json->allow_nonref([$enable])
2039
+
2040
+ $enabled = $json->get_allow_nonref
2041
+
2042
+ If C<$enable> is true (or missing), then the C<encode> method can convert a
2043
+ non-reference into its corresponding string, number or null JSON value,
2044
+ which is an extension to RFC4627. Likewise, C<decode> will accept those JSON
2045
+ values instead of croaking.
2046
+
2047
+ If C<$enable> is false, then the C<encode> method will croak if it isn't
2048
+ passed an arrayref or hashref, as JSON texts must either be an object
2049
+ or array. Likewise, C<decode> will croak if given something that is not a
2050
+ JSON object or array.
2051
+
2052
+ JSON::PP->new->allow_nonref->encode ("Hello, World!")
2053
+ => "Hello, World!"
2054
+
2055
+ =head2 allow_unknown
2056
+
2057
+ $json = $json->allow_unknown ([$enable])
2058
+
2059
+ $enabled = $json->get_allow_unknown
2060
+
2061
+ If $enable is true (or missing), then "encode" will *not* throw an
2062
+ exception when it encounters values it cannot represent in JSON (for
2063
+ example, filehandles) but instead will encode a JSON "null" value.
2064
+ Note that blessed objects are not included here and are handled
2065
+ separately by c<allow_nonref>.
2066
+
2067
+ If $enable is false (the default), then "encode" will throw an
2068
+ exception when it encounters anything it cannot encode as JSON.
2069
+
2070
+ This option does not affect "decode" in any way, and it is
2071
+ recommended to leave it off unless you know your communications
2072
+ partner.
2073
+
2074
+ =head2 allow_blessed
2075
+
2076
+ $json = $json->allow_blessed([$enable])
2077
+
2078
+ $enabled = $json->get_allow_blessed
2079
+
2080
+ If C<$enable> is true (or missing), then the C<encode> method will not
2081
+ barf when it encounters a blessed reference. Instead, the value of the
2082
+ B<convert_blessed> option will decide whether C<null> (C<convert_blessed>
2083
+ disabled or no C<TO_JSON> method found) or a representation of the
2084
+ object (C<convert_blessed> enabled and C<TO_JSON> method found) is being
2085
+ encoded. Has no effect on C<decode>.
2086
+
2087
+ If C<$enable> is false (the default), then C<encode> will throw an
2088
+ exception when it encounters a blessed object.
2089
+
2090
+ =head2 convert_blessed
2091
+
2092
+ $json = $json->convert_blessed([$enable])
2093
+
2094
+ $enabled = $json->get_convert_blessed
2095
+
2096
+ If C<$enable> is true (or missing), then C<encode>, upon encountering a
2097
+ blessed object, will check for the availability of the C<TO_JSON> method
2098
+ on the object's class. If found, it will be called in scalar context
2099
+ and the resulting scalar will be encoded instead of the object. If no
2100
+ C<TO_JSON> method is found, the value of C<allow_blessed> will decide what
2101
+ to do.
2102
+
2103
+ The C<TO_JSON> method may safely call die if it wants. If C<TO_JSON>
2104
+ returns other blessed objects, those will be handled in the same
2105
+ way. C<TO_JSON> must take care of not causing an endless recursion cycle
2106
+ (== crash) in this case. The name of C<TO_JSON> was chosen because other
2107
+ methods called by the Perl core (== not by the user of the object) are
2108
+ usually in upper case letters and to avoid collisions with the C<to_json>
2109
+ function or method.
2110
+
2111
+ This setting does not yet influence C<decode> in any way.
2112
+
2113
+ If C<$enable> is false, then the C<allow_blessed> setting will decide what
2114
+ to do when a blessed object is found.
2115
+
2116
+ =head2 filter_json_object
2117
+
2118
+ $json = $json->filter_json_object([$coderef])
2119
+
2120
+ When C<$coderef> is specified, it will be called from C<decode> each
2121
+ time it decodes a JSON object. The only argument passed to the coderef
2122
+ is a reference to the newly-created hash. If the code references returns
2123
+ a single scalar (which need not be a reference), this value
2124
+ (i.e. a copy of that scalar to avoid aliasing) is inserted into the
2125
+ deserialised data structure. If it returns an empty list
2126
+ (NOTE: I<not> C<undef>, which is a valid scalar), the original deserialised
2127
+ hash will be inserted. This setting can slow down decoding considerably.
2128
+
2129
+ When C<$coderef> is omitted or undefined, any existing callback will
2130
+ be removed and C<decode> will not change the deserialised hash in any
2131
+ way.
2132
+
2133
+ Example, convert all JSON objects into the integer 5:
2134
+
2135
+ my $js = JSON::PP->new->filter_json_object (sub { 5 });
2136
+ # returns [5]
2137
+ $js->decode ('[{}]'); # the given subroutine takes a hash reference.
2138
+ # throw an exception because allow_nonref is not enabled
2139
+ # so a lone 5 is not allowed.
2140
+ $js->decode ('{"a":1, "b":2}');
2141
+
2142
+ =head2 filter_json_single_key_object
2143
+
2144
+ $json = $json->filter_json_single_key_object($key [=> $coderef])
2145
+
2146
+ Works remotely similar to C<filter_json_object>, but is only called for
2147
+ JSON objects having a single key named C<$key>.
2148
+
2149
+ This C<$coderef> is called before the one specified via
2150
+ C<filter_json_object>, if any. It gets passed the single value in the JSON
2151
+ object. If it returns a single value, it will be inserted into the data
2152
+ structure. If it returns nothing (not even C<undef> but the empty list),
2153
+ the callback from C<filter_json_object> will be called next, as if no
2154
+ single-key callback were specified.
2155
+
2156
+ If C<$coderef> is omitted or undefined, the corresponding callback will be
2157
+ disabled. There can only ever be one callback for a given key.
2158
+
2159
+ As this callback gets called less often then the C<filter_json_object>
2160
+ one, decoding speed will not usually suffer as much. Therefore, single-key
2161
+ objects make excellent targets to serialise Perl objects into, especially
2162
+ as single-key JSON objects are as close to the type-tagged value concept
2163
+ as JSON gets (it's basically an ID/VALUE tuple). Of course, JSON does not
2164
+ support this in any way, so you need to make sure your data never looks
2165
+ like a serialised Perl hash.
2166
+
2167
+ Typical names for the single object key are C<__class_whatever__>, or
2168
+ C<$__dollars_are_rarely_used__$> or C<}ugly_brace_placement>, or even
2169
+ things like C<__class_md5sum(classname)__>, to reduce the risk of clashing
2170
+ with real hashes.
2171
+
2172
+ Example, decode JSON objects of the form C<< { "__widget__" => <id> } >>
2173
+ into the corresponding C<< $WIDGET{<id>} >> object:
2174
+
2175
+ # return whatever is in $WIDGET{5}:
2176
+ JSON::PP
2177
+ ->new
2178
+ ->filter_json_single_key_object (__widget__ => sub {
2179
+ $WIDGET{ $_[0] }
2180
+ })
2181
+ ->decode ('{"__widget__": 5')
2182
+
2183
+ # this can be used with a TO_JSON method in some "widget" class
2184
+ # for serialisation to json:
2185
+ sub WidgetBase::TO_JSON {
2186
+ my ($self) = @_;
2187
+
2188
+ unless ($self->{id}) {
2189
+ $self->{id} = ..get..some..id..;
2190
+ $WIDGET{$self->{id}} = $self;
2191
+ }
2192
+
2193
+ { __widget__ => $self->{id} }
2194
+ }
2195
+
2196
+ =head2 shrink
2197
+
2198
+ $json = $json->shrink([$enable])
2199
+
2200
+ $enabled = $json->get_shrink
2201
+
2202
+ In JSON::XS, this flag resizes strings generated by either
2203
+ C<encode> or C<decode> to their minimum size possible.
2204
+ It will also try to downgrade any strings to octet-form if possible.
2205
+
2206
+ In JSON::PP, it is noop about resizing strings but tries
2207
+ C<utf8::downgrade> to the returned string by C<encode>.
2208
+ See to L<utf8>.
2209
+
2210
+ See to L<JSON::XS/OBJECT-ORIENTED INTERFACE>
2211
+
2212
+ =head2 max_depth
2213
+
2214
+ $json = $json->max_depth([$maximum_nesting_depth])
2215
+
2216
+ $max_depth = $json->get_max_depth
2217
+
2218
+ Sets the maximum nesting level (default C<512>) accepted while encoding
2219
+ or decoding. If a higher nesting level is detected in JSON text or a Perl
2220
+ data structure, then the encoder and decoder will stop and croak at that
2221
+ point.
2222
+
2223
+ Nesting level is defined by number of hash- or arrayrefs that the encoder
2224
+ needs to traverse to reach a given point or the number of C<{> or C<[>
2225
+ characters without their matching closing parenthesis crossed to reach a
2226
+ given character in a string.
2227
+
2228
+ If no argument is given, the highest possible setting will be used, which
2229
+ is rarely useful.
2230
+
2231
+ See L<JSON::XS/SSECURITY CONSIDERATIONS> for more info on why this is useful.
2232
+
2233
+ When a large value (100 or more) was set and it de/encodes a deep nested object/text,
2234
+ it may raise a warning 'Deep recursion on subroutine' at the perl runtime phase.
2235
+
2236
+ =head2 max_size
2237
+
2238
+ $json = $json->max_size([$maximum_string_size])
2239
+
2240
+ $max_size = $json->get_max_size
2241
+
2242
+ Set the maximum length a JSON text may have (in bytes) where decoding is
2243
+ being attempted. The default is C<0>, meaning no limit. When C<decode>
2244
+ is called on a string that is longer then this many bytes, it will not
2245
+ attempt to decode the string but throw an exception. This setting has no
2246
+ effect on C<encode> (yet).
2247
+
2248
+ If no argument is given, the limit check will be deactivated (same as when
2249
+ C<0> is specified).
2250
+
2251
+ See L<JSON::XS/SECURITY CONSIDERATIONS> for more info on why this is useful.
2252
+
2253
+ =head2 encode
2254
+
2255
+ $json_text = $json->encode($perl_scalar)
2256
+
2257
+ Converts the given Perl data structure (a simple scalar or a reference
2258
+ to a hash or array) to its JSON representation. Simple scalars will be
2259
+ converted into JSON string or number sequences, while references to arrays
2260
+ become JSON arrays and references to hashes become JSON objects. Undefined
2261
+ Perl values (e.g. C<undef>) become JSON C<null> values.
2262
+ References to the integers C<0> and C<1> are converted into C<true> and C<false>.
2263
+
2264
+ =head2 decode
2265
+
2266
+ $perl_scalar = $json->decode($json_text)
2267
+
2268
+ The opposite of C<encode>: expects a JSON text and tries to parse it,
2269
+ returning the resulting simple scalar or reference. Croaks on error.
2270
+
2271
+ JSON numbers and strings become simple Perl scalars. JSON arrays become
2272
+ Perl arrayrefs and JSON objects become Perl hashrefs. C<true> becomes
2273
+ C<1> (C<JSON::true>), C<false> becomes C<0> (C<JSON::false>) and
2274
+ C<null> becomes C<undef>.
2275
+
2276
+ =head2 decode_prefix
2277
+
2278
+ ($perl_scalar, $characters) = $json->decode_prefix($json_text)
2279
+
2280
+ This works like the C<decode> method, but instead of raising an exception
2281
+ when there is trailing garbage after the first JSON object, it will
2282
+ silently stop parsing there and return the number of characters consumed
2283
+ so far.
2284
+
2285
+ JSON->new->decode_prefix ("[1] the tail")
2286
+ => ([], 3)
2287
+
2288
+ =head1 INCREMENTAL PARSING
2289
+
2290
+ Most of this section are copied and modified from L<JSON::XS/INCREMENTAL PARSING>.
2291
+
2292
+ In some cases, there is the need for incremental parsing of JSON texts.
2293
+ This module does allow you to parse a JSON stream incrementally.
2294
+ It does so by accumulating text until it has a full JSON object, which
2295
+ it then can decode. This process is similar to using C<decode_prefix>
2296
+ to see if a full JSON object is available, but is much more efficient
2297
+ (and can be implemented with a minimum of method calls).
2298
+
2299
+ This module will only attempt to parse the JSON text once it is sure it
2300
+ has enough text to get a decisive result, using a very simple but
2301
+ truly incremental parser. This means that it sometimes won't stop as
2302
+ early as the full parser, for example, it doesn't detect parenthesis
2303
+ mismatches. The only thing it guarantees is that it starts decoding as
2304
+ soon as a syntactically valid JSON text has been seen. This means you need
2305
+ to set resource limits (e.g. C<max_size>) to ensure the parser will stop
2306
+ parsing in the presence if syntax errors.
2307
+
2308
+ The following methods implement this incremental parser.
2309
+
2310
+ =head2 incr_parse
2311
+
2312
+ $json->incr_parse( [$string] ) # void context
2313
+
2314
+ $obj_or_undef = $json->incr_parse( [$string] ) # scalar context
2315
+
2316
+ @obj_or_empty = $json->incr_parse( [$string] ) # list context
2317
+
2318
+ This is the central parsing function. It can both append new text and
2319
+ extract objects from the stream accumulated so far (both of these
2320
+ functions are optional).
2321
+
2322
+ If C<$string> is given, then this string is appended to the already
2323
+ existing JSON fragment stored in the C<$json> object.
2324
+
2325
+ After that, if the function is called in void context, it will simply
2326
+ return without doing anything further. This can be used to add more text
2327
+ in as many chunks as you want.
2328
+
2329
+ If the method is called in scalar context, then it will try to extract
2330
+ exactly I<one> JSON object. If that is successful, it will return this
2331
+ object, otherwise it will return C<undef>. If there is a parse error,
2332
+ this method will croak just as C<decode> would do (one can then use
2333
+ C<incr_skip> to skip the erroneous part). This is the most common way of
2334
+ using the method.
2335
+
2336
+ And finally, in list context, it will try to extract as many objects
2337
+ from the stream as it can find and return them, or the empty list
2338
+ otherwise. For this to work, there must be no separators between the JSON
2339
+ objects or arrays, instead they must be concatenated back-to-back. If
2340
+ an error occurs, an exception will be raised as in the scalar context
2341
+ case. Note that in this case, any previously-parsed JSON texts will be
2342
+ lost.
2343
+
2344
+ Example: Parse some JSON arrays/objects in a given string and return them.
2345
+
2346
+ my @objs = JSON->new->incr_parse ("[5][7][1,2]");
2347
+
2348
+ =head2 incr_text
2349
+
2350
+ $lvalue_string = $json->incr_text
2351
+
2352
+ This method returns the currently stored JSON fragment as an lvalue, that
2353
+ is, you can manipulate it. This I<only> works when a preceding call to
2354
+ C<incr_parse> in I<scalar context> successfully returned an object. Under
2355
+ all other circumstances you must not call this function (I mean it.
2356
+ although in simple tests it might actually work, it I<will> fail under
2357
+ real world conditions). As a special exception, you can also call this
2358
+ method before having parsed anything.
2359
+
2360
+ This function is useful in two cases: a) finding the trailing text after a
2361
+ JSON object or b) parsing multiple JSON objects separated by non-JSON text
2362
+ (such as commas).
2363
+
2364
+ $json->incr_text =~ s/\s*,\s*//;
2365
+
2366
+ In Perl 5.005, C<lvalue> attribute is not available.
2367
+ You must write codes like the below:
2368
+
2369
+ $string = $json->incr_text;
2370
+ $string =~ s/\s*,\s*//;
2371
+ $json->incr_text( $string );
2372
+
2373
+ =head2 incr_skip
2374
+
2375
+ $json->incr_skip
2376
+
2377
+ This will reset the state of the incremental parser and will remove the
2378
+ parsed text from the input buffer. This is useful after C<incr_parse>
2379
+ died, in which case the input buffer and incremental parser state is left
2380
+ unchanged, to skip the text parsed so far and to reset the parse state.
2381
+
2382
+ =head2 incr_reset
2383
+
2384
+ $json->incr_reset
2385
+
2386
+ This completely resets the incremental parser, that is, after this call,
2387
+ it will be as if the parser had never parsed anything.
2388
+
2389
+ This is useful if you want to repeatedly parse JSON objects and want to
2390
+ ignore any trailing data, which means you have to reset the parser after
2391
+ each successful decode.
2392
+
2393
+ See to L<JSON::XS/INCREMENTAL PARSING> for examples.
2394
+
2395
+
2396
+ =head1 JSON::PP OWN METHODS
2397
+
2398
+ =head2 allow_singlequote
2399
+
2400
+ $json = $json->allow_singlequote([$enable])
2401
+
2402
+ If C<$enable> is true (or missing), then C<decode> will accept
2403
+ JSON strings quoted by single quotations that are invalid JSON
2404
+ format.
2405
+
2406
+ $json->allow_singlequote->decode({"foo":'bar'});
2407
+ $json->allow_singlequote->decode({'foo':"bar"});
2408
+ $json->allow_singlequote->decode({'foo':'bar'});
2409
+
2410
+ As same as the C<relaxed> option, this option may be used to parse
2411
+ application-specific files written by humans.
2412
+
2413
+
2414
+ =head2 allow_barekey
2415
+
2416
+ $json = $json->allow_barekey([$enable])
2417
+
2418
+ If C<$enable> is true (or missing), then C<decode> will accept
2419
+ bare keys of JSON object that are invalid JSON format.
2420
+
2421
+ As same as the C<relaxed> option, this option may be used to parse
2422
+ application-specific files written by humans.
2423
+
2424
+ $json->allow_barekey->decode('{foo:"bar"}');
2425
+
2426
+ =head2 allow_bignum
2427
+
2428
+ $json = $json->allow_bignum([$enable])
2429
+
2430
+ If C<$enable> is true (or missing), then C<decode> will convert
2431
+ the big integer Perl cannot handle as integer into a L<Math::BigInt>
2432
+ object and convert a floating number (any) into a L<Math::BigFloat>.
2433
+
2434
+ On the contrary, C<encode> converts C<Math::BigInt> objects and C<Math::BigFloat>
2435
+ objects into JSON numbers with C<allow_blessed> enable.
2436
+
2437
+ $json->allow_nonref->allow_blessed->allow_bignum;
2438
+ $bigfloat = $json->decode('2.000000000000000000000000001');
2439
+ print $json->encode($bigfloat);
2440
+ # => 2.000000000000000000000000001
2441
+
2442
+ See to L<JSON::XS/MAPPING> about the normal conversion of JSON number.
2443
+
2444
+ =head2 loose
2445
+
2446
+ $json = $json->loose([$enable])
2447
+
2448
+ The unescaped [\x00-\x1f\x22\x2f\x5c] strings are invalid in JSON strings
2449
+ and the module doesn't allow to C<decode> to these (except for \x2f).
2450
+ If C<$enable> is true (or missing), then C<decode> will accept these
2451
+ unescaped strings.
2452
+
2453
+ $json->loose->decode(qq|["abc
2454
+ def"]|);
2455
+
2456
+ See L<JSON::XS/SSECURITY CONSIDERATIONS>.
2457
+
2458
+ =head2 escape_slash
2459
+
2460
+ $json = $json->escape_slash([$enable])
2461
+
2462
+ According to JSON Grammar, I<slash> (U+002F) is escaped. But default
2463
+ JSON::PP (as same as JSON::XS) encodes strings without escaping slash.
2464
+
2465
+ If C<$enable> is true (or missing), then C<encode> will escape slashes.
2466
+
2467
+ =head2 indent_length
2468
+
2469
+ $json = $json->indent_length($length)
2470
+
2471
+ JSON::XS indent space length is 3 and cannot be changed.
2472
+ JSON::PP set the indent space length with the given $length.
2473
+ The default is 3. The acceptable range is 0 to 15.
2474
+
2475
+ =head2 sort_by
2476
+
2477
+ $json = $json->sort_by($function_name)
2478
+ $json = $json->sort_by($subroutine_ref)
2479
+
2480
+ If $function_name or $subroutine_ref are set, its sort routine are used
2481
+ in encoding JSON objects.
2482
+
2483
+ $js = $pc->sort_by(sub { $JSON::PP::a cmp $JSON::PP::b })->encode($obj);
2484
+ # is($js, q|{"a":1,"b":2,"c":3,"d":4,"e":5,"f":6,"g":7,"h":8,"i":9}|);
2485
+
2486
+ $js = $pc->sort_by('own_sort')->encode($obj);
2487
+ # is($js, q|{"a":1,"b":2,"c":3,"d":4,"e":5,"f":6,"g":7,"h":8,"i":9}|);
2488
+
2489
+ sub JSON::PP::own_sort { $JSON::PP::a cmp $JSON::PP::b }
2490
+
2491
+ As the sorting routine runs in the JSON::PP scope, the given
2492
+ subroutine name and the special variables C<$a>, C<$b> will begin
2493
+ 'JSON::PP::'.
2494
+
2495
+ If $integer is set, then the effect is same as C<canonical> on.
2496
+
2497
+ =head1 INTERNAL
2498
+
2499
+ For developers.
2500
+
2501
+ =over
2502
+
2503
+ =item PP_encode_box
2504
+
2505
+ Returns
2506
+
2507
+ {
2508
+ depth => $depth,
2509
+ indent_count => $indent_count,
2510
+ }
2511
+
2512
+
2513
+ =item PP_decode_box
2514
+
2515
+ Returns
2516
+
2517
+ {
2518
+ text => $text,
2519
+ at => $at,
2520
+ ch => $ch,
2521
+ len => $len,
2522
+ depth => $depth,
2523
+ encoding => $encoding,
2524
+ is_valid_utf8 => $is_valid_utf8,
2525
+ };
2526
+
2527
+ =back
2528
+
2529
+ =head1 MAPPING
2530
+
2531
+ This section is copied from JSON::XS and modified to C<JSON::PP>.
2532
+ JSON::XS and JSON::PP mapping mechanisms are almost equivalent.
2533
+
2534
+ See to L<JSON::XS/MAPPING>.
2535
+
2536
+ =head2 JSON -> PERL
2537
+
2538
+ =over 4
2539
+
2540
+ =item object
2541
+
2542
+ A JSON object becomes a reference to a hash in Perl. No ordering of object
2543
+ keys is preserved (JSON does not preserver object key ordering itself).
2544
+
2545
+ =item array
2546
+
2547
+ A JSON array becomes a reference to an array in Perl.
2548
+
2549
+ =item string
2550
+
2551
+ A JSON string becomes a string scalar in Perl - Unicode codepoints in JSON
2552
+ are represented by the same codepoints in the Perl string, so no manual
2553
+ decoding is necessary.
2554
+
2555
+ =item number
2556
+
2557
+ A JSON number becomes either an integer, numeric (floating point) or
2558
+ string scalar in perl, depending on its range and any fractional parts. On
2559
+ the Perl level, there is no difference between those as Perl handles all
2560
+ the conversion details, but an integer may take slightly less memory and
2561
+ might represent more values exactly than floating point numbers.
2562
+
2563
+ If the number consists of digits only, C<JSON> will try to represent
2564
+ it as an integer value. If that fails, it will try to represent it as
2565
+ a numeric (floating point) value if that is possible without loss of
2566
+ precision. Otherwise it will preserve the number as a string value (in
2567
+ which case you lose roundtripping ability, as the JSON number will be
2568
+ re-encoded to a JSON string).
2569
+
2570
+ Numbers containing a fractional or exponential part will always be
2571
+ represented as numeric (floating point) values, possibly at a loss of
2572
+ precision (in which case you might lose perfect roundtripping ability, but
2573
+ the JSON number will still be re-encoded as a JSON number).
2574
+
2575
+ Note that precision is not accuracy - binary floating point values cannot
2576
+ represent most decimal fractions exactly, and when converting from and to
2577
+ floating point, C<JSON> only guarantees precision up to but not including
2578
+ the least significant bit.
2579
+
2580
+ When C<allow_bignum> is enable, the big integers
2581
+ and the numeric can be optionally converted into L<Math::BigInt> and
2582
+ L<Math::BigFloat> objects.
2583
+
2584
+ =item true, false
2585
+
2586
+ These JSON atoms become C<JSON::PP::true> and C<JSON::PP::false>,
2587
+ respectively. They are overloaded to act almost exactly like the numbers
2588
+ C<1> and C<0>. You can check whether a scalar is a JSON boolean by using
2589
+ the C<JSON::is_bool> function.
2590
+
2591
+ print JSON::PP::true . "\n";
2592
+ => true
2593
+ print JSON::PP::true + 1;
2594
+ => 1
2595
+
2596
+ ok(JSON::true eq '1');
2597
+ ok(JSON::true == 1);
2598
+
2599
+ C<JSON> will install these missing overloading features to the backend modules.
2600
+
2601
+
2602
+ =item null
2603
+
2604
+ A JSON null atom becomes C<undef> in Perl.
2605
+
2606
+ C<JSON::PP::null> returns C<undef>.
2607
+
2608
+ =back
2609
+
2610
+
2611
+ =head2 PERL -> JSON
2612
+
2613
+ The mapping from Perl to JSON is slightly more difficult, as Perl is a
2614
+ truly typeless language, so we can only guess which JSON type is meant by
2615
+ a Perl value.
2616
+
2617
+ =over 4
2618
+
2619
+ =item hash references
2620
+
2621
+ Perl hash references become JSON objects. As there is no inherent ordering
2622
+ in hash keys (or JSON objects), they will usually be encoded in a
2623
+ pseudo-random order that can change between runs of the same program but
2624
+ stays generally the same within a single run of a program. C<JSON>
2625
+ optionally sort the hash keys (determined by the I<canonical> flag), so
2626
+ the same data structure will serialise to the same JSON text (given same
2627
+ settings and version of JSON::XS), but this incurs a runtime overhead
2628
+ and is only rarely useful, e.g. when you want to compare some JSON text
2629
+ against another for equality.
2630
+
2631
+
2632
+ =item array references
2633
+
2634
+ Perl array references become JSON arrays.
2635
+
2636
+ =item other references
2637
+
2638
+ Other unblessed references are generally not allowed and will cause an
2639
+ exception to be thrown, except for references to the integers C<0> and
2640
+ C<1>, which get turned into C<false> and C<true> atoms in JSON. You can
2641
+ also use C<JSON::false> and C<JSON::true> to improve readability.
2642
+
2643
+ to_json [\0,JSON::PP::true] # yields [false,true]
2644
+
2645
+ =item JSON::PP::true, JSON::PP::false, JSON::PP::null
2646
+
2647
+ These special values become JSON true and JSON false values,
2648
+ respectively. You can also use C<\1> and C<\0> directly if you want.
2649
+
2650
+ JSON::PP::null returns C<undef>.
2651
+
2652
+ =item blessed objects
2653
+
2654
+ Blessed objects are not directly representable in JSON. See the
2655
+ C<allow_blessed> and C<convert_blessed> methods on various options on
2656
+ how to deal with this: basically, you can choose between throwing an
2657
+ exception, encoding the reference as if it weren't blessed, or provide
2658
+ your own serialiser method.
2659
+
2660
+ See to L<convert_blessed>.
2661
+
2662
+ =item simple scalars
2663
+
2664
+ Simple Perl scalars (any scalar that is not a reference) are the most
2665
+ difficult objects to encode: JSON::XS and JSON::PP will encode undefined scalars as
2666
+ JSON C<null> values, scalars that have last been used in a string context
2667
+ before encoding as JSON strings, and anything else as number value:
2668
+
2669
+ # dump as number
2670
+ encode_json [2] # yields [2]
2671
+ encode_json [-3.0e17] # yields [-3e+17]
2672
+ my $value = 5; encode_json [$value] # yields [5]
2673
+
2674
+ # used as string, so dump as string
2675
+ print $value;
2676
+ encode_json [$value] # yields ["5"]
2677
+
2678
+ # undef becomes null
2679
+ encode_json [undef] # yields [null]
2680
+
2681
+ You can force the type to be a string by stringifying it:
2682
+
2683
+ my $x = 3.1; # some variable containing a number
2684
+ "$x"; # stringified
2685
+ $x .= ""; # another, more awkward way to stringify
2686
+ print $x; # perl does it for you, too, quite often
2687
+
2688
+ You can force the type to be a number by numifying it:
2689
+
2690
+ my $x = "3"; # some variable containing a string
2691
+ $x += 0; # numify it, ensuring it will be dumped as a number
2692
+ $x *= 1; # same thing, the choice is yours.
2693
+
2694
+ You can not currently force the type in other, less obscure, ways.
2695
+
2696
+ Note that numerical precision has the same meaning as under Perl (so
2697
+ binary to decimal conversion follows the same rules as in Perl, which
2698
+ can differ to other languages). Also, your perl interpreter might expose
2699
+ extensions to the floating point numbers of your platform, such as
2700
+ infinities or NaN's - these cannot be represented in JSON, and it is an
2701
+ error to pass those in.
2702
+
2703
+ =item Big Number
2704
+
2705
+ When C<allow_bignum> is enable,
2706
+ C<encode> converts C<Math::BigInt> objects and C<Math::BigFloat>
2707
+ objects into JSON numbers.
2708
+
2709
+
2710
+ =back
2711
+
2712
+ =head1 UNICODE HANDLING ON PERLS
2713
+
2714
+ If you do not know about Unicode on Perl well,
2715
+ please check L<JSON::XS/A FEW NOTES ON UNICODE AND PERL>.
2716
+
2717
+ =head2 Perl 5.8 and later
2718
+
2719
+ Perl can handle Unicode and the JSON::PP de/encode methods also work properly.
2720
+
2721
+ $json->allow_nonref->encode(chr hex 3042);
2722
+ $json->allow_nonref->encode(chr hex 12345);
2723
+
2724
+ Returns C<"\u3042"> and C<"\ud808\udf45"> respectively.
2725
+
2726
+ $json->allow_nonref->decode('"\u3042"');
2727
+ $json->allow_nonref->decode('"\ud808\udf45"');
2728
+
2729
+ Returns UTF-8 encoded strings with UTF8 flag, regarded as C<U+3042> and C<U+12345>.
2730
+
2731
+ Note that the versions from Perl 5.8.0 to 5.8.2, Perl built-in C<join> was broken,
2732
+ so JSON::PP wraps the C<join> with a subroutine. Thus JSON::PP works slow in the versions.
2733
+
2734
+
2735
+ =head2 Perl 5.6
2736
+
2737
+ Perl can handle Unicode and the JSON::PP de/encode methods also work.
2738
+
2739
+ =head2 Perl 5.005
2740
+
2741
+ Perl 5.005 is a byte semantics world -- all strings are sequences of bytes.
2742
+ That means the unicode handling is not available.
2743
+
2744
+ In encoding,
2745
+
2746
+ $json->allow_nonref->encode(chr hex 3042); # hex 3042 is 12354.
2747
+ $json->allow_nonref->encode(chr hex 12345); # hex 12345 is 74565.
2748
+
2749
+ Returns C<B> and C<E>, as C<chr> takes a value more than 255, it treats
2750
+ as C<$value % 256>, so the above codes are equivalent to :
2751
+
2752
+ $json->allow_nonref->encode(chr 66);
2753
+ $json->allow_nonref->encode(chr 69);
2754
+
2755
+ In decoding,
2756
+
2757
+ $json->decode('"\u00e3\u0081\u0082"');
2758
+
2759
+ The returned is a byte sequence C<0xE3 0x81 0x82> for UTF-8 encoded
2760
+ japanese character (C<HIRAGANA LETTER A>).
2761
+ And if it is represented in Unicode code point, C<U+3042>.
2762
+
2763
+ Next,
2764
+
2765
+ $json->decode('"\u3042"');
2766
+
2767
+ We ordinary expect the returned value is a Unicode character C<U+3042>.
2768
+ But here is 5.005 world. This is C<0xE3 0x81 0x82>.
2769
+
2770
+ $json->decode('"\ud808\udf45"');
2771
+
2772
+ This is not a character C<U+12345> but bytes - C<0xf0 0x92 0x8d 0x85>.
2773
+
2774
+
2775
+ =head1 TODO
2776
+
2777
+ =over
2778
+
2779
+ =item speed
2780
+
2781
+ =item memory saving
2782
+
2783
+ =back
2784
+
2785
+
2786
+ =head1 SEE ALSO
2787
+
2788
+ Most of the document are copied and modified from JSON::XS doc.
2789
+
2790
+ L<JSON::XS>
2791
+
2792
+ RFC4627 (L<http://www.ietf.org/rfc/rfc4627.txt>)
2793
+
2794
+ =head1 AUTHOR
2795
+
2796
+ Makamaka Hannyaharamitu, E<lt>makamaka[at]cpan.orgE<gt>
2797
+
2798
+
2799
+ =head1 COPYRIGHT AND LICENSE
2800
+
2801
+ Copyright 2007-2012 by Makamaka Hannyaharamitu
2802
+
2803
+ This library is free software; you can redistribute it and/or modify
2804
+ it under the same terms as Perl itself.
2805
+
2806
+ =cut
uroman/lib/JSON/backportPP/Boolean.pm ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ =head1 NAME
2
+
3
+ JSON::PP::Boolean - dummy module providing JSON::PP::Boolean
4
+
5
+ =head1 SYNOPSIS
6
+
7
+ # do not "use" yourself
8
+
9
+ =head1 DESCRIPTION
10
+
11
+ This module exists only to provide overload resolution for Storable
12
+ and similar modules. See L<JSON::PP> for more info about this class.
13
+
14
+ =cut
15
+
16
+ use JSON::backportPP ();
17
+ use strict;
18
+
19
+ 1;
20
+
21
+ =head1 AUTHOR
22
+
23
+ This idea is from L<JSON::XS::Boolean> written by
24
+ Marc Lehmann <schmorp[at]schmorp.de>
25
+
26
+ =cut
27
+
uroman/lib/JSON/backportPP/Compat5005.pm ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package # This is JSON::backportPP
2
+ JSON::backportPP5005;
3
+
4
+ use 5.005;
5
+ use strict;
6
+
7
+ my @properties;
8
+
9
+ $JSON::PP5005::VERSION = '1.10';
10
+
11
+ BEGIN {
12
+
13
+ sub utf8::is_utf8 {
14
+ 0; # It is considered that UTF8 flag off for Perl 5.005.
15
+ }
16
+
17
+ sub utf8::upgrade {
18
+ }
19
+
20
+ sub utf8::downgrade {
21
+ 1; # must always return true.
22
+ }
23
+
24
+ sub utf8::encode {
25
+ }
26
+
27
+ sub utf8::decode {
28
+ }
29
+
30
+ *JSON::PP::JSON_PP_encode_ascii = \&_encode_ascii;
31
+ *JSON::PP::JSON_PP_encode_latin1 = \&_encode_latin1;
32
+ *JSON::PP::JSON_PP_decode_surrogates = \&_decode_surrogates;
33
+ *JSON::PP::JSON_PP_decode_unicode = \&_decode_unicode;
34
+
35
+ # missing in B module.
36
+ sub B::SVp_IOK () { 0x01000000; }
37
+ sub B::SVp_NOK () { 0x02000000; }
38
+ sub B::SVp_POK () { 0x04000000; }
39
+
40
+ $INC{'bytes.pm'} = 1; # dummy
41
+ }
42
+
43
+
44
+
45
+ sub _encode_ascii {
46
+ join('', map { $_ <= 127 ? chr($_) : sprintf('\u%04x', $_) } unpack('C*', $_[0]) );
47
+ }
48
+
49
+
50
+ sub _encode_latin1 {
51
+ join('', map { chr($_) } unpack('C*', $_[0]) );
52
+ }
53
+
54
+
55
+ sub _decode_surrogates { # from http://homepage1.nifty.com/nomenclator/unicode/ucs_utf.htm
56
+ my $uni = 0x10000 + (hex($_[0]) - 0xD800) * 0x400 + (hex($_[1]) - 0xDC00); # from perlunicode
57
+ my $bit = unpack('B32', pack('N', $uni));
58
+
59
+ if ( $bit =~ /^00000000000(...)(......)(......)(......)$/ ) {
60
+ my ($w, $x, $y, $z) = ($1, $2, $3, $4);
61
+ return pack('B*', sprintf('11110%s10%s10%s10%s', $w, $x, $y, $z));
62
+ }
63
+ else {
64
+ Carp::croak("Invalid surrogate pair");
65
+ }
66
+ }
67
+
68
+
69
+ sub _decode_unicode {
70
+ my ($u) = @_;
71
+ my ($utf8bit);
72
+
73
+ if ( $u =~ /^00([89a-f][0-9a-f])$/i ) { # 0x80-0xff
74
+ return pack( 'H2', $1 );
75
+ }
76
+
77
+ my $bit = unpack("B*", pack("H*", $u));
78
+
79
+ if ( $bit =~ /^00000(.....)(......)$/ ) {
80
+ $utf8bit = sprintf('110%s10%s', $1, $2);
81
+ }
82
+ elsif ( $bit =~ /^(....)(......)(......)$/ ) {
83
+ $utf8bit = sprintf('1110%s10%s10%s', $1, $2, $3);
84
+ }
85
+ else {
86
+ Carp::croak("Invalid escaped unicode");
87
+ }
88
+
89
+ return pack('B*', $utf8bit);
90
+ }
91
+
92
+
93
+ sub JSON::PP::incr_text {
94
+ $_[0]->{_incr_parser} ||= JSON::PP::IncrParser->new;
95
+
96
+ if ( $_[0]->{_incr_parser}->{incr_parsing} ) {
97
+ Carp::croak("incr_text can not be called when the incremental parser already started parsing");
98
+ }
99
+
100
+ $_[0]->{_incr_parser}->{incr_text} = $_[1] if ( @_ > 1 );
101
+ $_[0]->{_incr_parser}->{incr_text};
102
+ }
103
+
104
+
105
+ 1;
106
+ __END__
107
+
108
+ =pod
109
+
110
+ =head1 NAME
111
+
112
+ JSON::PP5005 - Helper module in using JSON::PP in Perl 5.005
113
+
114
+ =head1 DESCRIPTION
115
+
116
+ JSON::PP calls internally.
117
+
118
+ =head1 AUTHOR
119
+
120
+ Makamaka Hannyaharamitu, E<lt>makamaka[at]cpan.orgE<gt>
121
+
122
+
123
+ =head1 COPYRIGHT AND LICENSE
124
+
125
+ Copyright 2007-2012 by Makamaka Hannyaharamitu
126
+
127
+ This library is free software; you can redistribute it and/or modify
128
+ it under the same terms as Perl itself.
129
+
130
+ =cut
131
+
uroman/lib/JSON/backportPP/Compat5006.pm ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package # This is JSON::backportPP
2
+ JSON::backportPP56;
3
+
4
+ use 5.006;
5
+ use strict;
6
+
7
+ my @properties;
8
+
9
+ $JSON::PP56::VERSION = '1.08';
10
+
11
+ BEGIN {
12
+
13
+ sub utf8::is_utf8 {
14
+ my $len = length $_[0]; # char length
15
+ {
16
+ use bytes; # byte length;
17
+ return $len != length $_[0]; # if !=, UTF8-flagged on.
18
+ }
19
+ }
20
+
21
+
22
+ sub utf8::upgrade {
23
+ ; # noop;
24
+ }
25
+
26
+
27
+ sub utf8::downgrade ($;$) {
28
+ return 1 unless ( utf8::is_utf8( $_[0] ) );
29
+
30
+ if ( _is_valid_utf8( $_[0] ) ) {
31
+ my $downgrade;
32
+ for my $c ( unpack( "U*", $_[0] ) ) {
33
+ if ( $c < 256 ) {
34
+ $downgrade .= pack("C", $c);
35
+ }
36
+ else {
37
+ $downgrade .= pack("U", $c);
38
+ }
39
+ }
40
+ $_[0] = $downgrade;
41
+ return 1;
42
+ }
43
+ else {
44
+ Carp::croak("Wide character in subroutine entry") unless ( $_[1] );
45
+ 0;
46
+ }
47
+ }
48
+
49
+
50
+ sub utf8::encode ($) { # UTF8 flag off
51
+ if ( utf8::is_utf8( $_[0] ) ) {
52
+ $_[0] = pack( "C*", unpack( "C*", $_[0] ) );
53
+ }
54
+ else {
55
+ $_[0] = pack( "U*", unpack( "C*", $_[0] ) );
56
+ $_[0] = pack( "C*", unpack( "C*", $_[0] ) );
57
+ }
58
+ }
59
+
60
+
61
+ sub utf8::decode ($) { # UTF8 flag on
62
+ if ( _is_valid_utf8( $_[0] ) ) {
63
+ utf8::downgrade( $_[0] );
64
+ $_[0] = pack( "U*", unpack( "U*", $_[0] ) );
65
+ }
66
+ }
67
+
68
+
69
+ *JSON::PP::JSON_PP_encode_ascii = \&_encode_ascii;
70
+ *JSON::PP::JSON_PP_encode_latin1 = \&_encode_latin1;
71
+ *JSON::PP::JSON_PP_decode_surrogates = \&JSON::PP::_decode_surrogates;
72
+ *JSON::PP::JSON_PP_decode_unicode = \&JSON::PP::_decode_unicode;
73
+
74
+ unless ( defined &B::SVp_NOK ) { # missing in B module.
75
+ eval q{ sub B::SVp_NOK () { 0x02000000; } };
76
+ }
77
+
78
+ }
79
+
80
+
81
+
82
+ sub _encode_ascii {
83
+ join('',
84
+ map {
85
+ $_ <= 127 ?
86
+ chr($_) :
87
+ $_ <= 65535 ?
88
+ sprintf('\u%04x', $_) : sprintf('\u%x\u%x', JSON::PP::_encode_surrogates($_));
89
+ } _unpack_emu($_[0])
90
+ );
91
+ }
92
+
93
+
94
+ sub _encode_latin1 {
95
+ join('',
96
+ map {
97
+ $_ <= 255 ?
98
+ chr($_) :
99
+ $_ <= 65535 ?
100
+ sprintf('\u%04x', $_) : sprintf('\u%x\u%x', JSON::PP::_encode_surrogates($_));
101
+ } _unpack_emu($_[0])
102
+ );
103
+ }
104
+
105
+
106
+ sub _unpack_emu { # for Perl 5.6 unpack warnings
107
+ return !utf8::is_utf8($_[0]) ? unpack('C*', $_[0])
108
+ : _is_valid_utf8($_[0]) ? unpack('U*', $_[0])
109
+ : unpack('C*', $_[0]);
110
+ }
111
+
112
+
113
+ sub _is_valid_utf8 {
114
+ my $str = $_[0];
115
+ my $is_utf8;
116
+
117
+ while ($str =~ /(?:
118
+ (
119
+ [\x00-\x7F]
120
+ |[\xC2-\xDF][\x80-\xBF]
121
+ |[\xE0][\xA0-\xBF][\x80-\xBF]
122
+ |[\xE1-\xEC][\x80-\xBF][\x80-\xBF]
123
+ |[\xED][\x80-\x9F][\x80-\xBF]
124
+ |[\xEE-\xEF][\x80-\xBF][\x80-\xBF]
125
+ |[\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF]
126
+ |[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]
127
+ |[\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF]
128
+ )
129
+ | (.)
130
+ )/xg)
131
+ {
132
+ if (defined $1) {
133
+ $is_utf8 = 1 if (!defined $is_utf8);
134
+ }
135
+ else {
136
+ $is_utf8 = 0 if (!defined $is_utf8);
137
+ if ($is_utf8) { # eventually, not utf8
138
+ return;
139
+ }
140
+ }
141
+ }
142
+
143
+ return $is_utf8;
144
+ }
145
+
146
+
147
+ 1;
148
+ __END__
149
+
150
+ =pod
151
+
152
+ =head1 NAME
153
+
154
+ JSON::PP56 - Helper module in using JSON::PP in Perl 5.6
155
+
156
+ =head1 DESCRIPTION
157
+
158
+ JSON::PP calls internally.
159
+
160
+ =head1 AUTHOR
161
+
162
+ Makamaka Hannyaharamitu, E<lt>makamaka[at]cpan.orgE<gt>
163
+
164
+
165
+ =head1 COPYRIGHT AND LICENSE
166
+
167
+ Copyright 2007-2012 by Makamaka Hannyaharamitu
168
+
169
+ This library is free software; you can redistribute it and/or modify
170
+ it under the same terms as Perl itself.
171
+
172
+ =cut
173
+
uroman/lib/NLP/Chinese.pm ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ################################################################
2
+ # #
3
+ # Chinese #
4
+ # #
5
+ ################################################################
6
+
7
+ package NLP::Chinese;
8
+
9
+ $utf8 = NLP::UTF8;
10
+ %empty_ht = ();
11
+
12
+ sub read_chinese_tonal_pinyin_files {
13
+ local($caller, *ht, @filenames) = @_;
14
+
15
+ $n_kHanyuPinlu = 0;
16
+ $n_kXHC1983 = 0;
17
+ $n_kHanyuPinyin = 0;
18
+ $n_kMandarin = 0;
19
+ $n_cedict = 0;
20
+ $n_simple_pinyin = 0;
21
+
22
+ foreach $filename (@filenames) {
23
+ if ($filename =~ /unihan/i) {
24
+ my $line_number = 0;
25
+ if (open(IN, $filename)) {
26
+ while (<IN>) {
27
+ $line_number++;
28
+ next if /^#/;
29
+ s/\s*$//;
30
+ if (($u, $type, $value) = split(/\t/, $_)) {
31
+ if ($type =~ /^(kHanyuPinlu|kXHC1983|kHanyuPinyin|kMandarin)$/) {
32
+ $u = $util->trim($u);
33
+ $type = $util->trim($type);
34
+ $value = $util->trim($value);
35
+ $f = $utf8->unicode_string2string($u);
36
+
37
+ if ($type eq "kHanyuPinlu") {
38
+ $value =~ s/\(.*?\)//g;
39
+ $value = $util->trim($value);
40
+ $translit = $caller->number_to_accent_tone($value);
41
+ $ht{"kHanyuPinlu"}->{$f} = $translit;
42
+ $n_kHanyuPinlu++;
43
+ } elsif ($type eq "kXHC1983") {
44
+ @translits = ($value =~ /:(\S+)/g);
45
+ $translit = join(" ", @translits);
46
+ $ht{"kXHC1983"}->{$f} = $translit;
47
+ $n_kXHC1983++;
48
+ } elsif ($type eq "kHanyuPinyin") {
49
+ $value =~ s/^.*://;
50
+ $value =~ s/,/ /g;
51
+ $ht{"kHanyuPinyin"}->{$f} = $value;
52
+ $n_kHanyuPinyin++;
53
+ } elsif ($type eq "kMandarin") {
54
+ $ht{"kMandarin"}->{$f} = $value;
55
+ $n_kMandarin++;
56
+ }
57
+ }
58
+ }
59
+ }
60
+ close(IN);
61
+ print "Read in $n_kHanyuPinlu kHanyuPinlu, $n_kXHC1983 n_kXHC1983, $n_kHanyuPinyin n_kHanyuPinyin $n_kMandarin n_kMandarin\n";
62
+ } else {
63
+ print STDERR "Can't open $filename\n";
64
+ }
65
+ } elsif ($filename =~ /cedict/i) {
66
+ if (open(IN, $filename)) {
67
+ my $line_number = 0;
68
+ while (<IN>) {
69
+ $line_number++;
70
+ next if /^#/;
71
+ s/\s*$//;
72
+ if (($f, $translit) = ($_ =~ /^\S+\s+(\S+)\s+\[([^\[\]]+)\]/)) {
73
+ $translit = $utf8->extended_lower_case($translit);
74
+ $translit = $caller->number_to_accent_tone($translit);
75
+ $translit =~ s/\s//g;
76
+ if ($old_translit = $ht{"cedict"}->{$f}) {
77
+ # $ht{CONFLICT}->{("DUPLICATE " . $f)} = "CEDICT($f): $old_translit\nCEDICT($f): $translit (duplicate)\n" unless $translit eq $old_translit;
78
+ $ht{"cedicts"}->{$f} = join(" ", $ht{"cedicts"}->{$f}, $translit) unless $old_translit eq $translit;
79
+ } else {
80
+ $ht{"cedict"}->{$f} = $translit;
81
+ $ht{"cedicts"}->{$f} = $translit;
82
+ }
83
+ $n_cedict++;
84
+ }
85
+ }
86
+ close(IN);
87
+ # print "Read in $n_cedict n_cedict\n";
88
+ } else {
89
+ print STDERR "Can't open $filename";
90
+ }
91
+ } elsif ($filename =~ /chinese_to_pinyin/i) {
92
+ if (open(IN, $filename)) {
93
+ my $line_number = 0;
94
+ while (<IN>) {
95
+ $line_number++;
96
+ next if /^#/;
97
+ if (($f, $translit) = ($_ =~ /^(\S+)\t(\S+)\s*$/)) {
98
+ $ht{"simple_pinyin"}->{$f} = $translit;
99
+ $n_simple_pinyin++;
100
+ }
101
+ }
102
+ close(IN);
103
+ # print "Read in $n_simple_pinyin n_simple_pinyin\n";
104
+ } else {
105
+ print STDERR "Can't open $filename";
106
+ }
107
+ } else {
108
+ print STDERR "Don't know what to do with file $filename (in read_chinese_tonal_pinyin_files)\n";
109
+ }
110
+ }
111
+ }
112
+
113
+ sub tonal_pinyin {
114
+ local($caller, $s, *ht, $gloss) = @_;
115
+
116
+ return $result if defined($result = $ht{COMBINED}->{$s});
117
+
118
+ $cedict_pinyin = $ht{"cedict"}->{$s} || "";
119
+ $cedicts_pinyin = $ht{"cedicts"}->{$s} || "";
120
+ $unihan_pinyin = "";
121
+ @characters = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht);
122
+ foreach $c (@characters) {
123
+ if ($pinyin = $ht{"simple_pinyin"}->{$c}) {
124
+ $unihan_pinyin .= $pinyin;
125
+ } elsif ($pinyin = $ht{"kHanyuPinlu"}->{$c}) {
126
+ $pinyin =~ s/^(\S+)\s.*$/$1/;
127
+ $unihan_pinyin .= $pinyin;
128
+ } elsif ($pinyin = $ht{"kXHC1983"}->{$c}) {
129
+ $pinyin =~ s/^(\S+)\s.*$/$1/;
130
+ $unihan_pinyin .= $pinyin;
131
+ } elsif ($pinyin = $ht{"kHanyuPinyin"}->{$c}) {
132
+ $pinyin =~ s/^(\S+)\s.*$/$1/;
133
+ $unihan_pinyin .= $pinyin;
134
+ } elsif ($pinyin = $ht{"cedicts"}->{$c}) {
135
+ $pinyin =~ s/^(\S+)\s.*$/$1/;
136
+ $unihan_pinyin .= $pinyin;
137
+ # middle dot, katakana middle dot, multiplication sign
138
+ } elsif ($c =~ /^(\xC2\xB7|\xE3\x83\xBB|\xC3\x97)$/) {
139
+ $unihan_pinyin .= $c;
140
+ # ASCII
141
+ } elsif ($c =~ /^([\x21-\x7E])$/) {
142
+ $unihan_pinyin .= $c;
143
+ } else {
144
+ $unihan_pinyin .= "?";
145
+ $hex = $utf8->utf8_to_hex($c);
146
+ $unicode = uc $utf8->utf8_to_4hex_unicode($c);
147
+ # print STDERR "Tonal pinyin: Unknown character $c ($hex/U+$unicode) -> ?\n";
148
+ }
149
+ }
150
+ $pinyin_title = "";
151
+ if (($#characters >= 1) && $cedicts_pinyin) {
152
+ foreach $pinyin (split(/\s+/, $cedicts_pinyin)) {
153
+ $pinyin_title .= "$s $pinyin (CEDICT)\n";
154
+ }
155
+ $pinyin_title .= "\n";
156
+ }
157
+ foreach $c (@characters) {
158
+ my %local_ht = ();
159
+ @pinyins = ();
160
+ foreach $type (("kHanyuPinlu", "kXHC1983", "kHanyuPinyin", "cedicts")) {
161
+ if ($pinyin_s = $ht{$type}->{$c}) {
162
+ foreach $pinyin (split(/\s+/, $pinyin_s)) {
163
+ push(@pinyins, $pinyin) unless $util->member($pinyin, @pinyins);
164
+ $type2 = ($type eq "cedicts") ? "CEDICT" : $type;
165
+ $local_ht{$pinyin} = ($local_ht{$pinyin}) ? join(", ", $local_ht{$pinyin}, $type2) : $type2;
166
+ }
167
+ }
168
+ }
169
+ foreach $pinyin (@pinyins) {
170
+ $type_s = $local_ht{$pinyin};
171
+ $pinyin_title .= "$c $pinyin ($type_s)\n";
172
+ }
173
+ }
174
+ $pinyin_title =~ s/\n$//;
175
+ $pinyin_title =~ s/\n/&#xA;/g;
176
+ $unihan_pinyin = "" if $unihan_pinyin =~ /^\?+$/;
177
+ if (($#characters >= 1) && $cedict_pinyin && $unihan_pinyin && ($unihan_pinyin ne $cedict_pinyin)) {
178
+ $log = "Gloss($s): $gloss\nCEdict($s): $cedicts_pinyin\nUnihan($s): $unihan_pinyin\n";
179
+ foreach $type (("kHanyuPinlu", "kXHC1983", "kHanyuPinyin")) {
180
+ $log_line = "$type($s): ";
181
+ foreach $c (@characters) {
182
+ $pinyin = $ht{$type}->{$c} || "";
183
+ if ($pinyin =~ / /) {
184
+ $log_line .= "($pinyin)";
185
+ } elsif ($pinyin) {
186
+ $log_line .= $pinyin;
187
+ } else {
188
+ $log_line .= "?";
189
+ }
190
+ }
191
+ $log .= "$log_line\n";
192
+ }
193
+ $ht{CONFLICT}->{$s} = $log;
194
+ }
195
+ $result = $unihan_pinyin || $cedict_pinyin;
196
+ $result = $cedict_pinyin if ($#characters > 0) && $cedict_pinyin;
197
+ $ht{COMBINED}->{$s} = $result;
198
+ $ht{PINYIN_TITLE}->{$s} = $pinyin_title;
199
+ return $result;
200
+ }
201
+
202
+ %number_to_accent_tone_ht = (
203
+ "a1", "\xC4\x81", "a2", "\xC3\xA1", "a3", "\xC7\x8E", "a4", "\xC3\xA0",
204
+ "e1", "\xC4\x93", "e2", "\xC3\xA9", "e3", "\xC4\x9B", "e4", "\xC3\xA8",
205
+ "i1", "\xC4\xAB", "i2", "\xC3\xAD", "i3", "\xC7\x90", "i4", "\xC3\xAC",
206
+ "o1", "\xC5\x8D", "o2", "\xC3\xB3", "o3", "\xC7\x92", "o4", "\xC3\xB2",
207
+ "u1", "\xC5\xAB", "u2", "\xC3\xBA", "u3", "\xC7\x94", "u4", "\xC3\xB9",
208
+ "u:1","\xC7\x96", "u:2","\xC7\x98", "u:3","\xC7\x9A", "u:4","\xC7\x9C",
209
+ "\xC3\xBC1","\xC7\x96","\xC3\xBC2","\xC7\x98","\xC3\xBC3","\xC7\x9A","\xC3\xBC4","\xC7\x9C"
210
+ );
211
+
212
+ sub number_to_accent_tone {
213
+ local($caller, $s) = @_;
214
+
215
+ my $result = "";
216
+ while (($pre,$alpha,$tone_number,$rest) = ($s =~ /^(.*?)((?:[a-z]|u:|\xC3\xBC)+)([1-5])(.*)$/i)) {
217
+ if ($tone_number eq "5") {
218
+ $result .= "$pre$alpha";
219
+ } elsif ((($pre_acc,$acc_letter,$post_acc) = ($alpha =~ /^(.*)([ae])(.*)$/))
220
+ || (($pre_acc,$acc_letter,$post_acc) = ($alpha =~ /^(.*)(o)(u.*)$/))
221
+ || (($pre_acc,$acc_letter,$post_acc) = ($alpha =~ /^(.*)(u:|[iou]|\xC3\xBC)([^aeiou]*)$/))) {
222
+ $result .= "$pre$pre_acc" . ($number_to_accent_tone_ht{($acc_letter . $tone_number)} || ($acc_letter . $tone_number)) . $post_acc;
223
+ } else {
224
+ $result .= "$pre$alpha$tone_number";
225
+ }
226
+ $s = $rest;
227
+ }
228
+ $result .= $s;
229
+ $result =~ s/u:/\xC3\xBC/g;
230
+ return $result;
231
+ }
232
+
233
+ sub string_contains_utf8_cjk_unified_ideograph_p {
234
+ local($caller, $s) = @_;
235
+
236
+ return ($s =~ /([\xE4-\xE9]|\xE3[\x90-\xBF]|\xF0[\xA0-\xAC])/);
237
+ }
238
+
239
+ 1;
uroman/lib/NLP/English.pm ADDED
The diff for this file is too large to render. See raw diff
 
uroman/lib/NLP/Romanizer.pm ADDED
@@ -0,0 +1,2020 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ################################################################
2
+ # #
3
+ # Romanizer #
4
+ # #
5
+ ################################################################
6
+
7
+ package NLP::Romanizer;
8
+
9
+ use NLP::Chinese;
10
+ use NLP::UTF8;
11
+ use NLP::utilities;
12
+ use JSON;
13
+ $utf8 = NLP::UTF8;
14
+ $util = NLP::utilities;
15
+ $chinesePM = NLP::Chinese;
16
+
17
+ my $verbosePM = 0;
18
+ %empty_ht = ();
19
+
20
+ my $braille_capital_letter_indicator = "\xE2\xA0\xA0";
21
+ my $braille_number_indicator = "\xE2\xA0\xBC";
22
+ my $braille_decimal_point = "\xE2\xA0\xA8";
23
+ my $braille_comma = "\xE2\xA0\x82";
24
+ my $braille_solidus = "\xE2\xA0\x8C";
25
+ my $braille_numeric_space = "\xE2\xA0\x90";
26
+ my $braille_letter_indicator = "\xE2\xA0\xB0";
27
+ my $braille_period = "\xE2\xA0\xB2";
28
+
29
+ sub new {
30
+ local($caller) = @_;
31
+
32
+ my $object = {};
33
+ my $class = ref( $caller ) || $caller;
34
+ bless($object, $class);
35
+ return $object;
36
+ }
37
+
38
+ sub load_unicode_data {
39
+ local($this, *ht, $filename) = @_;
40
+ # ../../data/UnicodeData.txt
41
+
42
+ $n = 0;
43
+ if (open(IN, $filename)) {
44
+ while (<IN>) {
45
+ if (($unicode_value, $char_name, $general_category, $canon_comb_classes, $bidir_category, $char_decomp_mapping, $decimal_digit_value, $digit_value, $numeric_value, $mirrored, $unicode_1_0_name, $comment_field, $uc_mapping, $lc_mapping, $title_case_mapping) = split(";", $_)) {
46
+ $utf8_code = $utf8->unicode_hex_string2string($unicode_value);
47
+ $ht{UTF_TO_CHAR_NAME}->{$utf8_code} = $char_name;
48
+ $ht{UTF_NAME_TO_UNICODE}->{$char_name} = $unicode_value;
49
+ $ht{UTF_NAME_TO_CODE}->{$char_name} = $utf8_code;
50
+ $ht{UTF_TO_CAT}->{$utf8_code} = $general_category;
51
+ $ht{UTF_TO_NUMERIC}->{$utf8_code} = $numeric_value unless $numeric_value eq "";
52
+ $n++;
53
+ }
54
+ }
55
+ close(IN);
56
+ # print STDERR "Loaded $n entries from $filename\n";
57
+ } else {
58
+ print STDERR "Can't open $filename\n";
59
+ }
60
+ }
61
+
62
+ sub load_unicode_overwrite_romanization {
63
+ local($this, *ht, $filename) = @_;
64
+ # ../../data/UnicodeDataOverwrite.txt
65
+
66
+ $n = 0;
67
+ if (open(IN, $filename)) {
68
+ while (<IN>) {
69
+ next if /^#/;
70
+ $unicode_value = $util->slot_value_in_double_colon_del_list($_, "u");
71
+ $romanization = $util->slot_value_in_double_colon_del_list($_, "r");
72
+ $numeric = $util->slot_value_in_double_colon_del_list($_, "num");
73
+ $picture = $util->slot_value_in_double_colon_del_list($_, "pic");
74
+ $syllable_info = $util->slot_value_in_double_colon_del_list($_, "syllable-info");
75
+ $tone_mark = $util->slot_value_in_double_colon_del_list($_, "tone-mark");
76
+ $char_name = $util->slot_value_in_double_colon_del_list($_, "name");
77
+ $entry_processed_p = 0;
78
+ $utf8_code = $utf8->unicode_hex_string2string($unicode_value);
79
+ if ($unicode_value) {
80
+ $ht{UTF_TO_CHAR_ROMANIZATION}->{$utf8_code} = $romanization if $romanization;
81
+ $ht{UTF_TO_NUMERIC}->{$utf8_code} = $numeric if defined($numeric) && ($numeric ne "");
82
+ $ht{UTF_TO_PICTURE_DESCR}->{$utf8_code} = $picture if $picture;
83
+ $ht{UTF_TO_SYLLABLE_INFO}->{$utf8_code} = $syllable_info if $syllable_info;
84
+ $ht{UTF_TO_TONE_MARK}->{$utf8_code} = $tone_mark if $tone_mark;
85
+ $ht{UTF_TO_CHAR_NAME}->{$utf8_code} = $char_name if $char_name;
86
+ $entry_processed_p = 1 if $romanization || $numeric || $picture || $syllable_info || $tone_mark;
87
+ }
88
+ $n++ if $entry_processed_p;
89
+ }
90
+ close(IN);
91
+ } else {
92
+ print STDERR "Can't open $filename\n";
93
+ }
94
+ }
95
+
96
+ sub load_script_data {
97
+ local($this, *ht, $filename) = @_;
98
+ # ../../data/Scripts.txt
99
+
100
+ $n = 0;
101
+ if (open(IN, $filename)) {
102
+ while (<IN>) {
103
+ next unless $script_name = $util->slot_value_in_double_colon_del_list($_, "script-name");
104
+ $abugida_default_vowel_s = $util->slot_value_in_double_colon_del_list($_, "abugida-default-vowel");
105
+ $alt_script_name_s = $util->slot_value_in_double_colon_del_list($_, "alt-script-name");
106
+ $language_s = $util->slot_value_in_double_colon_del_list($_, "language");
107
+ $direction = $util->slot_value_in_double_colon_del_list($_, "direction"); # right-to-left
108
+ $font_family_s = $util->slot_value_in_double_colon_del_list($_, "font-family");
109
+ $ht{SCRIPT_P}->{$script_name} = 1;
110
+ $ht{SCRIPT_NORM}->{(uc $script_name)} = $script_name;
111
+ $ht{DIRECTION}->{$script_name} = $direction if $direction;
112
+ foreach $language (split(/,\s*/, $language_s)) {
113
+ $ht{SCRIPT_LANGUAGE}->{$script_name}->{$language} = 1;
114
+ $ht{LANGUAGE_SCRIPT}->{$language}->{$script_name} = 1;
115
+ }
116
+ foreach $alt_script_name (split(/,\s*/, $alt_script_name_s)) {
117
+ $ht{SCRIPT_NORM}->{$alt_script_name} = $script_name;
118
+ $ht{SCRIPT_NORM}->{(uc $alt_script_name)} = $script_name;
119
+ }
120
+ foreach $abugida_default_vowel (split(/,\s*/, $abugida_default_vowel_s)) {
121
+ $ht{SCRIPT_ABUDIGA_DEFAULT_VOWEL}->{$script_name}->{$abugida_default_vowel} = 1 if $abugida_default_vowel;
122
+ }
123
+ foreach $font_family (split(/,\s*/, $font_family_s)) {
124
+ $ht{SCRIPT_FONT}->{$script_name}->{$font_family} = 1 if $font_family;
125
+ }
126
+ $n++;
127
+ }
128
+ close(IN);
129
+ # print STDERR "Loaded $n entries from $filename\n";
130
+ } else {
131
+ print STDERR "Can't open $filename\n";
132
+ }
133
+ }
134
+
135
+ sub unicode_hangul_romanization {
136
+ local($this, $s, $pass_through_p) = @_;
137
+
138
+ $pass_through_p = 0 unless defined($pass_through_p);
139
+ @leads = split(/\s+/, "g gg n d dd r m b bb s ss - j jj c k t p h");
140
+ # @vowels = split(/\s+/, "a ae ya yai e ei ye yei o oa oai oi yo u ue uei ui yu w wi i");
141
+ @vowels = split(/\s+/, "a ae ya yae eo e yeo ye o wa wai oe yo u weo we wi yu eu yi i");
142
+ @tails = split(/\s+/, "- g gg gs n nj nh d l lg lm lb ls lt lp lh m b bs s ss ng j c k t p h");
143
+ $result = "";
144
+ @chars = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht);
145
+ foreach $char (@chars) {
146
+ $unicode = $utf8->utf8_to_unicode($char);
147
+ if (($unicode >= 0xAC00) && ($unicode <= 0xD7A3)) {
148
+ $code = $unicode - 0xAC00;
149
+ $lead_index = int($code / (28*21));
150
+ $vowel_index = int($code/28) % 21;
151
+ $tail_index = $code % 28;
152
+ $rom = $leads[$lead_index] . $vowels[$vowel_index] . $tails[$tail_index];
153
+ $rom =~ s/-//g;
154
+ $result .= $rom;
155
+ } elsif ($pass_through_p) {
156
+ $result .= $char;
157
+ }
158
+ }
159
+ return $result;
160
+ }
161
+
162
+ sub listify_comma_sep_string {
163
+ local($this, $s) = @_;
164
+
165
+ @result_list = ();
166
+ return @result_list unless $s =~ /\S/;
167
+ $s = $util->trim2($s);
168
+ my $elem;
169
+
170
+ while (($elem, $rest) = ($s =~ /^("(?:\\"|[^"])*"|'(?:\\'|[^'])*'|[^"', ]+),\s*(.*)$/)) {
171
+ push(@result_list, $util->dequote_string($elem));
172
+ $s = $rest;
173
+ }
174
+ push(@result_list, $util->dequote_string($s)) if $s =~ /\S/;
175
+
176
+ return @result_list;
177
+ }
178
+
179
+ sub braille_string_p {
180
+ local($this, $s) = @_;
181
+
182
+ return ($s =~ /^(\xE2[\xA0-\xA3][\x80-\xBF])+$/);
183
+ }
184
+
185
+ sub register_word_boundary_info {
186
+ local($this, *ht, $lang_code, $utf8_source_string, $utf8_target_string, $use_only_for_whole_word_p,
187
+ $use_only_at_start_of_word_p, $use_only_at_end_of_word_p,
188
+ $dont_use_at_start_of_word_p, $dont_use_at_end_of_word_p) = @_;
189
+
190
+ if ($use_only_for_whole_word_p) {
191
+ if ($lang_code) {
192
+ $ht{USE_ONLY_FOR_WHOLE_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1;
193
+ } else {
194
+ $ht{USE_ONLY_FOR_WHOLE_WORD}->{$utf8_source_string}->{$utf8_target_string} = 1;
195
+ }
196
+ }
197
+ if ($use_only_at_start_of_word_p) {
198
+ if ($lang_code) {
199
+ $ht{USE_ONLY_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1;
200
+ } else {
201
+ $ht{USE_ONLY_AT_START_OF_WORD}->{$utf8_source_string}->{$utf8_target_string} = 1;
202
+ }
203
+ }
204
+ if ($use_only_at_end_of_word_p) {
205
+ if ($lang_code) {
206
+ $ht{USE_ONLY_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1;
207
+ } else {
208
+ $ht{USE_ONLY_AT_END_OF_WORD}->{$utf8_source_string}->{$utf8_target_string} = 1;
209
+ }
210
+ }
211
+ if ($dont_use_at_start_of_word_p) {
212
+ if ($lang_code) {
213
+ $ht{DONT_USE_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1;
214
+ } else {
215
+ $ht{DONT_USE_AT_START_OF_WORD}->{$utf8_source_string}->{$utf8_target_string} = 1;
216
+ }
217
+ }
218
+ if ($dont_use_at_end_of_word_p) {
219
+ if ($lang_code) {
220
+ $ht{DONT_USE_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1;
221
+ } else {
222
+ $ht{DONT_USE_AT_END_OF_WORD}->{$utf8_source_string}->{$utf8_target_string} = 1;
223
+ }
224
+ }
225
+ }
226
+
227
+ sub load_romanization_table {
228
+ local($this, *ht, $filename) = @_;
229
+ # ../../data/romanization-table.txt
230
+
231
+ $n = 0;
232
+ $line_number = 0;
233
+ if (open(IN, $filename)) {
234
+ while (<IN>) {
235
+ $line_number++;
236
+ next if /^#/;
237
+ if ($_ =~ /^::preserve\s/) {
238
+ $from_unicode = $util->slot_value_in_double_colon_del_list($_, "from");
239
+ $to_unicode = $util->slot_value_in_double_colon_del_list($_, "to");
240
+ if ($from_unicode =~ /^(?:U\+|\\u)[0-9A-F]{4,}$/i) {
241
+ $from_unicode =~ s/^(?:U\+|\\u)//;
242
+ $from_code_point = hex($from_unicode);
243
+ } else {
244
+ $from_code_point = "";
245
+ }
246
+ if ($to_unicode =~ /^(?:U\+|\\u)[0-9A-F]{4,}$/i) {
247
+ $to_unicode =~ s/^(?:U\+|\\u)//;
248
+ $to_code_point = hex($to_unicode);
249
+ } else {
250
+ $to_code_point = $from_code_point;
251
+ }
252
+ if ($from_code_point ne "") {
253
+ # print STDERR "Preserve code-points $from_unicode--$to_unicode = $from_code_point--$to_code_point\n";
254
+ foreach $code_point (($from_code_point .. $to_code_point)) {
255
+ $utf8_string = $utf8->unicode2string($code_point);
256
+ $ht{UTF_CHAR_MAPPING}->{$utf8_string}->{$utf8_string} = 1;
257
+ }
258
+ $n++;
259
+ }
260
+ next;
261
+ }
262
+ $utf8_source_string = $util->slot_value_in_double_colon_del_list($_, "s");
263
+ $utf8_target_string = $util->slot_value_in_double_colon_del_list($_, "t");
264
+ $utf8_alt_target_string_s = $util->slot_value_in_double_colon_del_list($_, "t-alt");
265
+ $use_alt_in_pointed_p = ($_ =~ /::use-alt-in-pointed\b/);
266
+ $use_only_for_whole_word_p = ($_ =~ /::use-only-for-whole-word\b/);
267
+ $use_only_at_start_of_word_p = ($_ =~ /::use-only-at-start-of-word\b/);
268
+ $use_only_at_end_of_word_p = ($_ =~ /::use-only-at-end-of-word\b/);
269
+ $dont_use_at_start_of_word_p = ($_ =~ /::dont-use-at-start-of-word\b/);
270
+ $dont_use_at_end_of_word_p = ($_ =~ /::dont-use-at-end-of-word\b/);
271
+ $use_only_in_lower_case_enviroment_p = ($_ =~ /::use-only-in-lower-case-enviroment\b/);
272
+ $word_external_punctuation_p = ($_ =~ /::word-external-punctuation\b/);
273
+ $utf8_source_string =~ s/\s*$//;
274
+ $utf8_target_string =~ s/\s*$//;
275
+ $utf8_alt_target_string_s =~ s/\s*$//;
276
+ $utf8_target_string =~ s/^"(.*)"$/$1/;
277
+ $utf8_target_string =~ s/^'(.*)'$/$1/;
278
+ @utf8_alt_targets = $this->listify_comma_sep_string($utf8_alt_target_string_s);
279
+ $numeric = $util->slot_value_in_double_colon_del_list($_, "num");
280
+ $numeric =~ s/\s*$//;
281
+ $annotation = $util->slot_value_in_double_colon_del_list($_, "annotation");
282
+ $annotation =~ s/\s*$//;
283
+ $lang_code = $util->slot_value_in_double_colon_del_list($_, "lcode");
284
+ $prob = $util->slot_value_in_double_colon_del_list($_, "p") || 1;
285
+ unless (($utf8_target_string eq "") && ($numeric =~ /\d/)) {
286
+ if ($lang_code) {
287
+ $ht{UTF_CHAR_MAPPING_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = $prob;
288
+ } else {
289
+ $ht{UTF_CHAR_MAPPING}->{$utf8_source_string}->{$utf8_target_string} = $prob;
290
+ }
291
+ if ($word_external_punctuation_p) {
292
+ if ($lang_code) {
293
+ $ht{WORD_EXTERNAL_PUNCTUATION_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = $prob;
294
+ } else {
295
+ $ht{WORD_EXTERNAL_PUNCTUATION}->{$utf8_source_string}->{$utf8_target_string} = $prob;
296
+ }
297
+ }
298
+ if ($this->braille_string_p($utf8_source_string)) {
299
+ if (($utf8_target_string =~ /^[a-z]+$/)
300
+ && (! ($utf8_source_string =~ /^$braille_capital_letter_indicator/))) {
301
+ my $uc_utf8_source_string = "$braille_capital_letter_indicator$utf8_source_string";
302
+ my $uc_utf8_target_string = ucfirst $utf8_target_string;
303
+ if ($lang_code) {
304
+ $ht{UTF_CHAR_MAPPING_LANG_SPEC}->{$lang_code}->{$uc_utf8_source_string}->{$uc_utf8_target_string} = $prob;
305
+ } else {
306
+ $ht{UTF_CHAR_MAPPING}->{$uc_utf8_source_string}->{$uc_utf8_target_string} = $prob;
307
+ }
308
+ $this->register_word_boundary_info(*ht, $lang_code, $uc_utf8_source_string, $uc_utf8_target_string,
309
+ $use_only_for_whole_word_p, $use_only_at_start_of_word_p, $use_only_at_end_of_word_p,
310
+ $dont_use_at_start_of_word_p, $dont_use_at_end_of_word_p);
311
+ }
312
+ if (($utf8_target_string =~ /^[0-9]$/)
313
+ && ($utf8_source_string =~ /^$braille_number_indicator./)) {
314
+ my $core_number_char = $utf8_source_string;
315
+ $core_number_char =~ s/$braille_number_indicator//;
316
+ $ht{BRAILLE_TO_DIGIT}->{$core_number_char} = $utf8_target_string;
317
+ }
318
+ }
319
+ }
320
+ if ($use_only_in_lower_case_enviroment_p) {
321
+ if ($lang_code) {
322
+ $ht{USE_ONLY_IN_LOWER_CASE_ENVIROMENT_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1;
323
+ } else {
324
+ $ht{USE_ONLY_IN_LOWER_CASE_ENVIROMENT}->{$utf8_source_string}->{$utf8_target_string} = 1;
325
+ }
326
+ }
327
+ $this->register_word_boundary_info(*ht, $lang_code, $utf8_source_string, $utf8_target_string,
328
+ $use_only_for_whole_word_p, $use_only_at_start_of_word_p, $use_only_at_end_of_word_p,
329
+ $dont_use_at_start_of_word_p, $dont_use_at_end_of_word_p);
330
+ foreach $utf8_alt_target (@utf8_alt_targets) {
331
+ if ($lang_code) {
332
+ $ht{UTF_CHAR_ALT_MAPPING_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_alt_target} = $prob;
333
+ $ht{USE_ALT_IN_POINTED_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_alt_target} = 1 if $use_alt_in_pointed_p;
334
+ } else {
335
+ $ht{UTF_CHAR_ALT_MAPPING}->{$utf8_source_string}->{$utf8_alt_target} = $prob;
336
+ $ht{USE_ALT_IN_POINTED}->{$utf8_source_string}->{$utf8_alt_target} = 1 if $use_alt_in_pointed_p;
337
+ }
338
+ if ($use_only_for_whole_word_p) {
339
+ if ($lang_code) {
340
+ $ht{USE_ALT_ONLY_FOR_WHOLE_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_alt_target} = 1;
341
+ } else {
342
+ $ht{USE_ALT_ONLY_FOR_WHOLE_WORD}->{$utf8_source_string}->{$utf8_alt_target} = 1;
343
+ }
344
+ }
345
+ if ($use_only_at_start_of_word_p) {
346
+ if ($lang_code) {
347
+ $ht{USE_ALT_ONLY_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_alt_target} = 1;
348
+ } else {
349
+ $ht{USE_ALT_ONLY_AT_START_OF_WORD}->{$utf8_source_string}->{$utf8_alt_target} = 1;
350
+ }
351
+ }
352
+ if ($use_only_at_end_of_word_p) {
353
+ if ($lang_code) {
354
+ $ht{USE_ALT_ONLY_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_alt_target} = 1;
355
+ } else {
356
+ $ht{USE_ALT_ONLY_AT_END_OF_WORD}->{$utf8_source_string}->{$utf8_alt_target} = 1;
357
+ }
358
+ }
359
+ }
360
+ if ($numeric =~ /\d/) {
361
+ $ht{UTF_TO_NUMERIC}->{$utf8_source_string} = $numeric;
362
+ }
363
+ if ($annotation =~ /\S/) {
364
+ $ht{UTF_ANNOTATION}->{$utf8_source_string} = $annotation;
365
+ }
366
+ $n++;
367
+ }
368
+ close(IN);
369
+ # print STDERR "Loaded $n entries from $filename\n";
370
+ } else {
371
+ print STDERR "Can't open $filename\n";
372
+ }
373
+ }
374
+
375
+ sub char_name_to_script {
376
+ local($this, $char_name, *ht) = @_;
377
+
378
+ return $cached_result if $cached_result = $ht{CHAR_NAME_TO_SCRIPT}->{$char_name};
379
+ $orig_char_name = $char_name;
380
+ $char_name =~ s/\s+(CONSONANT|LETTER|LIGATURE|SIGN|SYLLABLE|SYLLABICS|VOWEL)\b.*$//;
381
+ my $script_name;
382
+ while ($char_name) {
383
+ last if $script_name = $ht{SCRIPT_NORM}->{(uc $char_name)};
384
+ $char_name =~ s/\s*\S+\s*$//;
385
+ }
386
+ $script_name = "" unless defined($script_name);
387
+ $ht{CHAR_NAME_TO_SCRIPT}->{$char_name} = $script_name;
388
+ return $script_name;
389
+ }
390
+
391
+ sub letter_plus_char_p {
392
+ local($this, $char_name) = @_;
393
+
394
+ return $cached_result if $cached_result = $ht{CHAR_NAME_LETTER_PLUS}->{$char_name};
395
+ my $letter_plus_p = ($char_name =~ /\b(?:LETTER|VOWEL SIGN|AU LENGTH MARK|CONSONANT SIGN|SIGN VIRAMA|SIGN PAMAAEH|SIGN COENG|SIGN AL-LAKUNA|SIGN ASAT|SIGN ANUSVARA|SIGN ANUSVARAYA|SIGN BINDI|TIPPI|SIGN NIKAHIT|SIGN CANDRABINDU|SIGN VISARGA|SIGN REAHMUK|SIGN NUKTA|SIGN DOT BELOW|HEBREW POINT)\b/) ? 1 : 0;
396
+ $ht{CHAR_NAME_LETTER_PLUS}->{$char_name} = $letter_plus_p;
397
+ return $letter_plus_p;
398
+ }
399
+
400
+ sub subjoined_char_p {
401
+ local($this, $char_name) = @_;
402
+
403
+ return $cached_result if $cached_result = $ht{CHAR_NAME_SUBJOINED}->{$char_name};
404
+ my $subjoined_p = (($char_name =~ /\b(?:SUBJOINED LETTER|VOWEL SIGN|AU LENGTH MARK|EMPHASIS MARK|CONSONANT SIGN|SIGN VIRAMA|SIGN PAMAAEH|SIGN COENG|SIGN ASAT|SIGN ANUSVARA|SIGN ANUSVARAYA|SIGN BINDI|TIPPI|SIGN NIKAHIT|SIGN CANDRABINDU|SIGN VISARGA|SIGN REAHMUK|SIGN DOT BELOW|HEBREW (POINT|PUNCTUATION GERESH)|ARABIC (?:DAMMA|DAMMATAN|FATHA|FATHATAN|HAMZA|KASRA|KASRATAN|MADDAH|SHADDA|SUKUN))\b/)) ? 1 : 0;
405
+ $ht{CHAR_NAME_SUBJOINED}->{$char_name} = $subjoined_p;
406
+ return $subjoined_p;
407
+ }
408
+
409
+ sub new_node_id {
410
+ local($this, *chart_ht) = @_;
411
+
412
+ my $n_nodes = $chart_ht{N_NODES};
413
+ $n_nodes++;
414
+ $chart_ht{N_NODES} = $n_nodes;
415
+ return $n_nodes;
416
+ }
417
+
418
+ sub add_node {
419
+ local($this, $s, $start, $end, *chart_ht, $type, $comment) = @_;
420
+
421
+ my $node_id = $this->new_node_id(*chart_ht);
422
+ # print STDERR "add_node($node_id, $start-$end): $s [$comment]\n" if $comment =~ /number/;
423
+ # print STDERR "add_node($node_id, $start-$end): $s [$comment]\n" if ($start >= 0) && ($start < 50);
424
+ $chart_ht{NODE_START}->{$node_id} = $start;
425
+ $chart_ht{NODE_END}->{$node_id} = $end;
426
+ $chart_ht{NODES_STARTING_AT}->{$start}->{$node_id} = 1;
427
+ $chart_ht{NODES_ENDING_AT}->{$end}->{$node_id} = 1;
428
+ $chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}->{$node_id} = 1;
429
+ $chart_ht{NODE_TYPE}->{$node_id} = $type;
430
+ $chart_ht{NODE_COMMENT}->{$node_id} = $comment;
431
+ $chart_ht{NODE_ROMAN}->{$node_id} = $s;
432
+ return $node_id;
433
+ }
434
+
435
+ sub get_node_for_span {
436
+ local($this, $start, $end, *chart_ht) = @_;
437
+
438
+ return "" unless defined($chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end});
439
+ my @node_ids = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}};
440
+
441
+ return (@node_ids) ? $node_ids[0] : "";
442
+ }
443
+
444
+ sub get_node_for_span_and_type {
445
+ local($this, $start, $end, *chart_ht, $type) = @_;
446
+
447
+ return "" unless defined($chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end});
448
+ my @node_ids = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}};
449
+
450
+ foreach $node_id (@node_ids) {
451
+ return $node_id if $chart_ht{NODE_TYPE}->{$node_id} eq $type;
452
+ }
453
+ return "";
454
+ }
455
+
456
+ sub get_node_roman {
457
+ local($this, $node_id, *chart_id, $default) = @_;
458
+
459
+ $default = "" unless defined($default);
460
+ my $roman = $chart_ht{NODE_ROMAN}->{$node_id};
461
+ return (defined($roman)) ? $roman : $default;
462
+ }
463
+
464
+ sub set_node_id_slot_value {
465
+ local($this, $node_id, $slot, $value, *chart_id) = @_;
466
+
467
+ $chart_ht{NODE_SLOT}->{$node_id}->{$slot} = $value;
468
+ }
469
+
470
+ sub copy_slot_values {
471
+ local($this, $old_node_id, $new_node_id, *chart_id, @slots) = @_;
472
+
473
+ if (@slots) {
474
+ foreach $slot (keys %{$chart_ht{NODE_SLOT}->{$old_node_id}}) {
475
+ if (($slots[0] eq "all") || $util->member($slot, @slots)) {
476
+ my $value = $chart_ht{NODE_SLOT}->{$old_node_id}->{$slot};
477
+ $chart_ht{NODE_SLOT}->{$new_node_id}->{$slot} = $value if defined($value);
478
+ }
479
+ }
480
+ }
481
+ }
482
+
483
+ sub get_node_id_slot_value {
484
+ local($this, $node_id, $slot, *chart_id, $default) = @_;
485
+
486
+ $default = "" unless defined($default);
487
+ my $value = $chart_ht{NODE_SLOT}->{$node_id}->{$slot};
488
+ return (defined($value)) ? $value : $default;
489
+ }
490
+
491
+ sub get_node_for_span_with_slot_value {
492
+ local($this, $start, $end, $slot, *chart_id, $default) = @_;
493
+
494
+ $default = "" unless defined($default);
495
+ return $default unless defined($chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end});
496
+ my @node_ids = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}};
497
+ foreach $node_id (@node_ids) {
498
+ my $value = $chart_ht{NODE_SLOT}->{$node_id}->{$slot};
499
+ return $value if defined($value);
500
+ }
501
+ return $default;
502
+ }
503
+
504
+ sub get_node_for_span_with_slot {
505
+ local($this, $start, $end, $slot, *chart_id, $default) = @_;
506
+
507
+ $default = "" unless defined($default);
508
+ return $default unless defined($chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end});
509
+ my @node_ids = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}};
510
+ foreach $node_id (@node_ids) {
511
+ my $value = $chart_ht{NODE_SLOT}->{$node_id}->{$slot};
512
+ return $node_id if defined($value);
513
+ }
514
+ return $default;
515
+ }
516
+
517
+ sub register_new_complex_number_span_segment {
518
+ local($this, $start, $mid, $end, *chart_id, $line_number) = @_;
519
+ # e.g. 4 10 (= 40); 20 5 (= 25)
520
+ # might become part of larger complex number span, e.g. 4 1000 3 100 20 1
521
+
522
+ # print STDERR "register_new_complex_number_span_segment $start-$mid-$end\n" if $line_number == 43;
523
+ if (defined($old_start = $chart_ht{COMPLEX_NUMERIC_END_START}->{$mid})) {
524
+ undef($chart_ht{COMPLEX_NUMERIC_END_START}->{$mid});
525
+ $chart_ht{COMPLEX_NUMERIC_START_END}->{$old_start} = $end;
526
+ $chart_ht{COMPLEX_NUMERIC_END_START}->{$end} = $old_start;
527
+ } else {
528
+ $chart_ht{COMPLEX_NUMERIC_START_END}->{$start} = $end;
529
+ $chart_ht{COMPLEX_NUMERIC_END_START}->{$end} = $start;
530
+ }
531
+ }
532
+
533
+ sub romanize_by_token_with_caching {
534
+ local($this, $s, $lang_code, $output_style, *ht, *pinyin_ht, $initial_char_offset, $control, $line_number) = @_;
535
+
536
+ $control = "" unless defined($control);
537
+ my $return_chart_p = ($control =~ /return chart/i);
538
+ my $return_offset_mappings_p = ($control =~ /return offset mappings/i);
539
+ return $this->romanize($s, $lang_code, $output_style, *ht, *pinyin_ht, $initial_char_offset, $control, $line_number)
540
+ if $return_chart_p || $return_offset_mappings_p;
541
+ my $result = "";
542
+ my @separators = ();
543
+ my @tokens = ();
544
+ $s =~ s/\n$//; # Added May 2, 2019 as bug-fix (duplicate empty lines)
545
+ while (($sep, $token, $rest) = ($s =~ /^(\s*)(\S+)(.*)$/)) {
546
+ push(@separators, $sep);
547
+ push(@tokens, $token);
548
+ $s = $rest;
549
+ }
550
+ push(@separators, $s);
551
+ while (@tokens) {
552
+ my $sep = shift @separators;
553
+ my $token = shift @tokens;
554
+ $result .= $sep;
555
+ if ($token =~ /^[\x00-\x7F]*$/) { # all ASCII
556
+ $result .= $token;
557
+ } else {
558
+ my $rom_token = $ht{CACHED_ROMANIZATION}->{$lang_code}->{$token};
559
+ unless (defined($rom_token)) {
560
+ $rom_token = $this->romanize($token, $lang_code, $output_style, *ht, *pinyin_ht, $initial_char_offset, $control, $line_number);
561
+ $ht{CACHED_ROMANIZATION}->{$lang_code}->{$token} = $rom_token if defined($rom_token);
562
+ }
563
+ $result .= $rom_token;
564
+ }
565
+ }
566
+ my $sep = shift @separators;
567
+ $result .= $sep if defined($sep);
568
+
569
+ return $result;
570
+ }
571
+
572
+ sub romanize {
573
+ local($this, $s, $lang_code, $output_style, *ht, *pinyin_ht, $initial_char_offset, $control, $line_number, $initial_rom_char_offset) = @_;
574
+
575
+ my $orig_lang_code = $lang_code;
576
+ # Check whether the text (to be romanized) starts with a language code directive.
577
+ if (($line_lang_code) = ($s =~ /^::lcode\s+([a-z][a-z][a-z])\s/)) {
578
+ $lang_code = $line_lang_code;
579
+ }
580
+ $initial_char_offset = 0 unless defined($initial_char_offset);
581
+ $initial_rom_char_offset = 0 unless defined($initial_rom_char_offset);
582
+ $control = "" unless defined($control);
583
+ my $return_chart_p = ($control =~ /return chart/i);
584
+ my $return_offset_mappings_p = ($control =~ /return offset mappings/i);
585
+ $line_number = "" unless defined($line_number);
586
+ my @chars = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht);
587
+ my $n_characters = $#chars + 1;
588
+ %chart_ht = ();
589
+ $chart_ht{N_CHARS} = $n_characters;
590
+ $chart_ht{N_NODES} = 0;
591
+ my $char = "";
592
+ my $char_name = "";
593
+ my $prev_script = "";
594
+ my $current_script = "";
595
+ my $script_start = 0;
596
+ my $script_end = 0;
597
+ my $prev_letter_plus_script = "";
598
+ my $current_letter_plus_script = "";
599
+ my $letter_plus_script_start = 0;
600
+ my $letter_plus_script_end = 0;
601
+ my $log ="";
602
+ my $n_right_to_left_chars = 0;
603
+ my $n_left_to_right_chars = 0;
604
+ my $hebrew_word_start = ""; # used to identify Hebrew words with points
605
+ my $hebrew_word_contains_point = 0;
606
+ my $current_word_start = "";
607
+ my $current_word_script = "";
608
+ my $braille_all_caps_p = 0;
609
+
610
+ # prep
611
+ foreach $i ((0 .. ($#chars + 1))) {
612
+ if ($i <= $#chars) {
613
+ $char = $chars[$i];
614
+ $chart_ht{ORIG_CHAR}->{$i} = $char;
615
+ $char_name = $ht{UTF_TO_CHAR_NAME}->{$char} || "";
616
+ $chart_ht{CHAR_NAME}->{$i} = $char_name;
617
+ $current_script = $this->char_name_to_script($char_name, *ht);
618
+ $current_script_direction = $ht{DIRECTION}->{$current_script} || '';
619
+ if ($current_script_direction eq 'right-to-left') {
620
+ $n_right_to_left_chars++;
621
+ } elsif (($char =~ /^[a-z]$/i) || ! ($char =~ /^[\x00-\x7F]$/)) {
622
+ $n_left_to_right_chars++;
623
+ }
624
+ $chart_ht{CHAR_SCRIPT}->{$i} = $current_script;
625
+ $chart_ht{SCRIPT_SEGMENT_START}->{$i} = ""; # default value, to be updated later
626
+ $chart_ht{SCRIPT_SEGMENT_END}->{$i} = ""; # default value, to be updated later
627
+ $chart_ht{LETTER_TOKEN_SEGMENT_START}->{$i} = ""; # default value, to be updated later
628
+ $chart_ht{LETTER_TOKEN_SEGMENT_END}->{$i} = ""; # default value, to be updated later
629
+ $subjoined_char_p = $this->subjoined_char_p($char_name);
630
+ $chart_ht{CHAR_SUBJOINED}->{$i} = $subjoined_char_p;
631
+ $letter_plus_char_p = $this->letter_plus_char_p($char_name);
632
+ $chart_ht{CHAR_LETTER_PLUS}->{$i} = $letter_plus_char_p;
633
+ $current_letter_plus_script = ($letter_plus_char_p) ? $current_script : "";
634
+ $numeric_value = $ht{UTF_TO_NUMERIC}->{$char};
635
+ $numeric_value = "" unless defined($numeric_value);
636
+ $annotation = $ht{UTF_ANNOTATION}->{$char};
637
+ $annotation = "" unless defined($annotation);
638
+ $chart_ht{CHAR_NUMERIC_VALUE}->{$i} = $numeric_value;
639
+ $chart_ht{CHAR_ANNOTATION}->{$i} = $annotation;
640
+ $syllable_info = $ht{UTF_TO_SYLLABLE_INFO}->{$char} || "";
641
+ $chart_ht{CHAR_SYLLABLE_INFO}->{$i} = $syllable_info;
642
+ $tone_mark = $ht{UTF_TO_TONE_MARK}->{$char} || "";
643
+ $chart_ht{CHAR_TONE_MARK}->{$i} = $tone_mark;
644
+ } else {
645
+ $char = "";
646
+ $char_name = "";
647
+ $current_script = "";
648
+ $current_letter_plus_script = "";
649
+ }
650
+ if ($char_name =~ /^HEBREW (LETTER|POINT|PUNCTUATION GERESH) /) {
651
+ $hebrew_word_start = $i if $hebrew_word_start eq "";
652
+ $hebrew_word_contains_point = 1 if $char_name =~ /^HEBREW POINT /;
653
+ } elsif ($hebrew_word_start ne "") {
654
+ if ($hebrew_word_contains_point) {
655
+ foreach $j (($hebrew_word_start .. ($i-1))) {
656
+ $chart_ht{CHAR_PART_OF_POINTED_HEBREW_WORD}->{$j} = 1;
657
+ }
658
+ $chart_ht{CHAR_START_OF_WORD}->{$hebrew_word_start} = 1;
659
+ $chart_ht{CHAR_END_OF_WORD}->{($i-1)} = 1;
660
+ }
661
+ $hebrew_word_start = "";
662
+ $hebrew_word_contains_point = 0;
663
+ }
664
+ my $part_of_word_p = $current_script
665
+ && ($this->letter_plus_char_p($char_name)
666
+ || $this->subjoined_char_p($char_name)
667
+ || ($char_name =~ /\b(LETTER|SYLLABLE|SYLLABICS|LIGATURE)\b/));
668
+
669
+ # Braille punctuation
670
+ my $end_offset = 0;
671
+ if ($char_name =~ /^Braille\b/i) {
672
+ if (($char =~ /^\s*$/) || ($char_name =~ /BLANK/)) {
673
+ $part_of_word_p = 0;
674
+ $braille_all_caps_p = 0;
675
+ } elsif ($chart_ht{NOT_PART_OF_WORD_P}->{$i}) {
676
+ $part_of_word_p = 0;
677
+ $braille_all_caps_p = 0;
678
+ } elsif ((keys %{$ht{WORD_EXTERNAL_PUNCTUATION_LANG_SPEC}->{$lang_code}->{$char}})
679
+ || (keys %{$ht{WORD_EXTERNAL_PUNCTUATION}->{$char}})) {
680
+ $part_of_word_p = 0;
681
+ $braille_all_caps_p = 0;
682
+ } elsif (($i+1 <= $#chars)
683
+ && ($s1 = $char . $chars[$i+1])
684
+ && ((keys %{$ht{WORD_EXTERNAL_PUNCTUATION_LANG_SPEC}->{$lang_code}->{$s1}})
685
+ || (keys %{$ht{WORD_EXTERNAL_PUNCTUATION}->{$s1}}))) {
686
+ $part_of_word_p = 0;
687
+ $braille_all_caps_p = 0;
688
+ $chart_ht{NOT_PART_OF_WORD_P}->{($i+1)} = 1;
689
+ } elsif (($i+2 <= $#chars)
690
+ && ($s2 = $char . $chars[$i+1] . $chars[$i+2])
691
+ && ((keys %{$ht{WORD_EXTERNAL_PUNCTUATION_LANG_SPEC}->{$lang_code}->{$s2}})
692
+ || (keys %{$ht{WORD_EXTERNAL_PUNCTUATION}->{$s2}}))) {
693
+ $part_of_word_p = 0;
694
+ $braille_all_caps_p = 0;
695
+ $chart_ht{NOT_PART_OF_WORD_P}->{($i+1)} = 1;
696
+ $chart_ht{NOT_PART_OF_WORD_P}->{($i+2)} = 1;
697
+ } elsif (($i+1 <= $#chars)
698
+ && ($char eq $braille_capital_letter_indicator)
699
+ && ($chars[$i+1] eq $braille_capital_letter_indicator)) {
700
+ $braille_all_caps_p = 1;
701
+ } else {
702
+ $part_of_word_p = 1;
703
+ }
704
+ # last period in Braille text is also not part_of_word_p
705
+ if (($char eq $braille_period)
706
+ && (($i == $#chars)
707
+ || (($i < $#chars)
708
+ && (! $this->braille_string_p($chars[$i+1]))))) {
709
+ $part_of_word_p = 0;
710
+ }
711
+ # period before other word-external punctuation is also not part_of_word_p
712
+ if (($i > 0)
713
+ && ($chars[$i-1] eq $braille_period)
714
+ && (! $part_of_word_p)
715
+ && ($current_word_start ne "")) {
716
+ $end_offset = -1;
717
+ }
718
+ } else {
719
+ $braille_all_caps_p = 0;
720
+ }
721
+ $chart_ht{BRAILLE_ALL_CAPS_P}->{$i} = $braille_all_caps_p;
722
+
723
+ if (($current_word_start ne "")
724
+ && ((! $part_of_word_p)
725
+ || ($current_script ne $current_word_script))) {
726
+ # END OF WORD
727
+ $chart_ht{CHAR_START_OF_WORD}->{$current_word_start} = 1;
728
+ $chart_ht{CHAR_END_OF_WORD}->{($i-1+$end_offset)} = 1;
729
+ my $word = join("", @chars[$current_word_start .. ($i-1+$end_offset)]);
730
+ $chart_ht{WORD_START_END}->{$current_word_start}->{$i} = $word;
731
+ $chart_ht{WORD_END_START}->{$i+$end_offset}->{$current_word_start} = $word;
732
+ # print STDERR "Word ($current_word_start-$i+$end_offset): $word ($current_word_script)\n";
733
+ $current_word_start = "";
734
+ $current_word_script = "";
735
+ }
736
+ if ($part_of_word_p && ($current_word_start eq "")) {
737
+ # START OF WORD
738
+ $current_word_start = $i;
739
+ $current_word_script = $current_script;
740
+ }
741
+ # print STDERR "$i char: $char ($current_script)\n";
742
+ unless ($current_script eq $prev_script) {
743
+ if ($prev_script && ($i-1 >= $script_start)) {
744
+ my $script_end = $i;
745
+ $chart_ht{SCRIPT_SEGMENT_START_TO_END}->{$script_start} = $script_end;
746
+ $chart_ht{SCRIPT_SEGMENT_END_TO_START}->{$script_end} = $script_start;
747
+ foreach $i (($script_start .. $script_end)) {
748
+ $chart_ht{SCRIPT_SEGMENT_START}->{$i} = $script_start;
749
+ $chart_ht{SCRIPT_SEGMENT_END}->{$i} = $script_end;
750
+ }
751
+ # print STDERR "Script segment $script_start-$script_end: $prev_script\n";
752
+ }
753
+ $script_start = $i;
754
+ }
755
+ unless ($current_letter_plus_script eq $prev_letter_plus_script) {
756
+ if ($prev_letter_plus_script && ($i-1 >= $letter_plus_script_start)) {
757
+ my $letter_plus_script_end = $i;
758
+ $chart_ht{LETTER_TOKEN_SEGMENT_START_TO_END}->{$letter_plus_script_start} = $letter_plus_script_end;
759
+ $chart_ht{LETTER_TOKEN_SEGMENT_END_TO_START}->{$letter_plus_script_end} = $letter_plus_script_start;
760
+ foreach $i (($letter_plus_script_start .. $letter_plus_script_end)) {
761
+ $chart_ht{LETTER_TOKEN_SEGMENT_START}->{$i} = $letter_plus_script_start;
762
+ $chart_ht{LETTER_TOKEN_SEGMENT_END}->{$i} = $letter_plus_script_end;
763
+ }
764
+ # print STDERR "Script token segment $letter_plus_script_start-$letter_plus_script_end: $prev_letter_plus_script\n";
765
+ }
766
+ $letter_plus_script_start = $i;
767
+ }
768
+ $prev_script = $current_script;
769
+ $prev_letter_plus_script = $current_letter_plus_script;
770
+ }
771
+ $ht{STRING_IS_DOMINANTLY_RIGHT_TO_LEFT}->{$s} = 1 if $n_right_to_left_chars > $n_left_to_right_chars;
772
+
773
+ # main
774
+ my $i = 0;
775
+ while ($i <= $#chars) {
776
+ my $char = $chart_ht{ORIG_CHAR}->{$i};
777
+ my $current_script = $chart_ht{CHAR_SCRIPT}->{$i};
778
+ $chart_ht{CHART_CONTAINS_SCRIPT}->{$current_script} = 1;
779
+ my $script_segment_start = $chart_ht{SCRIPT_SEGMENT_START}->{$i};
780
+ my $script_segment_end = $chart_ht{SCRIPT_SEGMENT_END}->{$i};
781
+ my $char_name = $chart_ht{CHAR_NAME}->{$i};
782
+ my $subjoined_char_p = $chart_ht{CHAR_SUBJOINED}->{$i};
783
+ my $letter_plus_char_p = $chart_ht{CHAR_LETTER_PLUS}->{$i};
784
+ my $numeric_value = $chart_ht{CHAR_NUMERIC_VALUE}->{$i};
785
+ my $annotation = $chart_ht{CHAR_ANNOTATION}->{$i};
786
+ # print STDERR " $char_name annotation: $annotation\n" if $annotation;
787
+ my $tone_mark = $chart_ht{CHAR_TONE_MARK}->{$i};
788
+ my $found_char_mapping_p = 0;
789
+ my $prev_char_name = ($i >= 1) ? $chart_ht{CHAR_NAME}->{($i-1)} : "";
790
+ my $prev2_script = ($i >= 2) ? $chart_ht{CHAR_SCRIPT}->{($i-2)} : "";
791
+ my $prev_script = ($i >= 1) ? $chart_ht{CHAR_SCRIPT}->{($i-1)} : "";
792
+ my $next_script = ($i < $#chars) ? $chart_ht{CHAR_SCRIPT}->{($i+1)} : "";
793
+ my $next_char = ($i < $#chars) ? $chart_ht{ORIG_CHAR}->{($i+1)} : "";
794
+ my $next_char_name = $ht{UTF_TO_CHAR_NAME}->{$next_char} || "";
795
+ my $prev2_letter_plus_char_p = ($i >= 2) ? $chart_ht{CHAR_LETTER_PLUS}->{($i-2)} : 0;
796
+ my $prev_letter_plus_char_p = ($i >= 1) ? $chart_ht{CHAR_LETTER_PLUS}->{($i-1)} : 0;
797
+ my $next_letter_plus_char_p = ($i < $#chars) ? $chart_ht{CHAR_LETTER_PLUS}->{($i+1)} : 0;
798
+ my $next_index = $i + 1;
799
+
800
+ # Braille numeric mode
801
+ if ($char eq $braille_number_indicator) {
802
+ my $offset = 0;
803
+ my $numeric_value = "";
804
+ my $digit;
805
+ while ($i+$offset < $#chars) {
806
+ $offset++;
807
+ my $offset_char = $chart_ht{ORIG_CHAR}->{$i+$offset};
808
+ if (defined($digit = $ht{BRAILLE_TO_DIGIT}->{$offset_char})) {
809
+ $numeric_value .= $digit;
810
+ } elsif (($offset_char eq $braille_decimal_point)
811
+ || ($ht{UTF_CHAR_MAPPING}->{$offset_char}->{"."})) {
812
+ $numeric_value .= ".";
813
+ } elsif ($offset_char eq $braille_comma) {
814
+ $numeric_value .= ",";
815
+ } elsif ($offset_char eq $braille_numeric_space) {
816
+ $numeric_value .= " ";
817
+ } elsif ($offset_char eq $braille_solidus) {
818
+ $numeric_value .= "/";
819
+ } elsif ($offset_char eq $braille_number_indicator) {
820
+ # stay in Braille numeric mode
821
+ } elsif ($offset_char eq $braille_letter_indicator) {
822
+ # consider as part of number, but without contributing to numeric_value
823
+ last;
824
+ } else {
825
+ $offset--;
826
+ last;
827
+ }
828
+ }
829
+ if ($offset) {
830
+ $next_index = $i + $offset + 1;
831
+ $node_id = $this->add_node($numeric_value, $i, $next_index, *chart_ht, "", "braille number");
832
+ $found_char_mapping_p = 1;
833
+ }
834
+ }
835
+
836
+ unless ($found_char_mapping_p) {
837
+ foreach $string_length (reverse(1 .. 6)) {
838
+ next if ($i + $string_length-1) > $#chars;
839
+ my $start_of_word_p = $chart_ht{CHAR_START_OF_WORD}->{$i} || 0;
840
+ my $end_of_word_p = $chart_ht{CHAR_END_OF_WORD}->{($i+$string_length-1)} || 0;
841
+ my $multi_char_substring = join("", @chars[$i..($i+$string_length-1)]);
842
+ my @mappings = keys %{$ht{UTF_CHAR_MAPPING_LANG_SPEC}->{$lang_code}->{$multi_char_substring}};
843
+ @mappings = keys %{$ht{UTF_CHAR_MAPPING}->{$multi_char_substring}} unless @mappings;
844
+ my @mappings_whole = ();
845
+ my @mappings_start_or_end = ();
846
+ my @mappings_other = ();
847
+ foreach $mapping (@mappings) {
848
+ next if $mapping =~ /\(__.*__\)/;
849
+ if ($ht{USE_ONLY_FOR_WHOLE_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$mapping}
850
+ || $ht{USE_ONLY_FOR_WHOLE_WORD}->{$multi_char_substring}->{$mapping}) {
851
+ push(@mappings_whole, $mapping) if $start_of_word_p && $end_of_word_p;
852
+ } elsif ($ht{USE_ONLY_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$mapping}
853
+ || $ht{USE_ONLY_AT_START_OF_WORD}->{$multi_char_substring}->{$mapping}) {
854
+ push(@mappings_start_or_end, $mapping) if $start_of_word_p;
855
+ } elsif ($ht{USE_ONLY_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$mapping}
856
+ || $ht{USE_ONLY_AT_END_OF_WORD}->{$multi_char_substring}->{$mapping}) {
857
+ push(@mappings_start_or_end, $mapping) if $end_of_word_p;
858
+ } else {
859
+ push(@mappings_other, $mapping);
860
+ }
861
+ }
862
+ @mappings = @mappings_whole;
863
+ @mappings = @mappings_start_or_end unless @mappings;
864
+ @mappings = @mappings_other unless @mappings;
865
+ foreach $mapping (@mappings) {
866
+ next if $mapping =~ /\(__.*__\)/;
867
+ if ($ht{DONT_USE_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$mapping}
868
+ || $ht{DONT_USE_AT_START_OF_WORD}->{$multi_char_substring}->{$mapping}) {
869
+ next if $start_of_word_p;
870
+ }
871
+ if ($ht{DONT_USE_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$mapping}
872
+ || $ht{DONT_USE_AT_END_OF_WORD}->{$multi_char_substring}->{$mapping}) {
873
+ next if $end_of_word_p;
874
+ }
875
+ my $mapping2 = ($chart_ht{BRAILLE_ALL_CAPS_P}->{$i}) ? (uc $mapping) : $mapping;
876
+ $node_id = $this->add_node($mapping2, $i, $i+$string_length, *chart_ht, "", "multi-char-mapping");
877
+ $next_index = $i + $string_length;
878
+ $found_char_mapping_p = 1;
879
+ if ($annotation) {
880
+ @annotation_elems = split(/,\s*/, $annotation);
881
+ foreach $annotation_elem (@annotation_elems) {
882
+ if (($a_slot, $a_value) = ($annotation_elem =~ /^(\S+?):(\S+)\s*$/)) {
883
+ $this->set_node_id_slot_value($node_id, $a_slot, $a_value, *chart_ht);
884
+ } else {
885
+ $this->set_node_id_slot_value($node_id, $annotation_elem, 1, *chart_ht);
886
+ }
887
+ }
888
+ }
889
+ }
890
+ my @alt_mappings = keys %{$ht{UTF_CHAR_ALT_MAPPING_LANG_SPEC}->{$lang_code}->{$multi_char_substring}};
891
+ @alt_mappings = keys %{$ht{UTF_CHAR_ALT_MAPPING}->{$multi_char_substring}} unless @alt_mappings;
892
+ @alt_mappings = () if ($#alt_mappings == 0) && ($alt_mappings[0] eq "_NONE_");
893
+ foreach $alt_mapping (@alt_mappings) {
894
+ if ($chart_ht{CHAR_PART_OF_POINTED_HEBREW_WORD}->{$i}) {
895
+ next unless
896
+ $ht{USE_ALT_IN_POINTED_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$alt_mapping}
897
+ || $ht{USE_ALT_IN_POINTED}->{$multi_char_substring}->{$alt_mapping};
898
+ }
899
+ if ($ht{USE_ALT_ONLY_FOR_WHOLE_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$alt_mapping}
900
+ || $ht{USE_ALT_ONLY_FOR_WHOLE_WORD}->{$multi_char_substring}->{$alt_mapping}) {
901
+ next unless $start_of_word_p && $end_of_word_p;
902
+ }
903
+ if ($ht{USE_ALT_ONLY_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$alt_mapping}
904
+ || $ht{USE_ALT_ONLY_AT_START_OF_WORD}->{$multi_char_substring}->{$alt_mapping}) {
905
+ next unless $start_of_word_p;
906
+ }
907
+ if ($ht{USE_ALT_ONLY_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$alt_mapping}
908
+ || $ht{USE_ALT_ONLY_AT_END_OF_WORD}->{$multi_char_substring}->{$alt_mapping}) {
909
+ next unless $end_of_word_p;
910
+ }
911
+ my $alt_mapping2 = ($chart_ht{BRAILLE_ALL_CAPS_P}->{$i}) ? (uc $alt_mapping) : $alt_mapping;
912
+ $node_id = $this->add_node($alt_mapping2, $i, $i+$string_length, *chart_ht, "alt", "multi-char-mapping");
913
+ if ($annotation) {
914
+ @annotation_elems = split(/,\s*/, $annotation);
915
+ foreach $annotation_elem (@annotation_elems) {
916
+ if (($a_slot, $a_value) = ($annotation_elem =~ /^(\S+?):(\S+)\s*$/)) {
917
+ $this->set_node_id_slot_value($node_id, $a_slot, $a_value, *chart_ht);
918
+ } else {
919
+ $this->set_node_id_slot_value($node_id, $annotation_elem, 1, *chart_ht);
920
+ }
921
+ }
922
+ }
923
+ }
924
+ }
925
+ }
926
+ unless ($found_char_mapping_p) {
927
+ my $prev_node_id = $this->get_node_for_span($i-4, $i, *chart_ht)
928
+ || $this->get_node_for_span($i-3, $i, *chart_ht)
929
+ || $this->get_node_for_span($i-2, $i, *chart_ht)
930
+ || $this->get_node_for_span($i-1, $i, *chart_ht);
931
+ my $prev_char_roman = ($prev_node_id) ? $this->get_node_roman($prev_node_id, *chart_id) : "";
932
+ my $prev_node_start = ($prev_node_id) ? $chart_ht{NODE_START}->{$prev_node_id} : "";
933
+
934
+ # Number
935
+ if (($numeric_value =~ /\d/)
936
+ && (! ($char_name =~ /SUPERSCRIPT/))) {
937
+ my $prev_numeric_value = $this->get_node_for_span_with_slot_value($i-1, $i, "numeric-value", *chart_id);
938
+ my $sep = "";
939
+ $sep = " " if ($char_name =~ /^vulgar fraction /i) && ($prev_numeric_value =~ /\d/);
940
+ $node_id = $this->add_node("$sep$numeric_value", $i, $i+1, *chart_ht, "", "number");
941
+ $this->set_node_id_slot_value($node_id, "numeric-value", $numeric_value, *chart_ht);
942
+ if ((($prev_numeric_value =~ /\d/) && ($numeric_value =~ /\d\d/))
943
+ || (($prev_numeric_value =~ /\d\d/) && ($numeric_value =~ /\d/))) {
944
+ # pull in any other parts of single digits
945
+ my $j = 1;
946
+ # pull in any single digits adjoining on left
947
+ if ($prev_numeric_value =~ /^\d$/) {
948
+ while (1) {
949
+ if (($i-$j-1 >= 0)
950
+ && defined($digit_value = $this->get_node_for_span_with_slot_value($i-$j-1, $i-$j, "numeric-value", *chart_id))
951
+ && ($digit_value =~ /^\d$/)) {
952
+ $j++;
953
+ } elsif (($i-$j-2 >= 0)
954
+ && ($chart_ht{ORIG_CHAR}->{($i-$j-1)} =~ /^[.,]$/)
955
+ && defined($digit_value = $this->get_node_for_span_with_slot_value($i-$j-2, $i-$j-1, "numeric-value", *chart_id))
956
+ && ($digit_value =~ /^\d$/)) {
957
+ $j += 2;
958
+ } else {
959
+ last;
960
+ }
961
+ }
962
+ }
963
+ # pull in any single digits adjoining on right
964
+ my $k = 0;
965
+ if ($numeric_value =~ /^\d$/) {
966
+ while (1) {
967
+ if (defined($next_numeric_value = $chart_ht{CHAR_NUMERIC_VALUE}->{($i+$k+1)})
968
+ && ($next_numeric_value =~ /^\d$/)) {
969
+ $k++;
970
+ } else {
971
+ last;
972
+ }
973
+ }
974
+ }
975
+ $this->register_new_complex_number_span_segment($i-$j, $i, $i+$k+1, *chart_ht, $line_number);
976
+ }
977
+ if ($chinesePM->string_contains_utf8_cjk_unified_ideograph_p($char)
978
+ && ($tonal_translit = $chinesePM->tonal_pinyin($char, *pinyin_ht, ""))) {
979
+ $de_accented_translit = $util->de_accent_string($tonal_translit);
980
+ if ($numeric_value =~ /^(10000|1000000000000|10000000000000000)$/) {
981
+ $chart_ht{NODE_TYPE}->{$node_id} = "alt"; # keep, but demote
982
+ $alt_node_id = $this->add_node($de_accented_translit, $i, $i+1, *chart_ht, "", "CJK");
983
+ } else {
984
+ $alt_node_id = $this->add_node($de_accented_translit, $i, $i+1, *chart_ht, "alt", "CJK");
985
+ }
986
+ }
987
+
988
+ # ASCII
989
+ } elsif ($char =~ /^[\x00-\x7F]$/) {
990
+ $this->add_node($char, $i, $i+1, *chart_ht, "", "ASCII"); # ASCII character, incl. control characters
991
+
992
+ # Emoji, dingbats, pictographs
993
+ } elsif ($char =~ /^(\xE2[\x98-\x9E]|\xF0\x9F[\x8C-\xA7])/) {
994
+ $this->add_node($char, $i, $i+1, *chart_ht, "", "pictograph");
995
+
996
+ # Hangul (Korean)
997
+ } elsif (($char =~ /^[\xEA-\xED]/)
998
+ && ($romanized_char = $this->unicode_hangul_romanization($char))) {
999
+ $this->add_node($romanized_char, $i, $i+1, *chart_ht, "", "Hangul");
1000
+
1001
+ # CJK (Chinese, Japanese, Korean)
1002
+ } elsif ($chinesePM->string_contains_utf8_cjk_unified_ideograph_p($char)
1003
+ && ($tonal_translit = $chinesePM->tonal_pinyin($char, *pinyin_ht, ""))) {
1004
+ $de_accented_translit = $util->de_accent_string($tonal_translit);
1005
+ $this->add_node($de_accented_translit, $i, $i+1, *chart_ht, "", "CJK");
1006
+
1007
+ # Virama (cancel preceding vowel in Abudiga scripts)
1008
+ } elsif ($char_name =~ /\bSIGN (?:VIRAMA|AL-LAKUNA|ASAT|COENG|PAMAAEH)\b/) {
1009
+ # VIRAMA: cancel preceding default vowel (in Abudiga scripts)
1010
+ if (($prev_script eq $current_script)
1011
+ && (($prev_char_roman_consonant, $prev_char_roman_vowel) = ($prev_char_roman =~ /^(.*[bcdfghjklmnpqrstvwxyz])([aeiou]+)$/i))
1012
+ && ($ht{SCRIPT_ABUDIGA_DEFAULT_VOWEL}->{$current_script}->{(lc $prev_char_roman_vowel)})) {
1013
+ $this->add_node($prev_char_roman_consonant, $prev_node_start, $i+1, *chart_ht, "", "virama");
1014
+ } else {
1015
+ $this->add_node("", $i, $i+1, *chart_ht, "", "unexpected-virama");
1016
+ }
1017
+
1018
+ # Nukta (special (typically foreign) variant)
1019
+ } elsif ($char_name =~ /\bSIGN (?:NUKTA)\b/) {
1020
+ # NUKTA (dot): indicates special (typically foreign) variant; normally covered by multi-mappings
1021
+ if ($prev_script eq $current_script) {
1022
+ my $node_id = $this->add_node($prev_char_roman, $prev_node_start, $i+1, *chart_ht, "", "nukta");
1023
+ $this->copy_slot_values($prev_node_id, $node_id, *chart_id, "all");
1024
+ $this->set_node_id_slot_value($node_id, "nukta", 1, *chart_ht);
1025
+ } else {
1026
+ $this->add_node("", $i, $i+1, *chart_ht, "", "unexpected-nukta");
1027
+ }
1028
+
1029
+ # Zero-width character, incl. zero width space/non-joiner/joiner, left-to-right/right-to-left mark
1030
+ } elsif ($char =~ /^\xE2\x80[\x8B-\x8F\xAA-\xAE]$/) {
1031
+ if ($prev_node_id) {
1032
+ my $node_id = $this->add_node($prev_char_roman, $prev_node_start, $i+1, *chart_ht, "", "zero-width-char");
1033
+ $this->copy_slot_values($prev_node_id, $node_id, *chart_id, "all");
1034
+ } else {
1035
+ $this->add_node("", $i, $i+1, *chart_ht, "", "zero-width-char");
1036
+ }
1037
+ } elsif (($char =~ /^\xEF\xBB\xBF$/) && $prev_node_id) { # OK to leave byte-order-mark at beginning of line
1038
+ my $node_id = $this->add_node($prev_char_roman, $prev_node_start, $i+1, *chart_ht, "", "zero-width-char");
1039
+ $this->copy_slot_values($prev_node_id, $node_id, *chart_id, "all");
1040
+
1041
+ # Tone mark
1042
+ } elsif ($tone_mark) {
1043
+ if ($prev_script eq $current_script) {
1044
+ my $node_id = $this->add_node($prev_char_roman, $prev_node_start, $i+1, *chart_ht, "", "tone-mark");
1045
+ $this->copy_slot_values($prev_node_id, $node_id, *chart_id, "all");
1046
+ $this->set_node_id_slot_value($node_id, "tone-mark", $tone_mark, *chart_ht);
1047
+ } else {
1048
+ $this->add_node("", $i, $i+1, *chart_ht, "", "unexpected-tone-mark");
1049
+ }
1050
+
1051
+ # Diacritic
1052
+ } elsif (($char_name =~ /\b(ACCENT|TONE|COMBINING DIAERESIS|COMBINING DIAERESIS BELOW|COMBINING MACRON|COMBINING VERTICAL LINE ABOVE|COMBINING DOT ABOVE RIGHT|COMBINING TILDE|COMBINING CYRILLIC|MUUSIKATOAN|TRIISAP)\b/) && ($ht{UTF_TO_CAT}->{$char} =~ /^Mn/)) {
1053
+ if ($prev_script eq $current_script) {
1054
+ my $node_id = $this->add_node($prev_char_roman, $prev_node_start, $i+1, *chart_ht, "", "diacritic");
1055
+ $this->copy_slot_values($prev_node_id, $node_id, *chart_id, "all");
1056
+ $diacritic = lc $char_name;
1057
+ $diacritic =~ s/^.*(?:COMBINING CYRILLIC|COMBINING|SIGN)\s+//i;
1058
+ $diacritic =~ s/^.*(ACCENT|TONE)/$1/i;
1059
+ $diacritic =~ s/^\s*//;
1060
+ $this->set_node_id_slot_value($node_id, "diacritic", $diacritic, *chart_ht);
1061
+ # print STDERR "diacritic: $diacritic\n";
1062
+ } else {
1063
+ $this->add_node("", $i, $i+1, *chart_ht, "", "unexpected-diacritic");
1064
+ }
1065
+
1066
+ # Romanize to find out more
1067
+ } elsif ($char_name) {
1068
+ if (defined($romanized_char = $this->romanize_char_at_position($i, $lang_code, $output_style, *ht, *chart_ht))) {
1069
+ # print STDERR "ROM l.$line_number/$i: $romanized_char\n" if $line_number =~ /^[12]$/;
1070
+ print STDOUT "ROM l.$line_number/$i: $romanized_char\n" if $verbosePM;
1071
+
1072
+ # Empty string mapping
1073
+ if ($romanized_char eq "\"\"") {
1074
+ $this->add_node("", $i, $i+1, *chart_ht, "", "empty-string-mapping");
1075
+ # consider adding something for implausible romanizations of length 6+
1076
+
1077
+ # keep original character (instead of romanized_char lengthener, character-18b00 etc.)
1078
+ } elsif (($romanized_char =~ /^(character|lengthener|modifier)/)) {
1079
+ $this->add_node($char, $i, $i+1, *chart_ht, "", "nevermind-keep-original");
1080
+
1081
+ # Syllabic suffix in Abudiga languages, e.g. -m, -ng
1082
+ } elsif (($romanized_char =~ /^\+(H|M|N|NG)$/i)
1083
+ && ($prev_script eq $current_script)
1084
+ && ($ht{SCRIPT_ABUDIGA_DEFAULT_VOWEL}->{$current_script}->{"a"})) {
1085
+ my $core_suffix = $romanized_char;
1086
+ $core_suffix =~ s/^\+//;
1087
+ if ($prev_char_roman =~ /[aeiou]$/i) {
1088
+ $this->add_node($core_suffix, $i, $i+1, *chart_ht, "", "syllable-end-consonant");
1089
+ } else {
1090
+ $this->add_node(join("", $prev_char_roman, "a", $core_suffix), $prev_node_start, $i+1, *chart_ht, "", "syllable-end-consonant-with-added-a");
1091
+ $this->add_node(join("", "a", $core_suffix), $i, $i+1, *chart_ht, "backup", "syllable-end-consonant");
1092
+ }
1093
+
1094
+ # Japanese special cases
1095
+ } elsif ($char_name =~ /(?:HIRAGANA|KATAKANA) LETTER SMALL Y/) {
1096
+ if (($prev_script eq $current_script)
1097
+ && (($prev_char_roman_consonant) = ($prev_char_roman =~ /^(.*[bcdfghjklmnpqrstvwxyz])i$/i))) {
1098
+ unless ($this->get_node_for_span_and_type($prev_node_start, $i+1, *chart_ht, "")) {
1099
+ $this->add_node("$prev_char_roman_consonant$romanized_char", $prev_node_start, $i+1, *chart_ht, "", "japanese-contraction");
1100
+ }
1101
+ } else {
1102
+ $this->add_node($romanized_char, $i, $i+1, *chart_ht, "", "unexpected-japanese-contraction-character");
1103
+ }
1104
+ } elsif (($prev_script =~ /^(HIRAGANA|KATAKANA)$/i)
1105
+ && ($char_name eq "KATAKANA-HIRAGANA PROLONGED SOUND MARK") # Choonpu
1106
+ && (($prev_char_roman_vowel) = ($prev_char_roman =~ /([aeiou])$/i))) {
1107
+ $this->add_node("$prev_char_roman$prev_char_roman_vowel", $prev_node_start, $i+1, *chart_ht, "", "japanese-vowel-lengthening");
1108
+ } elsif (($current_script =~ /^(Hiragana|Katakana)$/i)
1109
+ && ($char_name =~ /^(HIRAGANA|KATAKANA) LETTER SMALL TU$/i) # Sokuon/Sukun
1110
+ && ($next_script eq $current_script)
1111
+ && ($romanized_next_char = $this->romanize_char_at_position_incl_multi($i+1, $lang_code, $output_style, *ht, *chart_ht))
1112
+ && (($doubled_consonant) = ($romanized_next_char =~ /^(ch|[bcdfghjklmnpqrstwz])/i))) {
1113
+ # Note: $romanized_next_char could be part of a multi-character mapping
1114
+ # print STDERR "current_script: $current_script char_name: $char_name next_script: $next_script romanized_next_char: $romanized_next_char doubled_consonant: $doubled_consonant\n";
1115
+ $doubled_consonant = "t" if $doubled_consonant eq "ch";
1116
+ $this->add_node($doubled_consonant, $i, $i+1, *chart_ht, "", "japanese-consonant-doubling");
1117
+
1118
+ # Greek small letter mu to micro-sign (instead of to "m") as used in abbreviations for microgram/micrometer/microliter/microsecond/micromolar/microfarad etc.
1119
+ } elsif (($char_name eq "GREEK SMALL LETTER MU")
1120
+ && (! ($prev_script =~ /^GREEK$/))
1121
+ && ($i < $#chars)
1122
+ && ($chart_ht{ORIG_CHAR}->{($i+1)} =~ /^[cfgjlmstv]$/i)) {
1123
+ $this->add_node("\xC2\xB5", $i, $i+1, *chart_ht, "", "greek-mu-to-micro-sign");
1124
+
1125
+ # Gurmukhi addak (doubles following consonant)
1126
+ } elsif (($current_script eq "Gurmukhi")
1127
+ && ($char_name eq "GURMUKHI ADDAK")) {
1128
+ if (($next_script eq $current_script)
1129
+ && ($romanized_next_char = $this->romanize_char_at_position_incl_multi($i+1, $lang_code, $output_style, *ht, *chart_ht))
1130
+ && (($doubled_consonant) = ($romanized_next_char =~ /^([bcdfghjklmnpqrstvwxz])/i))) {
1131
+ $this->add_node($doubled_consonant, $i, $i+1, *chart_ht, "", "gurmukhi-consonant-doubling");
1132
+ } else {
1133
+ $this->add_node("'", $i, $i+1, *chart_ht, "", "gurmukhi-unexpected-addak");
1134
+ }
1135
+
1136
+ # Subjoined character
1137
+ } elsif ($subjoined_char_p
1138
+ && ($prev_script eq $current_script)
1139
+ && (($prev_char_roman_consonant, $prev_char_roman_vowel) = ($prev_char_roman =~ /^(.*[bcdfghjklmnpqrstvwxyz])([aeiou]+)$/i))
1140
+ && ($ht{SCRIPT_ABUDIGA_DEFAULT_VOWEL}->{$current_script}->{(lc $prev_char_roman_vowel)})) {
1141
+ my $new_roman = "$prev_char_roman_consonant$romanized_char";
1142
+ $this->add_node($new_roman, $prev_node_start, $i+1, *chart_ht, "", "subjoined-character");
1143
+ # print STDERR " Subjoin l.$line_number/$i: $new_roman\n" if $line_number =~ /^[12]$/;
1144
+
1145
+ # Thai special case: written-pre-consonant-spoken-post-consonant
1146
+ } elsif (($char_name =~ /THAI CHARACTER/)
1147
+ && ($prev_script eq $current_script)
1148
+ && ($chart_ht{CHAR_SYLLABLE_INFO}->{($i-1)} =~ /written-pre-consonant-spoken-post-consonant/i)
1149
+ && ($prev_char_roman =~ /^[aeiou]+$/i)
1150
+ && ($romanized_char =~ /^[bcdfghjklmnpqrstvwxyz]/)) {
1151
+ $this->add_node("$romanized_char$prev_char_roman", $prev_node_start, $i+1, *chart_ht, "", "thai-vowel-consonant-swap");
1152
+
1153
+ # Thai special case: THAI CHARACTER O ANG (U+0E2D "\xE0\xB8\xAD")
1154
+ } elsif ($char_name eq "THAI CHARACTER O ANG") {
1155
+ if ($prev_script ne $current_script) {
1156
+ $this->add_node("", $i, $i+1, *chart_ht, "", "thai-initial-o-ang-drop");
1157
+ } elsif ($next_script ne $current_script) {
1158
+ $this->add_node("", $i, $i+1, *chart_ht, "", "thai-final-o-ang-drop");
1159
+ } else {
1160
+ my $romanized_next_char = $this->romanize_char_at_position($i+1, $lang_code, $output_style, *ht, *chart_ht);
1161
+ my $romanized_prev2_char = $this->romanize_char_at_position($i-2, $lang_code, $output_style, *ht, *chart_ht);
1162
+ if (($prev_char_roman =~ /^[bcdfghjklmnpqrstvwxz]+$/i)
1163
+ && ($romanized_next_char =~ /^[bcdfghjklmnpqrstvwxz]+$/i)) {
1164
+ $this->add_node("o", $i, $i+1, *chart_ht, "", "thai-middle-o-ang"); # keep between consonants
1165
+ } elsif (($prev2_script eq $current_script)
1166
+ && 0
1167
+ && ($prev_char_name =~ /^THAI CHARACTER MAI [A-Z]+$/) # Thai tone
1168
+ && ($romanized_prev2_char =~ /^[bcdfghjklmnpqrstvwxz]+$/i)
1169
+ && ($romanized_next_char =~ /^[bcdfghjklmnpqrstvwxz]+$/i)) {
1170
+ $this->add_node("o", $i, $i+1, *chart_ht, "", "thai-middle-o-ang"); # keep between consonant+tone-mark and consonant
1171
+ } else {
1172
+ $this->add_node("", $i, $i+1, *chart_ht, "", "thai-middle-o-ang-drop"); # drop next to vowel
1173
+ }
1174
+ }
1175
+
1176
+ # Romanization with space
1177
+ } elsif ($romanized_char =~ /\s/) {
1178
+ $this->add_node($char, $i, $i+1, *chart_ht, "", "space");
1179
+
1180
+ # Tibetan special cases
1181
+ } elsif ($current_script eq "Tibetan") {
1182
+
1183
+ if ($subjoined_char_p
1184
+ && ($prev_script eq $current_script)
1185
+ && $prev_letter_plus_char_p
1186
+ && ($prev_char_roman =~ /^[bcdfghjklmnpqrstvwxyz]+$/i)) {
1187
+ $this->add_node("$prev_char_roman$romanized_char", $prev_node_start, $i+1, *chart_ht, "", "subjoined-tibetan-character");
1188
+ } elsif ($romanized_char =~ /^-A$/i) {
1189
+ my $romanized_next_char = $this->romanize_char_at_position($i+1, $lang_code, $output_style, *ht, *chart_ht);
1190
+ if (! $prev_letter_plus_char_p) {
1191
+ $this->add_node("'", $i, $i+1, *chart_ht, "", "tibetan-frontal-dash-a");
1192
+ } elsif (($prev_script eq $current_script)
1193
+ && ($next_script eq $current_script)
1194
+ && ($prev_char_roman =~ /[bcdfghjklmnpqrstvwxyz]$/)
1195
+ && ($romanized_next_char =~ /^[aeiou]/)) {
1196
+ $this->add_node("a'", $i, $i+1, *chart_ht, "", "tibetan-medial-dash-a");
1197
+ } elsif (($prev_script eq $current_script)
1198
+ && ($next_script eq $current_script)
1199
+ && ($prev_char_roman =~ /[aeiou]$/)
1200
+ && ($romanized_next_char =~ /[aeiou]/)) {
1201
+ $this->add_node("'", $i, $i+1, *chart_ht, "", "tibetan-reduced-medial-dash-a");
1202
+ } elsif (($prev_script eq $current_script)
1203
+ && (! ($prev_char_roman =~ /[aeiou]/))
1204
+ && (! $next_letter_plus_char_p)) {
1205
+ $this->add_node("a", $i, $i+1, *chart_ht, "", "tibetan-final-dash-a");
1206
+ } else {
1207
+ $this->add_node("a", $i, $i+1, *chart_ht, "", "unexpected-tibetan-dash-a");
1208
+ }
1209
+ } elsif (($romanized_char =~ /^[AEIOU]/i)
1210
+ && ($prev_script eq $current_script)
1211
+ && ($prev_char_roman =~ /^A$/i)
1212
+ && (! $prev2_letter_plus_char_p)) {
1213
+ $this->add_node($romanized_char, $prev_node_start, $i+1, *chart_ht, "", "tibetan-dropped-word-initial-a");
1214
+ } else {
1215
+ $this->add_node($romanized_char, $i, $i+1, *chart_ht, "", "standard-unicode-based-romanization");
1216
+ }
1217
+
1218
+ # Khmer (for MUUSIKATOAN etc. see under "Diacritic" above)
1219
+ } elsif (($current_script eq "Khmer")
1220
+ && (($char_roman_consonant, $char_roman_vowel) = ($romanized_char =~ /^(.*[bcdfghjklmnpqrstvwxyz])([ao]+)-$/i))) {
1221
+ my $romanized_next_char = $this->romanize_char_at_position($i+1, $lang_code, $output_style, *ht, *chart_ht);
1222
+ if (($next_script eq $current_script)
1223
+ && ($romanized_next_char =~ /^[aeiouy]/i)) {
1224
+ $this->add_node($char_roman_consonant, $i, $i+1, *chart_ht, "", "khmer-vowel-drop");
1225
+ } else {
1226
+ $this->add_node("$char_roman_consonant$char_roman_vowel", $i, $i+1, *chart_ht, "", "khmer-standard-unicode-based-romanization");
1227
+ }
1228
+
1229
+ # Abudiga add default vowel
1230
+ } elsif ((@abudiga_default_vowels = sort keys %{$ht{SCRIPT_ABUDIGA_DEFAULT_VOWEL}->{$current_script}})
1231
+ && ($abudiga_default_vowel = $abudiga_default_vowels[0])
1232
+ && ($romanized_char =~ /^[bcdfghjklmnpqrstvwxyz]+$/i)) {
1233
+ my $new_roman = join("", $romanized_char, $abudiga_default_vowel);
1234
+ $this->add_node($new_roman, $i, $i+1, *chart_ht, "", "standard-unicode-based-romanization-plus-abudiga-default-vowel");
1235
+ # print STDERR " Abudiga add default vowel l.$line_number/$i: $new_roman\n" if $line_number =~ /^[12]$/;
1236
+
1237
+ # Standard romanization
1238
+ } else {
1239
+ $node_id = $this->add_node($romanized_char, $i, $i+1, *chart_ht, "", "standard-unicode-based-romanization");
1240
+ }
1241
+ } else {
1242
+ $this->add_node($char, $i, $i+1, *chart_ht, "", "unexpected-original");
1243
+ }
1244
+ } elsif (defined($romanized_char = $this->romanize_char_at_position($i, $lang_code, $output_style, *ht, *chart_ht))
1245
+ && ((length($romanized_char) <= 2)
1246
+ || ($ht{UTF_TO_CHAR_ROMANIZATION}->{$char}))) { # or from unicode_overwrite_romanization table
1247
+ $romanized_char =~ s/^""$//;
1248
+ $this->add_node($romanized_char, $i, $i+1, *chart_ht, "", "romanized-without-character-name");
1249
+ } else {
1250
+ $this->add_node($char, $i, $i+1, *chart_ht, "", "unexpected-original-without-character-name");
1251
+ }
1252
+ }
1253
+ $i = $next_index;
1254
+ }
1255
+
1256
+ $this->schwa_deletion(0, $n_characters, *chart_ht, $lang_code);
1257
+ $this->default_vowelize_tibetan(0, $n_characters, *chart_ht, $lang_code, $line_number) if $chart_ht{CHART_CONTAINS_SCRIPT}->{"Tibetan"};
1258
+ $this->assemble_numbers_in_chart(*chart_ht, $line_number);
1259
+
1260
+ if ($return_chart_p) {
1261
+ } elsif ($return_offset_mappings_p) {
1262
+ ($result, $offset_mappings, $new_char_offset, $new_rom_char_offset) = $this->best_romanized_string(0, $n_characters, *chart_ht, $control, $initial_char_offset, $initial_rom_char_offset);
1263
+ } else {
1264
+ $result = $this->best_romanized_string(0, $n_characters, *chart_ht) unless $return_chart_p;
1265
+ }
1266
+
1267
+ if ($verbosePM) {
1268
+ my $logfile = "/nfs/isd/ulf/cgi-mt/amr-tmp/uroman-log.txt";
1269
+ $util->append_to_file($logfile, $log) if $log && (-r $logfile);
1270
+ }
1271
+
1272
+ return ($result, $offset_mappings) if $return_offset_mappings_p;
1273
+ return *chart_ht if $return_chart_p;
1274
+ return $result;
1275
+ }
1276
+
1277
+ sub string_to_json_string {
1278
+ local($this, $s) = @_;
1279
+
1280
+ utf8::decode($s);
1281
+ my $j = JSON->new->utf8->encode([$s]);
1282
+ $j =~ s/^\[(.*)\]$/$1/;
1283
+ return $j;
1284
+ }
1285
+
1286
+ sub chart_to_json_romanization_elements {
1287
+ local($this, $chart_start, $chart_end, *chart_ht, $line_number) = @_;
1288
+
1289
+ my $result = "";
1290
+ my $start = $chart_start;
1291
+ my $end;
1292
+ while ($start < $chart_end) {
1293
+ $end = $this->find_end_of_rom_segment($start, $chart_end, *chart_ht);
1294
+ my @best_romanizations;
1295
+ if (($end && ($start < $end))
1296
+ && (@best_romanizations = $this->best_romanizations($start, $end, *chart_ht))) {
1297
+ $orig_segment = $this->orig_string_at_span($start, $end, *chart_ht);
1298
+ $next_start = $end;
1299
+ } else {
1300
+ $orig_segment = $chart_ht{ORIG_CHAR}->{$start};
1301
+ @best_romanizations = ($orig);
1302
+ $next_start = $start + 1;
1303
+ }
1304
+ $exclusive_end = $end - 1;
1305
+ # $guarded_orig = $util->string_guard($orig_segment);
1306
+ $guarded_orig = $this->string_to_json_string($orig_segment);
1307
+ $result .= " { \"line\": $line_number, \"start\": $start, \"end\": $exclusive_end, \"orig\": $guarded_orig, \"roms\": [";
1308
+ foreach $i ((0 .. $#best_romanizations)) {
1309
+ my $rom = $best_romanizations[$i];
1310
+ # my $guarded_rom = $util->string_guard($rom);
1311
+ my $guarded_rom = $this->string_to_json_string($rom);
1312
+ $result .= " { \"rom\": $guarded_rom";
1313
+ # $result .= ", \"alt\": true" if $i >= 1;
1314
+ $result .= " }";
1315
+ $result .= "," if $i < $#best_romanizations;
1316
+ }
1317
+ $result .= " ] },\n";
1318
+ $start = $next_start;
1319
+ }
1320
+ return $result;
1321
+ }
1322
+
1323
+ sub default_vowelize_tibetan {
1324
+ local($this, $chart_start, $chart_end, *chart_ht, $lang_code, $line_number) = @_;
1325
+
1326
+ # my $verbose = ($line_number == 103);
1327
+ # print STDERR "\nStart default_vowelize_tibetan l.$line_number $chart_start-$chart_end\n" if $verbose;
1328
+ my $token_start = $chart_start;
1329
+ my $next_token_start = $chart_start;
1330
+ while (($token_start = $next_token_start) < $chart_end) {
1331
+ $next_token_start = $token_start + 1;
1332
+
1333
+ next unless $chart_ht{CHAR_LETTER_PLUS}->{$token_start};
1334
+ my $current_script = $chart_ht{CHAR_SCRIPT}->{$token_start};
1335
+ next unless ($current_script eq "Tibetan");
1336
+ my $token_end = $chart_ht{LETTER_TOKEN_SEGMENT_START_TO_END}->{$token_start};
1337
+ next unless $token_end;
1338
+ next unless $token_end > $token_start;
1339
+ $next_token_start = $token_end;
1340
+
1341
+ my $start = $token_start;
1342
+ my $end;
1343
+ my @node_ids = ();
1344
+ while ($start < $token_end) {
1345
+ $end = $this->find_end_of_rom_segment($start, $chart_end, *chart_ht);
1346
+ last unless $end && ($end > $start);
1347
+ my @alt_node_ids = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}};
1348
+ last unless @alt_node_ids;
1349
+ push(@node_ids, $alt_node_ids[0]);
1350
+ $start = $end;
1351
+ }
1352
+ my $contains_vowel_p = 0;
1353
+ my @romanizations = ();
1354
+ foreach $node_id (@node_ids) {
1355
+ my $roman = $chart_ht{NODE_ROMAN}->{$node_id};
1356
+ $roman = "" unless defined($roman);
1357
+ push(@romanizations, $roman);
1358
+ $contains_vowel_p = 1 if $roman =~ /[aeiou]/i;
1359
+ }
1360
+ # print STDERR " old: $token_start-$token_end @romanizations\n" if $verbose;
1361
+ unless ($contains_vowel_p) {
1362
+ my $default_vowel_target_index;
1363
+ if ($#node_ids <= 1) {
1364
+ $default_vowel_target_index = 0;
1365
+ } elsif ($romanizations[$#romanizations] eq "s") {
1366
+ if ($romanizations[($#romanizations-1)] eq "y") {
1367
+ $default_vowel_target_index = $#romanizations-1;
1368
+ } else {
1369
+ $default_vowel_target_index = $#romanizations-2;
1370
+ }
1371
+ } else {
1372
+ $default_vowel_target_index = $#romanizations-1;
1373
+ }
1374
+ $romanizations[$default_vowel_target_index] .= "a";
1375
+ my $old_node_id = $node_ids[$default_vowel_target_index];
1376
+ my $old_start = $chart_ht{NODE_START}->{$old_node_id};
1377
+ my $old_end = $chart_ht{NODE_END}->{$old_node_id};
1378
+ my $old_roman = $chart_ht{NODE_ROMAN}->{$old_node_id};
1379
+ my $new_roman = $old_roman . "a";
1380
+ my $new_node_id = $this->add_node($new_roman, $old_start, $old_end, *chart_ht, "", "tibetan-default-vowel");
1381
+ $this->copy_slot_values($old_node_id, $new_node_id, *chart_id, "all");
1382
+ $chart_ht{NODE_TYPE}->{$old_node_id} = "backup"; # keep, but demote
1383
+ }
1384
+ if (($romanizations[0] eq "'")
1385
+ && ($#romanizations >= 1)
1386
+ && ($romanizations[1] =~ /^[o]$/)) {
1387
+ my $old_node_id = $node_ids[0];
1388
+ my $old_start = $chart_ht{NODE_START}->{$old_node_id};
1389
+ my $old_end = $chart_ht{NODE_END}->{$old_node_id};
1390
+ my $new_node_id = $this->add_node("", $old_start, $old_end, *chart_ht, "", "tibetan-delete-apostrophe");
1391
+ $this->copy_slot_values($old_node_id, $new_node_id, *chart_id, "all");
1392
+ $chart_ht{NODE_TYPE}->{$old_node_id} = "alt"; # keep, but demote
1393
+ }
1394
+ if (($#node_ids >= 1)
1395
+ && ($romanizations[$#romanizations] =~ /^[bcdfghjklmnpqrstvwxz]+y$/)) {
1396
+ my $old_node_id = $node_ids[$#romanizations];
1397
+ my $old_start = $chart_ht{NODE_START}->{$old_node_id};
1398
+ my $old_end = $chart_ht{NODE_END}->{$old_node_id};
1399
+ my $old_roman = $chart_ht{NODE_ROMAN}->{$old_node_id};
1400
+ my $new_roman = $old_roman . "a";
1401
+ my $new_node_id = $this->add_node($new_roman, $old_start, $old_end, *chart_ht, "", "tibetan-syllable-final-vowel");
1402
+ $this->copy_slot_values($old_node_id, $new_node_id, *chart_id, "all");
1403
+ $chart_ht{NODE_TYPE}->{$old_node_id} = "alt"; # keep, but demote
1404
+ }
1405
+ foreach $old_node_id (@node_ids) {
1406
+ my $old_roman = $chart_ht{NODE_ROMAN}->{$old_node_id};
1407
+ next unless $old_roman =~ /-a/;
1408
+ my $old_start = $chart_ht{NODE_START}->{$old_node_id};
1409
+ my $old_end = $chart_ht{NODE_END}->{$old_node_id};
1410
+ my $new_roman = $old_roman;
1411
+ $new_roman =~ s/-a/a/;
1412
+ my $new_node_id = $this->add_node($new_roman, $old_start, $old_end, *chart_ht, "", "tibetan-syllable-delete-dash");
1413
+ $this->copy_slot_values($old_node_id, $new_node_id, *chart_id, "all");
1414
+ $chart_ht{NODE_TYPE}->{$old_node_id} = "alt"; # keep, but demote
1415
+ }
1416
+ }
1417
+ }
1418
+
1419
+ sub schwa_deletion {
1420
+ local($this, $chart_start, $chart_end, *chart_ht, $lang_code) = @_;
1421
+ # delete word-final simple "a" in Devanagari (e.g. nepaala -> nepaal)
1422
+ # see Wikipedia article "Schwa deletion in Indo-Aryan languages"
1423
+
1424
+ if ($chart_ht{CHART_CONTAINS_SCRIPT}->{"Devanagari"}) {
1425
+ my $script_start = $chart_start;
1426
+ my $next_script_start = $chart_start;
1427
+ while (($script_start = $next_script_start) < $chart_end) {
1428
+ $next_script_start = $script_start + 1;
1429
+
1430
+ my $current_script = $chart_ht{CHAR_SCRIPT}->{$script_start};
1431
+ next unless ($current_script eq "Devanagari");
1432
+ my $script_end = $chart_ht{SCRIPT_SEGMENT_START_TO_END}->{$script_start};
1433
+ next unless $script_end;
1434
+ next unless $script_end - $script_start >= 2;
1435
+ $next_script_start = $script_end;
1436
+ my $end_node_id = $this->get_node_for_span($script_end-1, $script_end, *chart_ht);
1437
+ next unless $end_node_id;
1438
+ my $end_roman = $chart_ht{NODE_ROMAN}->{$end_node_id};
1439
+ next unless ($end_consonant) = ($end_roman =~ /^([bcdfghjklmnpqrstvwxz]+)a$/i);
1440
+ my $prev_node_id = $this->get_node_for_span($script_end-4, $script_end-1, *chart_ht)
1441
+ || $this->get_node_for_span($script_end-3, $script_end-1, *chart_ht)
1442
+ || $this->get_node_for_span($script_end-2, $script_end-1, *chart_ht);
1443
+ next unless $prev_node_id;
1444
+ my $prev_roman = $chart_ht{NODE_ROMAN}->{$prev_node_id};
1445
+ next unless $prev_roman =~ /[aeiou]/i;
1446
+ # TO DO: check further back for vowel (e.g. if $prev_roman eq "r" due to vowel cancelation)
1447
+
1448
+ $chart_ht{NODE_TYPE}->{$end_node_id} = "alt"; # keep, but demote
1449
+ # print STDERR "* Schwa deletion " . ($script_end-1) . "-$script_end $end_roman->$end_consonant\n";
1450
+ $this->add_node($end_consonant, $script_end-1, $script_end, *chart_ht, "", "devanagari-with-deleted-final-schwa");
1451
+ }
1452
+ }
1453
+ }
1454
+
1455
+ sub best_romanized_string {
1456
+ local($this, $chart_start, $chart_end, *chart_ht, $control, $orig_char_offset, $rom_char_offset) = @_;
1457
+
1458
+ $control = "" unless defined($control);
1459
+ my $current_orig_char_offset = $orig_char_offset || 0;
1460
+ my $current_rom_char_offset = $rom_char_offset || 0;
1461
+ my $return_offset_mappings_p = ($control =~ /\breturn offset mappings\b/);
1462
+ my $result = "";
1463
+ my $start = $chart_start;
1464
+ my $end;
1465
+ my @char_offsets = ("$current_orig_char_offset:$current_rom_char_offset");
1466
+ while ($start < $chart_end) {
1467
+ $end = $this->find_end_of_rom_segment($start, $chart_end, *chart_ht);
1468
+ my $n_orig_chars_in_segment = 0;
1469
+ my $n_rom_chars_in_segment = 0;
1470
+ if ($end && ($start < $end)) {
1471
+ my @best_romanizations = $this->best_romanizations($start, $end, *chart_ht);
1472
+ my $best_romanization = (@best_romanizations) ? $best_romanizations[0] : undef;
1473
+ if (defined($best_romanization)) {
1474
+ $result .= $best_romanization;
1475
+ if ($return_offset_mappings_p) {
1476
+ $n_orig_chars_in_segment = $end-$start;
1477
+ $n_rom_chars_in_segment = $utf8->length_in_utf8_chars($best_romanization);
1478
+ }
1479
+ $start = $end;
1480
+ } else {
1481
+ my $best_romanization = $chart_ht{ORIG_CHAR}->{$start};
1482
+ $result .= $best_romanization;
1483
+ $start++;
1484
+ if ($return_offset_mappings_p) {
1485
+ $n_orig_chars_in_segment = 1;
1486
+ $n_rom_chars_in_segment = $utf8->length_in_utf8_chars($best_romanization);
1487
+ }
1488
+ }
1489
+ } else {
1490
+ my $best_romanization = $chart_ht{ORIG_CHAR}->{$start};
1491
+ $result .= $best_romanization;
1492
+ $start++;
1493
+ if ($return_offset_mappings_p) {
1494
+ $n_orig_chars_in_segment = 1;
1495
+ $n_rom_chars_in_segment = $utf8->length_in_utf8_chars($best_romanization);
1496
+ }
1497
+ }
1498
+ if ($return_offset_mappings_p) {
1499
+ my $new_orig_char_offset = $current_orig_char_offset + $n_orig_chars_in_segment;
1500
+ my $new_rom_char_offset = $current_rom_char_offset + $n_rom_chars_in_segment;
1501
+ my $offset_mapping = "$new_orig_char_offset:$new_rom_char_offset";
1502
+ push(@char_offsets, $offset_mapping);
1503
+ $current_orig_char_offset = $new_orig_char_offset;
1504
+ $current_rom_char_offset = $new_rom_char_offset;
1505
+ }
1506
+ }
1507
+ return ($result, join(",", @char_offsets), $current_orig_char_offset, $current_rom_char_offset) if $return_offset_mappings_p;
1508
+ return $result;
1509
+ }
1510
+
1511
+ sub orig_string_at_span {
1512
+ local($this, $start, $end, *chart_ht) = @_;
1513
+
1514
+ my $result = "";
1515
+ foreach $i (($start .. ($end-1))) {
1516
+ $result .= $chart_ht{ORIG_CHAR}->{$i};
1517
+ }
1518
+ return $result;
1519
+ }
1520
+
1521
+ sub find_end_of_rom_segment {
1522
+ local($this, $start, $chart_end, *chart_ht) = @_;
1523
+
1524
+ my @ends = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}};
1525
+ my $end_index = $#ends;
1526
+ while (($end_index >= 0) && ($ends[$end_index] > $chart_end)) {
1527
+ $end_index--;
1528
+ }
1529
+ if (($end_index >= 0)
1530
+ && defined($end = $ends[$end_index])
1531
+ && ($start < $end)) {
1532
+ return $end;
1533
+ } else {
1534
+ return "";
1535
+ }
1536
+ }
1537
+
1538
+ sub best_romanizations {
1539
+ local($this, $start, $end, *chart_ht) = @_;
1540
+
1541
+ @regular_romanizations = ();
1542
+ @alt_romanizations = ();
1543
+ @backup_romanizations = ();
1544
+
1545
+ foreach $node_id (sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}}) {
1546
+ my $type = $chart_ht{NODE_TYPE}->{$node_id};
1547
+ my $roman = $chart_ht{NODE_ROMAN}->{$node_id};
1548
+ if (! defined($roman)) {
1549
+ # ignore
1550
+ } elsif (($type eq "backup") && ! defined($backup_romanization)) {
1551
+ push(@backup_romanizations, $roman) unless $util->member($roman, @backup_romanizations);
1552
+ } elsif (($type eq "alt") && ! defined($alt_romanization)) {
1553
+ push(@alt_romanizations, $roman) unless $util->member($roman, @alt_romanizations);
1554
+ } else {
1555
+ push(@regular_romanizations, $roman) unless $util->member($roman, @regular_romanizations);
1556
+ }
1557
+ }
1558
+ @regular_alt_romanizations = sort @regular_romanizations;
1559
+ foreach $alt_romanization (sort @alt_romanizations) {
1560
+ push(@regular_alt_romanizations, $alt_romanization) unless $util->member($alt_romanization, @regular_alt_romanizations);
1561
+ }
1562
+ return @regular_alt_romanizations if @regular_alt_romanizations;
1563
+ return sort @backup_romanizations;
1564
+ }
1565
+
1566
+ sub join_alt_romanizations_for_viz {
1567
+ local($this, @list) = @_;
1568
+
1569
+ my @viz_romanizations = ();
1570
+
1571
+ foreach $alt_rom (@list) {
1572
+ if ($alt_rom eq "") {
1573
+ push(@viz_romanizations, "-");
1574
+ } else {
1575
+ push(@viz_romanizations, $alt_rom);
1576
+ }
1577
+ }
1578
+ return join(", ", @viz_romanizations);
1579
+ }
1580
+
1581
+ sub markup_orig_rom_strings {
1582
+ local($this, $chart_start, $chart_end, *ht, *chart_ht, *pinyin_ht, $last_group_id_index) = @_;
1583
+
1584
+ my $marked_up_rom = "";
1585
+ my $marked_up_orig = "";
1586
+ my $start = $chart_start;
1587
+ my $end;
1588
+ while ($start < $chart_end) {
1589
+ my $segment_start = $start;
1590
+ my $segment_end = $start+1;
1591
+ my $end = $this->find_end_of_rom_segment($start, $chart_end, *chart_ht);
1592
+ my $rom_segment = "";
1593
+ my $orig_segment = "";
1594
+ my $rom_title = "";
1595
+ my $orig_title = "";
1596
+ my $contains_alt_romanizations = 0;
1597
+ if ($end) {
1598
+ $segment_end = $end;
1599
+ my @best_romanizations = $this->best_romanizations($start, $end, *chart_ht);
1600
+ my $best_romanization = (@best_romanizations) ? $best_romanizations[0] : undef;
1601
+ if (defined($best_romanization)) {
1602
+ $rom_segment .= $best_romanization;
1603
+ $orig_segment .= $this->orig_string_at_span($start, $end, *chart_ht);
1604
+ $segment_end = $end;
1605
+ if ($#best_romanizations >= 1) {
1606
+ $rom_title .= $util->guard_html("Alternative romanizations: " . $this->join_alt_romanizations_for_viz(@best_romanizations) . "\n");
1607
+ $contains_alt_romanizations = 1;
1608
+ }
1609
+ } else {
1610
+ my $segment = $this->orig_string_at_span($start, $start+1, *chart_ht);
1611
+ $rom_segment .= $segment;
1612
+ $orig_segment .= $segment;
1613
+ $segment_end = $start+1;
1614
+ }
1615
+ $start = $segment_end;
1616
+ } else {
1617
+ $rom_segment .= $chart_ht{ORIG_CHAR}->{$start};
1618
+ $orig_segment .= $this->orig_string_at_span($start, $start+1, *chart_ht);
1619
+ $segment_end = $start+1;
1620
+ $start = $segment_end;
1621
+ }
1622
+ my $next_char = $chart_ht{ORIG_CHAR}->{$segment_end};
1623
+ my $next_char_is_combining_p = $this->char_is_combining_char($next_char, *ht);
1624
+ while ($next_char_is_combining_p
1625
+ && ($segment_end < $chart_end)
1626
+ && ($end = $this->find_end_of_rom_segment($segment_end, $chart_end, *chart_ht))
1627
+ && ($end > $segment_end)
1628
+ && (@best_romanizations = $this->best_romanizations($segment_end, $end, *chart_ht))
1629
+ && defined($best_romanization = $best_romanizations[0])) {
1630
+ $orig_segment .= $this->orig_string_at_span($segment_end, $end, *chart_ht);
1631
+ $rom_segment .= $best_romanization;
1632
+ if ($#best_romanizations >= 1) {
1633
+ $rom_title .= $util->guard_html("Alternative romanizations: " . $this->join_alt_romanizations_for_viz(@best_romanizations) . "\n");
1634
+ $contains_alt_romanizations = 1;
1635
+ }
1636
+ $segment_end = $end;
1637
+ $start = $segment_end;
1638
+ $next_char = $chart_ht{ORIG_CHAR}->{$segment_end};
1639
+ $next_char_is_combining_p = $this->char_is_combining_char($next_char, *ht);
1640
+ }
1641
+ foreach $i (($segment_start .. ($segment_end-1))) {
1642
+ $orig_title .= "+&#x200E; &#x200E;" unless $orig_title eq "";
1643
+ my $char = $chart_ht{ORIG_CHAR}->{$i};
1644
+ my $numeric = $ht{UTF_TO_NUMERIC}->{$char};
1645
+ $numeric = "" unless defined($numeric);
1646
+ my $pic_descr = $ht{UTF_TO_PICTURE_DESCR}->{$char};
1647
+ $pic_descr = "" unless defined($pic_descr);
1648
+ if ($char =~ /^\xE4\xB7[\x80-\xBF]$/) {
1649
+ $orig_title .= "$char_name\n";
1650
+ } elsif (($char =~ /^[\xE3-\xE9][\x80-\xBF]{2,2}$/) && $chinesePM->string_contains_utf8_cjk_unified_ideograph_p($char)) {
1651
+ my $unicode = $utf8->utf8_to_unicode($char);
1652
+ $orig_title .= "CJK Unified Ideograph U+" . (uc sprintf("%04x", $unicode)) . "\n";
1653
+ $orig_title .= "Chinese: $tonal_translit\n" if $tonal_translit = $chinesePM->tonal_pinyin($char, *pinyin_ht, "");
1654
+ $orig_title .= "Number: $numeric\n" if $numeric =~ /\d/;
1655
+ } elsif ($char_name = $ht{UTF_TO_CHAR_NAME}->{$char}) {
1656
+ $orig_title .= "$char_name\n";
1657
+ $orig_title .= "Number: $numeric\n" if $numeric =~ /\d/;
1658
+ $orig_title .= "Picture: $pic_descr\n" if $pic_descr =~ /\S/;
1659
+ } else {
1660
+ my $unicode = $utf8->utf8_to_unicode($char);
1661
+ if (($unicode >= 0xAC00) && ($unicode <= 0xD7A3)) {
1662
+ $orig_title .= "Hangul syllable U+" . (uc sprintf("%04x", $unicode)) . "\n";
1663
+ } else {
1664
+ $orig_title .= "Unicode character U+" . (uc sprintf("%04x", $unicode)) . "\n";
1665
+ }
1666
+ }
1667
+ }
1668
+ (@non_ascii_roms) = ($rom_segment =~ /([\xC0-\xFF][\x80-\xBF]*)/g);
1669
+ foreach $char (@non_ascii_roms) {
1670
+ my $char_name = $ht{UTF_TO_CHAR_NAME}->{$char};
1671
+ my $unicode = $utf8->utf8_to_unicode($char);
1672
+ my $unicode_s = "U+" . (uc sprintf("%04x", $unicode));
1673
+ if ($char_name) {
1674
+ $rom_title .= "$char_name\n";
1675
+ } else {
1676
+ $rom_title .= "$unicode_s\n";
1677
+ }
1678
+ }
1679
+ $last_group_id_index++;
1680
+ $rom_title =~ s/\s*$//;
1681
+ $rom_title =~ s/\n/&#xA;/g;
1682
+ $orig_title =~ s/\s*$//;
1683
+ $orig_title =~ s/\n/&#xA;&#x200E;/g;
1684
+ $orig_title = "&#x202D;" . $orig_title . "&#x202C;";
1685
+ my $rom_title_clause = ($rom_title eq "") ? "" : " title=\"$rom_title\"";
1686
+ my $orig_title_clause = ($orig_title eq "") ? "" : " title=\"$orig_title\"";
1687
+ my $alt_rom_clause = ($contains_alt_romanizations) ? "border-bottom:1px dotted;" : "";
1688
+ $marked_up_rom .= "<span id=\"span-$last_group_id_index-1\" onmouseover=\"highlight_elems('span-$last_group_id_index','1');\" onmouseout=\"highlight_elems('span-$last_group_id_index','0');\" style=\"color:#00BB00;$alt_rom_clause\"$rom_title_clause>" . $util->guard_html($rom_segment) . "<\/span>";
1689
+ $marked_up_orig .= "<span id=\"span-$last_group_id_index-2\" onmouseover=\"highlight_elems('span-$last_group_id_index','1');\" onmouseout=\"highlight_elems('span-$last_group_id_index','0');\"$orig_title_clause>" . $util->guard_html($orig_segment) . "<\/span>";
1690
+ if (($last_char = $chart_ht{ORIG_CHAR}->{($segment_end-1)})
1691
+ && ($last_char_name = $ht{UTF_TO_CHAR_NAME}->{$last_char})
1692
+ && ($last_char_name =~ /^(FULLWIDTH COLON|FULLWIDTH COMMA|FULLWIDTH RIGHT PARENTHESIS|IDEOGRAPHIC COMMA|IDEOGRAPHIC FULL STOP|RIGHT CORNER BRACKET|BRAILLE PATTERN BLANK|TIBETAN MARK .*)$/)) {
1693
+ $marked_up_orig .= "<wbr>";
1694
+ $marked_up_rom .= "<wbr>";
1695
+ }
1696
+ }
1697
+ return ($marked_up_rom, $marked_up_orig, $last_group_id_index);
1698
+ }
1699
+
1700
+ sub romanizations_with_alternatives {
1701
+ local($this, *ht, *chart_ht, *pinyin_ht, $chart_start, $chart_end) = @_;
1702
+
1703
+ $chart_start = 0 unless defined($chart_start);
1704
+ $chart_end = $chart_ht{N_CHARS} unless defined($chart_end);
1705
+ my $result = "";
1706
+ my $start = $chart_start;
1707
+ my $end;
1708
+ # print STDOUT "romanizations_with_alternatives $chart_start-$chart_end\n";
1709
+ while ($start < $chart_end) {
1710
+ my $segment_start = $start;
1711
+ my $segment_end = $start+1;
1712
+ my $end = $this->find_end_of_rom_segment($start, $chart_end, *chart_ht);
1713
+ my $rom_segment = "";
1714
+ # print STDOUT " $start-$end\n";
1715
+ if ($end) {
1716
+ $segment_end = $end;
1717
+ my @best_romanizations = $this->best_romanizations($start, $end, *chart_ht);
1718
+ # print STDOUT " $start-$end @best_romanizations\n";
1719
+ if (@best_romanizations) {
1720
+ if ($#best_romanizations == 0) {
1721
+ $rom_segment .= $best_romanizations[0];
1722
+ } else {
1723
+ $rom_segment .= "{" . join("|", @best_romanizations) . "}";
1724
+ }
1725
+ $segment_end = $end;
1726
+ } else {
1727
+ my $segment = $this->orig_string_at_span($start, $start+1, *chart_ht);
1728
+ $rom_segment .= $segment;
1729
+ $segment_end = $start+1;
1730
+ }
1731
+ $start = $segment_end;
1732
+ } else {
1733
+ $rom_segment .= $chart_ht{ORIG_CHAR}->{$start};
1734
+ $segment_end = $start+1;
1735
+ $start = $segment_end;
1736
+ }
1737
+ # print STDOUT " $start-$end ** $rom_segment\n";
1738
+ $result .= $rom_segment;
1739
+ }
1740
+ return $result;
1741
+ }
1742
+
1743
+ sub quick_romanize {
1744
+ local($this, $s, $lang_code, *ht) = @_;
1745
+
1746
+ my $result = "";
1747
+ my @chars = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht);
1748
+ while (@chars) {
1749
+ my $found_match_in_table_p = 0;
1750
+ foreach $string_length (reverse(1..4)) {
1751
+ next if ($string_length-1) > $#chars;
1752
+ $multi_char_substring = join("", @chars[0..($string_length-1)]);
1753
+ my @mappings = keys %{$ht{UTF_CHAR_MAPPING_LANG_SPEC}->{$lang_code}->{$multi_char_substring}};
1754
+ @mappings = keys %{$ht{UTF_CHAR_MAPPING}->{$multi_char_substring}} unless @mappings;
1755
+ if (@mappings) {
1756
+ my $mapping = $mappings[0];
1757
+ $result .= $mapping;
1758
+ foreach $_ ((1 .. $string_length)) {
1759
+ shift @chars;
1760
+ }
1761
+ $found_match_in_table_p = 1;
1762
+ last;
1763
+ }
1764
+ }
1765
+ unless ($found_match_in_table_p) {
1766
+ $result .= $chars[0];
1767
+ shift @chars;
1768
+ }
1769
+ }
1770
+ return $result;
1771
+ }
1772
+
1773
+ sub char_is_combining_char {
1774
+ local($this, $c, *ht) = @_;
1775
+
1776
+ return 0 unless $c;
1777
+ my $category = $ht{UTF_TO_CAT}->{$c};
1778
+ return 0 unless $category;
1779
+ return $category =~ /^M/;
1780
+ }
1781
+
1782
+ sub mark_up_string_for_mouse_over {
1783
+ local($this, $s, *ht, $control, *pinyin_ht) = @_;
1784
+
1785
+ $control = "" unless defined($control);
1786
+ $no_ascii_p = ($control =~ /NO-ASCII/);
1787
+ my $result = "";
1788
+ @chars = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht);
1789
+ while (@chars) {
1790
+ $char = shift @chars;
1791
+ $numeric = $ht{UTF_TO_NUMERIC}->{$char};
1792
+ $numeric = "" unless defined($numeric);
1793
+ $pic_descr = $ht{UTF_TO_PICTURE_DESCR}->{$char};
1794
+ $pic_descr = "" unless defined($pic_descr);
1795
+ $next_char = ($#chars >= 0) ? $chars[0] : "";
1796
+ $next_char_is_combining_p = $this->char_is_combining_char($next_char, *ht);
1797
+ if ($no_ascii_p
1798
+ && ($char =~ /^[\x00-\x7F]*$/)
1799
+ && ! $next_char_is_combining_p) {
1800
+ $result .= $util->guard_html($char);
1801
+ } elsif (($char =~ /^[\xE3-\xE9][\x80-\xBF]{2,2}$/) && $chinesePM->string_contains_utf8_cjk_unified_ideograph_p($char)) {
1802
+ $unicode = $utf8->utf8_to_unicode($char);
1803
+ $title = "CJK Unified Ideograph U+" . (uc sprintf("%04x", $unicode));
1804
+ $title .= "&#xA;Chinese: $tonal_translit" if $tonal_translit = $chinesePM->tonal_pinyin($char, *pinyin_ht, "");
1805
+ $title .= "&#xA;Number: $numeric" if $numeric =~ /\d/;
1806
+ $result .= "<span title=\"$title\">" . $util->guard_html($char) . "<\/span>";
1807
+ } elsif ($char_name = $ht{UTF_TO_CHAR_NAME}->{$char}) {
1808
+ $title = $char_name;
1809
+ $title .= "&#xA;Number: $numeric" if $numeric =~ /\d/;
1810
+ $title .= "&#xA;Picture: $pic_descr" if $pic_descr =~ /\S/;
1811
+ $char_plus = $char;
1812
+ while ($next_char_is_combining_p) {
1813
+ # combining marks (Mc:non-spacing, Mc:spacing combining, Me: enclosing)
1814
+ $next_char_name = $ht{UTF_TO_CHAR_NAME}->{$next_char};
1815
+ $title .= "&#xA;+ $next_char_name";
1816
+ $char = shift @chars;
1817
+ $char_plus .= $char;
1818
+ $next_char = ($#chars >= 0) ? $chars[0] : "";
1819
+ $next_char_is_combining_p = $this->char_is_combining_char($next_char, *ht);
1820
+ }
1821
+ $result .= "<span title=\"$title\">" . $util->guard_html($char_plus) . "<\/span>";
1822
+ $result .= "<wbr>" if $char_name =~ /^(FULLWIDTH COLON|FULLWIDTH COMMA|FULLWIDTH RIGHT PARENTHESIS|IDEOGRAPHIC COMMA|IDEOGRAPHIC FULL STOP|RIGHT CORNER BRACKET)$/;
1823
+ } elsif (($unicode = $utf8->utf8_to_unicode($char))
1824
+ && ($unicode >= 0xAC00) && ($unicode <= 0xD7A3)) {
1825
+ $title = "Hangul syllable U+" . (uc sprintf("%04x", $unicode));
1826
+ $result .= "<span title=\"$title\">" . $util->guard_html($char) . "<\/span>";
1827
+ } else {
1828
+ $result .= $util->guard_html($char);
1829
+ }
1830
+ }
1831
+ return $result;
1832
+ }
1833
+
1834
+ sub romanize_char_at_position_incl_multi {
1835
+ local($this, $i, $lang_code, $output_style, *ht, *chart_ht) = @_;
1836
+
1837
+ my $char = $chart_ht{ORIG_CHAR}->{$i};
1838
+ return "" unless defined($char);
1839
+ my @mappings = keys %{$ht{UTF_CHAR_MAPPING_LANG_SPEC}->{$lang_code}->{$char}};
1840
+ return $mappings[0] if @mappings;
1841
+ @mappings = keys %{$ht{UTF_CHAR_MAPPING}->{$char}};
1842
+ return $mappings[0] if @mappings;
1843
+ return $this->romanize_char_at_position($i, $lang_code, $output_style, *ht, *chart_ht);
1844
+ }
1845
+
1846
+ sub romanize_char_at_position {
1847
+ local($this, $i, $lang_code, $output_style, *ht, *chart_ht) = @_;
1848
+
1849
+ my $char = $chart_ht{ORIG_CHAR}->{$i};
1850
+ return "" unless defined($char);
1851
+ return $char if $char =~ /^[\x00-\x7F]$/; # ASCII
1852
+ my $romanization = $ht{UTF_TO_CHAR_ROMANIZATION}->{$char};
1853
+ return $romanization if $romanization;
1854
+ my $char_name = $chart_ht{CHAR_NAME}->{$i};
1855
+ $romanization = $this->romanize_charname($char_name, $lang_code, $output_style, *ht, $char);
1856
+ $ht{SUSPICIOUS_ROMANIZATION}->{$char_name}->{$romanization}
1857
+ = ($ht{SUSPICIOUS_ROMANIZATION}->{$char_name}->{$romanization} || 0) + 1
1858
+ unless (length($romanization) < 4)
1859
+ || ($romanization =~ /\s/)
1860
+ || ($romanization =~ /^[bcdfghjklmnpqrstvwxyz]{2,3}[aeiou]-$/) # Khmer ngo-/nyo-/pho- OK
1861
+ || ($romanization =~ /^[bcdfghjklmnpqrstvwxyz]{2,2}[aeiougw][aeiou]{1,2}$/) # Canadian, Ethiopic syllable OK
1862
+ || ($romanization =~ /^(allah|bbux|nyaa|nnya|quuv|rrep|shch|shur|syrx)$/i) # Arabic; Yi; Ethiopic syllable nyaa; Cyrillic letter shcha
1863
+ || (($char_name =~ /^(YI SYLLABLE|VAI SYLLABLE|ETHIOPIC SYLLABLE|CANADIAN SYLLABICS|CANADIAN SYLLABICS CARRIER)\s+(\S+)$/) && (length($romanization) <= 5));
1864
+ # print STDERR "romanize_char_at_position $i $char_name :: $romanization\n" if $char_name =~ /middle/i;
1865
+ return $romanization;
1866
+ }
1867
+
1868
+ sub romanize_charname {
1869
+ local($this, $char_name, $lang_code, $output_style, *ht, $char) = @_;
1870
+
1871
+ my $cached_result = $ht{ROMANIZE_CHARNAME}->{$char_name}->{$lang_code}->{$output_style};
1872
+ # print STDERR "(C) romanize_charname($char_name): $cached_result\n" if $cached_result && ($char_name =~ /middle/i);
1873
+ return $cached_result if defined($cashed_result);
1874
+ $orig_char_name = $char_name;
1875
+ $char_name =~ s/^.* LETTER\s+([A-Z]+)-\d+$/$1/; # HENTAIGANA LETTER A-3
1876
+ $char_name =~ s/^.* LETTER\s+//;
1877
+ $char_name =~ s/^.* SYLLABLE\s+B\d\d\d\s+//; # Linear B syllables
1878
+ $char_name =~ s/^.* SYLLABLE\s+//;
1879
+ $char_name =~ s/^.* SYLLABICS\s+//;
1880
+ $char_name =~ s/^.* LIGATURE\s+//;
1881
+ $char_name =~ s/^.* VOWEL SIGN\s+//;
1882
+ $char_name =~ s/^.* CONSONANT SIGN\s+//;
1883
+ $char_name =~ s/^.* CONSONANT\s+//;
1884
+ $char_name =~ s/^.* VOWEL\s+//;
1885
+ $char_name =~ s/ WITH .*$//;
1886
+ $char_name =~ s/ WITHOUT .*$//;
1887
+ $char_name =~ s/\s+(ABOVE|AGUNG|BAR|BARREE|BELOW|CEDILLA|CEREK|DIGRAPH|DOACHASHMEE|FINAL FORM|GHUNNA|GOAL|INITIAL FORM|ISOLATED FORM|KAWI|LELET|LELET RASWADI|LONSUM|MAHAPRANA|MEDIAL FORM|MURDA|MURDA MAHAPRANA|REVERSED|ROTUNDA|SASAK|SUNG|TAM|TEDUNG|TYPE ONE|TYPE TWO|WOLOSO)\s*$//;
1888
+ $char_name =~ s/^([A-Z]+)\d+$/$1/; # Linear B syllables etc.
1889
+ foreach $_ ((1 .. 3)) {
1890
+ $char_name =~ s/^.*\b(?:ABKHASIAN|ACADEMY|AFRICAN|AIVILIK|AITON|AKHMIMIC|ALEUT|ALI GALI|ALPAPRAANA|ALTERNATE|ALTERNATIVE|AMBA|ARABIC|ARCHAIC|ASPIRATED|ATHAPASCAN|BASELINE|BLACKLETTER|BARRED|BASHKIR|BERBER|BHATTIPROLU|BIBLE-CREE|BIG|BINOCULAR|BLACKFOOT|BLENDED|BOTTOM|BROAD|BROKEN|CANDRA|CAPITAL|CARRIER|CHILLU|CLOSE|CLOSED|COPTIC|CROSSED|CRYPTOGRAMMIC|CURLED|CURLY|CYRILLIC|DANTAJA|DENTAL|DIALECT-P|DIAERESIZED|DOTLESS|DOUBLE|DOUBLE-STRUCK|EASTERN PWO KAREN|EGYPTOLOGICAL|FARSI|FINAL|FLATTENED|GLOTTAL|GREAT|GREEK|HALF|HIGH|INITIAL|INSULAR|INVERTED|IOTIFIED|JONA|KANTAJA|KASHMIRI|KHAKASSIAN|KHAMTI|KHANDA|KINNA|KIRGHIZ|KOMI|L-SHAPED|LATINATE|LITTLE|LONG|LONG-LEGGED|LOOPED|LOW|MAHAAPRAANA|MALAYALAM|MANCHU|MANDAILING|MATHEMATICAL|MEDIAL|MIDDLE-WELSH|MON|MONOCULAR|MOOSE-CREE|MULTIOCULAR|MUURDHAJA|N-CREE|NARROW|NASKAPI|NDOLE|NEUTRAL|NIKOLSBURG|NORTHERN|NUBIAN|NUNAVIK|NUNAVUT|OJIBWAY|OLD|OPEN|ORKHON|OVERLONG|PALI|PERSIAN|PHARYNGEAL|PRISHTHAMATRA|R-CREE|REDUPLICATION|REVERSED|ROMANIAN|ROUND|ROUNDED|RUDIMENTA|RUMAI PALAUNG|SANSKRIT|SANYAKA|SARA|SAYISI|SCRIPT|SEBATBEIT|SEMISOFT|SGAW KAREN|SHAN|SHARP|SHWE PALAUNG|SHORT|SIBE|SIDEWAYS|SIMALUNGUN|SMALL|SOGDIAN|SOFT|SOUTH-SLAVEY|SOUTHERN|SPIDERY|STIRRUP|STRAIGHT|STRETCHED|SUBSCRIPT|SWASH|TAI LAING|TAILED|TAILLESS|TAALUJA|TH-CREE|TALL|THREE-LEGGED|TURNED|TODO|TOP|TROKUTASTI|TUAREG|UKRAINIAN|UNBLENDED|VISIGOTHIC|VOCALIC|VOICED|VOICELESS|VOLAPUK|WAVY|WESTERN PWO KAREN|WEST-CREE|WESTERN|WIDE|WOODS-CREE|Y-CREE|YENISEI|YIDDISH)\s+//;
1891
+ }
1892
+ $char_name =~ s/\s+(ABOVE|AGUNG|BAR|BARREE|BELOW|CEDILLA|CEREK|DIGRAPH|DOACHASHMEE|FINAL FORM|GHUNNA|GOAL|INITIAL FORM|ISOLATED FORM|KAWI|LELET|LELET RASWADI|LONSUM|MAHAPRANA|MEDIAL FORM|MURDA|MURDA MAHAPRANA|REVERSED|ROTUNDA|SASAK|SUNG|TAM|TEDUNG|TYPE ONE|TYPE TWO|WOLOSO)\s*$//;
1893
+ if ($char_name =~ /THAI CHARACTER/) {
1894
+ $char_name =~ s/^THAI CHARACTER\s+//;
1895
+ if ($char =~ /^\xE0\xB8[\x81-\xAE]/) {
1896
+ # Thai consonants
1897
+ $char_name =~ s/^([^AEIOU]*).*/$1/i;
1898
+ } elsif ($char_name =~ /^SARA [AEIOU]/) {
1899
+ # Thai vowels
1900
+ $char_name =~ s/^SARA\s+//;
1901
+ } else {
1902
+ $char_name = $char;
1903
+ }
1904
+ }
1905
+ if ($orig_char_name =~ /(HIRAGANA LETTER|KATAKANA LETTER|SYLLABLE|LIGATURE)/) {
1906
+ $char_name = lc $char_name;
1907
+ } elsif ($char_name =~ /\b(ANUSVARA|ANUSVARAYA|NIKAHIT|SIGN BINDI|TIPPI)\b/) {
1908
+ $char_name = "+m";
1909
+ } elsif ($char_name =~ /\bSCHWA\b/) {
1910
+ $char_name = "e";
1911
+ } elsif ($char_name =~ /\bIOTA\b/) {
1912
+ $char_name = "i";
1913
+ } elsif ($char_name =~ /\s/) {
1914
+ } elsif ($orig_char_name =~ /KHMER LETTER/) {
1915
+ $char_name .= "-";
1916
+ } elsif ($orig_char_name =~ /CHEROKEE LETTER/) {
1917
+ # use whole letter as is
1918
+ } elsif ($orig_char_name =~ /KHMER INDEPENDENT VOWEL/) {
1919
+ $char_name =~ s/q//;
1920
+ } elsif ($orig_char_name =~ /LETTER/) {
1921
+ $char_name =~ s/^[AEIOU]+([^AEIOU]+)$/$1/i;
1922
+ $char_name =~ s/^([^-AEIOUY]+)[AEIOU].*/$1/i;
1923
+ $char_name =~ s/^(Y)[AEIOU].*/$1/i if $orig_char_name =~ /\b(?:BENGALI|DEVANAGARI|GURMUKHI|GUJARATI|KANNADA|MALAYALAM|MODI|MYANMAR|ORIYA|TAMIL|TELUGU|TIBETAN)\b.*\bLETTER YA\b/;
1924
+ $char_name =~ s/^(Y[AEIOU]+)[^AEIOU].*$/$1/i;
1925
+ $char_name =~ s/^([AEIOU]+)[^AEIOU]+[AEIOU].*/$1/i;
1926
+ }
1927
+
1928
+ my $result = ($orig_char_name =~ /\bCAPITAL\b/) ? (uc $char_name) : (lc $char_name);
1929
+ # print STDERR "(R) romanize_charname($orig_char_name): $result\n" if $orig_char_name =~ /middle/i;
1930
+ $ht{ROMANIZE_CHARNAME}->{$char_name}->{$lang_code}->{$output_style} = $result;
1931
+ return $result;
1932
+ }
1933
+
1934
+ sub assemble_numbers_in_chart {
1935
+ local($this, *chart_ht, $line_number) = @_;
1936
+
1937
+ foreach $start (sort { $a <=> $b } keys %{$chart_ht{COMPLEX_NUMERIC_START_END}}) {
1938
+ my $end = $chart_ht{COMPLEX_NUMERIC_START_END}->{$start};
1939
+ my @numbers = ();
1940
+ foreach $i (($start .. ($end-1))) {
1941
+ my $orig_char = $chart_ht{ORIG_CHAR}->{$i};
1942
+ my $node_id = $this->get_node_for_span_with_slot($i, $i+1, "numeric-value", *chart_id);
1943
+ if (defined($node_id)) {
1944
+ my $number = $chart_ht{NODE_ROMAN}->{$node_id};
1945
+ if (defined($number)) {
1946
+ push(@numbers, $number);
1947
+ } elsif ($orig_char =~ /^[.,]$/) { # decimal point, comma separator
1948
+ push(@numbers, $orig_char);
1949
+ } else {
1950
+ print STDERR "Found no romanization for node_id $node_id ($i-" . ($i+1) . ") in assemble_numbers_in_chart\n" if $verbosePM;
1951
+ }
1952
+ } else {
1953
+ print STDERR "Found no node_id for span $i-" . ($i+1) . " in assemble_numbers_in_chart\n" if $verbosePM;
1954
+ }
1955
+ }
1956
+ my $complex_number = $this->assemble_number(join("\xC2\xB7", @numbers), $line_number);
1957
+ # print STDERR "assemble_numbers_in_chart l.$line_number $start-$end $complex_number (@numbers)\n";
1958
+ $this->add_node($complex_number, $start, $end, *chart_ht, "", "complex-number");
1959
+ }
1960
+ }
1961
+
1962
+ sub assemble_number {
1963
+ local($this, $s, $line_number) = @_;
1964
+ # e.g. 10 9 100 7 10 8 = 1978
1965
+
1966
+ my $middot = "\xC2\xB7";
1967
+ my @tokens = split(/$middot/, $s); # middle dot U+00B7
1968
+ my $i = 0;
1969
+ my @orig_tokens = @tokens;
1970
+
1971
+ # assemble single digit numbers, e.g. 1 7 5 -> 175
1972
+ while ($i < $#tokens) {
1973
+ if ($tokens[$i] =~ /^\d$/) {
1974
+ my $j = $i+1;
1975
+ while (($j <= $#tokens) && ($tokens[$j] =~ /^[0-9.,]$/)) {
1976
+ $j++;
1977
+ }
1978
+ $j--;
1979
+ if ($j>$i) {
1980
+ my $new_token = join("", @tokens[$i .. $j]);
1981
+ $new_token =~ s/,//g;
1982
+ splice(@tokens, $i, $j-$i+1, $new_token);
1983
+ }
1984
+ }
1985
+ $i++;
1986
+ }
1987
+
1988
+ foreach $power ((10, 100, 1000, 10000, 100000, 1000000, 100000000, 1000000000, 1000000000000)) {
1989
+ for (my $i=0; $i <= $#tokens; $i++) {
1990
+ if ($tokens[$i] == $power) {
1991
+ if (($i > 0) && ($tokens[($i-1)] < $power)) {
1992
+ splice(@tokens, $i-1, 2, ($tokens[($i-1)] * $tokens[$i]));
1993
+ $i--;
1994
+ if (($i < $#tokens) && ($tokens[($i+1)] < $power)) {
1995
+ splice(@tokens, $i, 2, ($tokens[$i] + $tokens[($i+1)]));
1996
+ $i--;
1997
+ }
1998
+ }
1999
+ }
2000
+ # 400 30 (e.g. Egyptian)
2001
+ my $gen_pattern = $power;
2002
+ $gen_pattern =~ s/^1/\[1-9\]/;
2003
+ if (($tokens[$i] =~ /^$gen_pattern$/) && ($i < $#tokens) && ($tokens[($i+1)] < $power)) {
2004
+ splice(@tokens, $i, 2, ($tokens[$i] + $tokens[($i+1)]));
2005
+ $i--;
2006
+ }
2007
+ }
2008
+ last if $#tokens == 0;
2009
+ }
2010
+ my $result = join($middot, @tokens);
2011
+ if ($verbosePM) {
2012
+ my $logfile = "/nfs/isd/ulf/cgi-mt/amr-tmp/uroman-number-log.txt";
2013
+ $util->append_to_file($logfile, "$s -> $result\n") if -r $logfile;
2014
+ # print STDERR " assemble number l.$line_number @orig_tokens -> $result\n" if $line_number == 43;
2015
+ }
2016
+ return $result;
2017
+ }
2018
+
2019
+ 1;
2020
+
uroman/lib/NLP/UTF8.pm ADDED
@@ -0,0 +1,1404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ################################################################
2
+ # #
3
+ # UTF8 #
4
+ # #
5
+ ################################################################
6
+
7
+ package NLP::UTF8;
8
+
9
+ use NLP::utilities;
10
+ $util = NLP::utilities;
11
+
12
+ %empty_ht = ();
13
+
14
+ sub new {
15
+ local($caller) = @_;
16
+
17
+ my $object = {};
18
+ my $class = ref( $caller ) || $caller;
19
+ bless($object, $class);
20
+ return $object;
21
+ }
22
+
23
+ sub unicode_string2string {
24
+ # input: string that might contain unicode sequences such as "U+0627"
25
+ # output: string in pure utf-8
26
+ local($caller,$s) = @_;
27
+
28
+ my $pre;
29
+ my $unicode;
30
+ my $post;
31
+ my $r1;
32
+ my $r2;
33
+ my $r3;
34
+
35
+ ($pre,$unicode,$post) = ($s =~ /^(.*)(?:U\+|\\u)([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])(.*)$/);
36
+ return $s unless defined($post);
37
+ $r1 = $caller->unicode_string2string($pre);
38
+ $r2 = $caller->unicode_hex_string2string($unicode);
39
+ $r3 = $caller->unicode_string2string($post);
40
+ $result = $r1 . $r2 . $r3;
41
+ return $result;
42
+ }
43
+
44
+ sub unicode_hex_string2string {
45
+ # input: "0627" (interpreted as hex code)
46
+ # output: utf-8 string for Arabic letter alef
47
+ local($caller,$unicode) = @_;
48
+ return "" unless defined($unicode);
49
+ my $d = hex($unicode);
50
+ return $caller->unicode2string($d);
51
+ }
52
+
53
+ sub unicode2string {
54
+ # input: non-neg integer, e.g. 0x627
55
+ # output: utf-8 string for Arabic letter alef
56
+ local($caller,$d) = @_;
57
+ return "" unless defined($d) && $d >= 0;
58
+ return sprintf("%c",$d) if $d <= 0x7F;
59
+
60
+ my $lastbyte1 = ($d & 0x3F) | 0x80;
61
+ $d >>= 6;
62
+ return sprintf("%c%c",$d | 0xC0, $lastbyte1) if $d <= 0x1F;
63
+
64
+ my $lastbyte2 = ($d & 0x3F) | 0x80;
65
+ $d >>= 6;
66
+ return sprintf("%c%c%c",$d | 0xE0, $lastbyte2, $lastbyte1) if $d <= 0xF;
67
+
68
+ my $lastbyte3 = ($d & 0x3F) | 0x80;
69
+ $d >>= 6;
70
+ return sprintf("%c%c%c%c",$d | 0xF0, $lastbyte3, $lastbyte2, $lastbyte1) if $d <= 0x7;
71
+
72
+ my $lastbyte4 = ($d & 0x3F) | 0x80;
73
+ $d >>= 6;
74
+ return sprintf("%c%c%c%c%c",$d | 0xF8, $lastbyte4, $lastbyte3, $lastbyte2, $lastbyte1) if $d <= 0x3;
75
+
76
+ my $lastbyte5 = ($d & 0x3F) | 0x80;
77
+ $d >>= 6;
78
+ return sprintf("%c%c%c%c%c%c",$d | 0xFC, $lastbyte5, $lastbyte4, $lastbyte3, $lastbyte2, $lastbyte1) if $d <= 0x1;
79
+ return ""; # bad input
80
+ }
81
+
82
+ sub html2utf8 {
83
+ local($caller, $string) = @_;
84
+
85
+ return $string unless $string =~ /\&\#\d{3,5};/;
86
+
87
+ my $prev = "";
88
+ my $s = $string;
89
+ while ($s ne $prev) {
90
+ $prev = $s;
91
+ ($pre,$d,$post) = ($s =~ /^(.*)\&\#(\d+);(.*)$/);
92
+ if (defined($d) && ((($d >= 160) && ($d <= 255))
93
+ || (($d >= 1500) && ($d <= 1699))
94
+ || (($d >= 19968) && ($d <= 40879)))) {
95
+ $html_code = "\&\#" . $d . ";";
96
+ $utf8_code = $caller->unicode2string($d);
97
+ $s =~ s/$html_code/$utf8_code/;
98
+ }
99
+ }
100
+ return $s;
101
+ }
102
+
103
+ sub xhtml2utf8 {
104
+ local($caller, $string) = @_;
105
+
106
+ return $string unless $string =~ /\&\#x[0-9a-fA-F]{2,5};/;
107
+
108
+ my $prev = "";
109
+ my $s = $string;
110
+ while ($s ne $prev) {
111
+ $prev = $s;
112
+ if (($pre, $html_code, $x, $post) = ($s =~ /^(.*)(\&\#x([0-9a-fA-F]{2,5});)(.*)$/)) {
113
+ $utf8_code = $caller->unicode_hex_string2string($x);
114
+ $s =~ s/$html_code/$utf8_code/;
115
+ }
116
+ }
117
+ return $s;
118
+ }
119
+
120
+ sub utf8_marker {
121
+ return sprintf("%c%c%c\n", 0xEF, 0xBB, 0xBF);
122
+ }
123
+
124
+ sub enforcer {
125
+ # input: string that might not conform to utf-8
126
+ # output: string in pure utf-8, with a few "smart replacements" and possibly "?"
127
+ local($caller,$s,$no_repair) = @_;
128
+
129
+ my $ascii;
130
+ my $utf8;
131
+ my $rest;
132
+
133
+ return $s if $s =~ /^[\x00-\x7F]*$/;
134
+
135
+ $no_repair = 0 unless defined($no_repair);
136
+ $orig = $s;
137
+ $result = "";
138
+
139
+ while ($s ne "") {
140
+ ($ascii,$rest) = ($s =~ /^([\x00-\x7F]+)(.*)$/);
141
+ if (defined($ascii)) {
142
+ $result .= $ascii;
143
+ $s = $rest;
144
+ next;
145
+ }
146
+ ($utf8,$rest) = ($s =~ /^([\xC0-\xDF][\x80-\xBF])(.*)$/);
147
+ ($utf8,$rest) = ($s =~ /^([\xE0-\xEF][\x80-\xBF][\x80-\xBF])(.*)$/)
148
+ unless defined($rest);
149
+ ($utf8,$rest) = ($s =~ /^([\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])(.*)$/)
150
+ unless defined($rest);
151
+ ($utf8,$rest) = ($s =~ /^([\xF8-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF])(.*)$/)
152
+ unless defined($rest);
153
+ if (defined($utf8)) {
154
+ $result .= $utf8;
155
+ $s = $rest;
156
+ next;
157
+ }
158
+ ($c,$rest) = ($s =~ /^(.)(.*)$/);
159
+ if (defined($c)) {
160
+ if ($no_repair) { $result .= "?"; }
161
+ elsif ($c =~ /\x85/) { $result .= "..."; }
162
+ elsif ($c =~ /\x91/) { $result .= "'"; }
163
+ elsif ($c =~ /\x92/) { $result .= "'"; }
164
+ elsif ($c =~ /\x93/) { $result .= $caller->unicode2string(0x201C); }
165
+ elsif ($c =~ /\x94/) { $result .= $caller->unicode2string(0x201D); }
166
+ elsif ($c =~ /[\xC0-\xFF]/) {
167
+ $c2 = $c;
168
+ $c2 =~ tr/[\xC0-\xFF]/[\x80-\xBF]/;
169
+ $result .= "\xC3$c2";
170
+ } else {
171
+ $result .= "?";
172
+ }
173
+ $s = $rest;
174
+ next;
175
+ }
176
+ $s = "";
177
+ }
178
+ $result .= "\n" if ($orig =~ /\n$/) && ! ($result =~ /\n$/);
179
+ return $result;
180
+ }
181
+
182
+ sub split_into_utf8_characters {
183
+ # input: utf8 string
184
+ # output: list of sub-strings, each representing a utf8 character
185
+ local($caller,$string,$group_control, *ht) = @_;
186
+
187
+ @characters = ();
188
+ $end_of_token_p_string = "";
189
+ $skipped_bytes = "";
190
+ $group_control = "" unless defined($group_control);
191
+ $group_ascii_numbers = ($group_control =~ /ASCII numbers/);
192
+ $group_ascii_spaces = ($group_control =~ /ASCII spaces/);
193
+ $group_ascii_punct = ($group_control =~ /ASCII punct/);
194
+ $group_ascii_chars = ($group_control =~ /ASCII chars/);
195
+ $group_xml_chars = ($group_control =~ /XML chars/);
196
+ $group_xml_tags = ($group_control =~ /XML tags/);
197
+ $return_only_chars = ($group_control =~ /return only chars/);
198
+ $return_trailing_whitespaces = ($group_control =~ /return trailing whitespaces/);
199
+ if ($group_control =~ /ASCII all/) {
200
+ $group_ascii_numbers = 1;
201
+ $group_ascii_spaces = 1;
202
+ $group_ascii_chars = 1;
203
+ $group_ascii_punct = 1;
204
+ }
205
+ if ($group_control =~ /(XML chars and tags|XML tags and chars)/) {
206
+ $group_xml_chars = 1;
207
+ $group_xml_tags = 1;
208
+ }
209
+ $orig_string = $string;
210
+ $string .= " ";
211
+ while ($string =~ /\S/) {
212
+ # one-character UTF-8 = ASCII
213
+ if ($string =~ /^[\x00-\x7F]/) {
214
+ if ($group_xml_chars
215
+ && (($dec_unicode, $rest) = ($string =~ /^&#(\d+);(.*)$/s))
216
+ && ($utf8_char = $caller->unicode2string($dec_unicode))) {
217
+ push(@characters, $utf8_char);
218
+ $string = $rest;
219
+ } elsif ($group_xml_chars
220
+ && (($hex_unicode, $rest) = ($string =~ /^&#x([0-9a-f]{1,6});(.*)$/is))
221
+ && ($utf8_char = $caller->unicode_hex_string2string($hex_unicode))) {
222
+ push(@characters, $utf8_char);
223
+ $string = $rest;
224
+ } elsif ($group_xml_chars
225
+ && (($html_entity_name, $rest) = ($string =~ /^&([a-z]{1,6});(.*)$/is))
226
+ && ($dec_unicode = $ht{HTML_ENTITY_NAME_TO_DECUNICODE}->{$html_entity_name})
227
+ && ($utf8_char = $caller->unicode2string($dec_unicode))
228
+ ) {
229
+ push(@characters, $utf8_char);
230
+ $string = $rest;
231
+ } elsif ($group_xml_tags
232
+ && (($tag, $rest) = ($string =~ /^(<\/?[a-zA-Z][-_:a-zA-Z0-9]*(\s+[a-zA-Z][-_:a-zA-Z0-9]*=\"[^"]*\")*\s*\/?>)(.*)$/s))) {
233
+ push(@characters, $tag);
234
+ $string = $rest;
235
+ } elsif ($group_ascii_numbers && ($string =~ /^[12]\d\d\d\.[01]?\d.[0-3]?\d([^0-9].*)?$/)) {
236
+ ($date) = ($string =~ /^(\d\d\d\d\.\d?\d.\d?\d)([^0-9].*)?$/);
237
+ push(@characters,$date);
238
+ $string = substr($string, length($date));
239
+ } elsif ($group_ascii_numbers && ($string =~ /^\d/)) {
240
+ ($number) = ($string =~ /^(\d+(,\d\d\d)*(\.\d+)?)/);
241
+ push(@characters,$number);
242
+ $string = substr($string, length($number));
243
+ } elsif ($group_ascii_spaces && ($string =~ /^(\s+)/)) {
244
+ ($space) = ($string =~ /^(\s+)/);
245
+ $string = substr($string, length($space));
246
+ } elsif ($group_ascii_punct && (($punct_seq) = ($string =~ /^(-+|\.+|[:,%()"])/))) {
247
+ push(@characters,$punct_seq);
248
+ $string = substr($string, length($punct_seq));
249
+ } elsif ($group_ascii_chars && (($word) = ($string =~ /^(\$[A-Z]*|[A-Z]{1,3}\$)/))) {
250
+ push(@characters,$word);
251
+ $string = substr($string, length($word));
252
+ } elsif ($group_ascii_chars && (($abbrev) = ($string =~ /^((?:Jan|Feb|Febr|Mar|Apr|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec|Mr|Mrs|Dr|a.m|p.m)\.)/))) {
253
+ push(@characters,$abbrev);
254
+ $string = substr($string, length($abbrev));
255
+ } elsif ($group_ascii_chars && (($word) = ($string =~ /^(second|minute|hour|day|week|month|year|inch|foot|yard|meter|kilometer|mile)-(?:long|old)/i))) {
256
+ push(@characters,$word);
257
+ $string = substr($string, length($word));
258
+ } elsif ($group_ascii_chars && (($word) = ($string =~ /^(zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion|trillion)-/i))) {
259
+ push(@characters,$word);
260
+ $string = substr($string, length($word));
261
+ } elsif ($group_ascii_chars && (($word) = ($string =~ /^([a-zA-Z]+)(?:[ ,;%?|()"]|'s |' |\. |\d+[:hms][0-9 ])/))) {
262
+ push(@characters,$word);
263
+ $string = substr($string, length($word));
264
+ } elsif ($group_ascii_chars && ($string =~ /^([\x21-\x27\x2A-\x7E]+)/)) { # exclude ()
265
+ ($ascii) = ($string =~ /^([\x21-\x27\x2A-\x7E]+)/); # ASCII black-characters
266
+ push(@characters,$ascii);
267
+ $string = substr($string, length($ascii));
268
+ } elsif ($group_ascii_chars && ($string =~ /^([\x21-\x7E]+)/)) {
269
+ ($ascii) = ($string =~ /^([\x21-\x7E]+)/); # ASCII black-characters
270
+ push(@characters,$ascii);
271
+ $string = substr($string, length($ascii));
272
+ } elsif ($group_ascii_chars && ($string =~ /^([\x00-\x7F]+)/)) {
273
+ ($ascii) = ($string =~ /^([\x00-\x7F]+)/);
274
+ push(@characters,$ascii);
275
+ $string = substr($string, length($ascii));
276
+ } else {
277
+ push(@characters,substr($string, 0, 1));
278
+ $string = substr($string, 1);
279
+ }
280
+
281
+ # two-character UTF-8
282
+ } elsif ($string =~ /^[\xC0-\xDF][\x80-\xBF]/) {
283
+ push(@characters,substr($string, 0, 2));
284
+ $string = substr($string, 2);
285
+
286
+ # three-character UTF-8
287
+ } elsif ($string =~ /^[\xE0-\xEF][\x80-\xBF][\x80-\xBF]/) {
288
+ push(@characters,substr($string, 0, 3));
289
+ $string = substr($string, 3);
290
+
291
+ # four-character UTF-8
292
+ } elsif ($string =~ /^[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]/) {
293
+ push(@characters,substr($string, 0, 4));
294
+ $string = substr($string, 4);
295
+
296
+ # five-character UTF-8
297
+ } elsif ($string =~ /^[\xF8-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]/) {
298
+ push(@characters,substr($string, 0, 5));
299
+ $string = substr($string, 5);
300
+
301
+ # six-character UTF-8
302
+ } elsif ($string =~ /^[\xFC-\xFD][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]/) {
303
+ push(@characters,substr($string, 0, 6));
304
+ $string = substr($string, 6);
305
+
306
+ # not a UTF-8 character
307
+ } else {
308
+ $skipped_bytes .= substr($string, 0, 1);
309
+ $string = substr($string, 1);
310
+ }
311
+
312
+ $end_of_token_p_string .= ($string =~ /^\S/) ? "0" : "1"
313
+ if $#characters >= length($end_of_token_p_string);
314
+ }
315
+ $string =~ s/ $//; # remove previously added space, but keep original spaces
316
+ if ($return_trailing_whitespaces) {
317
+ while ($string =~ /^[ \t]/) {
318
+ push(@characters,substr($string, 0, 1));
319
+ $string = substr($string, 1);
320
+ }
321
+ push(@characters, "\n") if $orig_string =~ /\n$/;
322
+ }
323
+ return ($return_only_chars) ? @characters : ($skipped_bytes, $end_of_token_p_string, @characters);
324
+ }
325
+
326
+ sub max_substring_info {
327
+ local($caller,$s1,$s2,$info_type) = @_;
328
+
329
+ ($skipped_bytes1, $end_of_token_p_string1, @char_list1) = $caller->split_into_utf8_characters($s1, "", *empty_ht);
330
+ ($skipped_bytes2, $end_of_token_p_string2, @char_list2) = $caller->split_into_utf8_characters($s2, "", *empty_ht);
331
+ return 0 if $skipped_bytes1 || $skipped_bytes2;
332
+
333
+ $best_substring_start1 = 0;
334
+ $best_substring_start2 = 0;
335
+ $best_substring_length = 0;
336
+
337
+ foreach $start_pos2 ((0 .. $#char_list2)) {
338
+ last if $start_pos2 + $best_substring_length > $#char_list2;
339
+ foreach $start_pos1 ((0 .. $#char_list1)) {
340
+ last if $start_pos1 + $best_substring_length > $#char_list1;
341
+ $matching_length = 0;
342
+ while (($start_pos1 + $matching_length <= $#char_list1)
343
+ && ($start_pos2 + $matching_length <= $#char_list2)
344
+ && ($char_list1[$start_pos1+$matching_length] eq $char_list2[$start_pos2+$matching_length])) {
345
+ $matching_length++;
346
+ }
347
+ if ($matching_length > $best_substring_length) {
348
+ $best_substring_length = $matching_length;
349
+ $best_substring_start1 = $start_pos1;
350
+ $best_substring_start2 = $start_pos2;
351
+ }
352
+ }
353
+ }
354
+ if ($info_type =~ /^max-ratio1$/) {
355
+ $length1 = $#char_list1 + 1;
356
+ return ($length1 > 0) ? ($best_substring_length / $length1) : 0;
357
+ } elsif ($info_type =~ /^max-ratio2$/) {
358
+ $length2 = $#char_list2 + 1;
359
+ return ($length2 > 0) ? ($best_substring_length / $length2) : 0;
360
+ } elsif ($info_type =~ /^substring$/) {
361
+ return join("", @char_list1[$best_substring_start1 .. $best_substring_start1+$best_substring_length-1]);
362
+ } else {
363
+ $length1 = $#char_list1 + 1;
364
+ $length2 = $#char_list2 + 1;
365
+ $info = "s1=$s1;s2=$s2";
366
+ $info .= ";best_substring_length=$best_substring_length";
367
+ $info .= ";best_substring_start1=$best_substring_start1";
368
+ $info .= ";best_substring_start2=$best_substring_start2";
369
+ $info .= ";length1=$length1";
370
+ $info .= ";length2=$length2";
371
+ return $info;
372
+ }
373
+ }
374
+
375
+ sub n_shared_chars_at_start {
376
+ local($caller,$s1,$s2) = @_;
377
+
378
+ my $n = 0;
379
+ while (($s1 ne "") && ($s2 ne "")) {
380
+ ($c1, $rest1) = ($s1 =~ /^(.[\x80-\xBF]*)(.*)$/);
381
+ ($c2, $rest2) = ($s2 =~ /^(.[\x80-\xBF]*)(.*)$/);
382
+ if ($c1 eq $c2) {
383
+ $n++;
384
+ $s1 = $rest1;
385
+ $s2 = $rest2;
386
+ } else {
387
+ last;
388
+ }
389
+ }
390
+ return $n;
391
+ }
392
+
393
+ sub char_length {
394
+ local($caller,$string,$byte_offset) = @_;
395
+
396
+ my $char = ($byte_offset) ? substr($string, $byte_offset) : $string;
397
+ return 1 if $char =~ /^[\x00-\x7F]/;
398
+ return 2 if $char =~ /^[\xC0-\xDF]/;
399
+ return 3 if $char =~ /^[\xE0-\xEF]/;
400
+ return 4 if $char =~ /^[\xF0-\xF7]/;
401
+ return 5 if $char =~ /^[\xF8-\xFB]/;
402
+ return 6 if $char =~ /^[\xFC-\xFD]/;
403
+ return 0;
404
+ }
405
+
406
+ sub length_in_utf8_chars {
407
+ local($caller,$s) = @_;
408
+
409
+ $s =~ s/[\x80-\xBF]//g;
410
+ $s =~ s/[\x00-\x7F\xC0-\xFF]/c/g;
411
+ return length($s);
412
+ }
413
+
414
+ sub byte_length_of_n_chars {
415
+ local($caller,$char_length,$string,$byte_offset,$undef_return_value) = @_;
416
+
417
+ $byte_offset = 0 unless defined($byte_offset);
418
+ $undef_return_value = -1 unless defined($undef_return_value);
419
+ my $result = 0;
420
+ my $len;
421
+ foreach $i ((1 .. $char_length)) {
422
+ $len = $caller->char_length($string,($byte_offset+$result));
423
+ return $undef_return_value unless $len;
424
+ $result += $len;
425
+ }
426
+ return $result;
427
+ }
428
+
429
+ sub replace_non_ASCII_bytes {
430
+ local($caller,$string,$replacement) = @_;
431
+
432
+ $replacement = "HEX" unless defined($replacement);
433
+ if ($replacement =~ /^(Unicode|U\+4|\\u|HEX)$/) {
434
+ $new_string = "";
435
+ while (($pre,$utf8_char, $post) = ($string =~ /^([\x09\x0A\x20-\x7E]*)([\x00-\x08\x0B-\x1F\x7F]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF][\x80-\xBF]|[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]|[\xF8-\xFF][\x80-\xBF]+|[\x80-\xBF])(.*)$/s)) {
436
+ if ($replacement =~ /Unicode/) {
437
+ $new_string .= $pre . "<U" . (uc $caller->utf8_to_unicode($utf8_char)) . ">";
438
+ } elsif ($replacement =~ /\\u/) {
439
+ $new_string .= $pre . "\\u" . (uc sprintf("%04x", $caller->utf8_to_unicode($utf8_char)));
440
+ } elsif ($replacement =~ /U\+4/) {
441
+ $new_string .= $pre . "<U+" . (uc $caller->utf8_to_4hex_unicode($utf8_char)) . ">";
442
+ } else {
443
+ $new_string .= $pre . "<HEX-" . $caller->utf8_to_hex($utf8_char) . ">";
444
+ }
445
+ $string = $post;
446
+ }
447
+ $new_string .= $string;
448
+ } else {
449
+ $new_string = $string;
450
+ $new_string =~ s/[\x80-\xFF]/$replacement/g;
451
+ }
452
+ return $new_string;
453
+ }
454
+
455
+ sub valid_utf8_string_p {
456
+ local($caller,$string) = @_;
457
+
458
+ return $string =~ /^(?:[\x09\x0A\x20-\x7E]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF][\x80-\xBF]|[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])*$/;
459
+ }
460
+
461
+ sub valid_utf8_string_incl_ascii_control_p {
462
+ local($caller,$string) = @_;
463
+
464
+ return $string =~ /^(?:[\x00-\x7F]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF][\x80-\xBF]|[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])*$/;
465
+ }
466
+
467
+ sub utf8_to_hex {
468
+ local($caller,$s) = @_;
469
+
470
+ $hex = "";
471
+ foreach $i ((0 .. length($s)-1)) {
472
+ $hex .= uc sprintf("%2.2x",ord(substr($s, $i, 1)));
473
+ }
474
+ return $hex;
475
+ }
476
+
477
+ sub hex_to_utf8 {
478
+ local($caller,$s) = @_;
479
+ # surface string \xE2\x80\xBA to UTF8
480
+
481
+ my $utf8 = "";
482
+ while (($hex, $rest) = ($s =~ /^(?:\\x)?([0-9A-Fa-f]{2,2})(.*)$/)) {
483
+ $utf8 .= sprintf("%c", hex($hex));
484
+ $s = $rest;
485
+ }
486
+ return $utf8;
487
+ }
488
+
489
+ sub utf8_to_4hex_unicode {
490
+ local($caller,$s) = @_;
491
+
492
+ return sprintf("%4.4x", $caller->utf8_to_unicode($s));
493
+ }
494
+
495
+ sub utf8_to_unicode {
496
+ local($caller,$s) = @_;
497
+
498
+ $unicode = 0;
499
+ foreach $i ((0 .. length($s)-1)) {
500
+ $c = substr($s, $i, 1);
501
+ if ($c =~ /^[\x80-\xBF]$/) {
502
+ $unicode = $unicode * 64 + (ord($c) & 0x3F);
503
+ } elsif ($c =~ /^[\xC0-\xDF]$/) {
504
+ $unicode = $unicode * 32 + (ord($c) & 0x1F);
505
+ } elsif ($c =~ /^[\xE0-\xEF]$/) {
506
+ $unicode = $unicode * 16 + (ord($c) & 0x0F);
507
+ } elsif ($c =~ /^[\xF0-\xF7]$/) {
508
+ $unicode = $unicode * 8 + (ord($c) & 0x07);
509
+ } elsif ($c =~ /^[\xF8-\xFB]$/) {
510
+ $unicode = $unicode * 4 + (ord($c) & 0x03);
511
+ } elsif ($c =~ /^[\xFC-\xFD]$/) {
512
+ $unicode = $unicode * 2 + (ord($c) & 0x01);
513
+ }
514
+ }
515
+ return $unicode;
516
+ }
517
+
518
+ sub charhex {
519
+ local($caller,$string) = @_;
520
+
521
+ my $result = "";
522
+ while ($string ne "") {
523
+ $char = substr($string, 0, 1);
524
+ $string = substr($string, 1);
525
+ if ($char =~ /^[ -~]$/) {
526
+ $result .= $char;
527
+ } else {
528
+ $hex = sprintf("%2.2x",ord($char));
529
+ $hex =~ tr/a-f/A-F/;
530
+ $result .= "<HEX-$hex>";
531
+ }
532
+ }
533
+ return $result;
534
+ }
535
+
536
+ sub windows1252_to_utf8 {
537
+ local($caller,$s, $norm_to_ascii_p, $preserve_potential_utf8s_p) = @_;
538
+
539
+ return $s if $s =~ /^[\x00-\x7F]*$/; # all ASCII
540
+
541
+ $norm_to_ascii_p = 1 unless defined($norm_to_ascii_p);
542
+ $preserve_potential_utf8s_p = 1 unless defined($preserve_potential_utf8s_p);
543
+ my $result = "";
544
+ my $c = "";
545
+ while ($s ne "") {
546
+ $n_bytes = 1;
547
+ if ($s =~ /^[\x00-\x7F]/) {
548
+ $result .= substr($s, 0, 1); # ASCII
549
+ } elsif ($preserve_potential_utf8s_p && ($s =~ /^[\xC0-\xDF][\x80-\xBF]/)) {
550
+ $result .= substr($s, 0, 2); # valid 2-byte UTF8
551
+ $n_bytes = 2;
552
+ } elsif ($preserve_potential_utf8s_p && ($s =~ /^[\xE0-\xEF][\x80-\xBF][\x80-\xBF]/)) {
553
+ $result .= substr($s, 0, 3); # valid 3-byte UTF8
554
+ $n_bytes = 3;
555
+ } elsif ($preserve_potential_utf8s_p && ($s =~ /^[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]/)) {
556
+ $result .= substr($s, 0, 4); # valid 4-byte UTF8
557
+ $n_bytes = 4;
558
+ } elsif ($preserve_potential_utf8s_p && ($s =~ /^[\xF8-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]/)) {
559
+ $result .= substr($s, 0, 5); # valid 5-byte UTF8
560
+ $n_bytes = 5;
561
+ } elsif ($s =~ /^[\xA0-\xBF]/) {
562
+ $c = substr($s, 0, 1);
563
+ $result .= "\xC2$c";
564
+ } elsif ($s =~ /^[\xC0-\xFF]/) {
565
+ $c = substr($s, 0, 1);
566
+ $c =~ tr/[\xC0-\xFF]/[\x80-\xBF]/;
567
+ $result .= "\xC3$c";
568
+ } elsif ($s =~ /^\x80/) {
569
+ $result .= "\xE2\x82\xAC"; # Euro sign
570
+ } elsif ($s =~ /^\x82/) {
571
+ $result .= "\xE2\x80\x9A"; # single low quotation mark
572
+ } elsif ($s =~ /^\x83/) {
573
+ $result .= "\xC6\x92"; # Latin small letter f with hook
574
+ } elsif ($s =~ /^\x84/) {
575
+ $result .= "\xE2\x80\x9E"; # double low quotation mark
576
+ } elsif ($s =~ /^\x85/) {
577
+ $result .= ($norm_to_ascii_p) ? "..." : "\xE2\x80\xA6"; # horizontal ellipsis (three dots)
578
+ } elsif ($s =~ /^\x86/) {
579
+ $result .= "\xE2\x80\xA0"; # dagger
580
+ } elsif ($s =~ /^\x87/) {
581
+ $result .= "\xE2\x80\xA1"; # double dagger
582
+ } elsif ($s =~ /^\x88/) {
583
+ $result .= "\xCB\x86"; # circumflex
584
+ } elsif ($s =~ /^\x89/) {
585
+ $result .= "\xE2\x80\xB0"; # per mille sign
586
+ } elsif ($s =~ /^\x8A/) {
587
+ $result .= "\xC5\xA0"; # Latin capital letter S with caron
588
+ } elsif ($s =~ /^\x8B/) {
589
+ $result .= "\xE2\x80\xB9"; # single left-pointing angle quotation mark
590
+ } elsif ($s =~ /^\x8C/) {
591
+ $result .= "\xC5\x92"; # OE ligature
592
+ } elsif ($s =~ /^\x8E/) {
593
+ $result .= "\xC5\xBD"; # Latin capital letter Z with caron
594
+ } elsif ($s =~ /^\x91/) {
595
+ $result .= ($norm_to_ascii_p) ? "`" : "\xE2\x80\x98"; # left single quotation mark
596
+ } elsif ($s =~ /^\x92/) {
597
+ $result .= ($norm_to_ascii_p) ? "'" : "\xE2\x80\x99"; # right single quotation mark
598
+ } elsif ($s =~ /^\x93/) {
599
+ $result .= "\xE2\x80\x9C"; # left double quotation mark
600
+ } elsif ($s =~ /^\x94/) {
601
+ $result .= "\xE2\x80\x9D"; # right double quotation mark
602
+ } elsif ($s =~ /^\x95/) {
603
+ $result .= "\xE2\x80\xA2"; # bullet
604
+ } elsif ($s =~ /^\x96/) {
605
+ $result .= ($norm_to_ascii_p) ? "-" : "\xE2\x80\x93"; # n dash
606
+ } elsif ($s =~ /^\x97/) {
607
+ $result .= ($norm_to_ascii_p) ? "-" : "\xE2\x80\x94"; # m dash
608
+ } elsif ($s =~ /^\x98/) {
609
+ $result .= ($norm_to_ascii_p) ? "~" : "\xCB\x9C"; # small tilde
610
+ } elsif ($s =~ /^\x99/) {
611
+ $result .= "\xE2\x84\xA2"; # trade mark sign
612
+ } elsif ($s =~ /^\x9A/) {
613
+ $result .= "\xC5\xA1"; # Latin small letter s with caron
614
+ } elsif ($s =~ /^\x9B/) {
615
+ $result .= "\xE2\x80\xBA"; # single right-pointing angle quotation mark
616
+ } elsif ($s =~ /^\x9C/) {
617
+ $result .= "\xC5\x93"; # oe ligature
618
+ } elsif ($s =~ /^\x9E/) {
619
+ $result .= "\xC5\xBE"; # Latin small letter z with caron
620
+ } elsif ($s =~ /^\x9F/) {
621
+ $result .= "\xC5\xB8"; # Latin capital letter Y with diaeresis
622
+ } else {
623
+ $result .= "?";
624
+ }
625
+ $s = substr($s, $n_bytes);
626
+ }
627
+ return $result;
628
+ }
629
+
630
+ sub delete_weird_stuff {
631
+ local($caller, $s) = @_;
632
+
633
+ # delete control chacters (except tab and linefeed), zero-width characters, byte order mark,
634
+ # directional marks, join marks, variation selectors, Arabic tatweel
635
+ $s =~ s/([\x00-\x08\x0B-\x1F\x7F]|\xC2[\x80-\x9F]|\xD9\x80|\xE2\x80[\x8B-\x8F]|\xEF\xB8[\x80-\x8F]|\xEF\xBB\xBF|\xF3\xA0[\x84-\x87][\x80-\xBF])//g;
636
+ return $s;
637
+ }
638
+
639
+ sub number_of_utf8_character {
640
+ local($caller, $s) = @_;
641
+
642
+ $s2 = $s;
643
+ $s2 =~ s/[\x80-\xBF]//g;
644
+ return length($s2);
645
+ }
646
+
647
+ sub cap_letter_reg_exp {
648
+ # includes A-Z and other Latin-based capital letters with accents, umlauts and other decorations etc.
649
+ return "[A-Z]|\xC3[\x80-\x96\x98-\x9E]|\xC4[\x80\x82\x84\x86\x88\x8A\x8C\x8E\x90\x94\x964\x98\x9A\x9C\x9E\xA0\xA2\xA4\xA6\xA8\xAA\xAC\xAE\xB0\xB2\xB4\xB6\xB9\xBB\xBD\xBF]|\xC5[\x81\x83\x85\x87\x8A\x8C\x8E\x90\x92\x96\x98\x9A\x9C\x9E\xA0\xA2\xA4\xA6\xA8\xAA\xAC\xB0\xB2\xB4\xB6\xB8\xB9\xBB\xBD]";
650
+ }
651
+
652
+ sub regex_extended_case_expansion {
653
+ local($caller, $s) = @_;
654
+
655
+ if ($s =~ /\xC3/) {
656
+ $s =~ s/\xC3\xA0/\xC3\[\x80\xA0\]/g;
657
+ $s =~ s/\xC3\xA1/\xC3\[\x81\xA1\]/g;
658
+ $s =~ s/\xC3\xA2/\xC3\[\x82\xA2\]/g;
659
+ $s =~ s/\xC3\xA3/\xC3\[\x83\xA3\]/g;
660
+ $s =~ s/\xC3\xA4/\xC3\[\x84\xA4\]/g;
661
+ $s =~ s/\xC3\xA5/\xC3\[\x85\xA5\]/g;
662
+ $s =~ s/\xC3\xA6/\xC3\[\x86\xA6\]/g;
663
+ $s =~ s/\xC3\xA7/\xC3\[\x87\xA7\]/g;
664
+ $s =~ s/\xC3\xA8/\xC3\[\x88\xA8\]/g;
665
+ $s =~ s/\xC3\xA9/\xC3\[\x89\xA9\]/g;
666
+ $s =~ s/\xC3\xAA/\xC3\[\x8A\xAA\]/g;
667
+ $s =~ s/\xC3\xAB/\xC3\[\x8B\xAB\]/g;
668
+ $s =~ s/\xC3\xAC/\xC3\[\x8C\xAC\]/g;
669
+ $s =~ s/\xC3\xAD/\xC3\[\x8D\xAD\]/g;
670
+ $s =~ s/\xC3\xAE/\xC3\[\x8E\xAE\]/g;
671
+ $s =~ s/\xC3\xAF/\xC3\[\x8F\xAF\]/g;
672
+ $s =~ s/\xC3\xB0/\xC3\[\x90\xB0\]/g;
673
+ $s =~ s/\xC3\xB1/\xC3\[\x91\xB1\]/g;
674
+ $s =~ s/\xC3\xB2/\xC3\[\x92\xB2\]/g;
675
+ $s =~ s/\xC3\xB3/\xC3\[\x93\xB3\]/g;
676
+ $s =~ s/\xC3\xB4/\xC3\[\x94\xB4\]/g;
677
+ $s =~ s/\xC3\xB5/\xC3\[\x95\xB5\]/g;
678
+ $s =~ s/\xC3\xB6/\xC3\[\x96\xB6\]/g;
679
+ $s =~ s/\xC3\xB8/\xC3\[\x98\xB8\]/g;
680
+ $s =~ s/\xC3\xB9/\xC3\[\x99\xB9\]/g;
681
+ $s =~ s/\xC3\xBA/\xC3\[\x9A\xBA\]/g;
682
+ $s =~ s/\xC3\xBB/\xC3\[\x9B\xBB\]/g;
683
+ $s =~ s/\xC3\xBC/\xC3\[\x9C\xBC\]/g;
684
+ $s =~ s/\xC3\xBD/\xC3\[\x9D\xBD\]/g;
685
+ $s =~ s/\xC3\xBE/\xC3\[\x9E\xBE\]/g;
686
+ }
687
+ if ($s =~ /\xC5/) {
688
+ $s =~ s/\xC5\x91/\xC5\[\x90\x91\]/g;
689
+ $s =~ s/\xC5\xA1/\xC5\[\xA0\xA1\]/g;
690
+ $s =~ s/\xC5\xB1/\xC5\[\xB0\xB1\]/g;
691
+ }
692
+
693
+ return $s;
694
+ }
695
+
696
+ sub extended_lower_case {
697
+ local($caller, $s) = @_;
698
+
699
+ $s =~ tr/A-Z/a-z/;
700
+
701
+ # Latin-1
702
+ if ($s =~ /\xC3[\x80-\x9F]/) {
703
+ $s =~ s/À/à/g;
704
+ $s =~ s/Á/á/g;
705
+ $s =~ s/Â/â/g;
706
+ $s =~ s/Ã/ã/g;
707
+ $s =~ s/Ä/ä/g;
708
+ $s =~ s/Å/å/g;
709
+ $s =~ s/Æ/æ/g;
710
+ $s =~ s/Ç/ç/g;
711
+ $s =~ s/È/è/g;
712
+ $s =~ s/É/é/g;
713
+ $s =~ s/Ê/ê/g;
714
+ $s =~ s/Ë/ë/g;
715
+ $s =~ s/Ì/ì/g;
716
+ $s =~ s/Í/í/g;
717
+ $s =~ s/Î/î/g;
718
+ $s =~ s/Ï/ï/g;
719
+ $s =~ s/Ð/ð/g;
720
+ $s =~ s/Ñ/ñ/g;
721
+ $s =~ s/Ò/ò/g;
722
+ $s =~ s/Ó/ó/g;
723
+ $s =~ s/Ô/ô/g;
724
+ $s =~ s/Õ/õ/g;
725
+ $s =~ s/Ö/ö/g;
726
+ $s =~ s/Ø/ø/g;
727
+ $s =~ s/Ù/ù/g;
728
+ $s =~ s/Ú/ú/g;
729
+ $s =~ s/Û/û/g;
730
+ $s =~ s/Ü/ü/g;
731
+ $s =~ s/Ý/ý/g;
732
+ $s =~ s/Þ/þ/g;
733
+ }
734
+ # Latin Extended-A
735
+ if ($s =~ /[\xC4-\xC5][\x80-\xBF]/) {
736
+ $s =~ s/Ā/ā/g;
737
+ $s =~ s/Ă/ă/g;
738
+ $s =~ s/Ą/ą/g;
739
+ $s =~ s/Ć/ć/g;
740
+ $s =~ s/Ĉ/ĉ/g;
741
+ $s =~ s/Ċ/ċ/g;
742
+ $s =~ s/Č/č/g;
743
+ $s =~ s/Ď/ď/g;
744
+ $s =~ s/Đ/đ/g;
745
+ $s =~ s/Ē/ē/g;
746
+ $s =~ s/Ĕ/ĕ/g;
747
+ $s =~ s/Ė/ė/g;
748
+ $s =~ s/Ę/ę/g;
749
+ $s =~ s/Ě/ě/g;
750
+ $s =~ s/Ĝ/ĝ/g;
751
+ $s =~ s/Ğ/ğ/g;
752
+ $s =~ s/Ġ/ġ/g;
753
+ $s =~ s/Ģ/ģ/g;
754
+ $s =~ s/Ĥ/ĥ/g;
755
+ $s =~ s/Ħ/ħ/g;
756
+ $s =~ s/Ĩ/ĩ/g;
757
+ $s =~ s/Ī/ī/g;
758
+ $s =~ s/Ĭ/ĭ/g;
759
+ $s =~ s/Į/į/g;
760
+ $s =~ s/İ/ı/g;
761
+ $s =~ s/IJ/ij/g;
762
+ $s =~ s/Ĵ/ĵ/g;
763
+ $s =~ s/Ķ/ķ/g;
764
+ $s =~ s/Ĺ/ĺ/g;
765
+ $s =~ s/Ļ/ļ/g;
766
+ $s =~ s/Ľ/ľ/g;
767
+ $s =~ s/Ŀ/ŀ/g;
768
+ $s =~ s/Ł/ł/g;
769
+ $s =~ s/Ń/ń/g;
770
+ $s =~ s/Ņ/ņ/g;
771
+ $s =~ s/Ň/ň/g;
772
+ $s =~ s/Ŋ/ŋ/g;
773
+ $s =~ s/Ō/ō/g;
774
+ $s =~ s/Ŏ/ŏ/g;
775
+ $s =~ s/Ő/ő/g;
776
+ $s =~ s/Œ/œ/g;
777
+ $s =~ s/Ŕ/ŕ/g;
778
+ $s =~ s/Ŗ/ŗ/g;
779
+ $s =~ s/Ř/ř/g;
780
+ $s =~ s/Ś/ś/g;
781
+ $s =~ s/Ŝ/ŝ/g;
782
+ $s =~ s/Ş/ş/g;
783
+ $s =~ s/Š/š/g;
784
+ $s =~ s/Ţ/ţ/g;
785
+ $s =~ s/Ť/ť/g;
786
+ $s =~ s/Ŧ/ŧ/g;
787
+ $s =~ s/Ũ/ũ/g;
788
+ $s =~ s/Ū/ū/g;
789
+ $s =~ s/Ŭ/ŭ/g;
790
+ $s =~ s/Ů/ů/g;
791
+ $s =~ s/Ű/ű/g;
792
+ $s =~ s/Ų/ų/g;
793
+ $s =~ s/Ŵ/ŵ/g;
794
+ $s =~ s/Ŷ/ŷ/g;
795
+ $s =~ s/Ź/ź/g;
796
+ $s =~ s/Ż/ż/g;
797
+ $s =~ s/Ž/ž/g;
798
+ }
799
+ # Greek letters
800
+ if ($s =~ /\xCE[\x86-\xAB]/) {
801
+ $s =~ s/Α/α/g;
802
+ $s =~ s/Β/β/g;
803
+ $s =~ s/Γ/γ/g;
804
+ $s =~ s/Δ/δ/g;
805
+ $s =~ s/Ε/ε/g;
806
+ $s =~ s/Ζ/ζ/g;
807
+ $s =~ s/Η/η/g;
808
+ $s =~ s/Θ/θ/g;
809
+ $s =~ s/Ι/ι/g;
810
+ $s =~ s/Κ/κ/g;
811
+ $s =~ s/Λ/λ/g;
812
+ $s =~ s/Μ/μ/g;
813
+ $s =~ s/Ν/ν/g;
814
+ $s =~ s/Ξ/ξ/g;
815
+ $s =~ s/Ο/ο/g;
816
+ $s =~ s/Π/π/g;
817
+ $s =~ s/Ρ/ρ/g;
818
+ $s =~ s/Σ/σ/g;
819
+ $s =~ s/Τ/τ/g;
820
+ $s =~ s/Υ/υ/g;
821
+ $s =~ s/Φ/φ/g;
822
+ $s =~ s/Χ/χ/g;
823
+ $s =~ s/Ψ/ψ/g;
824
+ $s =~ s/Ω/ω/g;
825
+ $s =~ s/Ϊ/ϊ/g;
826
+ $s =~ s/Ϋ/ϋ/g;
827
+ $s =~ s/Ά/ά/g;
828
+ $s =~ s/Έ/έ/g;
829
+ $s =~ s/Ή/ή/g;
830
+ $s =~ s/Ί/ί/g;
831
+ $s =~ s/Ό/ό/g;
832
+ $s =~ s/Ύ/ύ/g;
833
+ $s =~ s/Ώ/ώ/g;
834
+ }
835
+ # Cyrillic letters
836
+ if ($s =~ /\xD0[\x80-\xAF]/) {
837
+ $s =~ s/А/а/g;
838
+ $s =~ s/Б/б/g;
839
+ $s =~ s/В/в/g;
840
+ $s =~ s/Г/г/g;
841
+ $s =~ s/Д/д/g;
842
+ $s =~ s/Е/е/g;
843
+ $s =~ s/Ж/ж/g;
844
+ $s =~ s/З/з/g;
845
+ $s =~ s/И/и/g;
846
+ $s =~ s/Й/й/g;
847
+ $s =~ s/К/к/g;
848
+ $s =~ s/Л/л/g;
849
+ $s =~ s/М/м/g;
850
+ $s =~ s/Н/н/g;
851
+ $s =~ s/О/о/g;
852
+ $s =~ s/П/п/g;
853
+ $s =~ s/Р/р/g;
854
+ $s =~ s/С/с/g;
855
+ $s =~ s/Т/т/g;
856
+ $s =~ s/У/у/g;
857
+ $s =~ s/Ф/ф/g;
858
+ $s =~ s/Х/х/g;
859
+ $s =~ s/Ц/ц/g;
860
+ $s =~ s/Ч/ч/g;
861
+ $s =~ s/Ш/ш/g;
862
+ $s =~ s/Щ/щ/g;
863
+ $s =~ s/Ъ/ъ/g;
864
+ $s =~ s/Ы/ы/g;
865
+ $s =~ s/Ь/ь/g;
866
+ $s =~ s/Э/э/g;
867
+ $s =~ s/Ю/ю/g;
868
+ $s =~ s/Я/я/g;
869
+ $s =~ s/Ѐ/ѐ/g;
870
+ $s =~ s/Ё/ё/g;
871
+ $s =~ s/Ђ/ђ/g;
872
+ $s =~ s/Ѓ/ѓ/g;
873
+ $s =~ s/Є/є/g;
874
+ $s =~ s/Ѕ/ѕ/g;
875
+ $s =~ s/І/і/g;
876
+ $s =~ s/Ї/ї/g;
877
+ $s =~ s/Ј/ј/g;
878
+ $s =~ s/Љ/љ/g;
879
+ $s =~ s/Њ/њ/g;
880
+ $s =~ s/Ћ/ћ/g;
881
+ $s =~ s/Ќ/ќ/g;
882
+ $s =~ s/Ѝ/ѝ/g;
883
+ $s =~ s/Ў/ў/g;
884
+ $s =~ s/Џ/џ/g;
885
+ }
886
+ # Fullwidth A-Z
887
+ if ($s =~ /\xEF\xBC[\xA1-\xBA]/) {
888
+ $s =~ s/A/a/g;
889
+ $s =~ s/B/b/g;
890
+ $s =~ s/C/c/g;
891
+ $s =~ s/D/d/g;
892
+ $s =~ s/E/e/g;
893
+ $s =~ s/F/f/g;
894
+ $s =~ s/G/g/g;
895
+ $s =~ s/H/h/g;
896
+ $s =~ s/I/i/g;
897
+ $s =~ s/J/j/g;
898
+ $s =~ s/K/k/g;
899
+ $s =~ s/L/l/g;
900
+ $s =~ s/M/m/g;
901
+ $s =~ s/N/n/g;
902
+ $s =~ s/O/o/g;
903
+ $s =~ s/P/p/g;
904
+ $s =~ s/Q/q/g;
905
+ $s =~ s/R/r/g;
906
+ $s =~ s/S/s/g;
907
+ $s =~ s/T/t/g;
908
+ $s =~ s/U/u/g;
909
+ $s =~ s/V/v/g;
910
+ $s =~ s/W/w/g;
911
+ $s =~ s/X/x/g;
912
+ $s =~ s/Y/y/g;
913
+ $s =~ s/Z/z/g;
914
+ }
915
+
916
+ return $s;
917
+ }
918
+
919
+ sub extended_upper_case {
920
+ local($caller, $s) = @_;
921
+
922
+ $s =~ tr/a-z/A-Z/;
923
+ return $s unless $s =~ /[\xC3-\xC5][\x80-\xBF]/;
924
+
925
+ $s =~ s/\xC3\xA0/\xC3\x80/g;
926
+ $s =~ s/\xC3\xA1/\xC3\x81/g;
927
+ $s =~ s/\xC3\xA2/\xC3\x82/g;
928
+ $s =~ s/\xC3\xA3/\xC3\x83/g;
929
+ $s =~ s/\xC3\xA4/\xC3\x84/g;
930
+ $s =~ s/\xC3\xA5/\xC3\x85/g;
931
+ $s =~ s/\xC3\xA6/\xC3\x86/g;
932
+ $s =~ s/\xC3\xA7/\xC3\x87/g;
933
+ $s =~ s/\xC3\xA8/\xC3\x88/g;
934
+ $s =~ s/\xC3\xA9/\xC3\x89/g;
935
+ $s =~ s/\xC3\xAA/\xC3\x8A/g;
936
+ $s =~ s/\xC3\xAB/\xC3\x8B/g;
937
+ $s =~ s/\xC3\xAC/\xC3\x8C/g;
938
+ $s =~ s/\xC3\xAD/\xC3\x8D/g;
939
+ $s =~ s/\xC3\xAE/\xC3\x8E/g;
940
+ $s =~ s/\xC3\xAF/\xC3\x8F/g;
941
+ $s =~ s/\xC3\xB0/\xC3\x90/g;
942
+ $s =~ s/\xC3\xB1/\xC3\x91/g;
943
+ $s =~ s/\xC3\xB2/\xC3\x92/g;
944
+ $s =~ s/\xC3\xB3/\xC3\x93/g;
945
+ $s =~ s/\xC3\xB4/\xC3\x94/g;
946
+ $s =~ s/\xC3\xB5/\xC3\x95/g;
947
+ $s =~ s/\xC3\xB6/\xC3\x96/g;
948
+ $s =~ s/\xC3\xB8/\xC3\x98/g;
949
+ $s =~ s/\xC3\xB9/\xC3\x99/g;
950
+ $s =~ s/\xC3\xBA/\xC3\x9A/g;
951
+ $s =~ s/\xC3\xBB/\xC3\x9B/g;
952
+ $s =~ s/\xC3\xBC/\xC3\x9C/g;
953
+ $s =~ s/\xC3\xBD/\xC3\x9D/g;
954
+ $s =~ s/\xC3\xBE/\xC3\x9E/g;
955
+
956
+ $s =~ s/\xC5\x91/\xC5\x90/g;
957
+ $s =~ s/\xC5\xA1/\xC5\xA0/g;
958
+ $s =~ s/\xC5\xB1/\xC5\xB0/g;
959
+ return $s unless $s =~ /[\xC3-\xC5][\x80-\xBF]/;
960
+
961
+ return $s;
962
+ }
963
+
964
+ sub extended_first_upper_case {
965
+ local($caller, $s) = @_;
966
+
967
+ if (($first_char, $rest) = ($s =~ /^([\x00-\x7F]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF][\x80-\xBF])(.*)$/)) {
968
+ return $caller->extended_upper_case($first_char) . $rest;
969
+ } else {
970
+ return $s;
971
+ }
972
+ }
973
+
974
+ sub repair_doubly_converted_utf8_strings {
975
+ local($caller, $s) = @_;
976
+
977
+ if ($s =~ /\xC3[\x82-\x85]\xC2[\x80-\xBF]/) {
978
+ $s =~ s/\xC3\x82\xC2([\x80-\xBF])/\xC2$1/g;
979
+ $s =~ s/\xC3\x83\xC2([\x80-\xBF])/\xC3$1/g;
980
+ $s =~ s/\xC3\x84\xC2([\x80-\xBF])/\xC4$1/g;
981
+ $s =~ s/\xC3\x85\xC2([\x80-\xBF])/\xC5$1/g;
982
+ }
983
+ return $s;
984
+ }
985
+
986
+ sub repair_misconverted_windows_to_utf8_strings {
987
+ local($caller, $s) = @_;
988
+
989
+ # correcting conversions of UTF8 using Latin1-to-UTF converter
990
+ if ($s =~ /\xC3\xA2\xC2\x80\xC2[\x90-\xEF]/) {
991
+ my $result = "";
992
+ while (($pre,$last_c,$post) = ($s =~ /^(.*?)\xC3\xA2\xC2\x80\xC2([\x90-\xEF])(.*)$/s)) {
993
+ $result .= "$pre\xE2\x80$last_c";
994
+ $s = $post;
995
+ }
996
+ $result .= $s;
997
+ $s = $result;
998
+ }
999
+ # correcting conversions of Windows1252-to-UTF8 using Latin1-to-UTF converter
1000
+ if ($s =~ /\xC2[\x80-\x9F]/) {
1001
+ my $result = "";
1002
+ while (($pre,$c_windows,$post) = ($s =~ /^(.*?)\xC2([\x80-\x9F])(.*)$/s)) {
1003
+ $c_utf8 = $caller->windows1252_to_utf8($c_windows, 0);
1004
+ $result .= ($c_utf8 eq "?") ? ($pre . "\xC2" . $c_windows) : "$pre$c_utf8";
1005
+ $s = $post;
1006
+ }
1007
+ $result .= $s;
1008
+ $s = $result;
1009
+ }
1010
+ if ($s =~ /\xC3/) {
1011
+ $s =~ s/\xC3\xA2\xE2\x80\x9A\xC2\xAC/\xE2\x82\xAC/g; # x80 -> Euro sign
1012
+ # x81 codepoint undefined in Windows 1252
1013
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xC5\xA1/\xE2\x80\x9A/g; # x82 -> single low-9 quotation mark
1014
+ $s =~ s/\xC3\x86\xE2\x80\x99/\xC6\x92/g; # x83 -> Latin small letter f with hook
1015
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xC5\xBE/\xE2\x80\x9E/g; # x84 -> double low-9 quotation mark
1016
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xA6/\xE2\x80\xA6/g; # x85 -> horizontal ellipsis
1017
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xA0/\xE2\x80\xA0/g; # x86 -> dagger
1018
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xA1/\xE2\x80\xA1/g; # x87 -> double dagger
1019
+ $s =~ s/\xC3\x8B\xE2\x80\xA0/\xCB\x86/g; # x88 -> modifier letter circumflex accent
1020
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xB0/\xE2\x80\xB0/g; # x89 -> per mille sign
1021
+ $s =~ s/\xC3\x85\xC2\xA0/\xC5\xA0/g; # x8A -> Latin capital letter S with caron
1022
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xB9/\xE2\x80\xB9/g; # x8B -> single left-pointing angle quotation mark
1023
+ $s =~ s/\xC3\x85\xE2\x80\x99/\xC5\x92/g; # x8C -> Latin capital ligature OE
1024
+ # x8D codepoint undefined in Windows 1252
1025
+ $s =~ s/\xC3\x85\xC2\xBD/\xC5\xBD/g; # x8E -> Latin capital letter Z with caron
1026
+ # x8F codepoint undefined in Windows 1252
1027
+ # x90 codepoint undefined in Windows 1252
1028
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xCB\x9C/\xE2\x80\x98/g; # x91 a-circumflex+euro+small tilde -> left single quotation mark
1029
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xE2\x84\xA2/\xE2\x80\x99/g; # x92 a-circumflex+euro+trademark -> right single quotation mark
1030
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xC5\x93/\xE2\x80\x9C/g; # x93 a-circumflex+euro+Latin small ligature oe -> left double quotation mark
1031
+ # x94 maps through undefined intermediate code point
1032
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xA2/\xE2\x80\xA2/g; # x95 a-circumflex+euro+cent sign -> bullet
1033
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xE2\x80\x9C/\xE2\x80\x93/g; # x96 a-circumflex+euro+left double quotation mark -> en dash
1034
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xE2\x80\x9D/\xE2\x80\x94/g; # x97 a-circumflex+euro+right double quotation mark -> em dash
1035
+ $s =~ s/\xC3\x8B\xC5\x93/\xCB\x9C/g; # x98 Latin capital e diaeresis+Latin small ligature oe -> small tilde
1036
+ $s =~ s/\xC3\xA2\xE2\x80\x9E\xC2\xA2/\xE2\x84\xA2/g; # x99 -> trade mark sign
1037
+ $s =~ s/\xC3\x85\xC2\xA1/\xC5\xA1/g; # x9A -> Latin small letter s with caron
1038
+ $s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xBA/\xE2\x80\xBA/g; # x9B -> single right-pointing angle quotation mark
1039
+ $s =~ s/\xC3\x85\xE2\x80\x9C/\xC5\x93/g; # x9C -> Latin small ligature oe
1040
+ # x9D codepoint undefined in Windows 1252
1041
+ $s =~ s/\xC3\x85\xC2\xBE/\xC5\xBE/g; # x9E -> Latin small letter z with caron
1042
+ $s =~ s/\xC3\x85\xC2\xB8/\xC5\xB8/g; # x9F -> Latin capital letter Y with diaeresis
1043
+ $s =~ s/\xC3\xAF\xC2\xBF\xC2\xBD/\xEF\xBF\xBD/g; # replacement character
1044
+ }
1045
+
1046
+ return $s;
1047
+ }
1048
+
1049
+ sub latin1_to_utf {
1050
+ local($caller, $s) = @_;
1051
+
1052
+ my $result = "";
1053
+ while (($pre,$c,$post) = ($s =~ /^(.*?)([\x80-\xFF])(.*)$/s)) {
1054
+ $result .= $pre;
1055
+ if ($c =~ /^[\x80-\xBF]$/) {
1056
+ $result .= "\xC2$c";
1057
+ } elsif ($c =~ /^[\xC0-\xFF]$/) {
1058
+ $c =~ tr/[\xC0-\xFF]/[\x80-\xBF]/;
1059
+ $result .= "\xC3$c";
1060
+ }
1061
+ $s = $post;
1062
+ }
1063
+ $result .= $s;
1064
+ return $result;
1065
+ }
1066
+
1067
+ sub character_type_is_letter_type {
1068
+ local($caller, $char_type) = @_;
1069
+
1070
+ return ($char_type =~ /\b((CJK|hiragana|kana|katakana)\s+character|diacritic|letter|syllable)\b/);
1071
+ }
1072
+
1073
+ sub character_type {
1074
+ local($caller, $c) = @_;
1075
+
1076
+ if ($c =~ /^[\x00-\x7F]/) {
1077
+ return "XML tag" if $c =~ /^<.*>$/;
1078
+ return "ASCII Latin letter" if $c =~ /^[a-z]$/i;
1079
+ return "ASCII digit" if $c =~ /^[0-9]$/i;
1080
+ return "ASCII whitespace" if $c =~ /^[\x09-\x0D\x20]$/;
1081
+ return "ASCII control-character" if $c =~ /^[\x00-\x1F\x7F]$/;
1082
+ return "ASCII currency" if $c eq "\$";
1083
+ return "ASCII punctuation";
1084
+ } elsif ($c =~ /^[\xC0-\xDF]/) {
1085
+ return "non-UTF8 (invalid)" unless $c =~ /^[\xC0-\xDF][\x80-\xBF]$/;
1086
+ return "non-shortest-UTF8 (invalid)" if $c =~ /[\xC0-\xC1]/;
1087
+ return "non-ASCII control-character" if $c =~ /\xC2[\x80-\x9F]/;
1088
+ return "non-ASCII whitespace" if $c =~ /\xC2\xA0/;
1089
+ return "non-ASCII currency" if $c =~ /\xC2[\xA2-\xA5]/;
1090
+ return "fraction" if $c =~ /\xC2[\xBC-\xBE]/; # NEW
1091
+ return "superscript digit" if $c =~ /\xC2[\xB2\xB3\xB9]/;
1092
+ return "non-ASCII Latin letter" if $c =~ /\xC2\xB5/; # micro sign
1093
+ return "non-ASCII punctuation" if $c =~ /\xC2[\xA0-\xBF]/;
1094
+ return "non-ASCII punctuation" if $c =~ /\xC3[\x97\xB7]/;
1095
+ return "non-ASCII Latin letter" if $c =~ /\xC3[\x80-\xBF]/;
1096
+ return "Latin ligature letter" if $c =~ /\xC4[\xB2\xB3]/;
1097
+ return "Latin ligature letter" if $c =~ /\xC5[\x92\x93]/;
1098
+ return "non-ASCII Latin letter" if $c =~ /[\xC4-\xC8]/;
1099
+ return "non-ASCII Latin letter" if $c =~ /\xC9[\x80-\x8F]/;
1100
+ return "IPA" if $c =~ /\xC9[\x90-\xBF]/;
1101
+ return "IPA" if $c =~ /\xCA[\x80-\xBF]/;
1102
+ return "IPA" if $c =~ /\xCB[\x80-\xBF]/;
1103
+ return "combining-diacritic" if $c =~ /\xCC[\x80-\xBF]/;
1104
+ return "combining-diacritic" if $c =~ /\xCD[\x80-\xAF]/;
1105
+ return "Greek punctuation" if $c =~ /\xCD[\xBE]/; # Greek question mark
1106
+ return "Greek punctuation" if $c =~ /\xCE[\x87]/; # Greek semicolon
1107
+ return "Greek letter" if $c =~ /\xCD[\xB0-\xBF]/;
1108
+ return "Greek letter" if $c =~ /\xCE/;
1109
+ return "Greek letter" if $c =~ /\xCF[\x80-\xA1\xB3\xB7\xB8\xBA\xBB]/;
1110
+ return "Coptic letter" if $c =~ /\xCF[\xA2-\xAF]/;
1111
+ return "Cyrillic letter" if $c =~ /[\xD0-\xD3]/;
1112
+ return "Cyrillic letter" if $c =~ /\xD4[\x80-\xAF]/;
1113
+ return "Armenian punctuation" if $c =~ /\xD5[\x9A-\x9F]/;
1114
+ return "Armenian punctuation" if $c =~ /\xD6[\x89-\x8F]/;
1115
+ return "Armenian letter" if $c =~ /\xD4[\xB0-\xBF]/;
1116
+ return "Armenian letter" if $c =~ /\xD5/;
1117
+ return "Armenian letter" if $c =~ /\xD6[\x80-\x8F]/;
1118
+ return "Hebrew accent" if $c =~ /\xD6[\x91-\xAE]/;
1119
+ return "Hebrew punctuation" if $c =~ /\xD6\xBE/;
1120
+ return "Hebrew punctuation" if $c =~ /\xD7[\x80\x83\x86\xB3\xB4]/;
1121
+ return "Hebrew point" if $c =~ /\xD6[\xB0-\xBF]/;
1122
+ return "Hebrew point" if $c =~ /\xD7[\x81\x82\x87]/;
1123
+ return "Hebrew letter" if $c =~ /\xD7[\x90-\xB2]/;
1124
+ return "other Hebrew" if $c =~ /\xD6[\x90-\xBF]/;
1125
+ return "other Hebrew" if $c =~ /\xD7/;
1126
+ return "Arabic currency" if $c =~ /\xD8\x8B/; # Afghani sign
1127
+ return "Arabic punctuation" if $c =~ /\xD8[\x89-\x8D\x9B\x9E\x9F]/;
1128
+ return "Arabic punctuation" if $c =~ /\xD9[\xAA-\xAD]/;
1129
+ return "Arabic punctuation" if $c =~ /\xDB[\x94]/;
1130
+ return "Arabic tatweel" if $c =~ /\xD9\x80/;
1131
+ return "Arabic letter" if $c =~ /\xD8[\xA0-\xBF]/;
1132
+ return "Arabic letter" if $c =~ /\xD9[\x81-\x9F]/;
1133
+ return "Arabic letter" if $c =~ /\xD9[\xAE-\xBF]/;
1134
+ return "Arabic letter" if $c =~ /\xDA[\x80-\xBF]/;
1135
+ return "Arabic letter" if $c =~ /\xDB[\x80-\x95]/;
1136
+ return "Arabic Indic digit" if $c =~ /\xD9[\xA0-\xA9]/;
1137
+ return "Arabic Indic digit" if $c =~ /\xDB[\xB0-\xB9]/;
1138
+ return "other Arabic" if $c =~ /[\xD8-\xDB]/;
1139
+ return "Syriac punctuation" if $c =~ /\xDC[\x80-\x8F]/;
1140
+ return "Syriac letter" if $c =~ /\xDC[\x90-\xAF]/;
1141
+ return "Syriac diacritic" if $c =~ /\xDC[\xB0-\xBF]/;
1142
+ return "Syriac diacritic" if $c =~ /\xDD[\x80-\x8A]/;
1143
+ return "Thaana letter" if $c =~ /\xDE/;
1144
+ } elsif ($c =~ /^[\xE0-\xEF]/) {
1145
+ return "non-UTF8 (invalid)" unless $c =~ /^[\xE0-\xEF][\x80-\xBF]{2,2}$/;
1146
+ return "non-shortest-UTF8 (invalid)" if $c =~ /\xE0[\x80-\x9F]/;
1147
+ return "Arabic letter" if $c =~ /\xE0\xA2[\xA0-\xBF]/; # extended letters
1148
+ return "other Arabic" if $c =~ /\xE0\xA3/; # extended characters
1149
+ return "Devanagari punctuation" if $c =~ /\xE0\xA5[\xA4\xA5]/; # danda, double danda
1150
+ return "Devanagari digit" if $c =~ /\xE0\xA5[\xA6-\xAF]/;
1151
+ return "Devanagari letter" if $c =~ /\xE0[\xA4-\xA5]/;
1152
+ return "Bengali digit" if $c =~ /\xE0\xA7[\xA6-\xAF]/;
1153
+ return "Bengali currency" if $c =~ /\xE0\xA7[\xB2-\xB9]/;
1154
+ return "Bengali letter" if $c =~ /\xE0[\xA6-\xA7]/;
1155
+ return "Gurmukhi digit" if $c =~ /\xE0\xA9[\xA6-\xAF]/;
1156
+ return "Gurmukhi letter" if $c =~ /\xE0[\xA8-\xA9]/;
1157
+ return "Gujarati digit" if $c =~ /\xE0\xAB[\xA6-\xAF]/;
1158
+ return "Gujarati letter" if $c =~ /\xE0[\xAA-\xAB]/;
1159
+ return "Oriya digit" if $c =~ /\xE0\xAD[\xA6-\xAF]/;
1160
+ return "Oriya fraction" if $c =~ /\xE0\xAD[\xB2-\xB7]/;
1161
+ return "Oriya letter" if $c =~ /\xE0[\xAC-\xAD]/;
1162
+ return "Tamil digit" if $c =~ /\xE0\xAF[\xA6-\xAF]/;
1163
+ return "Tamil number" if $c =~ /\xE0\xAF[\xB0-\xB2]/; # number (10, 100, 1000)
1164
+ return "Tamil letter" if $c =~ /\xE0[\xAE-\xAF]/;
1165
+ return "Telegu digit" if $c =~ /\xE0\xB1[\xA6-\xAF]/;
1166
+ return "Telegu fraction" if $c =~ /\xE0\xB1[\xB8-\xBE]/;
1167
+ return "Telegu letter" if $c =~ /\xE0[\xB0-\xB1]/;
1168
+ return "Kannada digit" if $c =~ /\xE0\xB3[\xA6-\xAF]/;
1169
+ return "Kannada letter" if $c =~ /\xE0[\xB2-\xB3]/;
1170
+ return "Malayalam digit" if $c =~ /\xE0\xB5[\x98-\x9E\xA6-\xB8]/;
1171
+ return "Malayalam punctuation" if $c =~ /\xE0\xB5\xB9/; # date mark
1172
+ return "Malayalam letter" if $c =~ /\xE0[\xB4-\xB5]/;
1173
+ return "Sinhala digit" if $c =~ /\xE0\xB7[\xA6-\xAF]/;
1174
+ return "Sinhala punctuation" if $c =~ /\xE0\xB7\xB4/;
1175
+ return "Sinhala letter" if $c =~ /\xE0[\xB6-\xB7]/;
1176
+ return "Thai currency" if $c =~ /\xE0\xB8\xBF/;
1177
+ return "Thai digit" if $c =~ /\xE0\xB9[\x90-\x99]/;
1178
+ return "Thai character" if $c =~ /\xE0[\xB8-\xB9]/;
1179
+ return "Lao punctuation" if $c =~ /\xE0\xBA\xAF/; # Lao ellipsis
1180
+ return "Lao digit" if $c =~ /\xE0\xBB[\x90-\x99]/;
1181
+ return "Lao character" if $c =~ /\xE0[\xBA-\xBB]/;
1182
+ return "Tibetan punctuation" if $c =~ /\xE0\xBC[\x81-\x94]/;
1183
+ return "Tibetan sign" if $c =~ /\xE0\xBC[\x95-\x9F]/;
1184
+ return "Tibetan digit" if $c =~ /\xE0\xBC[\xA0-\xB3]/;
1185
+ return "Tibetan punctuation" if $c =~ /\xE0\xBC[\xB4-\xBD]/;
1186
+ return "Tibetan letter" if $c =~ /\xE0[\xBC-\xBF]/;
1187
+ return "Myanmar digit" if $c =~ /\xE1\x81[\x80-\x89]/;
1188
+ return "Myanmar digit" if $c =~ /\xE1\x82[\x90-\x99]/; # Myanmar Shan digits
1189
+ return "Myanmar punctuation" if $c =~ /\xE1\x81[\x8A-\x8B]/;
1190
+ return "Myanmar letter" if $c =~ /\xE1[\x80-\x81]/;
1191
+ return "Myanmar letter" if $c =~ /\xE1\x82[\x80-\x9F]/;
1192
+ return "Georgian punctuation" if $c =~ /\xE1\x83\xBB/;
1193
+ return "Georgian letter" if $c =~ /\xE1\x82[\xA0-\xBF]/;
1194
+ return "Georgian letter" if $c =~ /\xE1\x83/;
1195
+ return "Georgian letter" if $c =~ /\xE1\xB2[\x90-\xBF]/; # Georgian Mtavruli capital letters
1196
+ return "Georgian letter" if $c =~ /\xE2\xB4[\x80-\xAF]/; # Georgian small letters (Khutsuri)
1197
+ return "Korean Hangul letter" if $c =~ /\xE1[\x84-\x87]/;
1198
+ return "Ethiopic punctuation" if $c =~ /\xE1\x8D[\xA0-\xA8]/;
1199
+ return "Ethiopic digit" if $c =~ /\xE1\x8D[\xA9-\xB1]/;
1200
+ return "Ethiopic number" if $c =~ /\xE1\x8D[\xB2-\xBC]/;
1201
+ return "Ethiopic syllable" if $c =~ /\xE1[\x88-\x8D]/;
1202
+ return "Cherokee letter" if $c =~ /\xE1\x8E[\xA0-\xBF]/;
1203
+ return "Cherokee letter" if $c =~ /\xE1\x8F/;
1204
+ return "Canadian punctuation" if $c =~ /\xE1\x90\x80/; # Canadian Syllabics hyphen
1205
+ return "Canadian punctuation" if $c =~ /\xE1\x99\xAE/; # Canadian Syllabics full stop
1206
+ return "Canadian syllable" if $c =~ /\xE1[\x90-\x99]/;
1207
+ return "Canadian syllable" if $c =~ /\xE1\xA2[\xB0-\xBF]/;
1208
+ return "Canadian syllable" if $c =~ /\xE1\xA3/;
1209
+ return "Ogham whitespace" if $c =~ /\xE1\x9A\x80/;
1210
+ return "Ogham letter" if $c =~ /\xE1\x9A[\x81-\x9A]/;
1211
+ return "Ogham punctuation" if $c =~ /\xE1\x9A[\x9B-\x9C]/;
1212
+ return "Runic punctuation" if $c =~ /\xE1\x9B[\xAB-\xAD]/;
1213
+ return "Runic letter" if $c =~ /\xE1\x9A[\xA0-\xBF]/;
1214
+ return "Runic letter" if $c =~ /\xE1\x9B/;
1215
+ return "Khmer currency" if $c =~ /\xE1\x9F\x9B/;
1216
+ return "Khmer digit" if $c =~ /\xE1\x9F[\xA0-\xA9]/;
1217
+ return "Khmer letter" if $c =~ /\xE1[\x9E-\x9F]/;
1218
+ return "Mongolian punctuation" if $c =~ /\xE1\xA0[\x80-\x8A]/;
1219
+ return "Mongolian digit" if $c =~ /\xE1\xA0[\x90-\x99]/;
1220
+ return "Mongolian letter" if $c =~ /\xE1[\xA0-\xA1]/;
1221
+ return "Mongolian letter" if $c =~ /\xE1\xA2[\x80-\xAF]/;
1222
+ return "Buginese letter" if $c =~ /\xE1\xA8[\x80-\x9B]/;
1223
+ return "Buginese punctuation" if $c =~ /\xE1\xA8[\x9E-\x9F]/;
1224
+ return "Balinese letter" if $c =~ /\xE1\xAC/;
1225
+ return "Balinese letter" if $c =~ /\xE1\xAD[\x80-\x8F]/;
1226
+ return "Balinese digit" if $c =~ /\xE1\xAD[\x90-\x99]/;
1227
+ return "Balinese puncutation" if $c =~ /\xE1\xAD[\x9A-\xA0]/;
1228
+ return "Balinese symbol" if $c =~ /\xE1\xAD[\xA1-\xBF]/;
1229
+ return "Sundanese digit" if $c =~ /\xE1\xAE[\xB0-\xB9]/;
1230
+ return "Sundanese letter" if $c =~ /\xE1\xAE/;
1231
+ return "Cyrillic letter" if $c =~ /\xE1\xB2[\x80-\x8F]/;
1232
+ return "Sundanese punctuation" if $c =~ /\xE1\xB3[\x80-\x8F]/;
1233
+ return "IPA" if $c =~ /\xE1[\xB4-\xB6]/;
1234
+ return "non-ASCII Latin letter" if $c =~ /\xE1[\xB8-\xBB]/;
1235
+ return "Greek letter" if $c =~ /\xE1[\xBC-\xBF]/;
1236
+ return "non-ASCII whitespace" if $c =~ /\xE2\x80[\x80-\x8A\xAF]/;
1237
+ return "zero-width space" if $c =~ /\xE2\x80\x8B/;
1238
+ return "zero-width non-space" if $c =~ /\xE2\x80\x8C/;
1239
+ return "zero-width joiner" if $c =~ /\xE2\x80\x8D/;
1240
+ return "directional mark" if $c =~ /\xE2\x80[\x8E-\x8F\xAA-\xAE]/;
1241
+ return "non-ASCII punctuation" if $c =~ /\xE2\x80[\x90-\xBF]/;
1242
+ return "non-ASCII punctuation" if $c =~ /\xE2\x81[\x80-\x9E]/;
1243
+ return "superscript letter" if $c =~ /\xE2\x81[\xB1\xBF]/;
1244
+ return "superscript digit" if $c =~ /\xE2\x81[\xB0-\xB9]/;
1245
+ return "superscript punctuation" if $c =~ /\xE2\x81[\xBA-\xBE]/;
1246
+ return "subscript digit" if $c =~ /\xE2\x82[\x80-\x89]/;
1247
+ return "subscript punctuation" if $c =~ /\xE2\x82[\x8A-\x8E]/;
1248
+ return "non-ASCII currency" if $c =~ /\xE2\x82[\xA0-\xBF]/;
1249
+ return "letterlike symbol" if $c =~ /\xE2\x84/;
1250
+ return "letterlike symbol" if $c =~ /\xE2\x85[\x80-\x8F]/;
1251
+ return "fraction" if $c =~ /\xE2\x85[\x90-\x9E]/; # NEW
1252
+ return "Roman number" if $c =~ /\xE2\x85[\xA0-\xBF]/; # NEW
1253
+ return "arrow symbol" if $c =~ /\xE2\x86[\x90-\xBF]/;
1254
+ return "arrow symbol" if $c =~ /\xE2\x87/;
1255
+ return "mathematical operator" if $c =~ /\xE2[\x88-\x8B]/;
1256
+ return "technical symbol" if $c =~ /\xE2[\x8C-\x8F]/;
1257
+ return "enclosed alphanumeric" if $c =~ /\xE2\x91[\xA0-\xBF]/;
1258
+ return "enclosed alphanumeric" if $c =~ /\xE2[\x92-\x93]/;
1259
+ return "box drawing" if $c =~ /\xE2[\x94-\x95]/;
1260
+ return "geometric shape" if $c =~ /\xE2\x96[\xA0-\xBF]/;
1261
+ return "geometric shape" if $c =~ /\xE2\x97/;
1262
+ return "pictograph" if $c =~ /\xE2[\x98-\x9E]/;
1263
+ return "arrow symbol" if $c =~ /\xE2\xAC[\x80-\x91\xB0-\xBF]/;
1264
+ return "geometric shape" if $c =~ /\xE2\xAC[\x92-\xAF]/;
1265
+ return "arrow symbol" if $c =~ /\xE2\xAD[\x80-\x8F\x9A-\xBF]/;
1266
+ return "geometric shape" if $c =~ /\xE2\xAD[\x90-\x99]/;
1267
+ return "arrow symbol" if $c =~ /\xE2\xAE[\x80-\xB9]/;
1268
+ return "geometric shape" if $c =~ /\xE2\xAE[\xBA-\xBF]/;
1269
+ return "geometric shape" if $c =~ /\xE2\xAF[\x80-\x88\x8A-\x8F]/;
1270
+ return "symbol" if $c =~ /\xE2[\xAC-\xAF]/;
1271
+ return "Coptic fraction" if $c =~ /\xE2\xB3\xBD/;
1272
+ return "Coptic punctuation" if $c =~ /\xE2\xB3[\xB9-\xBF]/;
1273
+ return "Coptic letter" if $c =~ /\xE2[\xB2-\xB3]/;
1274
+ return "Georgian letter" if $c =~ /\xE2\xB4[\x80-\xAF]/;
1275
+ return "Tifinagh punctuation" if $c =~ /\xE2\xB5\xB0/;
1276
+ return "Tifinagh letter" if $c =~ /\xE2\xB4[\xB0-\xBF]/;
1277
+ return "Tifinagh letter" if $c =~ /\xE2\xB5/;
1278
+ return "Ethiopic syllable" if $c =~ /\xE2\xB6/;
1279
+ return "Ethiopic syllable" if $c =~ /\xE2\xB7[\x80-\x9F]/;
1280
+ return "non-ASCII punctuation" if $c =~ /\xE3\x80[\x80-\x91\x94-\x9F\xB0\xBB-\xBD]/;
1281
+ return "symbol" if $c =~ /\xE3\x80[\x91\x92\xA0\xB6\xB7]/;
1282
+ return "Japanese hiragana character" if $c =~ /\xE3\x81/;
1283
+ return "Japanese hiragana character" if $c =~ /\xE3\x82[\x80-\x9F]/;
1284
+ return "Japanese katakana character" if $c =~ /\xE3\x82[\xA0-\xBF]/;
1285
+ return "Japanese katakana character" if $c =~ /\xE3\x83/;
1286
+ return "Bopomofo letter" if $c =~ /\xE3\x84[\x80-\xAF]/;
1287
+ return "Korean Hangul letter" if $c =~ /\xE3\x84[\xB0-\xBF]/;
1288
+ return "Korean Hangul letter" if $c =~ /\xE3\x85/;
1289
+ return "Korean Hangul letter" if $c =~ /\xE3\x86[\x80-\x8F]/;
1290
+ return "Bopomofo letter" if $c =~ /\xE3\x86[\xA0-\xBF]/;
1291
+ return "CJK stroke" if $c =~ /\xE3\x87[\x80-\xAF]/;
1292
+ return "Japanese kana character" if $c =~ /\xE3\x87[\xB0-\xBF]/;
1293
+ return "CJK symbol" if $c =~ /\xE3[\x88-\x8B]/;
1294
+ return "CJK square Latin abbreviation" if $c =~ /\xE3\x8D[\xB1-\xBA]/;
1295
+ return "CJK square Latin abbreviation" if $c =~ /\xE3\x8E/;
1296
+ return "CJK square Latin abbreviation" if $c =~ /\xE3\x8F[\x80-\x9F\xBF]/;
1297
+ return "CJK character" if $c =~ /\xE4[\xB8-\xBF]/;
1298
+ return "CJK character" if $c =~ /[\xE5-\xE9]/;
1299
+ return "Yi syllable" if $c =~ /\xEA[\x80-\x92]/;
1300
+ return "Lisu letter" if $c =~ /\xEA\x93[\x90-\xBD]/;
1301
+ return "Lisu punctuation" if $c =~ /\xEA\x93[\xBE-\xBF]/;
1302
+ return "Cyrillic letter" if $c =~ /\xEA\x99/;
1303
+ return "Cyrillic letter" if $c =~ /\xEA\x9A[\x80-\x9F]/;
1304
+ return "modifier tone" if $c =~ /\xEA\x9C[\x80-\xA1]/;
1305
+ return "Javanese punctuation" if $c =~ /\xEA\xA7[\x81-\x8D\x9E-\x9F]/;
1306
+ return "Javanese digit" if $c =~ /\xEA\xA7[\x90-\x99]/;
1307
+ return "Javanese letter" if $c =~ /\xEA\xA6/;
1308
+ return "Javanese letter" if $c =~ /\xEA\xA7[\x80-\x9F]/;
1309
+ return "Ethiopic syllable" if $c =~ /\xEA\xAC[\x80-\xAF]/;
1310
+ return "Cherokee letter" if $c =~ /\xEA\xAD[\xB0-\xBF]/;
1311
+ return "Cherokee letter" if $c =~ /\xEA\xAE/;
1312
+ return "Meetai Mayek digit" if $c =~ /\xEA\xAF[\xB0-\xB9]/;
1313
+ return "Meetai Mayek letter" if $c =~ /\xEA\xAF/;
1314
+ return "Korean Hangul syllable" if $c =~ /\xEA[\xB0-\xBF]/;
1315
+ return "Korean Hangul syllable" if $c =~ /[\xEB-\xEC]/;
1316
+ return "Korean Hangul syllable" if $c =~ /\xED[\x80-\x9E]/;
1317
+ return "Klingon letter" if $c =~ /\xEF\xA3[\x90-\xA9]/;
1318
+ return "Klingon digit" if $c =~ /\xEF\xA3[\xB0-\xB9]/;
1319
+ return "Klingon punctuation" if $c =~ /\xEF\xA3[\xBD-\xBE]/;
1320
+ return "Klingon symbol" if $c =~ /\xEF\xA3\xBF/;
1321
+ return "private use character" if $c =~ /\xEE/;
1322
+ return "Latin typographic ligature" if $c =~ /\xEF\xAC[\x80-\x86]/;
1323
+ return "Hebrew presentation letter" if $c =~ /\xEF\xAC[\x9D-\xBF]/;
1324
+ return "Hebrew presentation letter" if $c =~ /\xEF\xAD[\x80-\x8F]/;
1325
+ return "Arabic presentation letter" if $c =~ /\xEF\xAD[\x90-\xBF]/;
1326
+ return "Arabic presentation letter" if $c =~ /\xEF[\xAE-\xB7]/;
1327
+ return "non-ASCII punctuation" if $c =~ /\xEF\xB8[\x90-\x99]/;
1328
+ return "non-ASCII punctuation" if $c =~ /\xEF\xB8[\xB0-\xBF]/;
1329
+ return "non-ASCII punctuation" if $c =~ /\xEF\xB9[\x80-\xAB]/;
1330
+ return "Arabic presentation letter" if $c =~ /\xEF\xB9[\xB0-\xBF]/;
1331
+ return "Arabic presentation letter" if $c =~ /\xEF\xBA/;
1332
+ return "Arabic presentation letter" if $c =~ /\xEF\xBB[\x80-\xBC]/;
1333
+ return "byte-order mark/zero-width no-break space" if $c eq "\xEF\xBB\xBF";
1334
+ return "fullwidth currency" if $c =~ /\xEF\xBC\x84/;
1335
+ return "fullwidth digit" if $c =~ /\xEF\xBC[\x90-\x99]/;
1336
+ return "fullwidth Latin letter" if $c =~ /\xEF\xBC[\xA1-\xBA]/;
1337
+ return "fullwidth Latin letter" if $c =~ /\xEF\xBD[\x81-\x9A]/;
1338
+ return "fullwidth punctuation" if $c =~ /\xEF\xBC/;
1339
+ return "fullwidth punctuation" if $c =~ /\xEF\xBD[\x9B-\xA4]/;
1340
+ return "halfwidth Japanese punctuation" if $c =~ /\xEF\xBD[\xA1-\xA4]/;
1341
+ return "halfwidth Japanese katakana character" if $c =~ /\xEF\xBD[\xA5-\xBF]/;
1342
+ return "halfwidth Japanese katakana character" if $c =~ /\xEF\xBE[\x80-\x9F]/;
1343
+ return "fullwidth currency" if $c =~ /\xEF\xBF[\xA0-\xA6]/;
1344
+ return "replacement character" if $c eq "\xEF\xBF\xBD";
1345
+ } elsif ($c =~ /[\xF0-\xF7]/) {
1346
+ return "non-UTF8 (invalid)" unless $c =~ /[\xF0-\xF7][\x80-\xBF]{3,3}$/;
1347
+ return "non-shortest-UTF8 (invalid)" if $c =~ /\xF0[\x80-\x8F]/;
1348
+ return "Linear B syllable" if $c =~ /\xF0\x90\x80/;
1349
+ return "Linear B syllable" if $c =~ /\xF0\x90\x81[\x80-\x8F]/;
1350
+ return "Linear B symbol" if $c =~ /\xF0\x90\x81[\x90-\x9F]/;
1351
+ return "Linear B ideogram" if $c =~ /\xF0\x90[\x82-\x83]/;
1352
+ return "Gothic letter" if $c =~ /\xF0\x90\x8C[\xB0-\xBF]/;
1353
+ return "Gothic letter" if $c =~ /\xF0\x90\x8D[\x80-\x8F]/;
1354
+ return "Phoenician letter" if $c =~ /\xF0\x90\xA4[\x80-\x95]/;
1355
+ return "Phoenician number" if $c =~ /\xF0\x90\xA4[\x96-\x9B]/;
1356
+ return "Phoenician punctuation" if $c =~ /\xF0\x90\xA4\x9F/; # word separator
1357
+ return "Old Hungarian number" if $c =~ /\xF0\x90\xB3[\xBA-\xBF]/;
1358
+ return "Old Hungarian letter" if $c =~ /\xF0\x90[\xB2-\xB3]/;
1359
+ return "Cuneiform digit" if $c =~ /\xF0\x92\x90/; # numberic sign
1360
+ return "Cuneiform digit" if $c =~ /\xF0\x92\x91[\x80-\xAF]/; # numberic sign
1361
+ return "Cuneiform punctuation" if $c =~ /\xF0\x92\x91[\xB0-\xBF]/;
1362
+ return "Cuneiform sign" if $c =~ /\xF0\x92[\x80-\x95]/;
1363
+ return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x81\xA8/;
1364
+ return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x82[\xAD-\xB6]/;
1365
+ return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x86[\x90\xBC-\xBF]/;
1366
+ return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x87[\x80-\x84]/;
1367
+ return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x8D[\xA2-\xAB]/;
1368
+ return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x8E[\x86-\x92]/;
1369
+ return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x8F[\xBA-\xBF]/;
1370
+ return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x90[\x80-\x83]/;
1371
+ return "Egyptian hieroglyph" if $c =~ /\xF0\x93[\x80-\x90]/;
1372
+ return "enclosed alphanumeric" if $c =~ /\xF0\x9F[\x84-\x87]/;
1373
+ return "Mahjong symbol" if $c =~ /\xF0\x9F\x80[\x80-\xAF]/;
1374
+ return "Domino symbol" if $c =~ /\xF0\x9F\x80[\xB0-\xBF]/;
1375
+ return "Domino symbol" if $c =~ /\xF0\x9F\x81/;
1376
+ return "Domino symbol" if $c =~ /\xF0\x9F\x82[\x80-\x9F]/;
1377
+ return "Playing card symbol" if $c =~ /\xF0\x9F\x82[\xA0-\xBF]/;
1378
+ return "Playing card symbol" if $c =~ /\xF0\x9F\x83/;
1379
+ return "CJK symbol" if $c =~ /\xF0\x9F[\x88-\x8B]/;
1380
+ return "pictograph" if $c =~ /\xF0\x9F[\x8C-\x9B]/;
1381
+ return "geometric shape" if $c =~ /\xF0\x9F[\x9E-\x9F]/;
1382
+ return "non-ASCII punctuation" if $c =~ /\xF0\x9F[\xA0-\xA3]/;
1383
+ return "pictograph" if $c =~ /\xF0\x9F[\xA4-\xAB]/;
1384
+ return "CJK character" if $c =~ /\xF0[\xA0-\xAF]/;
1385
+ return "tag" if $c =~ /\xF3\xA0[\x80-\x81]/;
1386
+ return "variation selector" if $c =~ /\xF3\xA0[\x84-\x87]/;
1387
+ return "private use character" if $c =~ /\xF3[\xB0-\xBF]/;
1388
+ return "private use character" if $c =~ /\xF4[\x80-\x8F]/;
1389
+ # ...
1390
+ } elsif ($c =~ /[\xF8-\xFB]/) {
1391
+ return "non-UTF8 (invalid)" unless $c =~ /[\xF8-\xFB][\x80-\xBF]{4,4}$/;
1392
+ } elsif ($c =~ /[\xFC-\xFD]/) {
1393
+ return "non-UTF8 (invalid)" unless $c =~ /[\xFC-\xFD][\x80-\xBF]{5,5}$/;
1394
+ } elsif ($c =~ /\xFE/) {
1395
+ return "non-UTF8 (invalid)" unless $c =~ /\xFE][\x80-\xBF]{6,6}$/;
1396
+ } else {
1397
+ return "non-UTF8 (invalid)";
1398
+ }
1399
+ return "other character";
1400
+ }
1401
+
1402
+ 1;
1403
+
1404
+
uroman/lib/NLP/stringDistance.pm ADDED
@@ -0,0 +1,724 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ################################################################
2
+ # #
3
+ # stringDistance #
4
+ # #
5
+ ################################################################
6
+
7
+ package NLP::stringDistance;
8
+
9
+ use List::Util qw(min max);
10
+ $utf8 = NLP::UTF8;
11
+ $util = NLP::utilities;
12
+ $romanizer = NLP::Romanizer;
13
+
14
+ %dummy_ht = ();
15
+
16
+ sub rule_string_expansion {
17
+ local($this, *ht, $s, $lang_code) = @_;
18
+
19
+ my @characters = $utf8->split_into_utf8_characters($s, "return only chars, return trailing whitespaces", *dummy_ht);
20
+ foreach $sub_len ((0 .. ($#characters-1))) {
21
+ my $sub = join("", @characters[0 .. $sub_len]);
22
+ foreach $super_len ((($sub_len + 1) .. $#characters)) {
23
+ my $super = join("", @characters[0 .. $super_len]);
24
+ # print STDERR " $sub -> $super\n" unless $ht{RULE_STRING_EXPANSION}->{$lang_code}->{$sub}->{$super};
25
+ $ht{RULE_STRING_EXPANSION}->{$lang_code}->{$sub}->{$super} = 1;
26
+ $ht{RULE_STRING_HAS_EXPANSION}->{$lang_code}->{$sub} = 1;
27
+ # print STDERR " RULE_STRING_HAS_EXPANSION $lang_code $sub\n";
28
+ }
29
+ }
30
+ }
31
+
32
+ sub load_string_distance_data {
33
+ local($this, $filename, *ht, $verbose) = @_;
34
+
35
+ $verbose = 0 unless defined($verbose);
36
+ open(IN,$filename) || die "Could not open $filename";
37
+ my $line_number = 0;
38
+ my $n_cost_rules = 0;
39
+ while (<IN>) {
40
+ $line_number++;
41
+ my $line = $_;
42
+ $line =~ s/^\xEF\xBB\xBF//;
43
+ $line =~ s/\s*$//;
44
+ next if $line =~ /^\s*(\#.*)?$/;
45
+ print STDERR "** Warning: line $line_number contains suspicious control character: $line\n" if $line =~ /[\x00-\x1F]/;
46
+ my $s1 = $util->slot_value_in_double_colon_del_list($line, "s1");
47
+ my $s2 = $util->slot_value_in_double_colon_del_list($line, "s2");
48
+ $s1 = $util->dequote_string($s1); # 'can\'t' => can't
49
+ $s2 = $util->dequote_string($s2);
50
+ my $cost = $util->slot_value_in_double_colon_del_list($line, "cost");
51
+ if (($s1 eq "") && ($s2 eq "")) {
52
+ print STDERR "Ignoring bad line $line_number in $filename, because both s1 and s2 are empty strings\n";
53
+ next;
54
+ }
55
+ unless ($cost =~ /^\d+(\.\d+)?$/) {
56
+ if ($cost eq "") {
57
+ print STDERR "Ignoring bad line $line_number in $filename, because of missing cost\n";
58
+ } else {
59
+ print STDERR "Ignoring bad line $line_number in $filename, because of ill-formed cost $cost\n";
60
+ }
61
+ next;
62
+ }
63
+ my $lang_code1_s = $util->slot_value_in_double_colon_del_list($line, "lc1");
64
+ my $lang_code2_s = $util->slot_value_in_double_colon_del_list($line, "lc2");
65
+ my @lang_codes_1 = ($lang_code1_s eq "") ? ("") : split(/,\s*/, $lang_code1_s);
66
+ my @lang_codes_2 = ($lang_code2_s eq "") ? ("") : split(/,\s*/, $lang_code2_s);
67
+ my $left_context1 = $util->slot_value_in_double_colon_del_list($line, "left1");
68
+ my $left_context2 = $util->slot_value_in_double_colon_del_list($line, "left2");
69
+ my $right_context1 = $util->slot_value_in_double_colon_del_list($line, "right1");
70
+ my $right_context2 = $util->slot_value_in_double_colon_del_list($line, "right2");
71
+ my $bad_left = $util->slot_value_in_double_colon_del_list($line, "left");
72
+ if ($bad_left) {
73
+ print STDERR "** Warning: slot '::left $bad_left' in line $line_number\n";
74
+ next;
75
+ }
76
+ my $bad_right = $util->slot_value_in_double_colon_del_list($line, "right");
77
+ if ($bad_right) {
78
+ print STDERR "** Warning: slot '::right $bad_right' in line $line_number\n";
79
+ next;
80
+ }
81
+ my $in_lang_codes1 = $util->slot_value_in_double_colon_del_list($line, "in-lc1");
82
+ my $in_lang_codes2 = $util->slot_value_in_double_colon_del_list($line, "in-lc2");
83
+ my $out_lang_codes1 = $util->slot_value_in_double_colon_del_list($line, "out-lc1");
84
+ my $out_lang_codes2 = $util->slot_value_in_double_colon_del_list($line, "out-lc2");
85
+ if ($left_context1) {
86
+ if ($left_context1 =~ /^\/.*\/$/) {
87
+ $left_context1 =~ s/^\///;
88
+ $left_context1 =~ s/\/$//;
89
+ } else {
90
+ print STDERR "Ignoring unrecognized non-regular-express ::left1 $left_context1 in $line_number of $filename\n";
91
+ $left_context1 = "";
92
+ }
93
+ }
94
+ if ($left_context2) {
95
+ if ($left_context2 =~ /^\/.*\/$/) {
96
+ $left_context2 =~ s/^\///;
97
+ $left_context2 =~ s/\/$//;
98
+ } else {
99
+ $left_context2 = "";
100
+ print STDERR "Ignoring unrecognized non-regular-express ::left2 $left_context2 in $line_number of $filename\n";
101
+ }
102
+ }
103
+ if ($right_context1) {
104
+ unless ($right_context1 =~ /^(\[[^\[\]]*\])+$/) {
105
+ $right_context1 = "";
106
+ print STDERR "Ignoring unrecognized right-context ::right1 $right_context1 in $line_number of $filename\n";
107
+ }
108
+ }
109
+ if ($right_context2) {
110
+ unless ($right_context2 =~ /^(\[[^\[\]]*\])+$/) {
111
+ $right_context2 = "";
112
+ print STDERR "Ignoring unrecognized right-context ::right2 $right_context2 in $line_number of $filename\n";
113
+ }
114
+ }
115
+ foreach $lang_code1 (@lang_codes_1) {
116
+ foreach $lang_code2 (@lang_codes_2) {
117
+ $n_cost_rules++;
118
+ my $cost_rule_id = $n_cost_rules;
119
+ $ht{COST}->{$lang_code1}->{$lang_code2}->{$s1}->{$s2}->{$cost_rule_id} = $cost;
120
+ $ht{RULE_STRING}->{$lang_code1}->{$s1} = 1;
121
+ $ht{RULE_STRING}->{$lang_code2}->{$s2} = 1;
122
+ $ht{LEFT1}->{$cost_rule_id} = $left_context1;
123
+ $ht{LEFT2}->{$cost_rule_id} = $left_context2;
124
+ $ht{RIGHT1}->{$cost_rule_id} = $right_context1;
125
+ $ht{RIGHT2}->{$cost_rule_id} = $right_context2;
126
+ $ht{INLC1}->{$cost_rule_id} = $in_lang_codes1;
127
+ $ht{INLC2}->{$cost_rule_id} = $in_lang_codes2;
128
+ $ht{OUTLC1}->{$cost_rule_id} = $out_lang_codes1;
129
+ $ht{OUTLC2}->{$cost_rule_id} = $out_lang_codes2;
130
+ unless (($s1 eq $s2)
131
+ && ($lang_code1 eq $lang_code2)
132
+ && ($left_context1 eq $left_context2)
133
+ && ($right_context1 eq $right_context2)
134
+ && ($in_lang_codes1 eq $in_lang_codes2)
135
+ && ($out_lang_codes1 eq $out_lang_codes2)) {
136
+ $n_cost_rules++;
137
+ $cost_rule_id = $n_cost_rules;
138
+ $ht{COST}->{$lang_code2}->{$lang_code1}->{$s2}->{$s1}->{$cost_rule_id} = $cost;
139
+ $ht{LEFT1}->{$cost_rule_id} = $left_context2;
140
+ $ht{LEFT2}->{$cost_rule_id} = $left_context1;
141
+ $ht{RIGHT1}->{$cost_rule_id} = $right_context2;
142
+ $ht{RIGHT2}->{$cost_rule_id} = $right_context1;
143
+ $ht{INLC1}->{$cost_rule_id} = $in_lang_codes2;
144
+ $ht{INLC2}->{$cost_rule_id} = $in_lang_codes1;
145
+ $ht{OUTLC1}->{$cost_rule_id} = $out_lang_codes2;
146
+ $ht{OUTLC2}->{$cost_rule_id} = $out_lang_codes1;
147
+ # print STDERR " Flip rule in line $line: $line\n";
148
+ }
149
+ $this->rule_string_expansion(*ht, $s1, $lang_code1);
150
+ $this->rule_string_expansion(*ht, $s2, $lang_code2);
151
+ }
152
+ }
153
+ }
154
+ close(IN);
155
+ print STDERR "Read in $n_cost_rules rules from $line_number lines in $filename\n" if $verbose;
156
+ }
157
+
158
+ sub romanized_string_to_simple_chart {
159
+ local($this, $s, *chart_ht) = @_;
160
+
161
+ my @characters = $utf8->split_into_utf8_characters($s, "return only chars, return trailing whitespaces", *dummy_ht);
162
+ $chart_ht{N_CHARS} = $#characters + 1;
163
+ $chart_ht{N_NODES} = 0;
164
+ foreach $i ((0 .. $#characters)) {
165
+ $romanizer->add_node($characters[$i], $i, ($i+1), *chart_ht, "", "");
166
+ }
167
+ }
168
+
169
+ sub linearize_chart_points {
170
+ local($this, *chart_ht, $chart_id, *sd_ht, $verbose) = @_;
171
+
172
+ $verbose = 0 unless defined($verbose);
173
+ print STDERR "Linearize $chart_id\n" if $verbose;
174
+ my $current_chart_pos = 0;
175
+ my $current_linear_chart_pos = 0;
176
+ $sd_ht{POS2LINPOS}->{$chart_id}->{$current_chart_pos} = $current_linear_chart_pos;
177
+ $sd_ht{LINPOS2POS}->{$chart_id}->{$current_linear_chart_pos} = $current_chart_pos;
178
+ print STDERR " LINPOS2POS.$chart_id LIN: $current_linear_chart_pos POS: $current_chart_pos\n" if $verbose;
179
+ my @end_chart_positions = keys %{$chart_ht{NODES_ENDING_AT}};
180
+ my $end_chart_pos = (@end_chart_positions) ? max(@end_chart_positions) : 0;
181
+ $sd_ht{MAXPOS}->{$chart_id} = $end_chart_pos;
182
+ print STDERR " Chart span: $current_chart_pos-$end_chart_pos\n" if $verbose;
183
+ while ($current_chart_pos < $end_chart_pos) {
184
+ my @node_ids = keys %{$chart_ht{NODES_STARTING_AT}->{$current_chart_pos}};
185
+ foreach $node_id (@node_ids) {
186
+ my $roman_s = $chart_ht{NODE_ROMAN}->{$node_id};
187
+ my @roman_chars = $utf8->split_into_utf8_characters($roman_s, "return only chars, return trailing whitespaces", *dummy_ht);
188
+ print STDERR " $current_chart_pos/$current_linear_chart_pos node: $node_id $roman_s (@roman_chars)\n" if $verbose;
189
+ if ($#roman_chars >= 1) {
190
+ foreach $i ((1 .. $#roman_chars)) {
191
+ $current_linear_chart_pos++;
192
+ $sd_ht{SPLITPOS2LINPOS}->{$chart_id}->{$current_chart_pos}->{$node_id}->{$i} = $current_linear_chart_pos;
193
+ $sd_ht{LINPOS2SPLITPOS}->{$chart_id}->{$current_linear_chart_pos}->{$current_chart_pos}->{$node_id}->{$i} = 1;
194
+ print STDERR " LINPOS2SPLITPOS.$chart_id LIN: $current_linear_chart_pos POS: $current_chart_pos NODE: $node_id I: $i\n" if $verbose;
195
+ }
196
+ }
197
+ }
198
+ $current_chart_pos++;
199
+ if ($util->member($current_chart_pos, @end_chart_positions)) {
200
+ $current_linear_chart_pos++;
201
+ $sd_ht{POS2LINPOS}->{$chart_id}->{$current_chart_pos} = $current_linear_chart_pos;
202
+ $sd_ht{LINPOS2POS}->{$chart_id}->{$current_linear_chart_pos} = $current_chart_pos;
203
+ print STDERR " LINPOS2POS.$chart_id LIN: $current_linear_chart_pos POS: $current_chart_pos\n" if $verbose;
204
+ }
205
+ }
206
+ $current_chart_pos = 0;
207
+ while ($current_chart_pos <= $end_chart_pos) {
208
+ my $current_linear_chart_pos = $sd_ht{POS2LINPOS}->{$chart_id}->{$current_chart_pos};
209
+ $current_linear_chart_pos = "?" unless defined($current_linear_chart_pos);
210
+ my @node_ids = keys %{$chart_ht{NODES_STARTING_AT}->{$current_chart_pos}};
211
+ # print STDERR " LINROM.$chart_id LIN: $current_linear_chart_pos POS: $current_chart_pos NODES: @node_ids\n" if $verbose;
212
+ foreach $node_id (@node_ids) {
213
+ my $end_pos = $chart_ht{NODE_END}->{$node_id};
214
+ my $end_linpos = $sd_ht{POS2LINPOS}->{$chart_id}->{$end_pos};
215
+ my $roman_s = $chart_ht{NODE_ROMAN}->{$node_id};
216
+ my @roman_chars = $utf8->split_into_utf8_characters($roman_s, "return only chars, return trailing whitespaces", *dummy_ht);
217
+ print STDERR " LINROM.$chart_id LIN: $current_linear_chart_pos POS: $current_chart_pos NODE: $node_id CHARS: @roman_chars\n" if $verbose;
218
+ if (@roman_chars) {
219
+ foreach $i ((0 .. $#roman_chars)) {
220
+ my $from_linear_chart_pos
221
+ = (($i == 0)
222
+ ? $sd_ht{POS2LINPOS}->{$chart_id}->{$current_chart_pos}
223
+ : $sd_ht{SPLITPOS2LINPOS}->{$chart_id}->{$current_chart_pos}->{$node_id}->{$i});
224
+ print STDERR " FROM.$chart_id I: $i POS: $current_chart_pos NODE: $node_id FROM: $from_linear_chart_pos\n" if $verbose;
225
+ my $to_linear_chart_pos
226
+ = (($i == $#roman_chars)
227
+ ? $end_linpos
228
+ : $sd_ht{SPLITPOS2LINPOS}->{$chart_id}->{$current_chart_pos}->{$node_id}->{($i+1)});
229
+ print STDERR " TO.$chart_id I: $i POS: $current_chart_pos NODE: $node_id FROM: $to_linear_chart_pos\n" if $verbose;
230
+ my $roman_char = $roman_chars[$i];
231
+ $sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$from_linear_chart_pos}->{$to_linear_chart_pos}->{$roman_char} = 1;
232
+ }
233
+ } else {
234
+ my $from_linear_chart_pos = $sd_ht{POS2LINPOS}->{$chart_id}->{$current_chart_pos};
235
+ my $to_linear_chart_pos = $sd_ht{POS2LINPOS}->{$chart_id}->{($current_chart_pos+1)};
236
+ # HHERE check this out
237
+ my $i = 1;
238
+ while (! (defined($to_linear_chart_pos))) {
239
+ $i++;
240
+ $to_linear_chart_pos = $sd_ht{POS2LINPOS}->{$chart_id}->{($current_chart_pos+$i)};
241
+ }
242
+ if (defined($from_linear_chart_pos) && defined($to_linear_chart_pos)) {
243
+ $sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$from_linear_chart_pos}->{$to_linear_chart_pos}->{""} = 1
244
+ } else {
245
+ print STDERR " UNDEF.$chart_id from: "
246
+ . ((defined($from_linear_chart_pos)) ? $from_linear_chart_pos : "?")
247
+ . " to: "
248
+ . ((defined($to_linear_chart_pos)) ? $to_linear_chart_pos : "?")
249
+ . "\n";
250
+ }
251
+ }
252
+ }
253
+ $current_chart_pos++;
254
+ }
255
+ $sd_ht{MAXLINPOS}->{$chart_id} = $sd_ht{POS2LINPOS}->{$chart_id}->{$end_chart_pos};
256
+ }
257
+
258
+ sub expand_lin_ij_roman {
259
+ local($this, *sd_ht, $chart_id, $lang_code, *ht) = @_;
260
+
261
+ foreach $start (sort { $a <=> $b } keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}}) {
262
+ foreach $end (sort { $a <=> $b } keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$start}}) {
263
+ foreach $roman (sort keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$start}->{$end}}) {
264
+ if ($ht{RULE_STRING_HAS_EXPANSION}->{$lang_code}->{$roman}
265
+ || $ht{RULE_STRING_HAS_EXPANSION}->{""}->{$roman}) {
266
+ $this->expand_lin_ij_roman_rec(*sd_ht, $chart_id, $start, $end, $roman, $lang_code, *ht);
267
+ }
268
+ }
269
+ }
270
+ }
271
+ }
272
+
273
+ sub expand_lin_ij_roman_rec {
274
+ local($this, *sd_ht, $chart_id, $start, $end, $roman, $lang_code, *ht) = @_;
275
+
276
+ # print STDERR " expand_lin_ij_roman_rec.$chart_id $start-$end $lang_code $roman\n";
277
+ return unless $ht{RULE_STRING_HAS_EXPANSION}->{$lang_code}->{$roman}
278
+ || $ht{RULE_STRING_HAS_EXPANSION}->{""}->{$roman};
279
+ foreach $new_end (keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$end}}) {
280
+ foreach $next_roman (sort keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$end}->{$new_end}}) {
281
+ my $exp_roman = join("", $roman, $next_roman);
282
+ if ($ht{RULE_STRING}->{$lang_code}->{$exp_roman}
283
+ || $ht{RULE_STRING}->{""}->{$exp_roman}) {
284
+ $sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$start}->{$new_end}->{$exp_roman} = 1;
285
+ # print STDERR " Expansion ($start-$new_end) $exp_roman\n";
286
+ }
287
+ if ($ht{RULE_STRING_HAS_EXPANSION}->{$lang_code}->{$exp_roman}
288
+ || $ht{RULE_STRING_HAS_EXPANSION}->{""}->{$exp_roman}) {
289
+ $this->expand_lin_ij_roman_rec(*sd_ht, $chart_id, $start, $new_end, $exp_roman, $lang_code, *ht);
290
+ }
291
+ }
292
+ }
293
+ }
294
+
295
+ sub trace_string_distance {
296
+ local($this, *sd_ht, $chart1_id, $chart2_id, $control, $line_number, $cost) = @_;
297
+
298
+ my $chart_comb_id = join("/", $chart1_id, $chart2_id);
299
+ return "mismatch" if $sd_ht{MISMATCH}->{$chart_comb_id};
300
+ my $chart1_end = $sd_ht{MAXLINPOS}->{$chart1_id};
301
+ my $chart2_end = $sd_ht{MAXLINPOS}->{$chart2_id};
302
+ my $verbose = ($control =~ /verbose/);
303
+ my $chunks_p = ($control =~ /chunks/);
304
+ my @traces = ();
305
+ my @s1_s = ();
306
+ my @s2_s = ();
307
+ my @e1_s = ();
308
+ my @e2_s = ();
309
+ my @r1_s = ();
310
+ my @r2_s = ();
311
+ my @ic_s = ();
312
+
313
+ # print STDERR "trace_string_distance $chart1_id $chart2_id $line_number\n";
314
+ while ($chart1_end || $chart2_end) {
315
+ my $incr_cost = $sd_ht{INCR_COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
316
+ my $prec_i = $sd_ht{PREC_I}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
317
+ my $prec_j = $sd_ht{PREC_J}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
318
+ if ($incr_cost || $verbose || $chunks_p) {
319
+ my $roman1 = $sd_ht{ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
320
+ my $roman2 = $sd_ht{ROMAN2}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
321
+ if ($verbose) {
322
+ push(@traces, "$prec_i-$chart1_end/$prec_j-$chart2_end:$roman1/$roman2:$incr_cost");
323
+ } else {
324
+ if (defined($roman1)) {
325
+ push(@traces, "$roman1/$roman2:$incr_cost");
326
+ } else {
327
+ $print_prec_i = (defined($prec_i)) ? $prec_i : "?";
328
+ $print_prec_j = (defined($prec_j)) ? $prec_j : "?";
329
+ print STDERR " $prec_i-$chart1_end, $prec_j-$chart2_end\n";
330
+ }
331
+ }
332
+ if ($chunks_p) {
333
+ push(@s1_s, $prec_i);
334
+ push(@s2_s, $prec_j);
335
+ push(@e1_s, $chart1_end);
336
+ push(@e2_s, $chart2_end);
337
+ push(@r1_s, $roman1);
338
+ push(@r2_s, $roman2);
339
+ push(@ic_s, $incr_cost);
340
+ }
341
+ }
342
+ $chart1_end = $prec_i;
343
+ $chart2_end = $prec_j;
344
+ }
345
+ if ($chunks_p) {
346
+ my $r1 = "";
347
+ my $r2 = "";
348
+ my $tc = 0;
349
+ my $in_chunk = 0;
350
+ foreach $i ((0 .. $#ic_s)) {
351
+ if ($ic_s[$i]) {
352
+ $r1 = $r1_s[$i] . $r1;
353
+ $r2 = $r2_s[$i] . $r2;
354
+ $tc += $ic_s[$i];
355
+ $in_chunk = 1;
356
+ } elsif ($in_chunk) {
357
+ $chunk = "$r1/$r2/$tc";
358
+ $chunk .= "*" if $cost > 5;
359
+ $sd_ht{N_COST_CHUNK}->{$chunk} = ($sd_ht{N_COST_CHUNK}->{$chunk} || 0) + 1;
360
+ $sd_ht{EX_COST_CHUNK}->{$chunk}->{$line_number} = 1;
361
+ $r1 = "";
362
+ $r2 = "";
363
+ $tc = 0;
364
+ $in_chunk = 0;
365
+ }
366
+ }
367
+ if ($in_chunk) {
368
+ $chunk = "$r1/$r2/$tc";
369
+ $chunk .= "*" if $cost > 5;
370
+ $sd_ht{N_COST_CHUNK}->{$chunk} = ($sd_ht{N_COST_CHUNK}->{$chunk} || 0) + 1;
371
+ $sd_ht{EX_COST_CHUNK}->{$chunk}->{$line_number} = 1;
372
+ }
373
+ } else {
374
+ return join(" ", reverse @traces);
375
+ }
376
+ }
377
+
378
+ sub right_context_match {
379
+ local($this, $right_context_rule, *sd_ht, $chart_id, $start_pos) = @_;
380
+
381
+ return 1 if $right_context_rule eq "";
382
+ if (($right_context_item, $right_context_rest) = ($right_context_rule =~ /^\[([^\[\]]*)\]*(.*)$/)) {
383
+ my $guarded_right_context_item = $right_context_item;
384
+ $guarded_right_context_item =~ s/\$/\\\$/g;
385
+ my @end_positions = keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$start_pos}};
386
+ return 1 if ($#end_positions == -1)
387
+ && (($right_context_item eq "")
388
+ || ($right_context_item =~ /\$/));
389
+ foreach $end_pos (@end_positions) {
390
+ my @romans = keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$start_pos}->{$end_pos}};
391
+ foreach $roman (@romans) {
392
+ if ($roman =~ /^[$guarded_right_context_item]/) {
393
+ return $this->right_context_match($right_context_rest, *sd_ht, $chart_id, $end_pos);
394
+ }
395
+ }
396
+ }
397
+ }
398
+ return 0;
399
+ }
400
+
401
+ sub string_distance {
402
+ local($this, *sd_ht, $chart1_id, $chart2_id, $lang_code1, $lang_code2, *ht, $control) = @_;
403
+
404
+ my $verbose = ($control =~ /verbose/i);
405
+ my $chart_comb_id = join("/", $chart1_id, $chart2_id);
406
+
407
+ my $chart1_end_pos = $sd_ht{MAXLINPOS}->{$chart1_id};
408
+ my $chart2_end_pos = $sd_ht{MAXLINPOS}->{$chart2_id};
409
+ print STDERR "string_distance.$chart_comb_id $chart1_end_pos/$chart2_end_pos\n" if $verbose;
410
+ $sd_ht{COST_IJ}->{$chart_comb_id}->{0}->{0} = 0;
411
+ $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{0}->{0} = "";
412
+ $sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{0}->{0} = "";
413
+ # HHERE
414
+ foreach $chart1_start ((0 .. $chart1_end_pos)) {
415
+ # print STDERR " C1 $chart1_start- ($chart1_start .. $chart1_end_pos)\n";
416
+ my $prev_further_expansion_possible = 0;
417
+ my @chart1_ends = sort { $a <=> $b } keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart1_id}->{$chart1_start}};
418
+ my $max_chart1_ends = (@chart1_ends) ? $chart1_ends[$#chart1_ends] : -1;
419
+ foreach $chart1_end (($chart1_start .. $chart1_end_pos)) {
420
+ my $further_expansion_possible = ($chart1_start == $chart1_end)
421
+ || defined($sd_ht{LINPOS2SPLITPOS}->{$chart1_id}->{$chart1_start})
422
+ || ($chart1_end < $max_chart1_ends);
423
+ my @romans1 = (($chart1_start == $chart1_end)
424
+ ? ("")
425
+ : (sort keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart1_id}->{$chart1_start}->{$chart1_end}}));
426
+ if ($#romans1 == -1) {
427
+ $further_expansion_possible = 1 if $prev_further_expansion_possible;
428
+ } else {
429
+ $prev_further_expansion_possible = 0;
430
+ }
431
+ # print STDERR " C1 $chart1_start-$chart1_end romans1: @romans1 {$further_expansion_possible} *l*\n";
432
+ foreach $roman1 (@romans1) {
433
+ # print STDERR " C1 $chart1_start-$chart1_end $roman1 {$further_expansion_possible} *?*\n";
434
+ next unless $ht{RULE_STRING}->{$lang_code1}->{$roman1}
435
+ || $ht{RULE_STRING}->{""}->{$roman1};
436
+ # print STDERR " C1 $chart1_start-$chart1_end $roman1 {$further_expansion_possible} ***\n";
437
+ foreach $lang_code1o (($lang_code1, "")) {
438
+ foreach $lang_code2o (($lang_code2, "")) {
439
+ my @chart2_starts = (sort { $a <=> $b } keys %{$sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_start}});
440
+ foreach $chart2_start (@chart2_starts) {
441
+ # print STDERR " C1 $chart1_start-$chart1_end $roman1 C2 $chart2_start- (@chart2_starts)\n";
442
+ foreach $chart2_end (($chart2_start .. $chart2_end_pos)) {
443
+ print STDERR " C1 $chart1_start-$chart1_end $roman1 C2 $chart2_start-$chart2_end\n";
444
+ my @romans2 = (($chart2_start == $chart2_end)
445
+ ? ("")
446
+ : (sort keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart2_id}->{$chart2_start}->{$chart2_end}}));
447
+ foreach $roman2 (@romans2) {
448
+ if ($roman1 eq $roman2) {
449
+ print STDERR " C1 $chart1_start-$chart1_end $roman1 C2 $chart2_start-$chart2_end $roman2 (IDENTITY)\n";
450
+ my $cost = 0;
451
+ my $preceding_cost = $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_start}->{$chart2_start};
452
+ my $combined_cost = $preceding_cost + $cost;
453
+ my $old_cost = $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
454
+ if ((! defined($old_cost)) || ($combined_cost < $old_cost)) {
455
+ $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $combined_cost;
456
+ push(@chart2_starts, $chart2_end) unless $util->member($chart2_end, @chart2_starts);
457
+ $sd_ht{PREC_I}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $chart1_start;
458
+ $sd_ht{PREC_J}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $chart2_start;
459
+ $sd_ht{ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $roman1;
460
+ $sd_ht{ROMAN2}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $roman2;
461
+ $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end}
462
+ = $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_start}->{$chart2_start} . $roman1;
463
+ $sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{$chart1_end}->{$chart2_end}
464
+ = $sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{$chart1_start}->{$chart2_start} . $roman2;
465
+ $comb_left_roman1 = $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
466
+ $sd_ht{INCR_COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $cost;
467
+ $sd_ht{COST_RULE}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = "IDENTITY";
468
+ print STDERR " New cost $chart1_end/$chart2_end: $combined_cost (+$cost from $chart1_start/$chart2_start $roman1/$roman2)\n" if $verbose;
469
+ }
470
+ } else {
471
+ next unless $ht{RULE_STRING}->{$lang_code2o}->{$roman2};
472
+ print STDERR " C1 $chart1_start-$chart1_end $roman1 C2 $chart2_start-$chart2_end $roman2\n";
473
+ next unless defined($ht{COST}->{$lang_code1o}->{$lang_code2o}->{$roman1}->{$roman2});
474
+ my @cost_rule_ids = keys %{$ht{COST}->{$lang_code1o}->{$lang_code2o}->{$roman1}->{$roman2}};
475
+ foreach $cost_rule_id (@cost_rule_ids) {
476
+ ## check whether any context requirements are satisfied
477
+ # left context rules are regular expressions
478
+ my $left_context_rule1 = $ht{LEFT1}->{$cost_rule_id};
479
+ if ($left_context_rule1) {
480
+ my $comb_left_roman1 = $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_start}->{$chart2_start};
481
+ if (defined($comb_left_roman1)) {
482
+ next unless $comb_left_roman1 =~ /$left_context_rule1/;
483
+ } else {
484
+ print STDERR " No comb_left_roman1 value for $chart_comb_id $chart1_start,$chart2_start\n";
485
+ }
486
+ }
487
+ my $left_context_rule2 = $ht{LEFT2}->{$cost_rule_id};
488
+ if ($left_context_rule2) {
489
+ my $comb_left_roman2 = $sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{$chart1_start}->{$chart2_start};
490
+ if (defined($comb_left_roman2)) {
491
+ next unless $comb_left_roman2 =~ /$left_context_rule2/;
492
+ } else {
493
+ print STDERR " No comb_left_roman2 value for $chart_comb_id $chart1_start,$chart2_start\n";
494
+ }
495
+ }
496
+ my $right_context_rule1 = $ht{RIGHT1}->{$cost_rule_id};
497
+ if ($right_context_rule1) {
498
+ my $match_p = $this->right_context_match($right_context_rule1, *sd_ht, $chart1_id, $chart1_end);
499
+ # print STDERR " Match?($right_context_rule1, 1, $chart1_end) = $match_p\n";
500
+ next unless $match_p;
501
+ }
502
+ my $right_context_rule2 = $ht{RIGHT2}->{$cost_rule_id};
503
+ if ($right_context_rule2) {
504
+ my $match_p = $this->right_context_match($right_context_rule2, *sd_ht, $chart2_id, $chart2_end);
505
+ # print STDERR " Match?($right_context_rule2, 2, $chart2_end) = $match_p\n";
506
+ next unless $match_p;
507
+ }
508
+ my $cost = $ht{COST}->{$lang_code1o}->{$lang_code2o}->{$roman1}->{$roman2}->{$cost_rule_id};
509
+ my $preceding_cost = $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_start}->{$chart2_start};
510
+ my $combined_cost = $preceding_cost + $cost;
511
+ my $old_cost = $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
512
+ if ((! defined($old_cost)) || ($combined_cost < $old_cost)) {
513
+ $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $combined_cost;
514
+ push(@chart2_starts, $chart2_end) unless $util->member($chart2_end, @chart2_starts);
515
+ $sd_ht{PREC_I}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $chart1_start;
516
+ $sd_ht{PREC_J}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $chart2_start;
517
+ $sd_ht{ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $roman1;
518
+ $sd_ht{ROMAN2}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $roman2;
519
+ $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end}
520
+ = $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_start}->{$chart2_start} . $roman1;
521
+ $sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{$chart1_end}->{$chart2_end}
522
+ = $sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{$chart1_start}->{$chart2_start} . $roman2;
523
+ $comb_left_roman1 = $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
524
+ # print STDERR " Comb-left-roman1($chart_comb_id,$chart1_end,$chart2_end) = $comb_left_roman1\n";
525
+ $sd_ht{INCR_COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $cost;
526
+ $sd_ht{COST_RULE}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $cost_rule_id;
527
+ print STDERR " New cost $chart1_end/$chart2_end: $combined_cost (+$cost from $chart1_start/$chart2_start $roman1/$roman2)\n" if $verbose;
528
+ }
529
+ }
530
+ }
531
+ }
532
+ }
533
+ }
534
+ }
535
+ }
536
+ $further_expansion_possible = 1
537
+ if $ht{RULE_STRING_HAS_EXPANSION}->{$lang_code1}->{$roman1}
538
+ || $ht{RULE_STRING_HAS_EXPANSION}->{""}->{$roman1};
539
+ # print STDERR " further_expansion_possible: $further_expansion_possible (lc: $lang_code1 r1: $roman1) ***\n";
540
+ }
541
+ # print STDERR " last C1 $chart1_start-$chart1_end (@romans1)\n" unless $further_expansion_possible;
542
+ last unless $further_expansion_possible;
543
+ $prev_further_expansion_possible = 1 if $further_expansion_possible;
544
+ }
545
+ }
546
+ my $total_cost = $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_end_pos}->{$chart2_end_pos};
547
+ unless (defined($total_cost)) {
548
+ $total_cost = 99.9999;
549
+ $sd_ht{MISMATCH}->{$chart_comb_id} = 1;
550
+ }
551
+ return $total_cost;
552
+ }
553
+
554
+ sub print_sd_ht {
555
+ local($this, *sd_ht, $chart1_id, $chart2_id, *OUT) = @_;
556
+
557
+ print OUT "string-distance chart:\n";
558
+ foreach $chart_id (($chart1_id, $chart2_id)) {
559
+ print OUT "SD chart $chart_id:\n";
560
+ foreach $from_linear_chart_pos (sort { $a <=> $b } keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}}) {
561
+ foreach $to_linear_chart_pos (sort { $a <=> $b } keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$from_linear_chart_pos}}) {
562
+ foreach $roman_char (sort keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$from_linear_chart_pos}->{$to_linear_chart_pos}}) {
563
+ print OUT " Lnode($from_linear_chart_pos-$to_linear_chart_pos): $roman_char\n";
564
+ }
565
+ }
566
+ }
567
+ }
568
+ }
569
+
570
+ sub print_chart_ht {
571
+ local($this, *chart_ht, *OUT) = @_;
572
+
573
+ print OUT "uroman chart:\n";
574
+ foreach $start (sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AT}}) {
575
+ foreach $end (sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}}) {
576
+ foreach $node_id (keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}}) {
577
+ $roman_s = $chart_ht{NODE_ROMAN}->{$node_id};
578
+ print OUT " Node $node_id ($start-$end): $roman_s\n";
579
+ }
580
+ }
581
+ }
582
+ }
583
+
584
+ sub normalize_string {
585
+ local($this, $s) = @_;
586
+
587
+ # $s =~ s/(\xE2\x80\x8C)//g; # delete zero width non-joiner
588
+ $s =~ s/(\xE2\x80[\x93-\x94])/-/g; # en-dash, em-dash
589
+ $s =~ s/([\x00-\x7F\xC0-\xFE][\x80-\xBF]*)\1+/$1$1/g; # shorten 3 or more occurrences of same character in a row to 2
590
+ $s =~ s/[ \t]+/ /g;
591
+
592
+ return $s;
593
+ }
594
+
595
+ my $string_distance_chart_id = 0;
596
+ sub string_distance_by_chart {
597
+ local($this, $s1, $s2, $lang_code1, $lang_code2, *ht, *pinyin_ht, $control) = @_;
598
+
599
+ $control = "" unless defined($control);
600
+ %sd_ht = ();
601
+
602
+ $s1 = $this->normalize_string($s1);
603
+ my $lc_s1 = $utf8->extended_lower_case($s1);
604
+ $string_distance_chart_id++;
605
+ my $chart1_id = $string_distance_chart_id;
606
+ *chart_ht = $romanizer->romanize($lc_s1, $lang_code1, "", *ht, *pinyin_ht, 0, "return chart", $chart1_id);
607
+ $this->linearize_chart_points(*chart_ht, $chart1_id, *sd_ht);
608
+ $this->expand_lin_ij_roman(*sd_ht, $chart1_id, $lang_code1, *ht);
609
+
610
+ $s2 = $this->normalize_string($s2);
611
+ my $lc_s2 = $utf8->extended_lower_case($s2);
612
+ $string_distance_chart_id++;
613
+ my $chart2_id = $string_distance_chart_id;
614
+ *chart_ht = $romanizer->romanize($lc_s2, $lang_code2, "", *ht, *pinyin_ht, 0, "return chart", $chart2_id);
615
+ $this->linearize_chart_points(*chart_ht, $chart2_id, *sd_ht);
616
+ $this->expand_lin_ij_roman(*sd_ht, $chart2_id, $lang_code2, *ht);
617
+
618
+ my $cost = $this->string_distance(*sd_ht, $chart1_id, $chart2_id, $lang_code1, $lang_code2, *ht, $control);
619
+ return $cost;
620
+ }
621
+
622
+ my $n_quick_romanized_string_distance = 0;
623
+ sub quick_romanized_string_distance_by_chart {
624
+ local($this, $s1, $s2, *ht, $control, $lang_code1, $lang_code2) = @_;
625
+
626
+ # my $verbose = ($s1 eq "apit") && ($s2 eq "apet");
627
+ # print STDERR "Start quick_romanized_string_distance_by_chart\n";
628
+ $s1 = lc $s1;
629
+ $s2 = lc $s2;
630
+ $control = "" unless defined($control);
631
+ $lang_code1 = "" unless defined($lang_code1);
632
+ $lang_code2 = "" unless defined($lang_code2);
633
+ my $cache_p = ($control =~ /cache/);
634
+ my $total_cost;
635
+ if ($cache_p) {
636
+ $total_cost = $ht{CACHED_QRSD}->{$s1}->{$s2};
637
+ if (defined($total_cost)) {
638
+ return $total_cost;
639
+ }
640
+ }
641
+ my @lang_codes1 = ($lang_code1 eq "") ? ("") : ($lang_code1, "");
642
+ my @lang_codes2 = ($lang_code2 eq "") ? ("") : ($lang_code2, "");
643
+ my $chart1_end_pos = length($s1);
644
+ my $chart2_end_pos = length($s2);
645
+ my %sd_ht = ();
646
+ $sd_ht{COST_IJ}->{0}->{0} = 0;
647
+ foreach $chart1_start ((0 .. $chart1_end_pos)) {
648
+ foreach $chart1_end (($chart1_start .. $chart1_end_pos)) {
649
+ my $substr1 = substr($s1, $chart1_start, ($chart1_end-$chart1_start));
650
+ foreach $lang_code1o (@lang_codes1) {
651
+ foreach $lang_code2o (@lang_codes2) {
652
+ # next unless defined($ht{COST}->{$lang_code1o}->{$lang_code2o}->{$substr1});
653
+ }
654
+ }
655
+ my @chart2_starts = (sort { $a <=> $b } keys %{$sd_ht{COST_IJ}->{$chart1_start}});
656
+ foreach $chart2_start (@chart2_starts) {
657
+ foreach $chart2_end (($chart2_start .. $chart2_end_pos)) {
658
+ my $substr2 = substr($s2, $chart2_start, ($chart2_end-$chart2_start));
659
+ foreach $lang_code1o (@lang_codes1) {
660
+ foreach $lang_code2o (@lang_codes2) {
661
+ if ($substr1 eq $substr2) {
662
+ my $cost = 0;
663
+ my $preceding_cost = $sd_ht{COST_IJ}->{$chart1_start}->{$chart2_start};
664
+ if (defined($preceding_cost)) {
665
+ my $combined_cost = $preceding_cost + $cost;
666
+ my $old_cost = $sd_ht{COST_IJ}->{$chart1_end}->{$chart2_end};
667
+ if ((! defined($old_cost)) || ($combined_cost < $old_cost)) {
668
+ $sd_ht{COST_IJ}->{$chart1_end}->{$chart2_end} = $combined_cost;
669
+ push(@chart2_starts, $chart2_end) unless $util->member($chart2_end, @chart2_starts);
670
+ }
671
+ }
672
+ } else {
673
+ next unless defined($ht{COST}->{$lang_code1o}->{$lang_code2o}->{$substr1}->{$substr2});
674
+ my @cost_rule_ids = keys %{$ht{COST}->{$lang_code1o}->{$lang_code2o}->{$substr1}->{$substr2}};
675
+ my $best_cost = 99.99;
676
+ foreach $cost_rule_id (@cost_rule_ids) {
677
+ my $cost = $ht{COST}->{$lang_code1o}->{$lang_code2o}->{$substr1}->{$substr2}->{$cost_rule_id};
678
+ my $left_context_rule1 = $ht{LEFT1}->{$cost_rule_id};
679
+ next if $left_context_rule1
680
+ && (! (substr($s1, 0, $chart1_start) =~ /$left_context_rule1/));
681
+ my $left_context_rule2 = $ht{LEFT2}->{$cost_rule_id};
682
+ next if $left_context_rule2
683
+ && (! (substr($s2, 0, $chart2_start) =~ /$left_context_rule2/));
684
+ my $right_context_rule1 = $ht{RIGHT1}->{$cost_rule_id};
685
+ my $right_context1 = substr($s1, $chart1_end);
686
+ next if $right_context_rule1
687
+ && (! (($right_context1 =~ /^$right_context_rule1/)
688
+ || (($right_context_rule1 =~ /^\[[^\[\]]*\$/)
689
+ && ($right_context1 eq ""))));
690
+ my $right_context_rule2 = $ht{RIGHT2}->{$cost_rule_id};
691
+ my $right_context2 = substr($s2, $chart2_end);
692
+ next if $right_context_rule2
693
+ && (! (($right_context2 =~ /^$right_context_rule2/)
694
+ || (($right_context_rule2 =~ /^\[[^\[\]]*\$/)
695
+ && ($right_context2 eq ""))));
696
+ $best_cost = $cost if $cost < $best_cost;
697
+ my $preceding_cost = $sd_ht{COST_IJ}->{$chart1_start}->{$chart2_start};
698
+ my $combined_cost = $preceding_cost + $cost;
699
+ my $old_cost = $sd_ht{COST_IJ}->{$chart1_end}->{$chart2_end};
700
+ if ((! defined($old_cost)) || ($combined_cost < $old_cost)) {
701
+ $sd_ht{COST_IJ}->{$chart1_end}->{$chart2_end} = $combined_cost;
702
+ push(@chart2_starts, $chart2_end) unless $util->member($chart2_end, @chart2_starts);
703
+ }
704
+ }
705
+ }
706
+ }
707
+ }
708
+ }
709
+ }
710
+ }
711
+ }
712
+ $total_cost = $sd_ht{COST_IJ}->{$chart1_end_pos}->{$chart2_end_pos};
713
+ $total_cost = 99.99 unless defined($total_cost);
714
+ $ht{CACHED_QRSD}->{$s1}->{$s2} = $total_cost if $cache_p;
715
+ $n_quick_romanized_string_distance++;
716
+ return $total_cost;
717
+ }
718
+
719
+ sub get_n_quick_romanized_string_distance {
720
+ return $n_quick_romanized_string_distance;
721
+ }
722
+
723
+ 1;
724
+
uroman/lib/NLP/utilities.pm ADDED
The diff for this file is too large to render. See raw diff
 
uroman/tarballs/uroman-v1.0.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:912655beef069e5abb43c8fc4c3c4428fd0af6f4a1697accc98277933d3e1ee5
3
+ size 440252
uroman/tarballs/uroman-v1.1.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df990f6096a10e093ac5f28c2b86d5ef9e9098ef7472855843f9a841bb3b963d
3
+ size 507234
uroman/tarballs/uroman-v1.2.4.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77d707f3c17d5c45869b80fe71caee6023d1d9949ccffb446626f374605a25e2
3
+ size 503690
uroman/tarballs/uroman-v1.2.5.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2e9044afff8b4483f43a99b1fb1279889336760d76245ee93f300e660a46660
3
+ size 575581
uroman/tarballs/uroman-v1.2.6.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02f6f73b067b972a8f7d408da2f9b22741629af67f55b2ea768d11710fbf40a4
3
+ size 567522
uroman/tarballs/uroman-v1.2.7.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbb51506ed3ea6dcb902c824e62bea39b3741f6526564ba05d6e0083d8d876e5
3
+ size 566800
uroman/tarballs/uroman-v1.2.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c69e56d9c5eea9416ae00ca4dd859a1ef5129c1867778b66ad2f811f0fd33c9
3
+ size 494625
uroman/test/multi-script.txt ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ::lcode deu Grüße aus Bordeaux
2
+ ::lcode tur İstanbul, Türkiye'de yer alan şehir ve ülkenin 81 ilinden biri.
3
+ ::lcode eng ⠠⠺⠑⠀⠓⠕⠇⠙⠀⠘⠮⠀⠞⠗⠥⠹⠎⠀⠞⠕⠀⠆⠀⠎⠑⠇⠋⠤⠑⠧⠊⠙⠢⠞⠂⠀⠞⠀⠁⠇⠇⠀⠍⠑⠝⠀⠜⠑⠀⠉⠗⠂⠞⠫⠀⠑⠟⠥⠁⠇⠂⠀⠞⠀⠮⠽⠀⠜⠑⠀⠑⠝⠙⠪⠫⠀⠃⠽⠀⠸⠮⠀⠠⠉⠗⠑⠁⠞⠕⠗⠀⠾⠀⠉⠻⠞⠁⠔⠀⠥⠝⠁⠇⠊⠑⠝⠁⠃⠇⠑⠀⠠⠐⠗⠎⠂⠀⠞⠀⠁⠍⠰⠛⠀⠘⠮⠀⠜⠑⠀⠠⠇⠊⠋⠑⠂⠀⠠⠇⠊⠃⠻⠞⠽⠀⠯⠀⠮⠀⠏⠥⠗⠎⠥⠊⠞⠀⠷⠀⠠⠓⠁⠏⠏⠊⠰⠎⠲
4
+ ::lcode ell Το Λος Άντζελες (στα ισπανικά Los Angeles = Οι Άγγελοι) ή στην Αμερικανική αργκό L.A., ελ έι) είναι η δεύτερη μεγαλύτερη πόλη των Ηνωμένων Πολιτειών από άποψη πληθυσμού, καθώς και ένα από τα σημαντικότερα οικονομικά, πολιτιστικά επιστημονικά και ψυχαγωγικά κέντρα του κόσμου.
5
+ ::lcode rus Герма́ния (нем. Deutschland), официальное название — Федерати́вная Респу́блика Герма́ния (нем. Bundesrepublik Deutschland), ФРГ (нем. BRD) — государство в Западной Европе. Площадь территории — 357 021 км². Численность населения по переписи 2011 года — более 80 миллионов человек. [2][6].
6
+ ::lcode ukr Володи́мир Олекса́ндрович Зеле́нський (нар. 25 січня 1978, Кривий Ріг) — український державний діяч, політик, шоумен, актор, комік, режисер, продюсер та сценарист, шостий Президент України з 20 травня 2019 року.
7
+ ::lcode srp Сва људска бића рађају се слободна и једнака у достојанству и правима. Она су обдарена разумом и свешћу и треба једни према другима да поступају у духу братства.
8
+ ::lcode ara كندا (بالإنجليزية: Canada) هي دولة في أمريكا الشمالية تتألف من 10 مقاطعات وثلاثة أقاليم. تقع في القسم الشمالي من القارة وتمتد من المحيط الأطلسي في الشرق إلى المحيط الهادئ في الغرب وتمتد شمالاً في المحيط المتجمد الشمالي. كندا هي البلد الثاني عالمياً من حيث المساحة الكلية. كما أن حدود كندا المشتركة مع الولايات المتحدة من الجنوب والشمال الغربي هي الأطول في العالم.
9
+ ::lcode fas کالیفرنیا (به انگلیسی: California) ایالتی در غرب آمریکا بر کرانهٔ اقیانوس آرام است. مرکز آن ساکرامنتو و شهرهای مهم آن لس‌آنجلس، سن دیگو، سن خوزه و سان‌فرانسیسکو هستند.همچنین این ایالت پر جمعیت ترین ایالت امریکا است.
10
+ ::lcode uig ئامېرىكا قوشما شتاتلىرى بولسا شىمالىي ئامېرىكاغا جايلاشقان بىر دۆلەت. ئۇنىڭ پايتەختى بولسا ۋاشىنگتون، ئەڭ چوڭ شەھىرى بولسا نيۇيورك شەھىرى. دۆلەت تىلى بولسا ئېنگلىزتىلى. ھازىرقى زۇڭتۇڭ باراك ئوباما. بۇ دۆلەت ئەسلىدە ئەنگىلىيەنىڭ مۇستەملىكىسى بولۇپ ۋاشىنگىتوننىڭ رەھپەرلىكىدە 1776 يىلى 7 ئاينىڭ 4 كۇنى مۇستەقىل بولغان، يەر مەيدانى 9 مىلىيون 826 مىڭ 630 كۋادىرات كلومېتىر، نوپۇسى 306 مىللىيون 142 مىڭ، بۇلارنىڭ ئاسساسلىق دىنى خرىستىئان دىنى.
11
+ ::lcode amh ኢትዮጵያ ከዓለም ሶስቱ ትልቅ የአብርሃም ሀይማኖቶች ጋር ታሪካዊ ግንኙነት አላት።
12
+ ::lcode hin कैलिफ़ोर्निया शब्द का पहला अर्थ था जो क्षेत्र जहाँ आज बाहा कैलिफ़ोर्निया प्रायद्वीप, नेवाडा, यूटा और एरिज़ोना, नया मेक्सिको, और वायोमिंग के कई विभाग स्थित हैं।
13
+ ::lcode mar लंडन (इंग्लिश: London ) हे इंग्लंडचे व युनायटेड किंग्डमचे राजधानीचे व सर्वात मोठे शहर तसेच युरोपियन संघामधील सर्वात मोठे महान���र क्षेत्र आहे.
14
+ ::lcode nep यसको उचाइ समुन्द्र सतहबाट ८,८४८ मीटर (२९,०२८ फीट) छ। यो नेपालको सोलुखुम्बु जिल्लाको खुम्जुङ्ग गा. वि. स. मा पर्छ ।
15
+ ::lcode tam தமிழ்நாடு (Tamil Nadu) இந்தியாவின் 29 மாநிலங்களில் ஒன்றாகும். தமிழ்நாடு, தமிழகம் என்றும் பரவலாக அழைக்கப்படுகிறது.
16
+ ::lcode mal ഇന്ത്യയുടെ തെക്കുപടിഞ്ഞാറെ അറ്റത്തുള്ള സംസ്ഥാനമാണ് കേരളം.
17
+ ::lcode ori ଓଡ଼ିଶା ଭାରତର ପୂର୍ବ ଉପକୂଳରେ ଥିବା ଏକ ପ୍ରଶାସନିକ ରାଜ୍ୟ । ଏହାର ଉତ୍ତର-ପୂର୍ବରେ ପଶ୍ଚିମବଙ୍ଗ, ଉତ୍ତରରେ ଝାଡ଼ଖଣ୍ଡ, ପଶ୍ଚିମ ଓ ଉତ୍ତର-ପଶ୍ଚିମରେ ଛତିଶଗଡ଼, ଦକ୍ଷିଣ ଓ ଦକ୍ଷିଣ-ପଶ୍ଚିମରେ ଆନ୍ଧ୍ରପ୍ରଦେଶ ଅବସ୍ଥିତ । ଏହା ଆୟତନ ହିସାବରେ ନବମ ଓ ଜନସଂଖ୍ୟା ହିସାବରେ ଏଗାରତମ ରାଜ୍ୟ । ଓଡ଼ିଆ ଭାଷା ରାଜ୍ୟର ସରକାରୀ ଭାଷା । ୨୦୦୧ ଜନଗଣନା ଅନୁସାରେ ରାଜ୍ୟର ପ୍ରାୟ ୩୩.୨ ନିୟୁତ ଲୋକ ଓଡ଼ିଆ ଭାଷା ବ୍ୟବହାର କରନ୍ତି ।
18
+ ::lcode zho 加拿大在一万四千年前即有原住民在此生活。
19
+ ::lcode heb כֹּל עוֹד בַּלֵּבָב פְּנִימָה נֶפֶשׁ יְהוּדִי הוֹמִיָּה וּלְפַאֲתֵי מִזְרָח, קָדִימָה, עַיִן לְצִיּוֹן צוֹפִיָּה, עוֹד לֹא אָבְדָה תִּקְוָתֵנוּ, הַתִּקְוָה בַּת שְׁנוֹת אַלְפַּיִם לִהְיוֹת עַם חָפְשִׁי בְּאַרְצֵנוּ, אֶרֶץ צִיּוֹן וִירוּשָׁלַיִם.
20
+ ::lcode yid דווקא איז אן העברעישער זשורנאל וואס באשרייבט די יידיש־שפראכיקע קולטור. עס איז דערשינען געווארן תמוז ה'תשס"ז (יולי 2006).
21
+ ::lcode hye Տալնոեի շրջան (ուկր.՝ Тальнівський район), շրջան Ուկրաինայի Չերկասիի մարզում։ Ստեղծվել է 1923 թվականին։ Վարչական կենտրոնը՝ Տալնոե։ Աշխարհագրությունը Շրջանի տարածքի մակերեսը կազմում է 917 կմ²։ Բնակչություն
22
+ ::lcode tai มีประเทศอิสระ 2 ประเทศ คือ ซานมารีโนและนครรัฐวาติกัน เป็นดินแดนที่ล้อมรอบไปด้วยพื้นที่ของอิตาลี ในขณะที่เมืองกัมปีโอเนดีตาเลีย เป็นดินแดนส่วนแยกของอิตาลีที่ถูกล้อมรอบด้วยพื้นที่ประเทศสวิตเซอร์แลนด์
23
+ 북쪽에는 인도네시아와 동티모르, 파푸아 뉴기니, 북동쪽에는 솔로몬 제도와 바누아투, 누벨칼레도니, 그리고 남동쪽에는 뉴질랜드가 있다.
24
+ ಬಾ ಇಲ್ಲಿ ಸಂಭವಿಸು ಇಂದೆನ್ನ ಹೃದಯದಲಿ ನಿತ್ಯವೂ ಅವತರಿಪ ಸತ್ಯಾವತಾರ ಮಣ್ಣಾಗಿ ಮರವಾಗಿ ಮಿಗವಾಗಿ ಕಗವಾಗೀ... ಮಣ್ಣಾಗಿ ಮರವಾಗಿ ಮಿಗವಾಗಿ ಕಗವಾಗಿ ಭವ ಭವದಿ ಭತಿಸಿಹೇ ಭವತಿ ದೂರ ನಿತ್ಯವೂ ಅವತರಿಪ ಸತ್ಯಾವತಾರ || ಬಾ ಇಲ್ಲಿ ||
25
+ ვეპხის ტყაოსანი შოთა რუსთაველი ღმერთსი შემვედრე, ნუთუ კვლა დამხსნას სოფლისა შრომასა, ცეცხლს, წყალსა და მიწასა, ჰაერთა თანა მრომასა; მომცნეს ფრთენი და აღვფრინდე, მივჰხვდე მას ჩემსა ნდომასა, დღისით და ღამით ვჰხედვიდე მზისა ელვათა კრთომაასა.
26
+ ᚛ᚐᚅᚋ ᚋᚖᚂᚓᚌᚖᚋᚏᚔᚇ ᚋᚐᚉᚔ ᚍᚓᚉᚒᚋᚓᚅ᚜
27
+ ᛁᚳ᛫ᛗᚨᚷ᛫ᚷᛚᚨᛋ᛫ᛖᚩᛏᚪᚾ᛫ᚩᚾᛞ᛫ᚻᛁᛏ᛫ᚾᛖ᛫ᚻᛖᚪᚱᛗᛁᚪᚧ᛫ᛗᛖ᛬
28
+ 𓊪𓏏𓍯𓃭𓐝𓇌𓋴
29
+ チェコスロバキア
30
+ ལྷ་ས་གྲ���ང་ཁྱེར
31
+ ᓵᓕ ᓴᕕᐊᕐᔪᒃ ᐃᒻᒥᓂᒃ ᓂᓪᓕᕈᑎᖃᓲᖑᕗᖅ ᑕᐃᑦᓱᒪᓂᑕᑦᓴᔭᐅᓂᕋᕐᓱᓂ. ᐃᒻᒥᓂᓪᓗᑕᐅᖅ ᓂᓪᓕᕈᑎᖃᓱᖑᒻᒥᓱᓂ ᐅᓪᓗᒥᓂᑕᑦᓴᔭᐅᓂᕋᕐᓱᓂ.
32
+ ⴰⵎⴰⴳⵔⴰⴷ 1 ⴰⵔ ⴷ ⵜⵜⵍⴰⵍⴰⵏ ⵎⵉⴷⴷⵏ ⴳⴰⵏ ⵉⵍⴻⵍⵍⵉⵜⵏ ⵎⴳⴰⴷⴷⴰⵏ ⵖ ⵡⴰⴷⴷⵓⵔ ⴷ ⵉⵣⵔⴼⴰⵏ, ⵢⵉⵍⵉ ⴰⴽⵯ ⴷⴰⵔⵙⵏ ⵓⵏⵍⵍⵉ ⴷ ⵓⴼⵔⴰⴽ, ⵉⵍⵍⴰ ⴼⵍⵍⴰ ⵙⵏ ⴰⴷ ⵜⵜⵎⵢⴰⵡⴰⵙⵏ ⵏⴳⵔⴰⵜⵙⵏ ⵙ ⵜⴰⴳⵎⴰⵜ.
uroman/test/multi-script.uroman-ref.txt ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ::lcode deu Gruesse aus Bordeaux
2
+ ::lcode tur Istanbul, Tuerkiye'de yer alan shehir ve uelkenin 81 ilinden biri.
3
+ ::lcode eng We hold ⠘e truos to ; self-evid⠢t, t all men aee cr,te equal, t ey aee endoee by ⠸e Creator u cita⠔ unalienable ⠠⠐rs, t amg ⠘e aee Life, Libity ⠯ e pursuit a Happis.
4
+ ::lcode ell To Los Andzeles (sta ispanika Los Angeles = Oi Angeloi) e sten Amerikanike arngo L.A., el ei) einai e deutere megalutere pole ton Enomenon Politeion apo apopse plethysmou, kathos kai ena apo ta semandikotera oikonomika, politistika epistemonika kai psychagogika kendra tou kosmou.
5
+ ::lcode rus Germaniya (nem. Deutschland), ofitsialnoe nazvanie — Federativnaya Respublika Germaniya (nem. Bundesrepublik Deutschland), FRG (nem. BRD) — gosudarstvo v Zapadnoi Evrope. Ploshchad territorii — 357 021 km². Chislennost naseleniya po perepisi 2011 goda — bolee 80 millionov chelovek. [2][6].
6
+ ::lcode ukr Volodimir Oleksandrovich Zelensky (nar. 25 sichnya 1978, Krivy Rig) — ukrayinsky derzhavny diyach, politik, shoumen, aktor, komik, rezhiser, prodyuser ta stsenarist, shosty Prezident Ukrayini z 20 travnya 2019 roku.
7
+ ::lcode srp Sva ljudska bitsha radjaju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i sveshtshu i treba jedni prema drugima da postupaju u dukhu bratstva.
8
+ ::lcode ara knda (balinjlyzya: Canada) hy dwla fy amryka alshmalya ttalf mn 10 mqat'at wthlatha aqalym. tq' fy alqsm alshmaly mn alqara wtmtd mn almhyt alatlsy fy alshrq ila almhyt alhadye fy alghrb wtmtd shmalan fy almhyt almtjmd alshmaly. knda hy albld althany 'almyan mn hyth almsaha alklya. kma an hdwd knda almshtrka m' alwlayat almthda mn aljnwb walshmal alghrby hy alatwl fy al'alm.
9
+ ::lcode fas kalifrnia (bh anglisi: California) ialti dr ghrb amrika br kranh' aqianws aram ast. mrkz an sakramntw w shhrhai mhm an lsanjls, sn digw, sn khwzh w sanfransiskw hstnd.hmtchnin in ialt pr jm'it trin ialt amrika ast.
10
+ ::lcode uig yeameraka qwshma shtatlara bwlsa shamalay yeamerakagha jaylashqan bar doelaet. yeunang paytaekhta bwlsa vashangtwn, yeaeng tchwng shaehara bwlsa nyuywrk shaehara. doelaet tala bwlsa yeenglaztala. hazarqa zungtung barak yewbama. bu doelaet yeaesladae yeaengalayaenang mustaemlakasa bwlup vashangatwnnang raehpaerlakadae 1776 yala 7 yeaynang 4 kuna mustaeqal bwlghan, yaer maeydana 9 malaywn 826 mang 630 kvadarat klwmetar, nwpusa 306 mallaywn 142 mang, bularnang yeassaslaq dana khrastayean dana.
11
+ ::lcode amh iteyopheyaa kaaalame sosetu teleqe yaaberehaame hayemaanotoche gaare taarikaawi genenyunate alaate.
12
+ ::lcode hin kailiphorniyaa shabda kaa pahalaa artha thaa jo kssetra jahaam aaj baahaa kailiphorniyaa praayadviip, nevaaddaa, yuuttaa aur erijonaa, nayaa meksiko, aur vaayomimga ke kaii vibhaag sthit haim.
13
+ ::lcode mar lamddan (imglish: London ) he imglamddace va yunaayattedd kimgddamace raajadhaaniice va sarvaat motthe shahar tasec yuropiyan samghaamadhiil sarvaat motthe mahaanagar kssetra aahe.
14
+ ::lcode nep yasako ucaai samundra satahabaatt 8,848 miittar (29,028 phiitt) cha. yo nepaalako solukhumbu jillaako khumjungga gaa. vi. sa. maa parcha .
15
+ ::lcode tam tamilnaadu (Tamil Nadu) intiyaavin 29 maanilangkalil onraakum. tamilnaadu, tamilakam enrum paravalaaka alaikkappadukiratu.
16
+ ::lcode mal intyayutte tekkupattinynyaarre arrrrattulllla samsthaanamaann keerallam.
17
+ ::lcode ori oddishaa bhaaratara puurba upakuullare thibaa eka prashaasanika raajya . ehaara uttara-puurbare pashcimabangga, uttarare jhaaddakhanndda, pashcima o uttara-pashcimare chatishagadda, dakssinna o dakssinna-pashcimare aandhrapradesha abasthita . ehaa aayatana hisaabare nabama o janasamkhyaa hisaabare egaaratama raajya . oddiaa bhaassaa raajyara sarakaarii bhaassaa . 2001 janagannanaa anusaare raajyara praaya 33.2 niyuta loka oddiaa bhaassaa byabahaara karanti .
18
+ ::lcode zho jianadazai14000nianqianjiyouyuanzhuminzaicishenghuo.
19
+ ::lcode heb kol 'od balevav penimah nefesh yehudi homiyah ulefa'ate mizerach, qadimah, 'ayin letsiyon tsofiyah, 'od lo avedah tiqvatenu, hatiqvah bat shenot 'alepayim liheyot 'am chafeshiy be'aretsenu, erets tsiyon virushalayim.
20
+ ::lcode yid dvvqa ayz an h'vr'ysh'r zshvrnal vvas vashryyvt dy yydysh-shfrakyq' qvltvr. 's ayz d'rshyn'n g'vvarn tmvz h'tshs"z (yvly 2006).
21
+ ::lcode hye Talnoei shrjan (ukr., Talnivsky raion), shrjan Ukrainayi Cherkasii marzum. Steghtsvel e 1923 tvakanin. Varchakan kentrone, Talnoe. Ashkharhagrutyune Shrjani taratski makerese kazmum e 917 km². Bnakchutyun
22
+ ::lcode tai miipratesisra 2 prates kuee saanmaariinolaeankrratwaatikan peondindaentiilomrobpaidwypueentiikongitaalii naiknatiimeueengkampiionediitaaleiiy peondindaenswnyaekkongitaaliitiituuklomrobdwypueentiipratesswitserlaend
23
+ bugjjogeneun indonesiawa dongtimoreu, papua nyugini, bugdongjjogeneun solromon jedowa banuatu, nubelkalredoni, geurigo namdongjjogeneun nyujilraendeuga issda.
24
+ baa illi sambhavisu imdenna hrdayadali nityavuu avataripa satyaavataara mannnnaagi maravaagi migavaagi kagavaagii... mannnnaagi maravaagi migavaagi kagavaagi bhava bhavadi bhatisihee bhavati duura nityavuu avataripa satyaavataara || baa illi ||
25
+ vepxis tqaosani shota rustaveli ghmertsi shemvedre, nutu kvla damxsnas sophlisa shromasa, tsetsxls, tsqalsa da mitsasa, haerta tana mromasa; momtsnes phrteni da aghvphrinde, mivhxvde mas chemsa ndomasa, dghisit da ghamit vhxedvide mzisa elvata krtomaasa.
26
+ anm moilegoimrid maki vekumen
27
+ ic mag glas eotan ond hit ne hearmiath me.
28
+ ptolmys
29
+ chekosurobakia
30
+ lha·sa·grong·khyer
31
+ saali safiaryok imminik nillirotiqasoongofoq taitsomanitatsayaonirarsoni. imminillotaoq nillirotiqasongommisoni ollominitatsayaonirarsoni.
32
+ amagrad 1 ar d ttlalan middn gan ilellitn mgaddan gh waddur d izrfan, yili ak darsn unlli d ufrak, illa flla sn ad ttmyawasn ngratsn s tagmat.
uroman/test/string-similarity-test-input.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ trap strap
2
+ colour color
3
+ labeling labelling
4
+ organisation organization
5
+ Philadelphia Filadelfia
6
+ Vladimir Volodymyr
7
+ Moskva Moskvoy
uroman/test/string-similarity-test-output-ref.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Lang-code-1: eng Lang-code-2: eng
2
+ trap strap 1
3
+ colour color 0.1
4
+ labeling labelling 0.02
5
+ organisation organization 0.1
6
+ Philadelphia Filadelfia 0.02
7
+ Vladimir Volodymyr 0.5
8
+ Moskva Moskvoy 0.5
uroman/text/amh.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ ኢትዮጵያ ከዓለም ሶስቱ ትልቅ የአብርሃም ሀይማኖቶች ጋር ታሪካዊ ግንኙነት አላት።
2
+ ክርስትናን በአራተኛው ምዕተ-ዓመት ተቀብላለች።
3
+ ከሕዝቡ አንድ ሶስተኛው እስላም ነው።
4
+ የመጀመሪያው የእስላም ሂጅራ ወደ ኢትዮጵያ ነው የተከናወነው።
5
+ ነጋሽ በአፍሪካ የመጀመሪያው የእስላም መቀመጫ ናት።
6
+ እስከ ፲፱፻፸ ዎቹ ድረስ ብዙ ቤተ-እስራኤሎች በኢትዮጵያ ይኖሩ ነበር።
7
+ የራስ ተፈሪ እንቅስቃሴ ኢትዮጵያን በትልቅ ክብር ነው የሚያያት።
uroman/text/ara.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ كندا (بالإنجليزية: Canada) هي دولة في أمريكا الشمالية تتألف من 10 مقاطعات وثلاثة أقاليم. تقع في القسم الشمالي من القارة وتمتد من المحيط الأطلسي في الشرق إلى المحيط الهادئ في الغرب وتمتد شمالاً في المحيط المتجمد الشمالي. كندا هي البلد الثاني عالمياً من حيث المساحة الكلية. كما أن حدود كندا المشتركة مع الولايات المتحدة من الجنوب والشمال الغربي هي الأطول في العالم.
2
+ أراضي كندا مأهولة منذ آلاف السنين من قبل مجموعات مختلفة من السكان الأصليين. مع حلول أواخر القرن الخامس عشر بدأت الحملات البريطانية والفرنسية استكشاف المنطقة ومن ثم استوطنتها على طول ساحل المحيط الأطلسي. تنازلت فرنسا عن ما يقرب من جميع مستعمراتها في أمريكا الشمالية في عام 1763 بعد حرب السنوات السبع. في عام 1867، مع اتحاد ثلاثة مستعمرات بريطانية في أمريكا الشمالية عبر كونفدرالية تشكلت كندا باعتبارها كيانًا فدراليًا ذا سيادة يضم أربع مقاطعات. بدأ ذلك عملية اتسعت فيها مساحة كندا وتوسع حكمها الذاتي عن المملكة المتحدة. تجلت هذه الاستقلالية من خلال تشريع وستمنستر عام 1931 وبلغت ذروتها في صورة قانون كندا عام 1982 والذي قطع الاعتماد القانوني لكندا على البرلمان البريطاني.
3
+ كندا دولة فيدرالية يحكمها نظام ديمقراطي تمثيلي وملكية دستورية حيث الملكة إليزابيث الثانية قائدة للدولة. الأمة الكندية أمة ثنائية اللغة حيث الإنكليزية والفرنسية لغتان رسميتان على المستوى الاتحادي. تعد كندا واحدة من أكثر دول العالم تطوراً، حيث تمتلك اقتصاداً متنوعاً وتعتمد على مواردها الطبيعية الوفيرة، وعلى التجارة وبخاصة مع الولايات المتحدة اللتان تربطهما علاقة طويلة ومعقدة. كندا عضو في مجموعة الدول الصناعية السبع ومجموعة الثماني ومجموعة العشرين وحلف شمال الأطلسي ومنظمة التعاون والتنمية الاقتصادية ومنظمة التجارة العالمية ودول الكومنولث والفرنكوفونية ومنظمة الدول الأمريكية والإبيك والأمم المتحدة. تمتلك كندا واحداً من أعلى مستويات المعيشة في العالم حيث مؤشر التنمية البشرية يضعها في المرتبة الثامنة عالمياً.