Spaces:
Runtime error
Runtime error
Duplicate from Akmyradov/TurkmenTTSweSTT
Browse filesCo-authored-by: Yslam <Akmyradov@users.noreply.huggingface.co>
This view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +34 -0
- README.md +14 -0
- app.py +94 -0
- asr.py +41 -0
- data/asr/all_langs.tsv +1 -0
- data/lid/all_langs.tsv +4017 -0
- data/tts/all_langs.tsv +1 -0
- requirements.txt +11 -0
- tts.py +173 -0
- uroman/.gitignore +35 -0
- uroman/LICENSE.txt +11 -0
- uroman/README.md +165 -0
- uroman/README.txt +141 -0
- uroman/bin/de-accent.pl +201 -0
- uroman/bin/string-distance.pl +99 -0
- uroman/bin/uroman-quick.pl +58 -0
- uroman/bin/uroman-tsv.sh +28 -0
- uroman/bin/uroman.pl +138 -0
- uroman/data/Chinese_to_Pinyin.txt +0 -0
- uroman/data/Scripts.txt +135 -0
- uroman/data/UnicodeData.txt +0 -0
- uroman/data/UnicodeDataOverwrite.txt +442 -0
- uroman/data/romanization-table-arabic-block.txt +179 -0
- uroman/data/romanization-table.txt +2019 -0
- uroman/data/romanization-table.v1.2.1.txt +814 -0
- uroman/data/string-distance-cost-rules.txt +896 -0
- uroman/lib/JSON.pm +2317 -0
- uroman/lib/JSON/backportPP.pm +2806 -0
- uroman/lib/JSON/backportPP/Boolean.pm +27 -0
- uroman/lib/JSON/backportPP/Compat5005.pm +131 -0
- uroman/lib/JSON/backportPP/Compat5006.pm +173 -0
- uroman/lib/NLP/Chinese.pm +239 -0
- uroman/lib/NLP/English.pm +0 -0
- uroman/lib/NLP/Romanizer.pm +2020 -0
- uroman/lib/NLP/UTF8.pm +1404 -0
- uroman/lib/NLP/stringDistance.pm +724 -0
- uroman/lib/NLP/utilities.pm +0 -0
- uroman/tarballs/uroman-v1.0.tar.gz +3 -0
- uroman/tarballs/uroman-v1.1.tar.gz +3 -0
- uroman/tarballs/uroman-v1.2.4.tar.gz +3 -0
- uroman/tarballs/uroman-v1.2.5.tar.gz +3 -0
- uroman/tarballs/uroman-v1.2.6.tar.gz +3 -0
- uroman/tarballs/uroman-v1.2.7.tar.gz +3 -0
- uroman/tarballs/uroman-v1.2.tar.gz +3 -0
- uroman/test/multi-script.txt +32 -0
- uroman/test/multi-script.uroman-ref.txt +32 -0
- uroman/test/string-similarity-test-input.txt +7 -0
- uroman/test/string-similarity-test-output-ref.txt +8 -0
- uroman/text/amh.txt +7 -0
- uroman/text/ara.txt +3 -0
.gitattributes
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: MMS
|
3 |
+
emoji: ⚡
|
4 |
+
colorFrom: pink
|
5 |
+
colorTo: indigo
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.32.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: cc-by-nc-4.0
|
11 |
+
duplicated_from: Akmyradov/TurkmenTTSweSTT
|
12 |
+
---
|
13 |
+
|
14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import librosa
|
3 |
+
from asr import transcribe
|
4 |
+
from tts import synthesize, TTS_EXAMPLES
|
5 |
+
|
6 |
+
ALL_LANGUAGES = {}
|
7 |
+
|
8 |
+
for task in ["tts", "asr", "lid"]:
|
9 |
+
ALL_LANGUAGES.setdefault(task, {})
|
10 |
+
with open(f"data/{task}/all_langs.tsv") as f:
|
11 |
+
for line in f:
|
12 |
+
iso, name = line.split(" ", 1)
|
13 |
+
ALL_LANGUAGES[task][iso] = name
|
14 |
+
|
15 |
+
|
16 |
+
def identify(microphone, file_upload):
|
17 |
+
LID_SAMPLING_RATE = 16_000
|
18 |
+
|
19 |
+
warn_output = ""
|
20 |
+
if (microphone is not None) and (file_upload is not None):
|
21 |
+
warn_output = (
|
22 |
+
"WARNING: You've uploaded an audio file and used the microphone. "
|
23 |
+
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
|
24 |
+
)
|
25 |
+
|
26 |
+
elif (microphone is None) and (file_upload is None):
|
27 |
+
return "ERROR: You have to either use the microphone or upload an audio file"
|
28 |
+
|
29 |
+
audio_fp = microphone if microphone is not None else file_upload
|
30 |
+
inputs = librosa.load(audio_fp, sr=LID_SAMPLING_RATE, mono=True)[0]
|
31 |
+
|
32 |
+
raw_output = {"eng": 0.9, "hin": 0.04, "heb": 0.03, "ara": 0.02, "fra": 0.01}
|
33 |
+
return {(k + ": " + ALL_LANGUAGES["lid"][k]): v for k, v in raw_output.items()}
|
34 |
+
|
35 |
+
|
36 |
+
demo = gr.Blocks()
|
37 |
+
|
38 |
+
mms_transcribe = gr.Interface(
|
39 |
+
fn=transcribe,
|
40 |
+
inputs=[
|
41 |
+
gr.Audio(source="microphone", type="filepath"),
|
42 |
+
gr.Audio(source="upload", type="filepath"),
|
43 |
+
gr.Dropdown(
|
44 |
+
[f"{k}: {v}" for k, v in ALL_LANGUAGES["asr"].items()],
|
45 |
+
label="Language",
|
46 |
+
value="tuk-script_latin: Turkmen",
|
47 |
+
),
|
48 |
+
],
|
49 |
+
outputs="text",
|
50 |
+
title="Speech-to-text",
|
51 |
+
description=("Transcribe audio!"),
|
52 |
+
allow_flagging="never",
|
53 |
+
)
|
54 |
+
|
55 |
+
mms_synthesize = gr.Interface(
|
56 |
+
fn=synthesize,
|
57 |
+
inputs=[
|
58 |
+
gr.Text(label="Input text"),
|
59 |
+
gr.Dropdown(
|
60 |
+
[f"{k}: {v}" for k, v in ALL_LANGUAGES["tts"].items()],
|
61 |
+
label="Language",
|
62 |
+
value="tuk-script_latin: Turkmen",
|
63 |
+
),
|
64 |
+
gr.Slider(minimum=0.1, maximum=4.0, value=1.0, step=0.1, label="Speed"),
|
65 |
+
],
|
66 |
+
outputs=[
|
67 |
+
gr.Audio(label="Generated Audio", type="numpy"),
|
68 |
+
gr.Text(label="Filtered text after removing OOVs"),
|
69 |
+
],
|
70 |
+
examples=TTS_EXAMPLES,
|
71 |
+
title="Text-to-speech",
|
72 |
+
description=("Generate audio!"),
|
73 |
+
allow_flagging="never",
|
74 |
+
)
|
75 |
+
|
76 |
+
mms_identify = gr.Interface(
|
77 |
+
fn=identify,
|
78 |
+
inputs=[
|
79 |
+
gr.Audio(source="microphone", type="filepath"),
|
80 |
+
gr.Audio(source="upload", type="filepath"),
|
81 |
+
],
|
82 |
+
outputs=gr.Label(num_top_classes=10),
|
83 |
+
title="Language Identification",
|
84 |
+
description=("Identity the language of audio!"),
|
85 |
+
allow_flagging="never",
|
86 |
+
)
|
87 |
+
|
88 |
+
with demo:
|
89 |
+
gr.TabbedInterface(
|
90 |
+
[mms_synthesize, mms_transcribe, mms_identify],
|
91 |
+
["Text-to-speech", "Speech-to-text", "Language Identification"],
|
92 |
+
)
|
93 |
+
|
94 |
+
demo.launch()
|
asr.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
from transformers import Wav2Vec2ForCTC, AutoProcessor
|
3 |
+
import torch
|
4 |
+
|
5 |
+
ASR_SAMPLING_RATE = 16_000
|
6 |
+
|
7 |
+
|
8 |
+
MODEL_ID = "facebook/mms-1b-all"
|
9 |
+
|
10 |
+
processor = AutoProcessor.from_pretrained(MODEL_ID)
|
11 |
+
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
|
12 |
+
|
13 |
+
|
14 |
+
def transcribe(microphone, file_upload, lang):
|
15 |
+
|
16 |
+
warn_output = ""
|
17 |
+
if (microphone is not None) and (file_upload is not None):
|
18 |
+
warn_output = (
|
19 |
+
"WARNING: You've uploaded an audio file and used the microphone. "
|
20 |
+
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
|
21 |
+
)
|
22 |
+
elif (microphone is None) and (file_upload is None):
|
23 |
+
return "ERROR: You have to either use the microphone or upload an audio file"
|
24 |
+
|
25 |
+
audio_fp = microphone if microphone is not None else file_upload
|
26 |
+
audio_samples = librosa.load(audio_fp, sr=ASR_SAMPLING_RATE, mono=True)[0]
|
27 |
+
|
28 |
+
lang_code = lang.split(":")[0]
|
29 |
+
processor.tokenizer.set_target_lang(lang_code)
|
30 |
+
model.load_adapter(lang_code)
|
31 |
+
|
32 |
+
inputs = processor(
|
33 |
+
audio_samples, sampling_rate=ASR_SAMPLING_RATE, return_tensors="pt"
|
34 |
+
)
|
35 |
+
|
36 |
+
with torch.no_grad():
|
37 |
+
outputs = model(**inputs).logits
|
38 |
+
|
39 |
+
ids = torch.argmax(outputs, dim=-1)[0]
|
40 |
+
transcription = processor.decode(ids)
|
41 |
+
return warn_output + transcription
|
data/asr/all_langs.tsv
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
tuk-script_latin Turkmen
|
data/lid/all_langs.tsv
ADDED
@@ -0,0 +1,4017 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ara Arabic
|
2 |
+
eng English
|
3 |
+
cmn Chinese, Mandarin
|
4 |
+
spa Spanish
|
5 |
+
fra French
|
6 |
+
mlg Malagasy
|
7 |
+
ful Fulah
|
8 |
+
swe Swedish
|
9 |
+
por Portuguese
|
10 |
+
zlm Malay
|
11 |
+
sun Sunda
|
12 |
+
tuk Turkmen
|
13 |
+
vie Vietnamese
|
14 |
+
kor Korean
|
15 |
+
hin Hindi
|
16 |
+
ben Bengali
|
17 |
+
som Somali
|
18 |
+
asm Assamese
|
19 |
+
swh Swahili
|
20 |
+
urd Urdu
|
21 |
+
hau Hausa
|
22 |
+
ind Indonesian
|
23 |
+
tat Tatar
|
24 |
+
bod Tibetan, Central
|
25 |
+
tel Telugu
|
26 |
+
mon Mongolian
|
27 |
+
aze Azerbaijani
|
28 |
+
rus Russian
|
29 |
+
tgl Tagalog
|
30 |
+
tur Turkish
|
31 |
+
mar Marathi
|
32 |
+
amh Amharic
|
33 |
+
ron Romanian
|
34 |
+
yor Yoruba
|
35 |
+
tha Thai
|
36 |
+
slv Slovene
|
37 |
+
heb Hebrew
|
38 |
+
mkd Macedonian
|
39 |
+
bel Belarusian
|
40 |
+
nya Chichewa
|
41 |
+
mal Malayalam
|
42 |
+
bul Bulgarian
|
43 |
+
hun Hungarian
|
44 |
+
hat Haitian Creole
|
45 |
+
fas Persian
|
46 |
+
hrv Croatian
|
47 |
+
cat Catalan
|
48 |
+
tam Tamil
|
49 |
+
orm Oromo
|
50 |
+
kmr Kurdish, Northern
|
51 |
+
nld Dutch
|
52 |
+
cak Kaqchikel
|
53 |
+
afr Afrikaans
|
54 |
+
pol Polish
|
55 |
+
jav Javanese
|
56 |
+
lin Lingala
|
57 |
+
cym Welsh
|
58 |
+
kik Gikuyu
|
59 |
+
nob Norwegian Bokmål
|
60 |
+
grn Guarani
|
61 |
+
snd Sindhi
|
62 |
+
kaz Kazakh
|
63 |
+
isl Icelandic
|
64 |
+
uzb Uzbek
|
65 |
+
bos Bosnian
|
66 |
+
mya Burmese
|
67 |
+
lat Latin
|
68 |
+
deu German, Standard
|
69 |
+
npi Nepali
|
70 |
+
che Chechen
|
71 |
+
yue Chinese, Yue
|
72 |
+
kat Georgian
|
73 |
+
kan Kannada
|
74 |
+
lit Lithuanian
|
75 |
+
mam Mam
|
76 |
+
sqi Albanian
|
77 |
+
hye Armenian
|
78 |
+
jpn Japanese
|
79 |
+
ell Greek
|
80 |
+
crh Crimean Tatar
|
81 |
+
lav Latvian
|
82 |
+
khm Khmer
|
83 |
+
bak Bashkort
|
84 |
+
poh Poqomchi’
|
85 |
+
quc K’iche’
|
86 |
+
pan Punjabi, Eastern
|
87 |
+
ixl Ixil
|
88 |
+
xog Soga
|
89 |
+
ces Czech
|
90 |
+
tgk Tajik
|
91 |
+
cfm Chin, Falam
|
92 |
+
fao Faroese
|
93 |
+
guj Gujarati
|
94 |
+
aka Akan
|
95 |
+
ukr Ukrainian
|
96 |
+
glg Galician
|
97 |
+
ltz Luxembourgish
|
98 |
+
sxn Sangir
|
99 |
+
sna Shona
|
100 |
+
lao Lao
|
101 |
+
mlt Maltese
|
102 |
+
sin Sinhala
|
103 |
+
lug Ganda
|
104 |
+
aiw Aari
|
105 |
+
kia Kim
|
106 |
+
ayo Ayoreo
|
107 |
+
dtp Kadazan Dusun
|
108 |
+
cmo Mnong, Central
|
109 |
+
nhx Nahuatl, Isthmus-Mecayapan
|
110 |
+
gag Gagauz
|
111 |
+
tzj Tz’utujil
|
112 |
+
tuv Turkana
|
113 |
+
acr Achi
|
114 |
+
mri Maori
|
115 |
+
eus Basque
|
116 |
+
pus Pushto
|
117 |
+
quy Quechua, Ayacucho
|
118 |
+
srp Serbian
|
119 |
+
ita Italian
|
120 |
+
nno Norwegian Nynorsk
|
121 |
+
xsm Kasem
|
122 |
+
luo Dholuo
|
123 |
+
ory Odia
|
124 |
+
gur Farefare
|
125 |
+
cac Chuj
|
126 |
+
quh Quechua, South Bolivian
|
127 |
+
ewe Éwé
|
128 |
+
kbp Kabiyè
|
129 |
+
saq Samburu
|
130 |
+
slk Slovak
|
131 |
+
xon Konkomba
|
132 |
+
fin Finnish
|
133 |
+
mos Mòoré
|
134 |
+
bwq Bobo Madaré, Southern
|
135 |
+
yao Yao
|
136 |
+
hne Chhattisgarhi
|
137 |
+
rif Tarifit
|
138 |
+
new Newar
|
139 |
+
hus Huastec
|
140 |
+
dyu Jula
|
141 |
+
bre Breton
|
142 |
+
guh Guahibo
|
143 |
+
bis Bislama
|
144 |
+
yid Yiddish
|
145 |
+
txa Tombonuo
|
146 |
+
mnk Mandinka
|
147 |
+
uig Uyghur
|
148 |
+
bqc Boko
|
149 |
+
dan Danish
|
150 |
+
ngl Lomwe
|
151 |
+
pse Malay, Central
|
152 |
+
bam Bamanankan
|
153 |
+
mtg Una
|
154 |
+
pmf Pamona
|
155 |
+
onb Lingao
|
156 |
+
ntm Nateni
|
157 |
+
tso Tsonga
|
158 |
+
bno Bantoanon
|
159 |
+
teo Ateso
|
160 |
+
uhn Damal
|
161 |
+
ycl Lolopo
|
162 |
+
bus Bokobaru
|
163 |
+
ttq Tamajaq, Tawallammat
|
164 |
+
mcr Menya
|
165 |
+
seh Sena
|
166 |
+
kru Kurux
|
167 |
+
lok Loko
|
168 |
+
est Estonian
|
169 |
+
tpi Tok Pisin
|
170 |
+
zne Zande
|
171 |
+
bxk Bukusu
|
172 |
+
mzi Mazatec, Ixcatlán
|
173 |
+
amf Hamer-Banna
|
174 |
+
rel Rendille
|
175 |
+
sck Sadri
|
176 |
+
lcp Lawa, Western
|
177 |
+
gbo Grebo, Northern
|
178 |
+
adx Tibetan, Amdo
|
179 |
+
tcc Datooga
|
180 |
+
cnh Chin, Hakha
|
181 |
+
pwg Gapapaiwa
|
182 |
+
wlx Wali
|
183 |
+
rjs Rajbanshi
|
184 |
+
thl Tharu, Dangaura
|
185 |
+
xal Kalmyk-Oirat
|
186 |
+
dos Dogosé
|
187 |
+
lis Lisu
|
188 |
+
txu Kayapó
|
189 |
+
sxb Suba
|
190 |
+
gng Ngangam
|
191 |
+
ifa Ifugao, Amganad
|
192 |
+
beh Biali
|
193 |
+
poe Popoloca, San Juan Atzingo
|
194 |
+
dga Dagaare, Southern
|
195 |
+
dsh Daasanach
|
196 |
+
vmw Makhuwa
|
197 |
+
mup Malvi
|
198 |
+
lnd Lundayeh
|
199 |
+
kbo Keliko
|
200 |
+
cwa Kabwa
|
201 |
+
rol Romblomanon
|
202 |
+
khg Tibetan, Khams
|
203 |
+
nko Nkonya
|
204 |
+
dgi Dagara, Northern
|
205 |
+
kml Kalinga, Tanudan
|
206 |
+
nxq Naxi
|
207 |
+
acn Achang
|
208 |
+
pxm Mixe, Quetzaltepec
|
209 |
+
wal Wolaytta
|
210 |
+
ctg Chittagonian
|
211 |
+
dnw Dani, Western
|
212 |
+
pui Puinave
|
213 |
+
lew Kaili, Ledo
|
214 |
+
bfa Bari
|
215 |
+
mqj Mamasa
|
216 |
+
rmc Romani, Carpathian
|
217 |
+
mhy Ma’anyan
|
218 |
+
xsr Sherpa
|
219 |
+
gri Ghari
|
220 |
+
bfy Bagheli
|
221 |
+
kqp Kimré
|
222 |
+
frd Fordata
|
223 |
+
ayr Aymara, Central
|
224 |
+
mip Mixtec, Apasco-Apoala
|
225 |
+
nym Nyamwezi
|
226 |
+
tzh Tzeltal
|
227 |
+
kcg Tyap
|
228 |
+
tex Tennet
|
229 |
+
lbw Tolaki
|
230 |
+
sda Toraja-Sa’dan
|
231 |
+
kdt Kuay
|
232 |
+
bfo Birifor, Malba
|
233 |
+
qxl Quichua, Salasaca Highland
|
234 |
+
ttc Tektiteko
|
235 |
+
bfz Pahari, Mahasu
|
236 |
+
mhx Lhao Vo
|
237 |
+
sbp Sangu
|
238 |
+
mco Mixe, Coatlán
|
239 |
+
mbu Mbula-Bwazza
|
240 |
+
mxt Mixtec, Jamiltepec
|
241 |
+
nzi Nzema
|
242 |
+
suz Sunwar
|
243 |
+
hlt Chin, Matu
|
244 |
+
tzo Tzotzil
|
245 |
+
any Anyin
|
246 |
+
gna Kaansa
|
247 |
+
sid Sidamo
|
248 |
+
alp Alune
|
249 |
+
maj Mazatec, Jalapa de Díaz
|
250 |
+
zim Mesme
|
251 |
+
knj Akateko
|
252 |
+
zar Zapotec, Rincón
|
253 |
+
mxb Mixtec, Tezoatlán
|
254 |
+
bdu Oroko
|
255 |
+
bbc Batak Toba
|
256 |
+
ddn Dendi
|
257 |
+
obo Manobo, Obo
|
258 |
+
krs Gbaya
|
259 |
+
zaq Zapotec, Aloápam
|
260 |
+
ife Ifè
|
261 |
+
soy Miyobe
|
262 |
+
trs Triqui, Chicahuaxtla
|
263 |
+
mbj Nadëb
|
264 |
+
tuo Tucano
|
265 |
+
atb Zaiwa
|
266 |
+
vif Vili
|
267 |
+
mim Mixtec, Alacatlatzala
|
268 |
+
grc Greek, Ancient
|
269 |
+
cek Chin, Eastern Khumi
|
270 |
+
kfx Pahari, Kullu
|
271 |
+
naw Nawuri
|
272 |
+
tgj Tagin
|
273 |
+
xed Hdi
|
274 |
+
hnn Hanunoo
|
275 |
+
had Hatam
|
276 |
+
kij Kilivila
|
277 |
+
nlc Nalca
|
278 |
+
kek Q’eqchi’
|
279 |
+
rej Rejang
|
280 |
+
fon Fon
|
281 |
+
amk Ambai
|
282 |
+
kyb Kalinga, Butbut
|
283 |
+
dnj Dan
|
284 |
+
oku Oku
|
285 |
+
gil Kiribati
|
286 |
+
mag Magahi
|
287 |
+
lln Lele
|
288 |
+
pil Yom
|
289 |
+
pls Popoloca, San Marcos Tlacoyalco
|
290 |
+
box Buamu
|
291 |
+
kwf Kwara’ae
|
292 |
+
mgd Moru
|
293 |
+
xtm Mixtec, Magdalena Peñasco
|
294 |
+
ctd Chin, Tedim
|
295 |
+
akb Batak Angkola
|
296 |
+
nlg Gela
|
297 |
+
bmq Bomu
|
298 |
+
bmv Bum
|
299 |
+
mgo Meta’
|
300 |
+
cla Ron
|
301 |
+
rug Roviana
|
302 |
+
enx Enxet
|
303 |
+
mpm Mixtec, Yosondúa
|
304 |
+
gof Gofa
|
305 |
+
bom Berom
|
306 |
+
mbc Macushi
|
307 |
+
btx Batak Karo
|
308 |
+
did Didinga
|
309 |
+
mej Meyah
|
310 |
+
bgq Bagri
|
311 |
+
maa Mazatec, San Jerónimo Tecóatl
|
312 |
+
nmz Nawdm
|
313 |
+
mfk Mofu, North
|
314 |
+
aeu Akeu
|
315 |
+
mqn Moronene
|
316 |
+
tob Toba
|
317 |
+
hlb Halbi
|
318 |
+
nin Ninzo
|
319 |
+
kqe Kalagan
|
320 |
+
lex Luang
|
321 |
+
mkl Mokole
|
322 |
+
icr Islander English Creole
|
323 |
+
lns Lamnso’
|
324 |
+
tlj Talinga-Bwisi
|
325 |
+
bzh Buang, Mapos
|
326 |
+
bdh Baka
|
327 |
+
kle Kulung
|
328 |
+
pib Yine
|
329 |
+
vut Vute
|
330 |
+
btd Batak Dairi
|
331 |
+
xmm Malay, Manado
|
332 |
+
yka Yakan
|
333 |
+
btt Bete-Bendi
|
334 |
+
hoc Ho
|
335 |
+
yba Yala
|
336 |
+
mib Mixtec, Atatlahuca
|
337 |
+
kpq Korupun-Sela
|
338 |
+
xsb Sambal
|
339 |
+
muy Muyang
|
340 |
+
zyp Chin, Zyphe
|
341 |
+
bbo Konabéré
|
342 |
+
krc Karachay-Balkar
|
343 |
+
eka Ekajuk
|
344 |
+
mcp Makaa
|
345 |
+
bqj Bandial
|
346 |
+
mcq Ese
|
347 |
+
ybb Yemba
|
348 |
+
hyw Armenian, Western
|
349 |
+
tmc Tumak
|
350 |
+
mih Mixtec, Chayuco
|
351 |
+
blt Tai Dam
|
352 |
+
zpz Zapotec, Texmelucan
|
353 |
+
tng Tobanga
|
354 |
+
not Nomatsigenga
|
355 |
+
pny Pinyin
|
356 |
+
nuj Nyole
|
357 |
+
bhz Bada
|
358 |
+
kvn Kuna, Border
|
359 |
+
lje Rampi
|
360 |
+
sne Bidayuh, Bau
|
361 |
+
ndy Lutos
|
362 |
+
ksb Shambala
|
363 |
+
nhy Nahuatl, Northern Oaxaca
|
364 |
+
kwd Kwaio
|
365 |
+
moz Mukulu
|
366 |
+
cmr Mro-Khimi
|
367 |
+
xuo Kuo
|
368 |
+
zpu Zapotec, Yalálag
|
369 |
+
avn Avatime
|
370 |
+
pap Papiamentu
|
371 |
+
pss Kaulong
|
372 |
+
akp Siwu
|
373 |
+
ted Krumen, Tepo
|
374 |
+
rro Waima
|
375 |
+
muv Muthuvan
|
376 |
+
gau Gadaba, Mudhili
|
377 |
+
ake Akawaio
|
378 |
+
guq Aché
|
379 |
+
lsi Lacid
|
380 |
+
cul Kulina
|
381 |
+
tna Tacana
|
382 |
+
cle Chinantec, Lealao
|
383 |
+
iri Rigwe
|
384 |
+
flr Fuliiru
|
385 |
+
bkd Binukid
|
386 |
+
bmr Muinane
|
387 |
+
twb Tawbuid
|
388 |
+
ikk Ika
|
389 |
+
tbl Tboli
|
390 |
+
mnw Mon
|
391 |
+
asa Asu
|
392 |
+
abi Abidji
|
393 |
+
yaz Lokaa
|
394 |
+
bgw Bhatri
|
395 |
+
miy Mixtec, Ayutla
|
396 |
+
gai Mbore
|
397 |
+
smo Samoan
|
398 |
+
cnl Chinantec, Lalana
|
399 |
+
far Fataleka
|
400 |
+
poi Popoluca, Highland
|
401 |
+
tgo Sudest
|
402 |
+
gud Dida, Yocoboué
|
403 |
+
kak Kalanguya
|
404 |
+
gub Guajajára
|
405 |
+
yre Yaouré
|
406 |
+
cso Chinantec, Sochiapam
|
407 |
+
gwr Gwere
|
408 |
+
ati Attié
|
409 |
+
urt Urat
|
410 |
+
mil Mixtec, Peñoles
|
411 |
+
ndv Ndut
|
412 |
+
rnl Ranglong
|
413 |
+
sch Sakachep
|
414 |
+
zpc Zapotec, Choapan
|
415 |
+
tom Tombulu
|
416 |
+
tnt Tontemboan
|
417 |
+
atg Ivbie North-Okpela-Arhe
|
418 |
+
kdl Tsikimba
|
419 |
+
mto Mixe, Totontepec
|
420 |
+
bov Tuwuli
|
421 |
+
myy Macuna
|
422 |
+
ava Avar
|
423 |
+
ami Amis
|
424 |
+
luc Aringa
|
425 |
+
plw Palawano, Brooke’s Point
|
426 |
+
cab Garifuna
|
427 |
+
sey Paicoca
|
428 |
+
zpg Zapotec, Guevea de Humboldt
|
429 |
+
xnj Chingoni
|
430 |
+
kdc Kutu
|
431 |
+
zpt Zapotec, San Vicente Coatlán
|
432 |
+
prk Wa, Parauk
|
433 |
+
qxr Quichua, Cañar Highland
|
434 |
+
nga Ngbaka
|
435 |
+
ubl Bikol, Buhi’non
|
436 |
+
crs Seychelles French Creole
|
437 |
+
cwe Kwere
|
438 |
+
pps Popoloca, San Luís Temalacayuca
|
439 |
+
bjw Bakwé
|
440 |
+
aia Arosi
|
441 |
+
taq Tamasheq
|
442 |
+
idd Ede Idaca
|
443 |
+
ceb Cebuano
|
444 |
+
blh Kuwaa
|
445 |
+
kfw Naga, Kharam
|
446 |
+
gqr Gor
|
447 |
+
suc Subanon, Western
|
448 |
+
cok Cora, Santa Teresa
|
449 |
+
kzf Kaili, Da’a
|
450 |
+
myv Erzya
|
451 |
+
mge Mango
|
452 |
+
tly Talysh
|
453 |
+
udm Udmurt
|
454 |
+
tmf Toba-Maskoy
|
455 |
+
cbi Chachi
|
456 |
+
kqr Kimaragang
|
457 |
+
yas Nugunu
|
458 |
+
nsu Nahuatl, Sierra Negra
|
459 |
+
pez Penan, Eastern
|
460 |
+
moa Mwan
|
461 |
+
dgk Dagba
|
462 |
+
tao Yami
|
463 |
+
lon Lomwe, Malawi
|
464 |
+
kog Kogi
|
465 |
+
tlb Tobelo
|
466 |
+
azg Amuzgo, San Pedro Amuzgos
|
467 |
+
xtd Mixtec, Diuxi-Tilantongo
|
468 |
+
bqp Bisã
|
469 |
+
kpv Komi-Zyrian
|
470 |
+
hwc Hawaii Pidgin
|
471 |
+
cpu Ashéninka, Pichis
|
472 |
+
yat Yambeta
|
473 |
+
kje Kisar
|
474 |
+
met Mato
|
475 |
+
zmz Mbandja
|
476 |
+
ury Orya
|
477 |
+
cpb Ashéninka, Ucayali-Yurúa
|
478 |
+
bep Behoa
|
479 |
+
yea Ravula
|
480 |
+
zga Kinga
|
481 |
+
asg Cishingini
|
482 |
+
kaq Capanahua
|
483 |
+
jun Juang
|
484 |
+
knb Kalinga, Lubuagan
|
485 |
+
kyf Kouya
|
486 |
+
rap Rapa Nui
|
487 |
+
ess Yupik, Saint Lawrence Island
|
488 |
+
stn Owa
|
489 |
+
byr Yipma
|
490 |
+
sjm Mapun
|
491 |
+
mjv Mannan
|
492 |
+
rub Gungu
|
493 |
+
kjh Khakas
|
494 |
+
kmd Kalinga, Majukayang
|
495 |
+
dbq Daba
|
496 |
+
wap Wapishana
|
497 |
+
blx Ayta, Mag-Indi
|
498 |
+
kne Kankanaey
|
499 |
+
arl Arabela
|
500 |
+
abp Ayta, Abellen
|
501 |
+
tuf Tunebo, Central
|
502 |
+
cgc Kagayanen
|
503 |
+
ksr Borong
|
504 |
+
ojb Ojibwa, Northwestern
|
505 |
+
cbr Kakataibo-Kashibo
|
506 |
+
chv Chuvash
|
507 |
+
ktj Krumen, Plapo
|
508 |
+
omw Tairora, South
|
509 |
+
cjo Ashéninka, Pajonal
|
510 |
+
mhr Mari, Meadow
|
511 |
+
atq Aralle-Tabulahan
|
512 |
+
rkt Rangpuri
|
513 |
+
ium Iu Mien
|
514 |
+
crt Chorote, Iyojwa’ja
|
515 |
+
nog Nogai
|
516 |
+
snn Siona
|
517 |
+
tte Bwanabwana
|
518 |
+
tvw Sedoa
|
519 |
+
pjt Pitjantjatjara
|
520 |
+
nlk Yali, Ninia
|
521 |
+
tih Murut, Timugon
|
522 |
+
ppk Uma
|
523 |
+
lid Nyindrou
|
524 |
+
cui Cuiba
|
525 |
+
cot Caquinte
|
526 |
+
tav Tatuyo
|
527 |
+
log Logo
|
528 |
+
prt Prai
|
529 |
+
boj Anjam
|
530 |
+
huu Witoto, Murui
|
531 |
+
mqf Momuna
|
532 |
+
med Melpa
|
533 |
+
snp Siane
|
534 |
+
dah Gwahatike
|
535 |
+
tnr Ménik
|
536 |
+
tbk Tagbanwa, Calamian
|
537 |
+
mtj Moskona
|
538 |
+
men Mende
|
539 |
+
ubu Umbu-Ungu
|
540 |
+
agu Awakateko
|
541 |
+
kmu Kanite
|
542 |
+
trn Trinitario
|
543 |
+
zaj Zaramo
|
544 |
+
dnt Dani, Mid Grand Valley
|
545 |
+
qvh Quechua, Huamalíes-Dos de Mayo Huánuco
|
546 |
+
mcd Sharanahua
|
547 |
+
urb Kaapor
|
548 |
+
wsg Gondi, Adilabad
|
549 |
+
war Waray-Waray
|
550 |
+
ame Yanesha’
|
551 |
+
cof Tsafiki
|
552 |
+
bbb Barai
|
553 |
+
hap Hupla
|
554 |
+
law Lauje
|
555 |
+
crq Chorote, Iyo’wujwa
|
556 |
+
bor Borôro
|
557 |
+
kri Krio
|
558 |
+
nhe Nahuatl, Eastern Huasteca
|
559 |
+
bjr Binumarien
|
560 |
+
xte Ketengban
|
561 |
+
eip Lik
|
562 |
+
dav Dawida
|
563 |
+
mpd Machinere
|
564 |
+
mai Maithili
|
565 |
+
sil Sisaala, Tumulung
|
566 |
+
pis Pijin
|
567 |
+
crk Cree, Plains
|
568 |
+
kyz Kayabí
|
569 |
+
ngu Nahuatl, Guerrero
|
570 |
+
guo Guayabero
|
571 |
+
mnx Sougb
|
572 |
+
nij Ngaju
|
573 |
+
qva Quechua, Ambo-Pasco
|
574 |
+
lif Limbu
|
575 |
+
bvz Bauzi
|
576 |
+
awa Awadhi
|
577 |
+
kir Kyrgyz
|
578 |
+
kin Kinyarwanda
|
579 |
+
iba Iban
|
580 |
+
niy Ngiti
|
581 |
+
nas Naasioi
|
582 |
+
knk Kuranko
|
583 |
+
gog Gogo
|
584 |
+
gvc Wanano
|
585 |
+
mdm Mayogo
|
586 |
+
pkb Kipfokomo
|
587 |
+
sho Shanga
|
588 |
+
gbm Garhwali
|
589 |
+
dig Chidigo
|
590 |
+
bsq Bassa
|
591 |
+
tye Kyanga
|
592 |
+
gux Gourmanchéma
|
593 |
+
yal Yalunka
|
594 |
+
zyb Zhuang, Yongbei
|
595 |
+
run Rundi
|
596 |
+
bky Bokyi
|
597 |
+
yan Mayangna
|
598 |
+
tbt Tembo
|
599 |
+
set Sentani
|
600 |
+
oci Occitan
|
601 |
+
nyy Nyakyusa-Ngonde
|
602 |
+
shn Shan
|
603 |
+
bcc Balochi, Southern
|
604 |
+
kno Kono
|
605 |
+
yaa Yaminahua
|
606 |
+
bwu Buli
|
607 |
+
bgr Chin, Bawm
|
608 |
+
mfz Mabaan
|
609 |
+
keo Kakwa
|
610 |
+
led Lendu
|
611 |
+
kue Kuman
|
612 |
+
grt Garo
|
613 |
+
sus Susu
|
614 |
+
mdy Male
|
615 |
+
sah Yakut
|
616 |
+
dug Chiduruma
|
617 |
+
pkr Kurumba, Attapady
|
618 |
+
tir Tigrigna
|
619 |
+
suk Sukuma
|
620 |
+
san Sanskrit
|
621 |
+
kdj Ng’akarimojong
|
622 |
+
nyf Kigiryama
|
623 |
+
bem Bemba
|
624 |
+
hak Chinese, Hakka
|
625 |
+
dag Dagbani
|
626 |
+
nan Chinese, Min Nan
|
627 |
+
kdh Tem
|
628 |
+
gum Misak
|
629 |
+
hnj Hmong Njua
|
630 |
+
aha Ahanta
|
631 |
+
lsm Saamya-Gwe
|
632 |
+
nyn Nyankore
|
633 |
+
lam Lamba
|
634 |
+
tgw Sénoufo, Tagwana
|
635 |
+
kde Makonde
|
636 |
+
lhu Lahu
|
637 |
+
wme Wambule
|
638 |
+
guc Wayuu
|
639 |
+
mur Murle
|
640 |
+
kam Kamba
|
641 |
+
bru Bru, Eastern
|
642 |
+
nsk Naskapi
|
643 |
+
guk Gumuz
|
644 |
+
cas Tsimané
|
645 |
+
nnw Nuni, Southern
|
646 |
+
jow Jowulu
|
647 |
+
bvc Baelelea
|
648 |
+
gjn Gonja
|
649 |
+
cko Anufo
|
650 |
+
rim Nyaturu
|
651 |
+
mfi Wandala
|
652 |
+
thf Thangmi
|
653 |
+
trq Triqui, San Martín Itunyoso
|
654 |
+
bmu Somba-Siawari
|
655 |
+
ade Adele
|
656 |
+
rmy Romani, Vlax
|
657 |
+
nim Nilamba
|
658 |
+
mbb Manobo, Western Bukidnon
|
659 |
+
mxv Mixtec, Metlatónoc
|
660 |
+
ses Songhay, Koyraboro Senni
|
661 |
+
dyo Jola-Fonyi
|
662 |
+
taj Tamang, Eastern
|
663 |
+
mnb Muna
|
664 |
+
sbd Samo, Southern
|
665 |
+
hui Huli
|
666 |
+
esi Inupiatun, North Alaskan
|
667 |
+
wba Warao
|
668 |
+
kqn Kaonde
|
669 |
+
spy Sabaot
|
670 |
+
raw Rawang
|
671 |
+
kbr Kafa
|
672 |
+
tem Themne
|
673 |
+
bst Basketo
|
674 |
+
oss Ossetic
|
675 |
+
omi Omi
|
676 |
+
qul Quechua, North Bolivian
|
677 |
+
car Carib
|
678 |
+
kff Koya
|
679 |
+
ptu Bambam
|
680 |
+
mev Maan
|
681 |
+
mgh Makhuwa-Meetto
|
682 |
+
cly Chatino, Eastern Highland
|
683 |
+
mpx Misima-Panaeati
|
684 |
+
kus Kusaal
|
685 |
+
mwq Chin, Müün
|
686 |
+
khq Songhay, Koyra Chiini
|
687 |
+
nia Nias
|
688 |
+
urk Urak Lawoi’
|
689 |
+
spp Sénoufo, Supyire
|
690 |
+
dzo Dzongkha
|
691 |
+
sgb Ayta, Mag-antsi
|
692 |
+
kma Konni
|
693 |
+
iou Tuma-Irumu
|
694 |
+
lef Lelemi
|
695 |
+
nst Naga, Tangshang
|
696 |
+
udg Muduga
|
697 |
+
vag Vagla
|
698 |
+
kum Kumyk
|
699 |
+
maw Mampruli
|
700 |
+
quz Quechua, Cusco
|
701 |
+
kaa Karakalpak
|
702 |
+
mpg Marba
|
703 |
+
yva Yawa
|
704 |
+
bgc Haryanvi
|
705 |
+
bim Bimoba
|
706 |
+
fij Fijian
|
707 |
+
bud Ntcham
|
708 |
+
ceg Chamacoco
|
709 |
+
tpm Tampulma
|
710 |
+
mrj Mari, Hill
|
711 |
+
nus Nuer
|
712 |
+
sba Ngambay
|
713 |
+
lom Loma
|
714 |
+
bib Bisa
|
715 |
+
twu Termanu
|
716 |
+
acd Gikyode
|
717 |
+
mak Makasar
|
718 |
+
cni Asháninka
|
719 |
+
pbb Nasa
|
720 |
+
qvm Quechua, Margos-Yarowilca-Lauricocha
|
721 |
+
zab Zapotec, Western Tlacolula Valley
|
722 |
+
csk Jola-Kasa
|
723 |
+
gxx Wè Southern
|
724 |
+
bgt Bughotu
|
725 |
+
yuz Yuracare
|
726 |
+
emp Emberá, Northern
|
727 |
+
mzj Manya
|
728 |
+
mfq Moba
|
729 |
+
guw Gun
|
730 |
+
kac Jingpho
|
731 |
+
ilo Ilocano
|
732 |
+
qvo Quichua, Napo
|
733 |
+
las Lama
|
734 |
+
ctu Chol
|
735 |
+
cdj Churahi
|
736 |
+
yam Yamba
|
737 |
+
dip Dinka, Northeastern
|
738 |
+
kfy Kumaoni
|
739 |
+
sig Paasaal
|
740 |
+
srx Sirmauri
|
741 |
+
mie Mixtec, Ocotepec
|
742 |
+
tca Ticuna
|
743 |
+
cap Chipaya
|
744 |
+
nav Navajo
|
745 |
+
mca Maka
|
746 |
+
pce Palaung, Ruching
|
747 |
+
upv Uripiv-Wala-Rano-Atchin
|
748 |
+
bgd Bareli, Rathwi
|
749 |
+
blz Balantak
|
750 |
+
dik Dinka, Southwestern
|
751 |
+
gbi Galela
|
752 |
+
dgo Dogri
|
753 |
+
nnb Nande
|
754 |
+
cax Chiquitano
|
755 |
+
myb Mbay
|
756 |
+
txq Tii
|
757 |
+
dhi Dhimal
|
758 |
+
mad Madura
|
759 |
+
shk Shilluk
|
760 |
+
ktb Kambaata
|
761 |
+
quw Quichua, Tena Lowland
|
762 |
+
rav Sampang
|
763 |
+
sag Sango
|
764 |
+
nyu Nyungwe
|
765 |
+
ljp Lampung Api
|
766 |
+
mzm Mumuye
|
767 |
+
stb Subanen, Northern
|
768 |
+
pab Parecís
|
769 |
+
mzw Deg
|
770 |
+
mhi Ma’di
|
771 |
+
gor Gorontalo
|
772 |
+
agd Agarabi
|
773 |
+
gnd Zulgo-Gemzek
|
774 |
+
xnr Kangri
|
775 |
+
mor Moro
|
776 |
+
kyu Kayah, Western
|
777 |
+
ese Ese Ejja
|
778 |
+
myk Sénoufo, Mamara
|
779 |
+
zaw Zapotec, Mitla
|
780 |
+
cme Cerma
|
781 |
+
aaz Amarasi
|
782 |
+
cnt Chinantec, Tepetotutla
|
783 |
+
zpo Zapotec, Amatlán
|
784 |
+
anv Denya
|
785 |
+
ach Acholi
|
786 |
+
mwv Mentawai
|
787 |
+
kfb Kolami, Northwestern
|
788 |
+
otn Otomi, Tenango
|
789 |
+
kbq Kamano
|
790 |
+
kss Kisi, Southern
|
791 |
+
jiv Shuar
|
792 |
+
dop Lukpa
|
793 |
+
nhw Nahuatl, Western Huasteca
|
794 |
+
ahk Akha
|
795 |
+
tbz Ditammari
|
796 |
+
zas Zapotec, Santo Domingo Albarradas
|
797 |
+
huv Huave, San Mateo del Mar
|
798 |
+
xtn Mixtec, Northern Tlaxiaco
|
799 |
+
bex Jur Modo
|
800 |
+
crn Cora, El Nayar
|
801 |
+
cuk Kuna, San Blas
|
802 |
+
gbk Gaddi
|
803 |
+
toi Tonga
|
804 |
+
key Kupia
|
805 |
+
ifb Ifugao, Batad
|
806 |
+
ztq Zapotec, Quioquitani-Quierí
|
807 |
+
nag Nagamese
|
808 |
+
toc Totonac, Coyutla
|
809 |
+
ken Kenyang
|
810 |
+
agr Awajún
|
811 |
+
bfd Bafut
|
812 |
+
kyq Kenga
|
813 |
+
ker Kera
|
814 |
+
ntr Delo
|
815 |
+
usp Uspanteko
|
816 |
+
alz Alur
|
817 |
+
mas Maasai
|
818 |
+
lme Pévé
|
819 |
+
nhu Noone
|
820 |
+
dwr Dawro
|
821 |
+
ksp Kabba
|
822 |
+
ncu Chumburung
|
823 |
+
min Minangkabau
|
824 |
+
wol Wolof
|
825 |
+
hif Hindi, Fiji
|
826 |
+
tll Tetela
|
827 |
+
bba Baatonum
|
828 |
+
cco Chinantec, Comaltepec
|
829 |
+
tbc Takia
|
830 |
+
lia Limba, West-Central
|
831 |
+
mgq Malila
|
832 |
+
mnf Mundani
|
833 |
+
hil Hiligaynon
|
834 |
+
kyc Kyaka
|
835 |
+
ozm Koonzime
|
836 |
+
gyr Guarayu
|
837 |
+
pcm Pidgin, Nigerian
|
838 |
+
sml Sama, Central
|
839 |
+
npl Nahuatl, Southeastern Puebla
|
840 |
+
tby Tabaru
|
841 |
+
lem Nomaande
|
842 |
+
udu Uduk
|
843 |
+
xsu Sanumá
|
844 |
+
soq Kanasi
|
845 |
+
tik Tikar
|
846 |
+
ibg Ibanag
|
847 |
+
zpl Zapotec, Lachixío
|
848 |
+
sbl Sambal, Botolan
|
849 |
+
itv Itawit
|
850 |
+
noa Woun Meu
|
851 |
+
ace Aceh
|
852 |
+
ign Ignaciano
|
853 |
+
shp Shipibo-Conibo
|
854 |
+
jbu Jukun Takum
|
855 |
+
kub Kutep
|
856 |
+
knf Mankanya
|
857 |
+
mvp Duri
|
858 |
+
jac Jakalteko
|
859 |
+
wwa Waama
|
860 |
+
biv Birifor, Southern
|
861 |
+
kkj Kako
|
862 |
+
ter Terêna
|
863 |
+
pbi Parkwa
|
864 |
+
csy Chin, Siyin
|
865 |
+
xrb Karaboro, Eastern
|
866 |
+
mxq Mixe, Juquila
|
867 |
+
mfh Matal
|
868 |
+
bht Bhattiyali
|
869 |
+
fal Fali, South
|
870 |
+
adj Adioukrou
|
871 |
+
mcu Mambila, Cameroon
|
872 |
+
otq Otomi, Querétaro
|
873 |
+
bpr Blaan, Koronadal
|
874 |
+
miq Mískito
|
875 |
+
tee Tepehua, Huehuetla
|
876 |
+
mrw Maranao
|
877 |
+
nfr Nafaanra
|
878 |
+
izr Izere
|
879 |
+
bzi Bisu
|
880 |
+
sas Sasak
|
881 |
+
cou Wamey
|
882 |
+
cbt Shawi
|
883 |
+
lwo Luwo
|
884 |
+
ban Bali
|
885 |
+
kab Amazigh
|
886 |
+
cbs Kashinawa
|
887 |
+
prf Paranan
|
888 |
+
nhi Nahuatl, Zacatlán-Ahuacatlán-Tepetzintla
|
889 |
+
dyi Sénoufo, Djimini
|
890 |
+
cnw Chin, Ngawn
|
891 |
+
zaa Zapotec, Sierra de Juárez
|
892 |
+
mfe Morisyen
|
893 |
+
mio Mixtec, Pinotepa Nacional
|
894 |
+
kjb Q’anjob’al
|
895 |
+
myx Masaaba
|
896 |
+
con Cofán
|
897 |
+
bkv Bekwarra
|
898 |
+
sur Mwaghavul
|
899 |
+
eza Ezaa
|
900 |
+
qxn Quechua, Northern Conchucos Ancash
|
901 |
+
lgg Lugbara
|
902 |
+
cya Chatino, Nopala
|
903 |
+
zao Zapotec, Ozolotepec
|
904 |
+
kez Kukele
|
905 |
+
sja Epena
|
906 |
+
bdq Bahnar
|
907 |
+
acf Lesser Antillean French Creole
|
908 |
+
ruf Luguru
|
909 |
+
cce Chopi
|
910 |
+
old Mochi
|
911 |
+
acu Achuar-Shiwiar
|
912 |
+
jmc Machame
|
913 |
+
xpe Kpelle, Liberia
|
914 |
+
alj Alangan
|
915 |
+
awb Awa
|
916 |
+
srn Sranan Tongo
|
917 |
+
zad Zapotec, Cajonos
|
918 |
+
lob Lobi
|
919 |
+
tsz Purepecha
|
920 |
+
ote Otomi, Mezquital
|
921 |
+
bcl Bikol, Central
|
922 |
+
mbt Manobo, Matigsalug
|
923 |
+
yua Maya, Yucatec
|
924 |
+
sgw Sebat Bet Gurage
|
925 |
+
tue Tuyuca
|
926 |
+
kao Xaasongaxango
|
927 |
+
mjl Mandeali
|
928 |
+
maz Mazahua, Central
|
929 |
+
miz Mixtec, Coatzospan
|
930 |
+
qvw Quechua, Huaylla Wanca
|
931 |
+
cpa Chinantec, Palantla
|
932 |
+
kxc Konso
|
933 |
+
bss Akoose
|
934 |
+
laj Lango
|
935 |
+
nyo Nyoro
|
936 |
+
ndp Kebu
|
937 |
+
hag Hanga
|
938 |
+
lip Sekpele
|
939 |
+
agn Agutaynen
|
940 |
+
mfy Mayo
|
941 |
+
nod Thai, Northern
|
942 |
+
zos Zoque, Francisco León
|
943 |
+
gde Gude
|
944 |
+
qub Quechua, Huallaga
|
945 |
+
tri Trió
|
946 |
+
way Wayana
|
947 |
+
umb Umbundu
|
948 |
+
gwi Gwich’in
|
949 |
+
qwh Quechua, Huaylas Ancash
|
950 |
+
bsc Oniyan
|
951 |
+
qvn Quechua, North Junín
|
952 |
+
ncj Nahuatl, Northern Puebla
|
953 |
+
tnk Kwamera
|
954 |
+
mit Mixtec, Southern Puebla
|
955 |
+
irk Iraqw
|
956 |
+
djk Aukan
|
957 |
+
vun Vunjo
|
958 |
+
rai Ramoaaina
|
959 |
+
mda Mada
|
960 |
+
gym Ngäbere
|
961 |
+
wob Wè Northern
|
962 |
+
pam Kapampangan
|
963 |
+
mop Maya, Mopán
|
964 |
+
tpp Tepehua, Pisaflores
|
965 |
+
mzk Mambila, Nigeria
|
966 |
+
hig Kamwe
|
967 |
+
tap Taabwa
|
968 |
+
hto Witoto, Minika
|
969 |
+
pww Karen, Pwo Northern
|
970 |
+
kxm Khmer, Northern
|
971 |
+
pbc Patamona
|
972 |
+
ifu Ifugao, Mayoyao
|
973 |
+
heh Hehe
|
974 |
+
bnp Bola
|
975 |
+
nwb Nyabwa
|
976 |
+
pko Pökoot
|
977 |
+
jam Jamaican English Creole
|
978 |
+
gej Gen
|
979 |
+
sld Sissala
|
980 |
+
iqw Ikwo
|
981 |
+
pae Pagibete
|
982 |
+
tac Tarahumara, Western
|
983 |
+
zai Zapotec, Isthmus
|
984 |
+
alt Altai, Southern
|
985 |
+
snw Selee
|
986 |
+
ann Obolo
|
987 |
+
lee Lyélé
|
988 |
+
bao Waimaha
|
989 |
+
klv Maskelynes
|
990 |
+
izz Izii
|
991 |
+
pag Pangasinan
|
992 |
+
thk Kitharaka
|
993 |
+
hay Haya
|
994 |
+
mog Mongondow
|
995 |
+
krj Kinaray-a
|
996 |
+
klu Klao
|
997 |
+
apb Sa’a
|
998 |
+
gmv Gamo
|
999 |
+
ycn Yucuna
|
1000 |
+
kqy Koorete
|
1001 |
+
msy Aruamu
|
1002 |
+
qvs Quechua, San Martín
|
1003 |
+
ood Tohono O’odham
|
1004 |
+
cbc Carapana
|
1005 |
+
stp Tepehuan, Southeastern
|
1006 |
+
bts Batak Simalungun
|
1007 |
+
enb Markweeta
|
1008 |
+
bcw Bana
|
1009 |
+
muh Mündü
|
1010 |
+
adh Jopadhola
|
1011 |
+
gkn Gokana
|
1012 |
+
tgp Tangoa
|
1013 |
+
ziw Zigula
|
1014 |
+
kpz Kupsapiiny
|
1015 |
+
poy Pogolo
|
1016 |
+
daa Dangaléat
|
1017 |
+
tnn Tanna, North
|
1018 |
+
shi Tachelhit
|
1019 |
+
guu Yanomamö
|
1020 |
+
kdi Kumam
|
1021 |
+
ata Pele-Ata
|
1022 |
+
bav Vengo
|
1023 |
+
neb Toura
|
1024 |
+
mif Mofu-Gudur
|
1025 |
+
mbh Mangseng
|
1026 |
+
srm Saramaccan
|
1027 |
+
vid Vidunda
|
1028 |
+
vmy Mazatec, Ayautla
|
1029 |
+
nnq Ngindo
|
1030 |
+
dts Dogon, Toro So
|
1031 |
+
ilb Ila
|
1032 |
+
ngp Ngulu
|
1033 |
+
tpt Tepehua, Tlachichilco
|
1034 |
+
kki Kagulu
|
1035 |
+
gvl Gulay
|
1036 |
+
chz Chinantec, Ozumacín
|
1037 |
+
ndj Ndamba
|
1038 |
+
toh Tonga
|
1039 |
+
zae Zapotec, Yareni
|
1040 |
+
caa Ch’orti’
|
1041 |
+
pau Palauan
|
1042 |
+
zpi Zapotec, Santa María Quiegolani
|
1043 |
+
cjp Cabécar
|
1044 |
+
bng Benga
|
1045 |
+
bjv Bedjond
|
1046 |
+
cuc Chinantec, Usila
|
1047 |
+
krl Karelian
|
1048 |
+
wmw Mwani
|
1049 |
+
nch Nahuatl, Central Huasteca
|
1050 |
+
nse Nsenga
|
1051 |
+
ndz Ndogo
|
1052 |
+
meq Merey
|
1053 |
+
mah Marshallese
|
1054 |
+
gso Gbaya, Southwest
|
1055 |
+
kwi Awa-Cuaiquer
|
1056 |
+
qve Quechua, Eastern Apurímac
|
1057 |
+
mza Mixtec, Santa María Zacatepec
|
1058 |
+
rng Ronga
|
1059 |
+
azz Nahuatl, Highland Puebla
|
1060 |
+
hns Hindustani, Sarnami
|
1061 |
+
npy Napu
|
1062 |
+
bps Blaan, Sarangani
|
1063 |
+
mqb Mbuko
|
1064 |
+
ura Urarina
|
1065 |
+
zty Zapotec, Yatee
|
1066 |
+
inb Inga
|
1067 |
+
cwt Kuwaataay
|
1068 |
+
yli Yali, Angguruk
|
1069 |
+
pad Paumarí
|
1070 |
+
mox Molima
|
1071 |
+
zpm Zapotec, Mixtepec
|
1072 |
+
tos Totonac, Highland
|
1073 |
+
bzj Belize English Creole
|
1074 |
+
apr Arop-Lokep
|
1075 |
+
ifk Ifugao, Tuwali
|
1076 |
+
nca Iyo
|
1077 |
+
boa Bora
|
1078 |
+
rmo Romani, Sinte
|
1079 |
+
jic Tol
|
1080 |
+
ded Dedua
|
1081 |
+
waw Waiwai
|
1082 |
+
saj Sahu
|
1083 |
+
lnl Banda, South Central
|
1084 |
+
pir Piratapuyo
|
1085 |
+
quf Quechua, Lambayeque
|
1086 |
+
sri Siriano
|
1087 |
+
kdn Kunda
|
1088 |
+
cbv Cacua
|
1089 |
+
lac Lacandon
|
1090 |
+
mpp Migabac
|
1091 |
+
gam Kandawo
|
1092 |
+
qvc Quechua, Cajamarca
|
1093 |
+
qvz Quichua, Northern Pastaza
|
1094 |
+
qxh Quechua, Panao
|
1095 |
+
lai Lambya
|
1096 |
+
hub Wampís
|
1097 |
+
jvn Javanese, Suriname
|
1098 |
+
coe Koreguaje
|
1099 |
+
ify Kallahan, Keley-i
|
1100 |
+
nab Nambikuára, Southern
|
1101 |
+
mir Mixe, Isthmus
|
1102 |
+
apf Agta, Pahanan
|
1103 |
+
des Desano
|
1104 |
+
lww Lewo
|
1105 |
+
cbu Kandozi-Chapra
|
1106 |
+
tfr Teribe
|
1107 |
+
beq Beembe
|
1108 |
+
nbw Ngbandi, Southern
|
1109 |
+
loq Lobala
|
1110 |
+
tbg Tairora, North
|
1111 |
+
avu Avokaya
|
1112 |
+
mcb Matsigenka
|
1113 |
+
bto Bikol, Rinconada
|
1114 |
+
mnh Mono
|
1115 |
+
lgl Wala
|
1116 |
+
yad Yagua
|
1117 |
+
qxo Quechua, Southern Conchucos
|
1118 |
+
hno Hindko, Northern
|
1119 |
+
bxg Bangala
|
1120 |
+
pao Paiute, Northern
|
1121 |
+
ibo Igbo
|
1122 |
+
jnj Yemsa
|
1123 |
+
sgj Surgujia
|
1124 |
+
ldi Laari
|
1125 |
+
sab Buglere
|
1126 |
+
bci Baoulé
|
1127 |
+
bxh Buhutu
|
1128 |
+
haw Hawaiian
|
1129 |
+
tnc Tanimuca-Letuama
|
1130 |
+
mfx Melo
|
1131 |
+
tyv Tuvan
|
1132 |
+
neq Mixe, North Central
|
1133 |
+
wbi Vwanji
|
1134 |
+
bcq Bench
|
1135 |
+
ksw Karen, S’gaw
|
1136 |
+
guz Ekegusii
|
1137 |
+
mkw Kituba
|
1138 |
+
ore Maijuna
|
1139 |
+
ige Igede
|
1140 |
+
bjz Baruga
|
1141 |
+
zca Zapotec, Coatecas Altas
|
1142 |
+
mer Kimîîru
|
1143 |
+
aui Anuki
|
1144 |
+
arn Mapudungun
|
1145 |
+
zul Zulu
|
1146 |
+
kxf Kawyaw
|
1147 |
+
alw Alaba-K’abeena
|
1148 |
+
xho Xhosa
|
1149 |
+
loz Lozi
|
1150 |
+
mww Hmong Daw
|
1151 |
+
mey Hassaniyya
|
1152 |
+
ijc Izon
|
1153 |
+
mwt Moken
|
1154 |
+
bza Bandi
|
1155 |
+
lun Lunda
|
1156 |
+
kby Kanuri, Manga
|
1157 |
+
pov Guinea-Bissau Creole
|
1158 |
+
bdg Bonggi
|
1159 |
+
ipi Ipili
|
1160 |
+
sfw Esahie
|
1161 |
+
knc Kanuri, Yerwa
|
1162 |
+
syl Sylheti
|
1163 |
+
bho Bhojpuri
|
1164 |
+
tum Tumbuka
|
1165 |
+
tdy Tadyawan
|
1166 |
+
nso Sotho, Northern
|
1167 |
+
lbj Ladakhi
|
1168 |
+
ckb Kurdish, Central
|
1169 |
+
ndc Ndau
|
1170 |
+
bwr Bura-Pabir
|
1171 |
+
pci Duruwa
|
1172 |
+
dje Zarma
|
1173 |
+
bax Bamun
|
1174 |
+
top Totonac, Papantla
|
1175 |
+
gkp Kpelle, Guinea
|
1176 |
+
lub Luba-Katanga
|
1177 |
+
qug Quichua, Chimborazo Highland
|
1178 |
+
lus Mizo
|
1179 |
+
csh Chin, Asho
|
1180 |
+
gvr Gurung
|
1181 |
+
tew Tewa
|
1182 |
+
cag Nivaclé
|
1183 |
+
bev Bété, Daloa
|
1184 |
+
ggu Gban
|
1185 |
+
vai Vai
|
1186 |
+
tiv Tiv
|
1187 |
+
dgr Tlicho
|
1188 |
+
epo Esperanto
|
1189 |
+
srr Serer-Sine
|
1190 |
+
elm Eleme
|
1191 |
+
maf Mafa
|
1192 |
+
abk Abkhaz
|
1193 |
+
ijn Kalabari
|
1194 |
+
lua Luba-Kasai
|
1195 |
+
kck Kalanga
|
1196 |
+
ngb Ngbandi, Northern
|
1197 |
+
zpq Zapotec, Zoogocho
|
1198 |
+
etu Ejagham
|
1199 |
+
gvs Gumawana
|
1200 |
+
bft Balti
|
1201 |
+
tzm Tamazight, Central Atlas
|
1202 |
+
ida Luidakho-Luisukha-Lutirichi
|
1203 |
+
enl Enlhet
|
1204 |
+
ada Dangme
|
1205 |
+
nzb Njebi
|
1206 |
+
xdy Malayic Dayak
|
1207 |
+
aca Achagua
|
1208 |
+
ktu Kituba
|
1209 |
+
ebu Kiembu
|
1210 |
+
pdt Plautdietsch
|
1211 |
+
gaa Ga
|
1212 |
+
swk Sena, Malawi
|
1213 |
+
awn Awngi
|
1214 |
+
okr Kirike
|
1215 |
+
kvj Psikye
|
1216 |
+
xkl Kenyah, Mainstream
|
1217 |
+
knp Kwanja
|
1218 |
+
krw Krahn, Western
|
1219 |
+
mzl Mixe, Mazatlán
|
1220 |
+
ndi Samba Leko
|
1221 |
+
mug Musgu
|
1222 |
+
soe Songomeno
|
1223 |
+
sea Semai
|
1224 |
+
kfc Konda-Dora
|
1225 |
+
lol Mongo-Nkundu
|
1226 |
+
tsc Tswa
|
1227 |
+
idu Idoma
|
1228 |
+
mni Meitei
|
1229 |
+
trc Triqui, Copala
|
1230 |
+
mgr Mambwe-Lungu
|
1231 |
+
mcn Masana
|
1232 |
+
lrc Luri, Northern
|
1233 |
+
kfi Kurumba, Kannada
|
1234 |
+
bzw Basa
|
1235 |
+
mzz Maiadomu
|
1236 |
+
mrt Marghi Central
|
1237 |
+
rml Romani, Baltic
|
1238 |
+
rhg Rohingya
|
1239 |
+
urh Urhobo
|
1240 |
+
lag Langi
|
1241 |
+
its Isekiri
|
1242 |
+
ego Eggon
|
1243 |
+
gle Irish
|
1244 |
+
ubr Ubir
|
1245 |
+
hdy Hadiyya
|
1246 |
+
jen Dza
|
1247 |
+
sru Suruí
|
1248 |
+
ngc Ngombe
|
1249 |
+
lmp Limbum
|
1250 |
+
isn Isanzu
|
1251 |
+
kqs Kissi, Northern
|
1252 |
+
kpm Koho
|
1253 |
+
nup Nupe-Nupe-Tako
|
1254 |
+
mwm Sar
|
1255 |
+
kng Koongo
|
1256 |
+
nnc Nancere
|
1257 |
+
bkm Kom
|
1258 |
+
tui Tupuri
|
1259 |
+
ogo Khana
|
1260 |
+
lic Hlai
|
1261 |
+
mkn Malay, Kupang
|
1262 |
+
wed Wedau
|
1263 |
+
ald Alladian
|
1264 |
+
ksf Bafia
|
1265 |
+
dur Dii
|
1266 |
+
jaa Jamamadí
|
1267 |
+
kmb Kimbundu
|
1268 |
+
mua Mundang
|
1269 |
+
cje Chru
|
1270 |
+
igb Ebira
|
1271 |
+
gya Gbaya, Northwest
|
1272 |
+
skr Saraiki
|
1273 |
+
dow Doyayo
|
1274 |
+
dww Dawawa
|
1275 |
+
iso Isoko
|
1276 |
+
giz Giziga
|
1277 |
+
bum Bulu
|
1278 |
+
zza Zaza
|
1279 |
+
mfa Malay, Pattani
|
1280 |
+
snf Noon
|
1281 |
+
mgw Matumbi
|
1282 |
+
bin Edo
|
1283 |
+
vmk Makhuwa-Shirima
|
1284 |
+
dua Duala
|
1285 |
+
kea Kabuverdianu
|
1286 |
+
sef Sénoufo, Cebaara
|
1287 |
+
kaj Jju
|
1288 |
+
kqf Kakabai
|
1289 |
+
ayz Mai Brat
|
1290 |
+
ksz Kodaku
|
1291 |
+
ncl Nahuatl, Michoacán
|
1292 |
+
bzd Bribri
|
1293 |
+
ssn Waata
|
1294 |
+
mro Mru
|
1295 |
+
bhi Bhilali
|
1296 |
+
wes Pidgin, Cameroon
|
1297 |
+
adi Adi
|
1298 |
+
efi Efik
|
1299 |
+
ena Apal
|
1300 |
+
nde Ndebele
|
1301 |
+
ast Asturian
|
1302 |
+
mhw Mbukushu
|
1303 |
+
bbj Ghomálá’
|
1304 |
+
geb Kire
|
1305 |
+
igl Igala
|
1306 |
+
aoi Anindilyakwa
|
1307 |
+
rao Rao
|
1308 |
+
nnh Ngiemboon
|
1309 |
+
byv Medumba
|
1310 |
+
sat Santhali
|
1311 |
+
dzg Dazaga
|
1312 |
+
gnn Gumatj
|
1313 |
+
bhb Bhili
|
1314 |
+
swp Suau
|
1315 |
+
sgc Kipsigis
|
1316 |
+
wim Wik-Mungkan
|
1317 |
+
viv Iduna
|
1318 |
+
ady Adyghe
|
1319 |
+
krr Krung
|
1320 |
+
fan Fang
|
1321 |
+
coh Chichonyi-Chidzihana-Chikauma
|
1322 |
+
nbq Nggem
|
1323 |
+
gvo Gavião do Jiparaná
|
1324 |
+
glk Gilaki
|
1325 |
+
acz Acheron
|
1326 |
+
mwf Murrinh-Patha
|
1327 |
+
wbp Warlpiri
|
1328 |
+
tod Toma
|
1329 |
+
unr Mundari
|
1330 |
+
khe Korowai
|
1331 |
+
ntj Ngaanyatjarra
|
1332 |
+
wnc Wantoat
|
1333 |
+
suj Shubi
|
1334 |
+
emk Maninkakan, Eastern
|
1335 |
+
kel Kela
|
1336 |
+
dks Dinka, Southeastern
|
1337 |
+
zav Zapotec, Yatzachi
|
1338 |
+
jra Jarai
|
1339 |
+
dhg Dhangu-Djangu
|
1340 |
+
wlo Wolio
|
1341 |
+
bmk Ghayavi
|
1342 |
+
lgr Lengo
|
1343 |
+
njz Nyishi
|
1344 |
+
lue Luvale
|
1345 |
+
mhu Digaro-Mishmi
|
1346 |
+
tsn Setswana
|
1347 |
+
beo Bedamuni
|
1348 |
+
lgm Lega-Mwenga
|
1349 |
+
haq Ha
|
1350 |
+
trp Kok Borok
|
1351 |
+
tdh Thulung
|
1352 |
+
tuy Tugen
|
1353 |
+
lzz Laz
|
1354 |
+
gvj Guajá
|
1355 |
+
gom Konkani, Goan
|
1356 |
+
kjl Kham, Western Parbate
|
1357 |
+
tke Takwane
|
1358 |
+
mpj Martu Wangka
|
1359 |
+
ven Venda
|
1360 |
+
xer Xerénte
|
1361 |
+
nyd Olunyole
|
1362 |
+
byd Benyadu’
|
1363 |
+
snc Sinaugoro
|
1364 |
+
sdr Sadri, Oraon
|
1365 |
+
toq Toposa
|
1366 |
+
wod Wolani
|
1367 |
+
nhr Naro
|
1368 |
+
tvs Taveta
|
1369 |
+
ble Balanta-Kentohe
|
1370 |
+
bcp Bali
|
1371 |
+
rag Lulogooli
|
1372 |
+
jmx Mixtec, Western Juxtlahuaca
|
1373 |
+
bvd Baeggu
|
1374 |
+
bvu Malay, Bukit
|
1375 |
+
dbj Ida’an
|
1376 |
+
her Herero
|
1377 |
+
mwc Are
|
1378 |
+
sou Thai, Southern
|
1379 |
+
ktz Juǀ’hoansi
|
1380 |
+
rmn Romani, Balkan
|
1381 |
+
qxu Quechua, Arequipa-La Unión
|
1382 |
+
nmn !Xóõ
|
1383 |
+
haj Hajong
|
1384 |
+
bee Byangsi
|
1385 |
+
wbf Wara
|
1386 |
+
sot Sotho, Southern
|
1387 |
+
fmu Muria, Far Western
|
1388 |
+
swb Comorian, Maore
|
1389 |
+
dde Doondo
|
1390 |
+
mve Marwari
|
1391 |
+
mlk Kiwilwana
|
1392 |
+
mjt Sauria Paharia
|
1393 |
+
bjg Bidyogo
|
1394 |
+
jmd Yamdena
|
1395 |
+
mwn Nyamwanga
|
1396 |
+
yml Iamalele
|
1397 |
+
kha Khasi
|
1398 |
+
mzp Movima
|
1399 |
+
tvk Ambrym, Southeast
|
1400 |
+
tkr Tsakhur
|
1401 |
+
dim Dime
|
1402 |
+
mix Mixtec, Mixtepec
|
1403 |
+
tbo Tawala
|
1404 |
+
lma Limba, East
|
1405 |
+
pln Palenquero
|
1406 |
+
koe Suri, Kacipo-Bale
|
1407 |
+
glv Manx
|
1408 |
+
kjg Khmu
|
1409 |
+
wof Wolof, Gambian
|
1410 |
+
kjc Konjo, Coastal
|
1411 |
+
xuu Khwedam
|
1412 |
+
brv Bru, Western
|
1413 |
+
aoz Uab Meto
|
1414 |
+
evn Evenki
|
1415 |
+
tsb Tsamai
|
1416 |
+
djr Djambarrpuyngu
|
1417 |
+
mch Maquiritari
|
1418 |
+
kgk Kaiwá
|
1419 |
+
klr Khaling
|
1420 |
+
gno Gondi, Northern
|
1421 |
+
nuy Nunggubuyu
|
1422 |
+
srq Sirionó
|
1423 |
+
sep Sénoufo, Sìcìté
|
1424 |
+
oki Okiek
|
1425 |
+
trd Turi
|
1426 |
+
msc Maninka, Sankaran
|
1427 |
+
twm Monpa, Tawang
|
1428 |
+
rki Rakhine
|
1429 |
+
mfv Mandjak
|
1430 |
+
mhs Buru
|
1431 |
+
mjx Mahali
|
1432 |
+
ggw Gogodala
|
1433 |
+
nfa Dhao
|
1434 |
+
mym Me’en
|
1435 |
+
hvn Hawu
|
1436 |
+
nuz Nahuatl, Tlamacazapa
|
1437 |
+
are Arrarnta, Western
|
1438 |
+
lbm Lodhi
|
1439 |
+
hni Hani
|
1440 |
+
chf Chontal, Tabasco
|
1441 |
+
mtd Mualang
|
1442 |
+
div Maldivian
|
1443 |
+
the Tharu, Central
|
1444 |
+
rgs Roglai, Southern
|
1445 |
+
nys Nyungar
|
1446 |
+
tpe Tippera
|
1447 |
+
eyo Keiyo
|
1448 |
+
ghr Ghera
|
1449 |
+
kls Kalasha
|
1450 |
+
lrm Olumarama
|
1451 |
+
pmy Malay, Papuan
|
1452 |
+
lbx Lawangan
|
1453 |
+
akh Angal Heneng
|
1454 |
+
kpc Curripaco
|
1455 |
+
sco Scots
|
1456 |
+
lwg Oluwanga
|
1457 |
+
kay Kamayurá
|
1458 |
+
zac Zapotec, Ocotlán
|
1459 |
+
ccp Chakma
|
1460 |
+
pof Poke
|
1461 |
+
seg Segeju
|
1462 |
+
nos Nisu, Eastern
|
1463 |
+
abt Ambulas
|
1464 |
+
llc Lele
|
1465 |
+
sbe Saliba
|
1466 |
+
khz Keapara
|
1467 |
+
yup Yukpa
|
1468 |
+
khw Khowar
|
1469 |
+
bjn Banjar
|
1470 |
+
kyg Keyagana
|
1471 |
+
tab Tabasaran
|
1472 |
+
wci Gbe, Waci
|
1473 |
+
llg Lole
|
1474 |
+
lig Ligbi
|
1475 |
+
tcz Chin, Thado
|
1476 |
+
tog Tonga
|
1477 |
+
bqi Bakhtiâri
|
1478 |
+
psa Awyu, Asue
|
1479 |
+
knx Kendayan
|
1480 |
+
wat Kaninuwa
|
1481 |
+
xem Kembayan
|
1482 |
+
suv Puroik
|
1483 |
+
hix Hixkaryána
|
1484 |
+
bmf Bom-Kim
|
1485 |
+
bkx Baikeno
|
1486 |
+
imo Imbongu
|
1487 |
+
cjs Shor
|
1488 |
+
cto Embera Catío
|
1489 |
+
nyk Nyaneka
|
1490 |
+
tet Tetun
|
1491 |
+
slu Selaru
|
1492 |
+
xmc Makhuwa-Marrevone
|
1493 |
+
knu Kono
|
1494 |
+
rgu Rikou
|
1495 |
+
bgz Banggai
|
1496 |
+
zam Zapotec, Miahuatlán
|
1497 |
+
xdn
|
1498 |
+
iru Irula
|
1499 |
+
mbp Malayo
|
1500 |
+
ymm Maay
|
1501 |
+
kuj Kuria
|
1502 |
+
bfg Kayan, Busang
|
1503 |
+
thq Tharu, Mid-Eastern
|
1504 |
+
otd Ot Danum
|
1505 |
+
tnv Tangchangya
|
1506 |
+
esg Gondi, Aheri
|
1507 |
+
ajg Aja
|
1508 |
+
dwy Dhuwaya
|
1509 |
+
yrl Nhengatu
|
1510 |
+
kud ’Auhelawa
|
1511 |
+
mau Mazatec, Huautla
|
1512 |
+
loe Saluan
|
1513 |
+
kiw Kiwai, Northeast
|
1514 |
+
zin Zinza
|
1515 |
+
bbr Girawa
|
1516 |
+
srb Sora
|
1517 |
+
gup Gunwinggu
|
1518 |
+
pht Phu Thai
|
1519 |
+
ztg Zapotec, Xanaguía
|
1520 |
+
tpa Taupota
|
1521 |
+
blr Blang
|
1522 |
+
awi Aekyom
|
1523 |
+
pgg Pangwali
|
1524 |
+
snk Soninke
|
1525 |
+
nni Nuaulu, North
|
1526 |
+
hts Hadza
|
1527 |
+
scg Sanggau
|
1528 |
+
xdo Kwandu
|
1529 |
+
adq Adangbe
|
1530 |
+
cnk Chin, Khumi
|
1531 |
+
nza Mbembe, Tigon
|
1532 |
+
agg Angor
|
1533 |
+
ina Interlingua (International Auxiliary Language Association)
|
1534 |
+
maq Mazatec, Chiquihuitlán
|
1535 |
+
blo Anii
|
1536 |
+
ctp Chatino, Western Highland
|
1537 |
+
lbf Tinani
|
1538 |
+
xta Mixtec, Alcozauca
|
1539 |
+
tix Tiwa, Southern
|
1540 |
+
mee Mengen
|
1541 |
+
dnn Dzùùngoo
|
1542 |
+
kap Bezhta
|
1543 |
+
ssy Saho
|
1544 |
+
yon Yongkom
|
1545 |
+
tlr Talise
|
1546 |
+
duc Duna
|
1547 |
+
tro Naga, Tarao
|
1548 |
+
tth Ta’oih, Upper
|
1549 |
+
kpo Ikposo
|
1550 |
+
nuf Nusu
|
1551 |
+
pbo Papel
|
1552 |
+
lla Lala-Roba
|
1553 |
+
mki Dhatki
|
1554 |
+
ckt Chukchi
|
1555 |
+
pri Paicî
|
1556 |
+
pnb Punjabi, Western
|
1557 |
+
rah Rabha
|
1558 |
+
fli Fali Muchella
|
1559 |
+
eto Eton
|
1560 |
+
beu Blagar
|
1561 |
+
xsq Makhuwa-Saka
|
1562 |
+
bhw Biak
|
1563 |
+
atd Manobo, Ata
|
1564 |
+
zpv Zapotec, Chichicapan
|
1565 |
+
sza Semelai
|
1566 |
+
bob Aweer
|
1567 |
+
afz Obokuitai
|
1568 |
+
mui Musi
|
1569 |
+
tkt Tharu, Kathariya
|
1570 |
+
phr Pahari-Potwari
|
1571 |
+
bha Bharia
|
1572 |
+
tdt Tetun Dili
|
1573 |
+
ton Tongan
|
1574 |
+
nwi Tanna, Southwest
|
1575 |
+
olu Kuvale
|
1576 |
+
mxx Mahou
|
1577 |
+
uki Kui
|
1578 |
+
mgp Magar, Eastern
|
1579 |
+
zgb Zhuang, Guibei
|
1580 |
+
bxr Buriat, Russia
|
1581 |
+
tsj Tshangla
|
1582 |
+
gwn Gwandara
|
1583 |
+
bon Bine
|
1584 |
+
enq Enga
|
1585 |
+
qxp Quechua, Puno
|
1586 |
+
bji Burji
|
1587 |
+
onr One, Northern
|
1588 |
+
xky Uma’ Lasan
|
1589 |
+
awu Awyu, Central
|
1590 |
+
kvo Dobel
|
1591 |
+
xav Xavánte
|
1592 |
+
yiu Awu
|
1593 |
+
sdq Semandang
|
1594 |
+
pdu Kayan
|
1595 |
+
vaa Vaagri Booli
|
1596 |
+
shr Shi
|
1597 |
+
kvw Wersing
|
1598 |
+
mvv Murut, Tahol
|
1599 |
+
blb Bilua
|
1600 |
+
ckh Chak
|
1601 |
+
kei Kei
|
1602 |
+
jml Jumli
|
1603 |
+
knl Keninjal
|
1604 |
+
tpr Tuparí
|
1605 |
+
pwo Karen, Pwo Western
|
1606 |
+
dgc Agta, Casiguran Dumagat
|
1607 |
+
bug Bugis
|
1608 |
+
age Angal
|
1609 |
+
kmw Komo
|
1610 |
+
sei Seri
|
1611 |
+
cbn Nyahkur
|
1612 |
+
ria Riang
|
1613 |
+
asy Asmat, Yaosakor
|
1614 |
+
nes Kinnauri, Bhoti
|
1615 |
+
mrr Maria
|
1616 |
+
oyb Oy
|
1617 |
+
vah Varhadi-Nagpuri
|
1618 |
+
gnk ǁGana
|
1619 |
+
gah Alekano
|
1620 |
+
ghe Ghale, Southern
|
1621 |
+
aoj Mufian
|
1622 |
+
kps Tehit
|
1623 |
+
tpx Me’phaa, Acatepec
|
1624 |
+
jab Hyam
|
1625 |
+
vaj Northwestern !Kung
|
1626 |
+
sie Simaa
|
1627 |
+
pcf Paliyan
|
1628 |
+
itl Itelmen
|
1629 |
+
gld Nanai
|
1630 |
+
hmd Miao, Large Flowery
|
1631 |
+
skx Seko Padang
|
1632 |
+
yoy Yoy
|
1633 |
+
dhw Danuwar
|
1634 |
+
sbu Stod Bhoti
|
1635 |
+
bun Sherbro
|
1636 |
+
khb Lü
|
1637 |
+
leu Kara
|
1638 |
+
kas Kashmiri
|
1639 |
+
hii Hinduri
|
1640 |
+
djo Jangkang
|
1641 |
+
krn Sapo
|
1642 |
+
bap Bantawa
|
1643 |
+
iii Nuosu
|
1644 |
+
row Dela-Oenale
|
1645 |
+
brx Boro
|
1646 |
+
lir Liberian English
|
1647 |
+
apz Safeyoka
|
1648 |
+
ssw Swati
|
1649 |
+
kib Koalib
|
1650 |
+
bmb Bembe
|
1651 |
+
cao Chácobo
|
1652 |
+
nbe Naga, Konyak
|
1653 |
+
jna Jangshung
|
1654 |
+
kca Khanty
|
1655 |
+
zyn Zhuang, Yongnan
|
1656 |
+
kpy Koryak
|
1657 |
+
peg Pengo
|
1658 |
+
tnl Lenakel
|
1659 |
+
nti Natioro
|
1660 |
+
gaj Gadsup
|
1661 |
+
lep Lepcha
|
1662 |
+
mxn Moi
|
1663 |
+
dry Darai
|
1664 |
+
kmc Dong, Southern
|
1665 |
+
kup Kunimaipa
|
1666 |
+
tqo Toaripi
|
1667 |
+
kqb Kovai
|
1668 |
+
ksd Kuanua
|
1669 |
+
hea Miao, Northern Qiandong
|
1670 |
+
pcc Bouyei
|
1671 |
+
dre Dolpo
|
1672 |
+
mxj Miju-Mishmi
|
1673 |
+
lyn Luyana
|
1674 |
+
kxv Kuvi
|
1675 |
+
cns Asmat, Central
|
1676 |
+
aix Aighon
|
1677 |
+
rwr Marwari
|
1678 |
+
anu Anuak
|
1679 |
+
aso Dano
|
1680 |
+
ino Inoke-Yate
|
1681 |
+
ncm Nambo
|
1682 |
+
kfq Korku
|
1683 |
+
dhn Dangi
|
1684 |
+
nii Nii
|
1685 |
+
bzf Boikin
|
1686 |
+
srl Isirawa
|
1687 |
+
bpe Bauni
|
1688 |
+
ong Olo
|
1689 |
+
mho Mashi
|
1690 |
+
sdo Bidayuh Serian
|
1691 |
+
kfv Kurmukar
|
1692 |
+
cch Atsam
|
1693 |
+
agx Aghul
|
1694 |
+
ewo Ewondo
|
1695 |
+
dta Daur
|
1696 |
+
mlu To’abaita
|
1697 |
+
zik Zimakani
|
1698 |
+
yom Yombe
|
1699 |
+
lae Pattani
|
1700 |
+
wbr Wagdi
|
1701 |
+
dar Dargwa
|
1702 |
+
mrm Mwerlap
|
1703 |
+
hmt Hamtai
|
1704 |
+
vay Wayu
|
1705 |
+
dib Dinka, South Central
|
1706 |
+
cdm Chepang
|
1707 |
+
ola Walungge
|
1708 |
+
yiz Azhe
|
1709 |
+
lri Olumarachi
|
1710 |
+
xmz Mori Bawah
|
1711 |
+
tpj Ñandeva
|
1712 |
+
kgp Kaingang
|
1713 |
+
bcf Bamu
|
1714 |
+
wib Toussian, Southern
|
1715 |
+
mji Kim Mun
|
1716 |
+
fwe Fwe
|
1717 |
+
apw Apache, Western
|
1718 |
+
xri Krikati-Timbira
|
1719 |
+
thr Tharu, Rana
|
1720 |
+
afe Utugwang-Irungene-Afrike
|
1721 |
+
gea Geruma
|
1722 |
+
gwj ǀGwi
|
1723 |
+
kai Karekare
|
1724 |
+
sgp Singpho
|
1725 |
+
ahl Igo
|
1726 |
+
pav Pakaásnovos
|
1727 |
+
zzj Zhuang, Zuojiang
|
1728 |
+
sip Sikkimese
|
1729 |
+
ybi Yamphu
|
1730 |
+
cli Chakali
|
1731 |
+
xtl Mixtec, Tijaltepec
|
1732 |
+
cro Crow
|
1733 |
+
pmi Pumi, Northern
|
1734 |
+
nmi Nyam
|
1735 |
+
kcl Kala
|
1736 |
+
ish Esan
|
1737 |
+
rab Chamling
|
1738 |
+
kvf Kabalai
|
1739 |
+
kwv Kaba Naa, Sara
|
1740 |
+
bwi Baniwa
|
1741 |
+
mrd Magar, Western
|
1742 |
+
kfk Kinnauri
|
1743 |
+
cfa Dikaka
|
1744 |
+
pex Petats
|
1745 |
+
aly Alyawarr
|
1746 |
+
lot Otuho
|
1747 |
+
twe Teiwa
|
1748 |
+
ygr Yagaria
|
1749 |
+
afu Awutu
|
1750 |
+
gol Gola
|
1751 |
+
dhd Dhundari
|
1752 |
+
bku Buhid
|
1753 |
+
ppt Pa
|
1754 |
+
ulu Uma’ Lung
|
1755 |
+
syw Syuba
|
1756 |
+
ekg Ekari
|
1757 |
+
boq Bogaya
|
1758 |
+
tsx Mubami
|
1759 |
+
stt Stieng, Budeh
|
1760 |
+
kwl Kofyar
|
1761 |
+
bzy Abanglekuo
|
1762 |
+
mjc Mixtec, San Juan Colorado
|
1763 |
+
tnp Whitesands
|
1764 |
+
njb Naga, Nocte
|
1765 |
+
mle Manambu
|
1766 |
+
ram Canela
|
1767 |
+
bas Basaa
|
1768 |
+
kjp Karen, Pwo Eastern
|
1769 |
+
shj Shatt
|
1770 |
+
hut Humla
|
1771 |
+
pud Punan Aput
|
1772 |
+
att Atta, Pamplona
|
1773 |
+
wbm Wa, Vo
|
1774 |
+
xuj Kurumba, Jennu
|
1775 |
+
bhj Bahing
|
1776 |
+
dhm Dhimba
|
1777 |
+
les Lese
|
1778 |
+
amn Amanab
|
1779 |
+
ass Ipulo
|
1780 |
+
kge Komering
|
1781 |
+
bwx Bunu, Bu-Nao
|
1782 |
+
onp Sartang
|
1783 |
+
nmo Naga, Moyon
|
1784 |
+
gju Gujari
|
1785 |
+
haz Hazaragi
|
1786 |
+
snx Sam
|
1787 |
+
bfb Bareli, Pauri
|
1788 |
+
kyo Klon
|
1789 |
+
tdf Talieng
|
1790 |
+
mgm Mambae
|
1791 |
+
swv Shekhawati
|
1792 |
+
blk Pa’o
|
1793 |
+
kqm Khisa
|
1794 |
+
ikx Ik
|
1795 |
+
yig Nasu, Wusa
|
1796 |
+
twh Tai Dón
|
1797 |
+
tjg Tunjung
|
1798 |
+
kpb Kurumba, Mullu
|
1799 |
+
kzs Sugut Dusun
|
1800 |
+
szb Ngalum
|
1801 |
+
ysn Sani
|
1802 |
+
bzz Evant
|
1803 |
+
nbu Naga, Rongmei
|
1804 |
+
cgk Chocangacakha
|
1805 |
+
kbd Kabardian
|
1806 |
+
cua Cua
|
1807 |
+
ntp Tepehuan, Northern
|
1808 |
+
zpj Zapotec, Quiavicuzas
|
1809 |
+
aii Assyrian Neo-Aramaic
|
1810 |
+
kpr Korafe-Yegha
|
1811 |
+
tpu Tampuan
|
1812 |
+
mfc Mba
|
1813 |
+
xra Krahô
|
1814 |
+
aai Miniafia Oyan
|
1815 |
+
shg Shua
|
1816 |
+
brg Baure
|
1817 |
+
tsg Tausug
|
1818 |
+
giw Duoluo
|
1819 |
+
myl Moma
|
1820 |
+
mks Mixtec, Silacayoapan
|
1821 |
+
say Saya
|
1822 |
+
goj Gowlan
|
1823 |
+
ywq Yi, Wuding-Luquan
|
1824 |
+
tsr Akei
|
1825 |
+
niq Nandi
|
1826 |
+
mtr Mewari
|
1827 |
+
lml Hano
|
1828 |
+
wtm Mewati
|
1829 |
+
mde Maba
|
1830 |
+
cik Kinnauri, Chitkuli
|
1831 |
+
dwz Dewas Rai
|
1832 |
+
uar Tairuma
|
1833 |
+
ian Iatmul
|
1834 |
+
lar Larteh
|
1835 |
+
ttr Tera
|
1836 |
+
dby Dibiyaso
|
1837 |
+
pah Tenharim
|
1838 |
+
wlv Bermejo Wichí
|
1839 |
+
mpr Vangunu
|
1840 |
+
uth ut-Hun
|
1841 |
+
krv Kavet
|
1842 |
+
mrg Mising
|
1843 |
+
grv Grebo, Central
|
1844 |
+
bpx Bareli, Palya
|
1845 |
+
dob Dobu
|
1846 |
+
knv Tabo
|
1847 |
+
scp Hyolmo
|
1848 |
+
shy Tachawit
|
1849 |
+
lbe Lak
|
1850 |
+
sya Siang
|
1851 |
+
loy Lhowa
|
1852 |
+
cux Cuicatec, Tepeuxila
|
1853 |
+
ybh Yakkha
|
1854 |
+
sso Essono
|
1855 |
+
ztp Zapotec, Loxicha
|
1856 |
+
jul Jirel
|
1857 |
+
kgq Kamoro
|
1858 |
+
dao Chin, Daai
|
1859 |
+
wad Wamesa
|
1860 |
+
mnz Moni
|
1861 |
+
kbc Kadiwéu
|
1862 |
+
agw Kahua
|
1863 |
+
wmt Walmajarri
|
1864 |
+
bco Kaluli
|
1865 |
+
pkh Pangkhua
|
1866 |
+
meu Motu
|
1867 |
+
gjk Koli, Kachi
|
1868 |
+
uss us-Saare
|
1869 |
+
raa Dungmali
|
1870 |
+
nkb Naga, Khoibu
|
1871 |
+
aau Abau
|
1872 |
+
bde Bade
|
1873 |
+
mzr Marubo
|
1874 |
+
sax Sa
|
1875 |
+
txo Toto
|
1876 |
+
mte Mono
|
1877 |
+
sdp Sherdukpen
|
1878 |
+
hmo Motu, Hiri
|
1879 |
+
gdb Gadaba, Pottangi Ollar
|
1880 |
+
tic Tira
|
1881 |
+
mdk Mangbutu
|
1882 |
+
baa Babatana
|
1883 |
+
sjp Surjapuri
|
1884 |
+
kun Kunama
|
1885 |
+
kbl Kanembu
|
1886 |
+
mql Mbelime
|
1887 |
+
qud Quichua, Calderón Highland
|
1888 |
+
lpo Lipo
|
1889 |
+
arr Karo
|
1890 |
+
kty Kango
|
1891 |
+
klw Tado
|
1892 |
+
mke Mawchi
|
1893 |
+
nfu Mfumte
|
1894 |
+
soi Sonha
|
1895 |
+
tar Tarahumara, Central
|
1896 |
+
xub Kurumba, Betta
|
1897 |
+
klz Kabola
|
1898 |
+
lra Bakati’, Rara
|
1899 |
+
mxu Mada
|
1900 |
+
kwx Khirwar
|
1901 |
+
mdr Mandar
|
1902 |
+
hoe Horom
|
1903 |
+
lsr Aruop
|
1904 |
+
mbz Mixtec, Amoltepec
|
1905 |
+
lbq Wampar
|
1906 |
+
mdd Mbum
|
1907 |
+
plj Polci
|
1908 |
+
all Allar
|
1909 |
+
kjo Kinnauri, Pahari
|
1910 |
+
xmt Matbat
|
1911 |
+
kft Kanjari
|
1912 |
+
mcf Matses
|
1913 |
+
tbf Mandara
|
1914 |
+
sif Siamou
|
1915 |
+
tio Teop
|
1916 |
+
tcy Tulu
|
1917 |
+
lnu Longuda
|
1918 |
+
ica Ede Ica
|
1919 |
+
bpp Kaure
|
1920 |
+
juk Wapan
|
1921 |
+
shb Ninam
|
1922 |
+
grj Grebo, Southern
|
1923 |
+
bec Iceve-Maci
|
1924 |
+
mvg Mixtec, Yucuañe
|
1925 |
+
cnb Chin, Uppu
|
1926 |
+
skj Seke
|
1927 |
+
noe Nimadi
|
1928 |
+
tba Aikanã
|
1929 |
+
sly Selayar
|
1930 |
+
dot Dass
|
1931 |
+
sfm Miao, Small Flowery
|
1932 |
+
yss Yessan-Mayo
|
1933 |
+
blw Balangao
|
1934 |
+
slr Salar
|
1935 |
+
soa Thai Song
|
1936 |
+
bla Blackfoot
|
1937 |
+
tan Tangale
|
1938 |
+
bns Bundeli
|
1939 |
+
xtc Katcha-Kadugli-Miri
|
1940 |
+
nmf Naga, Tangkhul
|
1941 |
+
grd Guruntum-Mbaaru
|
1942 |
+
amr Amarakaeri
|
1943 |
+
puu Punu
|
1944 |
+
mlm Mulam
|
1945 |
+
lec Leco
|
1946 |
+
bcs Hohumono
|
1947 |
+
byn Bilen
|
1948 |
+
ott Otomi, Temoaya
|
1949 |
+
arv Arbore
|
1950 |
+
xkk Kachok
|
1951 |
+
mjg Tu
|
1952 |
+
pnq Pana
|
1953 |
+
asc Asmat, Casuarina Coast
|
1954 |
+
aks Akaselem
|
1955 |
+
mmg Ambrym, North
|
1956 |
+
tld Talaud
|
1957 |
+
bkq Bakairí
|
1958 |
+
ort Oriya, Adivasi
|
1959 |
+
kxz Kerewo
|
1960 |
+
kwj Kwanga
|
1961 |
+
cub Cubeo
|
1962 |
+
eja Jola-Felupe
|
1963 |
+
wbl Wakhi
|
1964 |
+
uri Urim
|
1965 |
+
zua Zeem
|
1966 |
+
kjd Kiwai, Southern
|
1967 |
+
ruk Kuce
|
1968 |
+
lbk Bontok, Central
|
1969 |
+
bfw Bondo
|
1970 |
+
jao Yanyuwa
|
1971 |
+
hca Andaman Hindi Creole
|
1972 |
+
ssx Samberigi
|
1973 |
+
ldl Kaan
|
1974 |
+
byx Qaqet
|
1975 |
+
nku Kulango, Bouna
|
1976 |
+
gec Grebo, Gboloo
|
1977 |
+
zlj Zhuang, Liujiang
|
1978 |
+
bge Bauria
|
1979 |
+
btu Batu
|
1980 |
+
nlx Nahali
|
1981 |
+
hmr Hmar
|
1982 |
+
tcu Tarahumara, Southeastern
|
1983 |
+
lax Tiwa
|
1984 |
+
lhm Lhomi
|
1985 |
+
kdp Nikyob-Nindem
|
1986 |
+
tes Tengger
|
1987 |
+
mdb Morigi
|
1988 |
+
msi Malay, Sabah
|
1989 |
+
rog Roglai, Northern
|
1990 |
+
jda Jad
|
1991 |
+
zpa Zapotec, Lachiguiri
|
1992 |
+
poc Poqomam
|
1993 |
+
mgu Magi
|
1994 |
+
nnu Dwang
|
1995 |
+
kui Kuikúro-Kalapálo
|
1996 |
+
llp Efate, North
|
1997 |
+
kxj Kulfa
|
1998 |
+
mjz Majhi
|
1999 |
+
jms Mashi
|
2000 |
+
nto Ntomba
|
2001 |
+
hsn Chinese, Xiang
|
2002 |
+
bhu Bhunjia
|
2003 |
+
nfd Ahwai
|
2004 |
+
ksg Kusaghe
|
2005 |
+
kzr Karang
|
2006 |
+
lyg Lyngngam
|
2007 |
+
prp Parsi
|
2008 |
+
lle Lele
|
2009 |
+
kex Kukna
|
2010 |
+
brh Brahui
|
2011 |
+
bkk Brokskat
|
2012 |
+
wuu Chinese, Wu
|
2013 |
+
gry Grebo, Barclayville
|
2014 |
+
bgp Balochi, Eastern
|
2015 |
+
pai Pye
|
2016 |
+
cta Chatino, Tataltepec
|
2017 |
+
cog Chong
|
2018 |
+
oro Orokolo
|
2019 |
+
pug Phuie
|
2020 |
+
swi Sui
|
2021 |
+
inj Inga, Jungle
|
2022 |
+
wmo Wom
|
2023 |
+
kcv Kete
|
2024 |
+
cna Changthang
|
2025 |
+
xkf Khengkha
|
2026 |
+
jer Jere
|
2027 |
+
bca Bai, Central
|
2028 |
+
kua Oshiwambo
|
2029 |
+
roh Romansh
|
2030 |
+
mxe Mele-Fila
|
2031 |
+
jmn Naga, Makuri
|
2032 |
+
dus Dumi
|
2033 |
+
ssk Sunam
|
2034 |
+
bqg Bago-Kusuntu
|
2035 |
+
pwr Powari
|
2036 |
+
jbj Arandai
|
2037 |
+
yet Yetfa
|
2038 |
+
lhi Lahu Shi
|
2039 |
+
aar Afar
|
2040 |
+
ksu Khamyang
|
2041 |
+
mxy Mixtec, Southeastern Nochixtlán
|
2042 |
+
tcn Tichurong
|
2043 |
+
lmx Laimbue
|
2044 |
+
xua Kurumba, Alu
|
2045 |
+
khr Kharia
|
2046 |
+
zyj Zhuang, Youjiang
|
2047 |
+
mng Mnong, Eastern
|
2048 |
+
roo Rotokas
|
2049 |
+
anr Andh
|
2050 |
+
mdv Mixtec, Santa Lucía Monteverde
|
2051 |
+
msm Manobo, Agusan
|
2052 |
+
nbl Ndebele
|
2053 |
+
cin Cinta Larga
|
2054 |
+
sjl Miji
|
2055 |
+
saw Sawi
|
2056 |
+
xkz Kurtokha
|
2057 |
+
npb Nupbikha
|
2058 |
+
cnc Côông
|
2059 |
+
muk Mugom
|
2060 |
+
foi Foi
|
2061 |
+
sqq Sou
|
2062 |
+
tdd Tai Nüa
|
2063 |
+
kil Kariya
|
2064 |
+
bma Lame
|
2065 |
+
dad Marik
|
2066 |
+
bix Bijori
|
2067 |
+
nao Naaba
|
2068 |
+
pwb Panawa
|
2069 |
+
bhx Bhalay
|
2070 |
+
aro Araona
|
2071 |
+
qwa Quechua, Corongo Ancash
|
2072 |
+
gga Gao
|
2073 |
+
zau Zangskari
|
2074 |
+
brt Bitare
|
2075 |
+
tyz Tày
|
2076 |
+
keu Akebu
|
2077 |
+
anm Anal
|
2078 |
+
lro Laro
|
2079 |
+
ssb Sama, Southern
|
2080 |
+
der Deori
|
2081 |
+
kad Adara
|
2082 |
+
esk Inupiatun, Northwest Alaska
|
2083 |
+
clo Chontal, Lowland Oaxaca
|
2084 |
+
bli Bolia
|
2085 |
+
tuz Turka
|
2086 |
+
bra Braj Bhasha
|
2087 |
+
nnm Namia
|
2088 |
+
sui Suki
|
2089 |
+
tgs Nume
|
2090 |
+
gbe Niksek
|
2091 |
+
xwe Gbe, Xwela
|
2092 |
+
kfp Korwa
|
2093 |
+
apt Apatani
|
2094 |
+
dzl Dzalakha
|
2095 |
+
mpq Matís
|
2096 |
+
hal Halang
|
2097 |
+
bio Nai
|
2098 |
+
jib Jibu
|
2099 |
+
kph Kplang
|
2100 |
+
hia Lamang
|
2101 |
+
yij Yindjibarndi
|
2102 |
+
chq Chinantec, Quiotepec
|
2103 |
+
xbi Kombio
|
2104 |
+
mpc Mangarrayi
|
2105 |
+
ebo Teke-Eboo
|
2106 |
+
tcs Torres Strait Creole
|
2107 |
+
kvi Kwang
|
2108 |
+
zyg Zhuang, Yang
|
2109 |
+
bww Bwa
|
2110 |
+
kpl Kpala
|
2111 |
+
hoy Holiya
|
2112 |
+
nhp Nahuatl, Isthmus-Pajapan
|
2113 |
+
abo Abon
|
2114 |
+
dai Day
|
2115 |
+
zom Zo
|
2116 |
+
lea Lega-Shabunda
|
2117 |
+
kej Kadar
|
2118 |
+
aup Makayam
|
2119 |
+
tcx Toda
|
2120 |
+
kmi Kami
|
2121 |
+
jio Jiamao
|
2122 |
+
bhd Bhadrawahi
|
2123 |
+
cav Cavineña
|
2124 |
+
bda Bayot
|
2125 |
+
ppq Pefiyahe
|
2126 |
+
bbk Babanki
|
2127 |
+
apu Apurinã
|
2128 |
+
ahr Ahirani
|
2129 |
+
wsi Wusi
|
2130 |
+
tdj Tajio
|
2131 |
+
myu Mundurukú
|
2132 |
+
kzq Kaike
|
2133 |
+
bfu Gahri
|
2134 |
+
sgh Shughni
|
2135 |
+
kfg Kudiya
|
2136 |
+
bcn Bali
|
2137 |
+
ygw Yagwoia
|
2138 |
+
ttv Titan
|
2139 |
+
iyo Mesaka
|
2140 |
+
pcn Abishi
|
2141 |
+
lkt Lakota
|
2142 |
+
aim Aimol
|
2143 |
+
tcf Me’phaa, Malinaltepec
|
2144 |
+
fod Foodo
|
2145 |
+
phk Phake
|
2146 |
+
scu Shumcho
|
2147 |
+
lch Luchazi
|
2148 |
+
nbm Ngbaka Ma’bo
|
2149 |
+
bei Bakati’
|
2150 |
+
jid Bu
|
2151 |
+
sce Dongxiang
|
2152 |
+
noi Noiri
|
2153 |
+
hmj Ge
|
2154 |
+
tyr Tai Daeng
|
2155 |
+
rop Kriol
|
2156 |
+
tsv Tsogo
|
2157 |
+
nbr Numana
|
2158 |
+
kvx Koli, Parkari
|
2159 |
+
ums Pendau
|
2160 |
+
dka Dakpakha
|
2161 |
+
alu ’Are’are
|
2162 |
+
pid Piaroa
|
2163 |
+
mab Mixtec, Yutanduchi
|
2164 |
+
gaq Gata’
|
2165 |
+
kgy Kyerung
|
2166 |
+
abs Malay, Ambonese
|
2167 |
+
alk Alak
|
2168 |
+
gdn Umanakaina
|
2169 |
+
ths Thakali
|
2170 |
+
khn Khandesi
|
2171 |
+
gaw Nobonob
|
2172 |
+
aac Ari
|
2173 |
+
tvd Tsuvadi
|
2174 |
+
bkr Bakumpai
|
2175 |
+
xkb Nago, Northern
|
2176 |
+
aot Atong
|
2177 |
+
lmn Lambadi
|
2178 |
+
kgr Abun
|
2179 |
+
moc Mocoví
|
2180 |
+
mbk Malol
|
2181 |
+
sss So
|
2182 |
+
dbv Dungu
|
2183 |
+
ngt Kriang
|
2184 |
+
tja Tajuasohn
|
2185 |
+
kif Kham, Eastern Parbate
|
2186 |
+
okv Orokaiva
|
2187 |
+
qvi Quichua, Imbabura Highland
|
2188 |
+
esu Yupik, Central
|
2189 |
+
bby Befang
|
2190 |
+
koi Komi-Permyak
|
2191 |
+
cvg Chug
|
2192 |
+
gdr Wipi
|
2193 |
+
kxp Koli, Wadiyari
|
2194 |
+
mme Mae
|
2195 |
+
pmj Pumi, Southern
|
2196 |
+
suy Suyá
|
2197 |
+
vas Vasavi
|
2198 |
+
suo Bouni
|
2199 |
+
nbc Naga, Chang
|
2200 |
+
bvr Burarra
|
2201 |
+
tts Thai, Northeastern
|
2202 |
+
diu Gciriku
|
2203 |
+
ndx Nduga
|
2204 |
+
bkl Berik
|
2205 |
+
lhp Lhokpu
|
2206 |
+
alf Elege
|
2207 |
+
wog Wogamusin
|
2208 |
+
bxa Bauro
|
2209 |
+
xwl Gbe, Western Xwla
|
2210 |
+
jae Yabem
|
2211 |
+
xbr Kambera
|
2212 |
+
bwd Bwaidoka
|
2213 |
+
nar Iguta
|
2214 |
+
dcc Deccan
|
2215 |
+
bjx Kalinga, Vanaw
|
2216 |
+
yes Nyankpa
|
2217 |
+
kul Kulere
|
2218 |
+
ssi Sansi
|
2219 |
+
hre Hre
|
2220 |
+
mtt Mota
|
2221 |
+
ysp Lolopo, Southern
|
2222 |
+
auc Waorani
|
2223 |
+
thy Tha
|
2224 |
+
dza Tunzuii
|
2225 |
+
tkb Buksa
|
2226 |
+
lkr Päri
|
2227 |
+
skn Subanon, Kolibugan
|
2228 |
+
tgd Ciwogai
|
2229 |
+
myp Pirahã
|
2230 |
+
eve Even
|
2231 |
+
bgg Bugun
|
2232 |
+
ril Riang Lang
|
2233 |
+
dbm Duguri
|
2234 |
+
bew Betawi
|
2235 |
+
aps Orop
|
2236 |
+
aon Weri
|
2237 |
+
dub Dubli
|
2238 |
+
hld Halang Doan
|
2239 |
+
jwi Jwira-Pepesa
|
2240 |
+
ayg Ginyanga
|
2241 |
+
wno Wano
|
2242 |
+
bfr Bazigar
|
2243 |
+
kpk Kpan
|
2244 |
+
bcg Baga Pokur
|
2245 |
+
avt Au
|
2246 |
+
nke Duke
|
2247 |
+
stk Aramba
|
2248 |
+
mkz Makasae
|
2249 |
+
hms Miao, Southern Qiandong
|
2250 |
+
duh Dungra Bhil
|
2251 |
+
scl Shina
|
2252 |
+
bfm Mmen
|
2253 |
+
ctl Chinantec, Tlacoatzintepec
|
2254 |
+
kra Kumal
|
2255 |
+
hmg Miao, Southwestern Guiyang
|
2256 |
+
zay Zayse
|
2257 |
+
faa Fasu
|
2258 |
+
lpn Naga, Long Phuri
|
2259 |
+
bqv Koro Wachi
|
2260 |
+
mpt Mian
|
2261 |
+
zak Zanaki
|
2262 |
+
pne Penan, Western
|
2263 |
+
apn Apinayé
|
2264 |
+
sbx Seberuang
|
2265 |
+
anp Angika
|
2266 |
+
bdv Bodo Parja
|
2267 |
+
juy Juray
|
2268 |
+
dso Desiya
|
2269 |
+
ndd Nde-Nsele-Nta
|
2270 |
+
ich Etkywan
|
2271 |
+
bkc Baka
|
2272 |
+
lez Lezgi
|
2273 |
+
lsh Lish
|
2274 |
+
mig Mixtec, San Miguel el Grande
|
2275 |
+
bdi Burun
|
2276 |
+
buu Budu
|
2277 |
+
ktn Karitiâna
|
2278 |
+
lbo Laven
|
2279 |
+
spn Sanapaná
|
2280 |
+
kgj Kham, Gamal
|
2281 |
+
kky Guugu Yimidhirr
|
2282 |
+
bjj Kanauji
|
2283 |
+
hve Huave, San Dionisio del Mar
|
2284 |
+
ghs Guhu-Samane
|
2285 |
+
vav Varli
|
2286 |
+
pih Pitcairn-Norfolk
|
2287 |
+
pcg Paniya
|
2288 |
+
ldj Lemoro
|
2289 |
+
brr Birao
|
2290 |
+
emn Eman
|
2291 |
+
lhl Lohar, Lahul
|
2292 |
+
pnc Pannei
|
2293 |
+
mnl Tiale
|
2294 |
+
ncq Katang, Northern
|
2295 |
+
xac Kachari
|
2296 |
+
xsn Sanga
|
2297 |
+
muz Mursi
|
2298 |
+
gwd Ale
|
2299 |
+
saf Safaliba
|
2300 |
+
dir Dirim
|
2301 |
+
dmg Kinabatangan, Upper
|
2302 |
+
isu Isu
|
2303 |
+
tpq Kinnauri, Chhoyul
|
2304 |
+
yuf Havasupai-Walapai-Yavapai
|
2305 |
+
oub Glio-Oubi
|
2306 |
+
ngn Ngwo
|
2307 |
+
fai Faiwol
|
2308 |
+
moi Mboi
|
2309 |
+
muo Mubako
|
2310 |
+
cih Chinali
|
2311 |
+
wew Wejewa
|
2312 |
+
luj Luna
|
2313 |
+
lkh Lakha
|
2314 |
+
wti Berta
|
2315 |
+
mse Musey
|
2316 |
+
bwo Borna
|
2317 |
+
nxr Ninggerum
|
2318 |
+
gru Kistane
|
2319 |
+
wiu Witu
|
2320 |
+
ndr Ndoola
|
2321 |
+
kmo Kwoma
|
2322 |
+
ksm Kumba
|
2323 |
+
ggb Gbii
|
2324 |
+
tqu Touo
|
2325 |
+
gia Kija
|
2326 |
+
aol Alor
|
2327 |
+
ute Ute-Southern Paiute
|
2328 |
+
xtj Mixtec, San Juan Teita
|
2329 |
+
khj Kuturmi
|
2330 |
+
bvh Bure
|
2331 |
+
kwc Likwala
|
2332 |
+
doz Dorze
|
2333 |
+
kga Koyaga
|
2334 |
+
cqd Miao, Chuanqiandian Cluster
|
2335 |
+
cjv Chuave
|
2336 |
+
hmb Songhay, Humburi Senni
|
2337 |
+
nac Narak
|
2338 |
+
iws Iwam, Sepik
|
2339 |
+
kxw Konai
|
2340 |
+
kmy Koma
|
2341 |
+
tww Tuwari
|
2342 |
+
arg Aragonese
|
2343 |
+
tig Tigré
|
2344 |
+
irx Kamberau
|
2345 |
+
ktv Katu, Eastern
|
2346 |
+
cdh Chambeali
|
2347 |
+
tis Itneg, Masadiit
|
2348 |
+
yeu Yerukula
|
2349 |
+
nzy Nzakambay
|
2350 |
+
drg Rungus
|
2351 |
+
wau Waurá
|
2352 |
+
mln Malango
|
2353 |
+
rmb Rembarrnga
|
2354 |
+
ldb Duya
|
2355 |
+
mjs Miship
|
2356 |
+
baw Bambili-Bambui
|
2357 |
+
dmo Kemedzung
|
2358 |
+
qxs Qiang, Southern
|
2359 |
+
kjq Keres, Western
|
2360 |
+
kwa Dâw
|
2361 |
+
azo Awing
|
2362 |
+
cjk Chokwe
|
2363 |
+
jeh Jeh
|
2364 |
+
drs Gedeo
|
2365 |
+
arh Arhuaco
|
2366 |
+
zdj Comorian, Ngazidja
|
2367 |
+
yaq Yaqui
|
2368 |
+
gyz Gyaazi
|
2369 |
+
fir Firan
|
2370 |
+
hbn Heiban
|
2371 |
+
ayb Gbe, Ayizo
|
2372 |
+
yde Yangum Dey
|
2373 |
+
gby Gbari
|
2374 |
+
byc Ubaghara
|
2375 |
+
bac Badui
|
2376 |
+
nhb Beng
|
2377 |
+
nms Letemboi
|
2378 |
+
pll Palaung, Shwe
|
2379 |
+
bwe Karen, Bwe
|
2380 |
+
ibb Ibibio
|
2381 |
+
agl Fembe
|
2382 |
+
nnp Naga, Wancho
|
2383 |
+
wmd Mamaindê
|
2384 |
+
kmt Kemtuik
|
2385 |
+
wja Waja
|
2386 |
+
bol Bole
|
2387 |
+
bhf Odiai
|
2388 |
+
xty Mixtec, Yoloxóchitl
|
2389 |
+
sgz Sursurunga
|
2390 |
+
apj Apache, Jicarilla
|
2391 |
+
drd Darmiya
|
2392 |
+
mqu Mandari
|
2393 |
+
brd Baram
|
2394 |
+
oym Wayampi
|
2395 |
+
uis Uisai
|
2396 |
+
eot Beti
|
2397 |
+
zpk Zapotec, Tlacolulita
|
2398 |
+
lbn Rmeet
|
2399 |
+
nqg Nago, Southern
|
2400 |
+
sme Saami, North
|
2401 |
+
zaz Zari
|
2402 |
+
sen Sénoufo, Nanerigé
|
2403 |
+
pca Popoloca, Santa Inés Ahuatempan
|
2404 |
+
biz Baloi
|
2405 |
+
brb Brao
|
2406 |
+
ppo Folopa
|
2407 |
+
amb Ambo
|
2408 |
+
krx Karon
|
2409 |
+
kwn Kwangali
|
2410 |
+
yiq Miqie
|
2411 |
+
gmb Gula’alaa
|
2412 |
+
res Reshe
|
2413 |
+
plc Palawano, Central
|
2414 |
+
bab Bainouk-Gunyuño
|
2415 |
+
kvb Kubu
|
2416 |
+
ymk Makwe
|
2417 |
+
nxk Naga, Kokak
|
2418 |
+
nut Nung
|
2419 |
+
dio Dibo
|
2420 |
+
tva Vaghua
|
2421 |
+
aez Aeka
|
2422 |
+
aoe Angal Enen
|
2423 |
+
bqh Baima
|
2424 |
+
otx Otomi, Texcatepec
|
2425 |
+
gdf Guduf-Gava
|
2426 |
+
mfl Putai
|
2427 |
+
adl Adi, Galo
|
2428 |
+
yay Agwagwune
|
2429 |
+
gas Garasia, Adiwasi
|
2430 |
+
aio Aiton
|
2431 |
+
tkx Tangko
|
2432 |
+
brf Bira
|
2433 |
+
usi Usoi
|
2434 |
+
vmz Mazatec, Mazatlán
|
2435 |
+
hru Hruso
|
2436 |
+
nja Nzanyi
|
2437 |
+
mfn Mbembe, Cross River
|
2438 |
+
ekr Yace
|
2439 |
+
nud Gala
|
2440 |
+
otr Otoro
|
2441 |
+
fie Fyer
|
2442 |
+
kwe Kwerba
|
2443 |
+
mgb Mararit
|
2444 |
+
yno Yong
|
2445 |
+
bef Benabena
|
2446 |
+
nux Mehek
|
2447 |
+
sto Stoney
|
2448 |
+
fqs Momu
|
2449 |
+
sbn Sindhi Bhil
|
2450 |
+
liq Libido
|
2451 |
+
jbm Bijim
|
2452 |
+
bfh Blafe
|
2453 |
+
isi Nkem-Nkum
|
2454 |
+
vig Viemo
|
2455 |
+
heg Helong
|
2456 |
+
kvl Kayaw
|
2457 |
+
thz Tamajeq, Tayart
|
2458 |
+
rin Nungu
|
2459 |
+
nco Sibe
|
2460 |
+
siw Motuna
|
2461 |
+
vmc Mixtec, Juxtlahuaca
|
2462 |
+
lev Pantar, Western
|
2463 |
+
mvn Minaveha
|
2464 |
+
tpl Me’phaa, Tlacoapa
|
2465 |
+
uiv Iyive
|
2466 |
+
pua Purepecha, Western Highland
|
2467 |
+
rnd Ruund
|
2468 |
+
cjm Cham, Eastern
|
2469 |
+
sym Samo, Maya
|
2470 |
+
bbt Mburku
|
2471 |
+
gvn Kuku-Yalanji
|
2472 |
+
kbx Ap Ma
|
2473 |
+
nsm Naga, Sumi
|
2474 |
+
bys Burak
|
2475 |
+
tlf Telefol
|
2476 |
+
mzq Mori Atas
|
2477 |
+
pck Chin, Paite
|
2478 |
+
hoo Holoholo
|
2479 |
+
wrm Warumungu
|
2480 |
+
tek Teke, Ibali
|
2481 |
+
zkr Zakhring
|
2482 |
+
ywl Lalu, Western
|
2483 |
+
mjw Karbi
|
2484 |
+
kmn Awtuw
|
2485 |
+
khs Kasua
|
2486 |
+
bnj Bangon
|
2487 |
+
mfd Mendankwe-Nkwen
|
2488 |
+
pqa Pa’a
|
2489 |
+
swo Shanenawa
|
2490 |
+
los Loniu
|
2491 |
+
nma Naga, Maram
|
2492 |
+
dgz Daga
|
2493 |
+
stj Samo, Matya
|
2494 |
+
ayu Ayu
|
2495 |
+
mxs Mixtec, Huitepec
|
2496 |
+
bpn Dzao Min
|
2497 |
+
tlx Khehek
|
2498 |
+
nbn Kuri
|
2499 |
+
ynq Yendang
|
2500 |
+
grh Tugbiri-Niragu
|
2501 |
+
juo Jiba
|
2502 |
+
amu Amuzgo, Guerrero
|
2503 |
+
myw Muyuw
|
2504 |
+
ybj Hasha
|
2505 |
+
hio Tshuwau
|
2506 |
+
kix Naga, Khiamniungan
|
2507 |
+
pma Paama
|
2508 |
+
bej Bedawiyet
|
2509 |
+
dni Dani, Lower Grand Valley
|
2510 |
+
naq Khoekhoe
|
2511 |
+
mrq Marquesan, North
|
2512 |
+
mrn Cheke Holo
|
2513 |
+
dgh Dghwede
|
2514 |
+
bau Mbat
|
2515 |
+
ite Itene
|
2516 |
+
crw Chrau
|
2517 |
+
ndb Kenswei Nsei
|
2518 |
+
nuk Nuu-chah-nulth
|
2519 |
+
dnd Daonda
|
2520 |
+
nlu Nchumbulu
|
2521 |
+
sge Segai
|
2522 |
+
can Chambri
|
2523 |
+
sre Bakati’, Sara
|
2524 |
+
kfr Kacchi
|
2525 |
+
hul Vula’a
|
2526 |
+
kid Koshin
|
2527 |
+
cyo Cuyonon
|
2528 |
+
ykm Kap
|
2529 |
+
ktm Kurti
|
2530 |
+
bsf Bauchi
|
2531 |
+
pio Piapoco
|
2532 |
+
kkc Odoodee
|
2533 |
+
thm Aheu
|
2534 |
+
xkn Kayan, Kayan River
|
2535 |
+
gfk Patpatar
|
2536 |
+
gel ut-Ma’in
|
2537 |
+
bsh Kati
|
2538 |
+
pmq Pame, Northern
|
2539 |
+
bfj Bafanji
|
2540 |
+
xwg Kwegu
|
2541 |
+
sng Sanga
|
2542 |
+
szp Suabo
|
2543 |
+
fvr Fur
|
2544 |
+
zwa Zay
|
2545 |
+
svs Savosavo
|
2546 |
+
chw Chuwabu
|
2547 |
+
nlv Nahuatl, Orizaba
|
2548 |
+
bsp Baga Sitemu
|
2549 |
+
bdl Bajau, Indonesian
|
2550 |
+
khy Kele
|
2551 |
+
ito Itonama
|
2552 |
+
naj Nalu
|
2553 |
+
bdd Bunama
|
2554 |
+
emb Embaloh
|
2555 |
+
zps Zapotec, Coatlán
|
2556 |
+
kee Keres, Eastern
|
2557 |
+
ukw Ukwuani-Aboh-Ndoni
|
2558 |
+
ldm Landoma
|
2559 |
+
duw Dusun Witu
|
2560 |
+
mxp Mixe, Tlahuitoltepec
|
2561 |
+
zln Zhuang, Lianshan
|
2562 |
+
zns Mangas
|
2563 |
+
blf Buol
|
2564 |
+
ksn Kasiguranin
|
2565 |
+
prm Kibiri
|
2566 |
+
lmd Lumun
|
2567 |
+
lop Lopa
|
2568 |
+
yev Yeri
|
2569 |
+
kwk Kwakwala
|
2570 |
+
tcp Chin, Tawr
|
2571 |
+
int Intha
|
2572 |
+
clj Chin, Laitu
|
2573 |
+
jit Jita
|
2574 |
+
mgc Morokodo
|
2575 |
+
ags Esimbi
|
2576 |
+
tvu Tunen
|
2577 |
+
ghk Karen, Geko
|
2578 |
+
hue Huave, San Francisco del Mar
|
2579 |
+
kkn Kon Keu
|
2580 |
+
pbm Mazatec, Puebla and Northeastern
|
2581 |
+
snl Sangil
|
2582 |
+
jkp Karen, Paku
|
2583 |
+
yrk Nenets
|
2584 |
+
ciw Chippewa
|
2585 |
+
mlf Mal
|
2586 |
+
pym Pyam
|
2587 |
+
vrs Varisi
|
2588 |
+
nnd Ambae, West
|
2589 |
+
akg Anakalangu
|
2590 |
+
udi Udi
|
2591 |
+
kys Kayan, Baram
|
2592 |
+
lky Lokoya
|
2593 |
+
bui Bongili
|
2594 |
+
zkd Kadu
|
2595 |
+
ihp Iha
|
2596 |
+
cdr Kamuku
|
2597 |
+
anj Anor
|
2598 |
+
ndm Ndam
|
2599 |
+
lga Lungga
|
2600 |
+
hmw Miao, Western Mashan
|
2601 |
+
zkn Kanan
|
2602 |
+
bpz Bilba
|
2603 |
+
taw Tay
|
2604 |
+
mez Menominee
|
2605 |
+
wuv Wuvulu-Aua
|
2606 |
+
mkk Byep
|
2607 |
+
aki Aiome
|
2608 |
+
gue Gurindji
|
2609 |
+
bse Wushi
|
2610 |
+
dsq Tadaksahak
|
2611 |
+
spt Spiti Bhoti
|
2612 |
+
hoj Haroti
|
2613 |
+
aom Ömie
|
2614 |
+
mdt Mbere
|
2615 |
+
nbb Ndoe
|
2616 |
+
ape Bukiyip
|
2617 |
+
eky Kayah, Eastern
|
2618 |
+
itd Tidung, Southern
|
2619 |
+
mcc Bitur
|
2620 |
+
kzi Kelabit
|
2621 |
+
bhq Tukang Besi South
|
2622 |
+
dia Dia
|
2623 |
+
asb Assiniboine
|
2624 |
+
wyy Fijian, Western
|
2625 |
+
nna Nyangumarta
|
2626 |
+
twx Tewe
|
2627 |
+
mlq Maninkakan, Western
|
2628 |
+
uta Itang
|
2629 |
+
hmz Sinicized Miao
|
2630 |
+
aof Bragat
|
2631 |
+
rue Rusyn
|
2632 |
+
pbs Pame, Central
|
2633 |
+
kio Kiowa
|
2634 |
+
tdn Tondano
|
2635 |
+
snm Ma’di, Southern
|
2636 |
+
cod Kukama-Kukamiria
|
2637 |
+
cde Chenchu
|
2638 |
+
ppl Nahuat
|
2639 |
+
tdg Tamang, Western
|
2640 |
+
jmb Zumbun
|
2641 |
+
eit Eitiep
|
2642 |
+
wni Comorian, Ndzwani
|
2643 |
+
tlp Totonac, Filomena Mata-Coahuitlán
|
2644 |
+
ilk Bogkalot
|
2645 |
+
nri Naga, Chokri
|
2646 |
+
kyv Kewat
|
2647 |
+
scs Slavey, North
|
2648 |
+
kji Zabana
|
2649 |
+
tku Totonac, Upper Necaxa
|
2650 |
+
byp Bumaji
|
2651 |
+
xkt Kantosi
|
2652 |
+
kcc Lubila
|
2653 |
+
yuq Yuqui
|
2654 |
+
cho Choctaw
|
2655 |
+
hot Malei
|
2656 |
+
kku Tumi
|
2657 |
+
bmi Bagirmi
|
2658 |
+
wlc Comorian, Mwali
|
2659 |
+
auu Auye
|
2660 |
+
wle Wolane
|
2661 |
+
mmm Maii
|
2662 |
+
cdo Chinese, Min Dong
|
2663 |
+
nez Nez Perce
|
2664 |
+
ukp Bukpe
|
2665 |
+
kwo Kwomtari
|
2666 |
+
zpx Zapotec, San Baltazar Loxicha
|
2667 |
+
mlv Mwotlap
|
2668 |
+
ppm Papuma
|
2669 |
+
bqr Burusu
|
2670 |
+
wut Wutung
|
2671 |
+
tji Tujia, Northern
|
2672 |
+
bbq Bamali
|
2673 |
+
ttk Totoro
|
2674 |
+
ets Etsako
|
2675 |
+
yin Riang Lai
|
2676 |
+
gim Gimi
|
2677 |
+
kow Kugama
|
2678 |
+
ksa Shuwa-Zamani
|
2679 |
+
git Gitxsan
|
2680 |
+
erk Efate, South
|
2681 |
+
vmx Mixtec, Tamazola
|
2682 |
+
duv Duvle
|
2683 |
+
bgf Bangandu
|
2684 |
+
wms Wambon
|
2685 |
+
isd Isnag
|
2686 |
+
pmx Naga, Poumai
|
2687 |
+
doy Dompo
|
2688 |
+
nak Nakanai
|
2689 |
+
bze Bozo, Jenaama
|
2690 |
+
gis Giziga, North
|
2691 |
+
miu Mixtec, Cacaloxtepec
|
2692 |
+
bzu Burmeso
|
2693 |
+
ckx Caka
|
2694 |
+
duu Drung
|
2695 |
+
jmr Kamara
|
2696 |
+
lur Laura
|
2697 |
+
wlw Walak
|
2698 |
+
rar Cook Islands Maori
|
2699 |
+
osi Osing
|
2700 |
+
mmd Maonan
|
2701 |
+
kmm Kom
|
2702 |
+
kvr Kerinci
|
2703 |
+
ncr Nchane
|
2704 |
+
for Fore
|
2705 |
+
bgn Balochi, Western
|
2706 |
+
gnm Ginuman
|
2707 |
+
alx Amol
|
2708 |
+
xks Kumbewaha
|
2709 |
+
lkn Lakon
|
2710 |
+
mbl Maxakalí
|
2711 |
+
bri Mokpwe
|
2712 |
+
mov Mohave
|
2713 |
+
pot Potawatomi
|
2714 |
+
pnu Bunu, Jiongnai
|
2715 |
+
djm Dogon, Jamsay
|
2716 |
+
ula Fungwa
|
2717 |
+
nnj Nyangatom
|
2718 |
+
ybl Yukuben
|
2719 |
+
aab Arum
|
2720 |
+
has Haisla
|
2721 |
+
alh Alawa
|
2722 |
+
mea Menka
|
2723 |
+
pum Puma
|
2724 |
+
spo Spokane
|
2725 |
+
cyb Cayubaba
|
2726 |
+
nbh Ngamo
|
2727 |
+
ont Ontenu
|
2728 |
+
ahp Aizi, Aproumu
|
2729 |
+
bpy Bishnupuriya
|
2730 |
+
utr Etulo
|
2731 |
+
auk Heiyoho
|
2732 |
+
bdb Basap
|
2733 |
+
klo Kapya
|
2734 |
+
nrf Guernésiais
|
2735 |
+
tmn Taman
|
2736 |
+
mvo Marovo
|
2737 |
+
kla Klamath-Modoc
|
2738 |
+
jnl Rawat
|
2739 |
+
jad Jahanka
|
2740 |
+
hrm Miao, Horned
|
2741 |
+
hoa Hoava
|
2742 |
+
mus Muskogee
|
2743 |
+
dna Dani, Upper Grand Valley
|
2744 |
+
btg Bété, Gagnoa
|
2745 |
+
ngs Gvoko
|
2746 |
+
lmu Lamenu
|
2747 |
+
add Lidzonka
|
2748 |
+
pha Pa-Hng
|
2749 |
+
kvq Karen, Geba
|
2750 |
+
pch Pardhan
|
2751 |
+
bgs Tagabawa
|
2752 |
+
nir Nimboran
|
2753 |
+
bcy Bacama
|
2754 |
+
var Huarijío
|
2755 |
+
sjo Xibe
|
2756 |
+
jle Ngile
|
2757 |
+
cuv Cuvok
|
2758 |
+
smf Auwe
|
2759 |
+
cnq Chung
|
2760 |
+
bhh Bukharic
|
2761 |
+
dox Mositacha
|
2762 |
+
ior Inor
|
2763 |
+
oma Omaha-Ponca
|
2764 |
+
abz Abui
|
2765 |
+
kza Karaboro, Western
|
2766 |
+
rbb Palaung, Rumai
|
2767 |
+
bfq Badaga
|
2768 |
+
kht Khamti
|
2769 |
+
sps Saposa
|
2770 |
+
syk Sukur
|
2771 |
+
slp Lamaholot
|
2772 |
+
jax Malay, Jambi
|
2773 |
+
byo Biyo
|
2774 |
+
qvj Quichua, Loja Highland
|
2775 |
+
bnx Bangubangu
|
2776 |
+
ngw Ngwaba
|
2777 |
+
krf Koro
|
2778 |
+
loa Loloda
|
2779 |
+
cox Nanti
|
2780 |
+
wwo Dorig
|
2781 |
+
akc Mpur
|
2782 |
+
kal Greenlandic
|
2783 |
+
siu Sinagen
|
2784 |
+
aqm Atohwaim
|
2785 |
+
rmt Domari
|
2786 |
+
nhn Nahuatl, Central
|
2787 |
+
jum Jumjum
|
2788 |
+
nix Hema
|
2789 |
+
ncg Nisga’a
|
2790 |
+
ccl Cutchi-Swahili
|
2791 |
+
kvu Yinbaw
|
2792 |
+
tnb Tunebo, Western
|
2793 |
+
cpx Chinese, Pu-Xian
|
2794 |
+
kgo Krongo
|
2795 |
+
nxd Ngando
|
2796 |
+
coj Cochimi
|
2797 |
+
grx Muno
|
2798 |
+
bfs Bai, Southern
|
2799 |
+
cov Cao Miao
|
2800 |
+
cbj Ede Cabe
|
2801 |
+
loh Narim
|
2802 |
+
iry Iraya
|
2803 |
+
cky Cakfem-Mushere
|
2804 |
+
bsn Barasana-Eduria
|
2805 |
+
xkv Kgalagadi
|
2806 |
+
itz Itza’
|
2807 |
+
tgc Tigak
|
2808 |
+
boh Boma
|
2809 |
+
mck Mbunda
|
2810 |
+
ccg Samba Daka
|
2811 |
+
piy Piya-Kwonci
|
2812 |
+
how Honi
|
2813 |
+
pwm Molbog
|
2814 |
+
tds Doutai
|
2815 |
+
ldg Lenyima
|
2816 |
+
csa Chinantec, Chiltepec
|
2817 |
+
cbk Chavacano
|
2818 |
+
ibl Ibaloi
|
2819 |
+
kql Kyenele
|
2820 |
+
smq Samo
|
2821 |
+
uya Doko-Uyanga
|
2822 |
+
tkd Tukudede
|
2823 |
+
cry Kyoli
|
2824 |
+
clk Idu-Mishmi
|
2825 |
+
cut Cuicatec, Teutila
|
2826 |
+
apm Apache, Mescalero-Chiricahua
|
2827 |
+
bya Batak
|
2828 |
+
nyi Ama
|
2829 |
+
nih Nyiha, Tanzania
|
2830 |
+
hbb Nya Huba
|
2831 |
+
huc ǂ’Amkhoe
|
2832 |
+
cdi Chodri
|
2833 |
+
rhp Yahang
|
2834 |
+
bcj Bardi
|
2835 |
+
pei Chichimeco-Jonaz
|
2836 |
+
gdl Dirasha
|
2837 |
+
emg Mewahang, Eastern
|
2838 |
+
mmz Mabaale
|
2839 |
+
afo Ajiri
|
2840 |
+
bhs Buwal
|
2841 |
+
lht Lo-Toga
|
2842 |
+
ktp Kaduo
|
2843 |
+
xns Kanashi
|
2844 |
+
sjb Sajau Basap
|
2845 |
+
pow Popoloca, San Felipe Otlaltepec
|
2846 |
+
rad Rade
|
2847 |
+
gut Maléku Jaíka
|
2848 |
+
vam Dumo
|
2849 |
+
kis Kis
|
2850 |
+
bet Bété, Guiberoua
|
2851 |
+
lva Makuva
|
2852 |
+
zoc Zoque, Copainalá
|
2853 |
+
goa Guro
|
2854 |
+
bkg Buraka
|
2855 |
+
yae Pumé
|
2856 |
+
won Wongo
|
2857 |
+
gpa Gupa-Abawa
|
2858 |
+
sde Vori
|
2859 |
+
mls Masalit
|
2860 |
+
jiu Jinuo, Youle
|
2861 |
+
bmd Baga Manduri
|
2862 |
+
czt Chin, Zotung
|
2863 |
+
tvn Tavoyan
|
2864 |
+
zng Mang
|
2865 |
+
ijj Ede Ije
|
2866 |
+
dms Dampelas
|
2867 |
+
mlw Moloko
|
2868 |
+
wow Wawonii
|
2869 |
+
png Pangu
|
2870 |
+
ikw Ikwere
|
2871 |
+
dtb Kadazan, Labuk-Kinabatangan
|
2872 |
+
bey Beli
|
2873 |
+
ntu Natügu
|
2874 |
+
sua Sulka
|
2875 |
+
kcx Kachama-Ganjule
|
2876 |
+
ekl Kol
|
2877 |
+
mhp Malay, Balinese
|
2878 |
+
slz Ma’ya
|
2879 |
+
skt Sakata
|
2880 |
+
dez Dengese
|
2881 |
+
ogc Ogbah
|
2882 |
+
byz Waran
|
2883 |
+
yui Wajiara
|
2884 |
+
kdx Kam
|
2885 |
+
erh Eruwa
|
2886 |
+
atp Atta, Pudtol
|
2887 |
+
qws Quechua, Sihuas Ancash
|
2888 |
+
ale Aleut
|
2889 |
+
lcm Tungag
|
2890 |
+
pbp Badyara
|
2891 |
+
anc Ngas
|
2892 |
+
khl Lusi
|
2893 |
+
mkc Siliput
|
2894 |
+
knm Kanamarí
|
2895 |
+
yah Yazgulyam
|
2896 |
+
hml Miao, Luopohe
|
2897 |
+
mfb Bangka
|
2898 |
+
mxl Gbe, Maxi
|
2899 |
+
lgt Pahi
|
2900 |
+
das Daho-Doo
|
2901 |
+
njo Naga, Ao
|
2902 |
+
iar Purari
|
2903 |
+
nou Ewage-Notu
|
2904 |
+
moh Mohawk
|
2905 |
+
tvl Tuvaluan
|
2906 |
+
yuy Yugur, East
|
2907 |
+
kvt Lahta
|
2908 |
+
sku Sakao
|
2909 |
+
hra Hrangkhol
|
2910 |
+
nka Nkoya
|
2911 |
+
crx Carrier
|
2912 |
+
tif Tifal
|
2913 |
+
pia Pima Bajo
|
2914 |
+
ppi Paipai
|
2915 |
+
nbp Nnam
|
2916 |
+
btm Batak Mandailing
|
2917 |
+
jya Jiarong
|
2918 |
+
mxd Modang
|
2919 |
+
psn Panasuan
|
2920 |
+
puc Punan Merap
|
2921 |
+
tty Sikaritai
|
2922 |
+
mzb Tumzabt
|
2923 |
+
zmb Zimba
|
2924 |
+
kdu Kadaru
|
2925 |
+
nnz Nda’nda’
|
2926 |
+
nmb V’ënen Taut
|
2927 |
+
tcd Tafi
|
2928 |
+
weh Weh
|
2929 |
+
jni Janji
|
2930 |
+
txn Tarangan, West
|
2931 |
+
pem Phende
|
2932 |
+
xod Kokoda
|
2933 |
+
byj Bina
|
2934 |
+
bpw Bo
|
2935 |
+
bbf Baibai
|
2936 |
+
sol Solos
|
2937 |
+
mmc Mazahua, Michoacán
|
2938 |
+
pta Pai Tavytera
|
2939 |
+
khc Tukang Besi North
|
2940 |
+
nau Nauruan
|
2941 |
+
llu Lau
|
2942 |
+
pnz Pana
|
2943 |
+
kuy Kuuku-Ya’u
|
2944 |
+
wbq Waddar
|
2945 |
+
wud Wudu
|
2946 |
+
mbi Manobo, Ilianen
|
2947 |
+
ikt Inuinnaqtun
|
2948 |
+
bhp Bima
|
2949 |
+
mdj Mangbetu
|
2950 |
+
swj Sira
|
2951 |
+
xom Komo
|
2952 |
+
rir Ribun
|
2953 |
+
sbr Murut, Sembakung
|
2954 |
+
tfn Tanaina
|
2955 |
+
pwa Pawaia
|
2956 |
+
msw Mansoanka
|
2957 |
+
zpn Zapotec, Santa Inés Yatzechi
|
2958 |
+
rkm Marka
|
2959 |
+
aun One, Molmo
|
2960 |
+
mxa Mixtec, Northwest Oaxaca
|
2961 |
+
abr Abron
|
2962 |
+
bxs Busam
|
2963 |
+
bly Notre
|
2964 |
+
gro Groma
|
2965 |
+
mvz Mesqan
|
2966 |
+
yum Quechan
|
2967 |
+
nxg Ngad’a
|
2968 |
+
akw Akwa
|
2969 |
+
kmp Gimme
|
2970 |
+
kfh Kurichiya
|
2971 |
+
ged Gade
|
2972 |
+
yuj Karkar-Yuri
|
2973 |
+
hwo Hwana
|
2974 |
+
pkt Maleng
|
2975 |
+
agc Agatu
|
2976 |
+
mgi Migili
|
2977 |
+
akt Akolet
|
2978 |
+
bkw Bekwel
|
2979 |
+
dun Dusun Deyah
|
2980 |
+
mrh Chin, Mara
|
2981 |
+
dgd Dagaari Dioula
|
2982 |
+
kci Kamantan
|
2983 |
+
yak Yakama
|
2984 |
+
kch Vono
|
2985 |
+
bxq Beele
|
2986 |
+
chx Chantyal
|
2987 |
+
gra Garasia, Rajput
|
2988 |
+
kih Kilmeri
|
2989 |
+
ono Onondaga
|
2990 |
+
adn Adang
|
2991 |
+
aug Aguna
|
2992 |
+
bqt Bamukumbit
|
2993 |
+
mum Maiwala
|
2994 |
+
atu Reel
|
2995 |
+
hop Hopi
|
2996 |
+
bhy Bhele
|
2997 |
+
zms Mbesa
|
2998 |
+
prx Purig
|
2999 |
+
bjp Fanamaket
|
3000 |
+
odu Odual
|
3001 |
+
azd Nahuatl, Eastern Durango
|
3002 |
+
bje Biao-Jiao Mien
|
3003 |
+
mct Mengisa
|
3004 |
+
njm Naga, Angami
|
3005 |
+
liu Logorik
|
3006 |
+
pwn Paiwan
|
3007 |
+
mav Sateré-Mawé
|
3008 |
+
gnu Gnau
|
3009 |
+
jub Wannu
|
3010 |
+
sez Chin, Senthang
|
3011 |
+
mgg Mpumpong
|
3012 |
+
ost Osatu
|
3013 |
+
vkl Kulisusu
|
3014 |
+
kbj Kari
|
3015 |
+
bag Tuki
|
3016 |
+
bjt Balanta-Ganja
|
3017 |
+
mkf Miya
|
3018 |
+
ngi Ngizim
|
3019 |
+
mds Maria
|
3020 |
+
gvf Golin
|
3021 |
+
thd Kuuk Thayorre
|
3022 |
+
rau Raute
|
3023 |
+
sse Sama, Balangingih
|
3024 |
+
nhz Nahuatl, Santa María la Alta
|
3025 |
+
cvn Chinantec, Valle Nacional
|
3026 |
+
nba Nyemba
|
3027 |
+
hnd Hindko, Southern
|
3028 |
+
nbi Naga, Mao
|
3029 |
+
bil Bille
|
3030 |
+
xmh Kugu-Muminh
|
3031 |
+
bip Bila
|
3032 |
+
zhi Zhire
|
3033 |
+
aal Afade
|
3034 |
+
mfg Mogofin
|
3035 |
+
wan Wan
|
3036 |
+
kkf Monpa, Kalaktang
|
3037 |
+
nyq Nayini
|
3038 |
+
ors Orang Seletar
|
3039 |
+
bbp Banda, West Central
|
3040 |
+
yle Yélî Dnye
|
3041 |
+
taz Tocho
|
3042 |
+
dri C’Lela
|
3043 |
+
nbv Ngamambo
|
3044 |
+
mqg Malay, Kota Bangun Kutai
|
3045 |
+
mdu Mboko
|
3046 |
+
aty Aneityum
|
3047 |
+
mbq Maisin
|
3048 |
+
hav Havu
|
3049 |
+
ner Yahadian
|
3050 |
+
glw Glavda
|
3051 |
+
nyb Nyangbo
|
3052 |
+
clt Chin, Lautu
|
3053 |
+
jiy Jinuo, Buyuan
|
3054 |
+
qxa Quechua, Chiquián
|
3055 |
+
win Ho-Chunk
|
3056 |
+
chr Cherokee
|
3057 |
+
vkn Koro Nulu
|
3058 |
+
quv Sakapulteko
|
3059 |
+
wrs Waris
|
3060 |
+
nit Kolami, Southeastern
|
3061 |
+
ver Verre
|
3062 |
+
nmk Namakura
|
3063 |
+
czh Chinese, Huizhou
|
3064 |
+
wrp Waropen
|
3065 |
+
mmx Madak
|
3066 |
+
yis Yis
|
3067 |
+
kce Kaivi
|
3068 |
+
ddg Fataluku
|
3069 |
+
sle Sholaga
|
3070 |
+
ega Ega
|
3071 |
+
jnd Jandavra
|
3072 |
+
kxx Likuba
|
3073 |
+
kna Dera
|
3074 |
+
yer Tarok
|
3075 |
+
amm Sawiyanu
|
3076 |
+
onn Onobasulu
|
3077 |
+
tce Tutchone, Southern
|
3078 |
+
buh Bunu, Younuo
|
3079 |
+
sst Sinasina
|
3080 |
+
wsk Waskia
|
3081 |
+
dln Darlong
|
3082 |
+
teq Temein
|
3083 |
+
org Oring
|
3084 |
+
cfg Karimjo
|
3085 |
+
nce Yale
|
3086 |
+
lgq Logba
|
3087 |
+
yif Ache
|
3088 |
+
kfo Koro
|
3089 |
+
jog Jogi
|
3090 |
+
nkx Nkoroo
|
3091 |
+
asr Asuri
|
3092 |
+
ktc Kholok
|
3093 |
+
gbz Dari, Zoroastrian
|
3094 |
+
kvy Yintale
|
3095 |
+
kvv Kola
|
3096 |
+
oia Oirata
|
3097 |
+
rdb Rudbari
|
3098 |
+
ymb Yambes
|
3099 |
+
sad Sandawe
|
3100 |
+
ntk Ikoma-Nata-Isenye
|
3101 |
+
dru Rukai
|
3102 |
+
bjh Bahinemo
|
3103 |
+
ywa Kalou
|
3104 |
+
nmc Ngam
|
3105 |
+
nat Cahungwarya
|
3106 |
+
ato Atong
|
3107 |
+
liw Col
|
3108 |
+
qux Quechua, Yauyos
|
3109 |
+
shw Shwai
|
3110 |
+
cfd Cara
|
3111 |
+
pip Pero
|
3112 |
+
zts Zapotec, Tilquiapan
|
3113 |
+
mcs Mambai
|
3114 |
+
sgi Nizaa
|
3115 |
+
mhl Mauwake
|
3116 |
+
ndu Dugun
|
3117 |
+
bqa Tchumbuli
|
3118 |
+
bqo Balo
|
3119 |
+
buz Bukwen
|
3120 |
+
fak Fang
|
3121 |
+
tii Tiene
|
3122 |
+
gvp Gavião, Pará
|
3123 |
+
kmh Kalam
|
3124 |
+
xkc Kho’ini
|
3125 |
+
max Malay, North Moluccan
|
3126 |
+
phl Palula
|
3127 |
+
gbg Gbanziri
|
3128 |
+
zag Zaghawa
|
3129 |
+
trf Trinidadian English Creole
|
3130 |
+
weo Wemale
|
3131 |
+
geg Gengle
|
3132 |
+
kxb Krobu
|
3133 |
+
pru Puragi
|
3134 |
+
kie Kibet
|
3135 |
+
mpn Mindiri
|
3136 |
+
mhz Mor
|
3137 |
+
gbh Gbe, Defi
|
3138 |
+
gbr Gbagyi
|
3139 |
+
tmy Tami
|
3140 |
+
rey Reyesano
|
3141 |
+
kpj Karajá
|
3142 |
+
nap Napoletano-Calabrese
|
3143 |
+
lgu Longgu
|
3144 |
+
bye Pouye
|
3145 |
+
tml Citak, Tamnim
|
3146 |
+
kpw Kobon
|
3147 |
+
kfa Kodava
|
3148 |
+
iyx Yaka
|
3149 |
+
twy Tawoyan
|
3150 |
+
sed Sedang
|
3151 |
+
bdm Buduma
|
3152 |
+
plg Pilagá
|
3153 |
+
buo Terei
|
3154 |
+
aww Awun
|
3155 |
+
yyu Yau
|
3156 |
+
cld Chaldean Neo-Aramaic
|
3157 |
+
xmg Mengaka
|
3158 |
+
pku Paku
|
3159 |
+
xkg Kagoro
|
3160 |
+
caq Nicobarese, Car
|
3161 |
+
kmq Gwama
|
3162 |
+
lel Lele
|
3163 |
+
gqa Ga’anda
|
3164 |
+
tfi Gbe, Tofin
|
3165 |
+
mml Man Met
|
3166 |
+
nxa Nauete
|
3167 |
+
tdk Rom
|
3168 |
+
kbv Dla
|
3169 |
+
bgv Warkay-Bipim
|
3170 |
+
bbw Supapya
|
3171 |
+
kvm Kendem
|
3172 |
+
aku Akum
|
3173 |
+
ert Eritai
|
3174 |
+
jdg Jadgali
|
3175 |
+
gow Gorowa
|
3176 |
+
doo Dongo
|
3177 |
+
jeb Jebero
|
3178 |
+
stf Seta
|
3179 |
+
nid Ngandi
|
3180 |
+
mqx Mamuju
|
3181 |
+
mta Manobo, Cotabato
|
3182 |
+
she Sheko
|
3183 |
+
mfm Marghi South
|
3184 |
+
jei Yei
|
3185 |
+
deg Degema
|
3186 |
+
gcf Guadeloupean French Creole
|
3187 |
+
bxb Belanda Bor
|
3188 |
+
mut Muria, Western
|
3189 |
+
diw Dinka, Northwestern
|
3190 |
+
nqy Naga, Akyaung Ari
|
3191 |
+
sop Songe
|
3192 |
+
kny Kanyok
|
3193 |
+
lse Lusengo
|
3194 |
+
ahg Qimant
|
3195 |
+
opa Okpamheri
|
3196 |
+
hah Hahon
|
3197 |
+
daq Maria, Dandami
|
3198 |
+
hac Gurani
|
3199 |
+
klg Tagakaulo
|
3200 |
+
kqi Koita
|
3201 |
+
slx Salampasu
|
3202 |
+
ots Otomí, Estado de México
|
3203 |
+
tru Turoyo
|
3204 |
+
sxw Gbe, Saxwe
|
3205 |
+
dij Dai
|
3206 |
+
aog Angoram
|
3207 |
+
kcr Katla
|
3208 |
+
agf Arguni
|
3209 |
+
alq Algonquin
|
3210 |
+
raf Mewahang, Western
|
3211 |
+
mij Mungbam
|
3212 |
+
gdu Gudu
|
3213 |
+
wgi Wahgi
|
3214 |
+
bbu Kulung
|
3215 |
+
ndo Ndonga
|
3216 |
+
mma Mama
|
3217 |
+
tal Tal
|
3218 |
+
odk Oadki
|
3219 |
+
etr Edolo
|
3220 |
+
umu Munsee
|
3221 |
+
kjs Kewapi, East
|
3222 |
+
bvm Bamunka
|
3223 |
+
jqr Jaqaru
|
3224 |
+
kfm Khunsari
|
3225 |
+
tbp Diebroud
|
3226 |
+
ems Yupik, Pacific Gulf
|
3227 |
+
kcq Kamo
|
3228 |
+
ruy Mala
|
3229 |
+
nng Naga, Maring
|
3230 |
+
jns Jaunsari
|
3231 |
+
sbk Safwa
|
3232 |
+
wji Warji
|
3233 |
+
sbz Sara Kaba
|
3234 |
+
bhl Bimin
|
3235 |
+
auy Awiyaana
|
3236 |
+
txt Citak
|
3237 |
+
nof Nomane
|
3238 |
+
cll Chala
|
3239 |
+
pak Parakanã
|
3240 |
+
tli Tlingit
|
3241 |
+
kqo Krahn, Eastern
|
3242 |
+
kbz Duhwa
|
3243 |
+
mbx Mari
|
3244 |
+
xrw Karawa
|
3245 |
+
crj Cree, Southern East
|
3246 |
+
jaq Yaqay
|
3247 |
+
pbn Kpasham
|
3248 |
+
dbi Doka
|
3249 |
+
kod Kodi
|
3250 |
+
bjk Barok
|
3251 |
+
syb Subanen, Central
|
3252 |
+
nyh Nyikina
|
3253 |
+
kfd Koraga, Korra
|
3254 |
+
mtk Mbo’
|
3255 |
+
mbd Manobo, Dibabawon
|
3256 |
+
jgk Gwak
|
3257 |
+
mmp Siawi
|
3258 |
+
uba Ubang
|
3259 |
+
kxh Karo
|
3260 |
+
tov Taromi, Upper
|
3261 |
+
buk Bugawac
|
3262 |
+
abn Abua
|
3263 |
+
kbh Camsá
|
3264 |
+
slc Sáliba
|
3265 |
+
knt Katukína, Panoan
|
3266 |
+
rwa Rawo
|
3267 |
+
kyk Kamayo
|
3268 |
+
kli Kalumpang
|
3269 |
+
klq Rumu
|
3270 |
+
iqu Iquitu
|
3271 |
+
jku Labir
|
3272 |
+
bga Gwamhi-Wuri
|
3273 |
+
amo Amo
|
3274 |
+
gou Gavar
|
3275 |
+
kdz Kwaja
|
3276 |
+
nzm Naga, Zeme
|
3277 |
+
mgk Mawes
|
3278 |
+
sjr Siar-Lak
|
3279 |
+
aqg Arigidi
|
3280 |
+
ghl Ghulfan
|
3281 |
+
oso Ososo
|
3282 |
+
rei Reli
|
3283 |
+
tiw Tiwi
|
3284 |
+
kdq Koch
|
3285 |
+
zbu Bu
|
3286 |
+
wem Gbe, Weme
|
3287 |
+
gig Goaria
|
3288 |
+
tsw Tsishingini
|
3289 |
+
gmz Mgbolizhia
|
3290 |
+
mfo Mbe
|
3291 |
+
anw Anaang
|
3292 |
+
mtu Mixtec, Tututepec
|
3293 |
+
ahb Axamb
|
3294 |
+
bub Bua
|
3295 |
+
jru Japreria
|
3296 |
+
ryu Okinawan, Central
|
3297 |
+
nuo Nguôn
|
3298 |
+
kdm Gyong
|
3299 |
+
due Agta, Umiray Dumaget
|
3300 |
+
boo Bozo, Tiemacèwè
|
3301 |
+
vmm Mixtec, Mitlatongo
|
3302 |
+
ydg Yadgha
|
3303 |
+
adz Adzera
|
3304 |
+
yaf Yaka
|
3305 |
+
mep Miriwoong
|
3306 |
+
kip Kham, Sheshi
|
3307 |
+
bvw Boga
|
3308 |
+
mqh Mixtec, Tlazoyaltepec
|
3309 |
+
bmj Bote
|
3310 |
+
dih Kumiai
|
3311 |
+
cib Gbe, Ci
|
3312 |
+
ggg Gurgula
|
3313 |
+
ldq Lufu
|
3314 |
+
scv Sheni
|
3315 |
+
siy Sivandi
|
3316 |
+
ktf Kwami
|
3317 |
+
gew Gera
|
3318 |
+
lan Laru
|
3319 |
+
kks Giiwo
|
3320 |
+
fun Iatê
|
3321 |
+
dtm Dogon, Tomo Kan
|
3322 |
+
thp Thompson
|
3323 |
+
gye Gyem
|
3324 |
+
zaf Zapotec, Ayoquesco
|
3325 |
+
kcs Koenoem
|
3326 |
+
yap Yapese
|
3327 |
+
bnv Beneraf
|
3328 |
+
src Sardinian, Logudorese
|
3329 |
+
brq Breri
|
3330 |
+
frc French, Cajun
|
3331 |
+
elk Elkei
|
3332 |
+
aad Amal
|
3333 |
+
kqj Koromira
|
3334 |
+
ael Ambele
|
3335 |
+
mku Maninka, Konyanka
|
3336 |
+
otm Otomi, Eastern Highland
|
3337 |
+
ldp Tso
|
3338 |
+
dbd Dadiya
|
3339 |
+
ttm Tutchone, Northern
|
3340 |
+
nen Nengone
|
3341 |
+
bit Berinomo
|
3342 |
+
wca Yanomámi
|
3343 |
+
jig Jingulu
|
3344 |
+
wss Wasa
|
3345 |
+
huh Huilliche
|
3346 |
+
xti Mixtec, Sinicahua
|
3347 |
+
nhv Nahuatl, Temascaltepec
|
3348 |
+
smy Semnani
|
3349 |
+
tak Tala
|
3350 |
+
hch Huichol
|
3351 |
+
kqa Mum
|
3352 |
+
spm Akukem
|
3353 |
+
kfz Koromfé
|
3354 |
+
ank Goemai
|
3355 |
+
ruz Ruma
|
3356 |
+
koh Koyo
|
3357 |
+
pdo Padoe
|
3358 |
+
kvd Kui
|
3359 |
+
fut Futuna-Aniwa
|
3360 |
+
wom Wom
|
3361 |
+
sor Soumraye
|
3362 |
+
gdx Godwari
|
3363 |
+
ttb Gaa
|
3364 |
+
iti Itneg, Inlaud
|
3365 |
+
tsp Toussian, Northern
|
3366 |
+
jkr Koro
|
3367 |
+
sct Katang, Southern
|
3368 |
+
laa Subanen, Southern
|
3369 |
+
auq Anus
|
3370 |
+
agy Alta, Southern
|
3371 |
+
tuq Tedaga
|
3372 |
+
acv Achumawi
|
3373 |
+
mbv Mbulungish
|
3374 |
+
orh Oroqen
|
3375 |
+
def Dezfuli
|
3376 |
+
gop Yeretuar
|
3377 |
+
nyg Nyindu
|
3378 |
+
liz Libinza
|
3379 |
+
tay Atayal
|
3380 |
+
dil Dilling
|
3381 |
+
mtf Murik
|
3382 |
+
jup Hupdë
|
3383 |
+
uuu U
|
3384 |
+
ncf Notsi
|
3385 |
+
hum Hungana
|
3386 |
+
vum Vumbu
|
3387 |
+
mfj Mefele
|
3388 |
+
afi Chini
|
3389 |
+
meh Mixtec, Southwestern Tlaxiaco
|
3390 |
+
tma Tama
|
3391 |
+
mkg Mak
|
3392 |
+
aik Akye
|
3393 |
+
ung Ngarinyin
|
3394 |
+
itt Itneg, Maeng
|
3395 |
+
akl Aklanon
|
3396 |
+
sti Stieng, Bulo
|
3397 |
+
gid Gidar
|
3398 |
+
ckl Kibaku
|
3399 |
+
spu Sapuan
|
3400 |
+
enn Engenni
|
3401 |
+
ebr Tchaman
|
3402 |
+
mcw Mawa
|
3403 |
+
ybe Yugur, West
|
3404 |
+
kni Kanufi
|
3405 |
+
kjr Kurudu
|
3406 |
+
bwm Biwat
|
3407 |
+
vra Vera’a
|
3408 |
+
duq Dusun Malang
|
3409 |
+
bpu Bongu
|
3410 |
+
mrz Marind
|
3411 |
+
sdh Kurdish, Southern
|
3412 |
+
cdn Chaudangsi
|
3413 |
+
vmp Mazatec, Soyaltepec
|
3414 |
+
zsm Malay, Standard
|
3415 |
+
szg Sengele
|
3416 |
+
yun Bena
|
3417 |
+
kcd Kanum, Ngkâlmpw
|
3418 |
+
ala Alago
|
3419 |
+
ywn Yawanawa
|
3420 |
+
nfl Äiwoo
|
3421 |
+
pbl Mak
|
3422 |
+
pyu Puyuma
|
3423 |
+
zrg Mirgan
|
3424 |
+
aif Agi
|
3425 |
+
kmj Kumarbhag Paharia
|
3426 |
+
njj Njen
|
3427 |
+
ahs Ashe
|
3428 |
+
kwu Kwakum
|
3429 |
+
mxh Mvuba
|
3430 |
+
chp Dene
|
3431 |
+
iko Olulumo-Ikom
|
3432 |
+
krh Kurama
|
3433 |
+
bux Boghom
|
3434 |
+
udl Wuzlam
|
3435 |
+
one Oneida
|
3436 |
+
akq Ak
|
3437 |
+
fla Kalispel-Pend d’Oreille
|
3438 |
+
zpr Zapotec, Santiago Xanica
|
3439 |
+
tvt Naga, Tutsa
|
3440 |
+
awe Awetí
|
3441 |
+
bqx Baangi
|
3442 |
+
yns Iyansi
|
3443 |
+
dya Dyan
|
3444 |
+
hkk Hunjara-Kaina Ke
|
3445 |
+
clc Chilcotin
|
3446 |
+
kpa Kutto
|
3447 |
+
ldk Leelau
|
3448 |
+
dak Dakota
|
3449 |
+
vls West Flemish
|
3450 |
+
xnz Mattokki
|
3451 |
+
ccj Kasanga
|
3452 |
+
kzc Kulango, Bondoukou
|
3453 |
+
dkx Mazagway-Hidi
|
3454 |
+
leq Lembena
|
3455 |
+
saz Saurashtra
|
3456 |
+
mqz Pano
|
3457 |
+
akr Araki
|
3458 |
+
fap Paloor
|
3459 |
+
mef Megam
|
3460 |
+
rat Razajerdi
|
3461 |
+
kmk Kalinga, Limos
|
3462 |
+
ike Inuktitut, Eastern Canadian
|
3463 |
+
see Seneca
|
3464 |
+
nlo Ngul
|
3465 |
+
klk Kono
|
3466 |
+
rcf Réunion French Creole
|
3467 |
+
bof Bolon
|
3468 |
+
rwk Rwa
|
3469 |
+
smt Simte
|
3470 |
+
jma Dima
|
3471 |
+
mmn Minamanwa
|
3472 |
+
mhk Mungaka
|
3473 |
+
whg Yuwei
|
3474 |
+
zro Záparo
|
3475 |
+
sob Sobei
|
3476 |
+
mtp Weenhayek
|
3477 |
+
zuy Zumaya
|
3478 |
+
ocu Matlatzinca, Atzingo
|
3479 |
+
xtt Mixtec, Tacahua
|
3480 |
+
mek Mekeo
|
3481 |
+
ctt Chetti, Wayanad
|
3482 |
+
bni Bangi
|
3483 |
+
ogb Ogbia
|
3484 |
+
orx Oro
|
3485 |
+
kot Lagwan
|
3486 |
+
itr Yawuno Teneyo
|
3487 |
+
kic Kickapoo
|
3488 |
+
skd Miwok, Southern Sierra
|
3489 |
+
nhg Nahuatl, Tetelcingo
|
3490 |
+
bvi Belanda Viri
|
3491 |
+
tny Tongwe
|
3492 |
+
rui Rufiji
|
3493 |
+
dor Dori’o
|
3494 |
+
lmk Lamkang
|
3495 |
+
ncb Nicobarese, Central
|
3496 |
+
msg Moraid
|
3497 |
+
snq Sangu
|
3498 |
+
eme Tekó
|
3499 |
+
amc Amahuaca
|
3500 |
+
msn Vurës
|
3501 |
+
hdn Haida, Northern
|
3502 |
+
com Comanche
|
3503 |
+
sgd Surigaonon
|
3504 |
+
cdf Chiru
|
3505 |
+
ttj Tooro
|
3506 |
+
skv Skou
|
3507 |
+
twp Ere
|
3508 |
+
gek Ywom
|
3509 |
+
cob Chicomuceltec
|
3510 |
+
fll Fali, North
|
3511 |
+
mne Naba
|
3512 |
+
coc Cocopa
|
3513 |
+
mph Maung
|
3514 |
+
gaf Gende
|
3515 |
+
agh Ngelima
|
3516 |
+
epi Epie
|
3517 |
+
aaw Solong
|
3518 |
+
sok Sokoro
|
3519 |
+
piu Pintupi-Luritja
|
3520 |
+
dyg Agta, Villa Viciosa
|
3521 |
+
mla Malo
|
3522 |
+
dof Domu
|
3523 |
+
klx Koluwawa
|
3524 |
+
gab Gabri
|
3525 |
+
scn Sicilian
|
3526 |
+
mat Matlatzinca, San Francisco
|
3527 |
+
bja Budza
|
3528 |
+
kcj Kobiana
|
3529 |
+
kwb Kwa
|
3530 |
+
tsu Tsou
|
3531 |
+
kev Kanikkaran
|
3532 |
+
ksj Uare
|
3533 |
+
zrs Mairasi
|
3534 |
+
bcv Shoo-Minda-Nye
|
3535 |
+
sug Suganga
|
3536 |
+
pcl Pardhi
|
3537 |
+
yim Naga, Yimchungru
|
3538 |
+
kqk Gbe, Kotafon
|
3539 |
+
bzx Bozo, Kelengaxo
|
3540 |
+
esh Eshtehardi
|
3541 |
+
fay Fars, Southwestern
|
3542 |
+
dee Dewoin
|
3543 |
+
eze Uzekwe
|
3544 |
+
bwt Bafaw-Balong
|
3545 |
+
nph Naga, Phom
|
3546 |
+
pmm Pol
|
3547 |
+
pdc German, Pennsylvania
|
3548 |
+
srz Shahmirzadi
|
3549 |
+
tug Tunia
|
3550 |
+
hux Witoto, Nipode
|
3551 |
+
soo Songo
|
3552 |
+
bcz Bainouk-Gunyaamolo
|
3553 |
+
bva Barein
|
3554 |
+
sky Sikaiana
|
3555 |
+
blc Bella Coola
|
3556 |
+
skq Sininkere
|
3557 |
+
yix Axi
|
3558 |
+
arx Aruá
|
3559 |
+
msl Molof
|
3560 |
+
aqt Angaité
|
3561 |
+
gcr Guianese French Creole
|
3562 |
+
mtb Anyin Morofo
|
3563 |
+
lrl Lari
|
3564 |
+
tiy Teduray
|
3565 |
+
iwm Iwam
|
3566 |
+
bhg Binandere
|
3567 |
+
pbv Pnar
|
3568 |
+
gmm Gbaya-Mbodomo
|
3569 |
+
apy Apalaí
|
3570 |
+
iow Iowa-Oto
|
3571 |
+
cku Koasati
|
3572 |
+
sry Sera
|
3573 |
+
zcd Zapotec, Las Delicias
|
3574 |
+
toj Tojolabal
|
3575 |
+
idi Idi
|
3576 |
+
kqw Kandas
|
3577 |
+
irr Ir
|
3578 |
+
bif Biafada
|
3579 |
+
akf Akpa
|
3580 |
+
arw Arawak
|
3581 |
+
lor Téén
|
3582 |
+
was Washo
|
3583 |
+
nrg Narango
|
3584 |
+
knz Kalamsé
|
3585 |
+
anf Animere
|
3586 |
+
goz Gozarkhani
|
3587 |
+
vmh Maraghei
|
3588 |
+
arp Arapaho
|
3589 |
+
glr Glaro-Twabo
|
3590 |
+
big Biangai
|
3591 |
+
tou Tho
|
3592 |
+
lie Likila
|
3593 |
+
hol Holu
|
3594 |
+
dbn Duriankere
|
3595 |
+
asu Asurini, Tocantins
|
3596 |
+
xvi Kamviri
|
3597 |
+
aaf Aranadan
|
3598 |
+
mii Mixtec, Chigmecatitlán
|
3599 |
+
xkj Kajali
|
3600 |
+
bez Bena
|
3601 |
+
trv Seediq
|
3602 |
+
bqs Bosmun
|
3603 |
+
yax Yauma
|
3604 |
+
ykg Yukaghir, Northern
|
3605 |
+
hgm Hai|ǁom
|
3606 |
+
sgr Sangisari
|
3607 |
+
vaf Vafsi
|
3608 |
+
anl Chin, Anu-Khongso
|
3609 |
+
mdh Maguindanaon
|
3610 |
+
bbv Karnai
|
3611 |
+
wbb Wabo
|
3612 |
+
shc Sonde
|
3613 |
+
nsa Naga, Sangtam
|
3614 |
+
rtm Rotuman
|
3615 |
+
kvg Kuni-Boazi
|
3616 |
+
cgg Chiga
|
3617 |
+
mdn Mbati
|
3618 |
+
job Joba
|
3619 |
+
bxl Jalkunan
|
3620 |
+
jrt Jakattoe
|
3621 |
+
ilp Iranun
|
3622 |
+
njh Naga, Lotha
|
3623 |
+
sek Sekani
|
3624 |
+
avi Avikam
|
3625 |
+
nmh Naga, Monsang
|
3626 |
+
cos Corsican
|
3627 |
+
ctz Chatino, Zacatepec
|
3628 |
+
wbj Alagwa
|
3629 |
+
sbg Seget
|
3630 |
+
tyy Tiyaa
|
3631 |
+
bea Beaver
|
3632 |
+
chd Chontal, Highland Oaxaca
|
3633 |
+
ado Abu
|
3634 |
+
mnv Rennell-Bellona
|
3635 |
+
dbb Deno
|
3636 |
+
mti Maiwa
|
3637 |
+
ekp Ekpeye
|
3638 |
+
plr Sénoufo, Palaka
|
3639 |
+
nev Nyaheun
|
3640 |
+
cra Chara
|
3641 |
+
tla Tepehuan, Southwestern
|
3642 |
+
xmf Mingrelian
|
3643 |
+
nyw Nyaw
|
3644 |
+
dis Dimasa
|
3645 |
+
zpy Zapotec, Mazaltepec
|
3646 |
+
dgx Doghoro
|
3647 |
+
ifm Teke-Wuumu
|
3648 |
+
ngz Ngungwel
|
3649 |
+
yra Yerakai
|
3650 |
+
sau Saleman
|
3651 |
+
psw Port Sandwich
|
3652 |
+
kbm Iwal
|
3653 |
+
mye Myene
|
3654 |
+
tiq Tiéfo
|
3655 |
+
kkh Khün
|
3656 |
+
kjt Karen, Phrae Pwo
|
3657 |
+
gox Gobu
|
3658 |
+
kzm Kais
|
3659 |
+
pac Pacoh
|
3660 |
+
gua Shiki
|
3661 |
+
too Totonac, Xicotepec de Juárez
|
3662 |
+
nre Naga, Southern Rengma
|
3663 |
+
pqm Malecite-Passamaquoddy
|
3664 |
+
gul Sea Island English Creole
|
3665 |
+
cte Chinantec, Tepinapa
|
3666 |
+
buf Bushoong
|
3667 |
+
bws Bomboma
|
3668 |
+
tlq Tai Loi
|
3669 |
+
asi Buruwai
|
3670 |
+
bpv Marind, Bian
|
3671 |
+
atk Ati
|
3672 |
+
gar Galeya
|
3673 |
+
plv Palawano, Southwest
|
3674 |
+
sev Sénoufo, Nyarafolo
|
3675 |
+
vem Vemgo-Mabas
|
3676 |
+
hla Halia
|
3677 |
+
mna Mbula
|
3678 |
+
pcb Pear
|
3679 |
+
lih Lihir
|
3680 |
+
ksv Kusu
|
3681 |
+
iby Ibani
|
3682 |
+
yrb Yareba
|
3683 |
+
nge Ngemba
|
3684 |
+
ney Neyo
|
3685 |
+
keb Kélé
|
3686 |
+
nuq Nukumanu
|
3687 |
+
okh Koresh-e Rostam
|
3688 |
+
ity Itneg, Moyadan
|
3689 |
+
van Walman
|
3690 |
+
ijs Ijo, Southeast
|
3691 |
+
shs Shuswap
|
3692 |
+
mkb Mal Paharia
|
3693 |
+
kit Agob
|
3694 |
+
nyj Nyanga
|
3695 |
+
tti Tobati
|
3696 |
+
agb Legbo
|
3697 |
+
twr Tarahumara, Southwestern
|
3698 |
+
cae Laalaa
|
3699 |
+
biu Biate
|
3700 |
+
grs Gresi
|
3701 |
+
brp Barapasi
|
3702 |
+
tdv Atoro
|
3703 |
+
crv Chaura
|
3704 |
+
njs Nisa
|
3705 |
+
oke Okpe
|
3706 |
+
tdl Kusur-Myet
|
3707 |
+
mlx Na’ahai
|
3708 |
+
zte Zapotec, Elotepec
|
3709 |
+
ivb Ibatan
|
3710 |
+
chy Cheyenne
|
3711 |
+
mbf Malay, Baba
|
3712 |
+
nal Nalik
|
3713 |
+
lwl Lawa, Eastern
|
3714 |
+
buw Bubi
|
3715 |
+
qus Quichua, Santiago del Estero
|
3716 |
+
lik Lika
|
3717 |
+
lna Langbashe
|
3718 |
+
dem Dem
|
3719 |
+
ldo Loo
|
3720 |
+
pbg Paraujano
|
3721 |
+
mic Mi’kmaq
|
3722 |
+
wdj Wadjiginy
|
3723 |
+
tol Tolowa
|
3724 |
+
sns Nahavaq
|
3725 |
+
luz Luri, Southern
|
3726 |
+
tgy Togoyo
|
3727 |
+
sha Shall-Zwall
|
3728 |
+
mtl Tehl
|
3729 |
+
scw Sya
|
3730 |
+
hna Mina
|
3731 |
+
moe Innu
|
3732 |
+
mae Bo-Rukul
|
3733 |
+
avd Alviri-Vidari
|
3734 |
+
bsy Bisaya, Sabah
|
3735 |
+
kfe Kota
|
3736 |
+
dsn Dusner
|
3737 |
+
kst Winyé
|
3738 |
+
bid Bidiyo
|
3739 |
+
erg Sie
|
3740 |
+
tls Tambotalo
|
3741 |
+
nkw Nkutu
|
3742 |
+
zia Zia
|
3743 |
+
bdw Baham
|
3744 |
+
une Uneme
|
3745 |
+
ykk Yakaikeke
|
3746 |
+
plu Palikúr
|
3747 |
+
pfe Pere
|
3748 |
+
blq Paluai
|
3749 |
+
sao Sause
|
3750 |
+
tsa Tsaangi
|
3751 |
+
uni Uni
|
3752 |
+
irn Irántxe
|
3753 |
+
pos Popoluca, Sayula
|
3754 |
+
mot Barí
|
3755 |
+
lki Laki
|
3756 |
+
gbn Mo’da
|
3757 |
+
chk Chuukese
|
3758 |
+
kmz Khorasani Turkish
|
3759 |
+
orz Ormu
|
3760 |
+
bfe Betaf
|
3761 |
+
nlj Nyali
|
3762 |
+
bnn Bunun
|
3763 |
+
aba Abé
|
3764 |
+
abu Abure
|
3765 |
+
iai Iaai
|
3766 |
+
knn Konkani
|
3767 |
+
biy Birhor
|
3768 |
+
yog Yogad
|
3769 |
+
gnb Gangte
|
3770 |
+
bou Bondei
|
3771 |
+
zmq Mituku
|
3772 |
+
tto Ta’oih, Lower
|
3773 |
+
abm Abanyom
|
3774 |
+
dhv Drehu
|
3775 |
+
brl Birwa
|
3776 |
+
shh Shoshoni
|
3777 |
+
zbc Berawan, Central
|
3778 |
+
oyd Oyda
|
3779 |
+
pek Penchal
|
3780 |
+
tbj Tiang
|
3781 |
+
ema Emai-Iuleha-Ora
|
3782 |
+
bgi Bagobo-Klata
|
3783 |
+
tkq Tee
|
3784 |
+
nmm Nyeshangte
|
3785 |
+
kkk Kokota
|
3786 |
+
djn Djauan
|
3787 |
+
tow Jemez
|
3788 |
+
nwm Nyamusa-Molo
|
3789 |
+
tef Teressa
|
3790 |
+
daw Davawenyo
|
3791 |
+
kpx Koiali, Mountain
|
3792 |
+
mtq Muong
|
3793 |
+
mwe Mwera
|
3794 |
+
stv Silt’e
|
3795 |
+
lum Luimbi
|
3796 |
+
phq Phana’
|
3797 |
+
tdc Embera Tadó
|
3798 |
+
pcj Parenga
|
3799 |
+
vnk Lovono
|
3800 |
+
kdd Yankunytjatjara
|
3801 |
+
aul Aulua
|
3802 |
+
mnp Chinese, Min Bei
|
3803 |
+
tdo Teme
|
3804 |
+
mwg Aiklep
|
3805 |
+
dma Duma
|
3806 |
+
coz Chocholtec
|
3807 |
+
owi Owiniga
|
3808 |
+
rji Raji
|
3809 |
+
aey Amele
|
3810 |
+
dge Degenang
|
3811 |
+
nil Nila
|
3812 |
+
ler Lenkau
|
3813 |
+
agt Agta, Central Cagayan
|
3814 |
+
kof Kubi
|
3815 |
+
okx Okpe
|
3816 |
+
ogg Ogbogolo
|
3817 |
+
xes Koromu
|
3818 |
+
hur Halkomelem
|
3819 |
+
bgx Balkan Gagauz Turkish
|
3820 |
+
anx Andra-Hus
|
3821 |
+
rwo Rawa
|
3822 |
+
caz Canichana
|
3823 |
+
kuh Kushi
|
3824 |
+
bks Sorsoganon, Northern
|
3825 |
+
ztx Zapotec, Zaachila
|
3826 |
+
axk Yaka
|
3827 |
+
umm Umon
|
3828 |
+
mmy Migaama
|
3829 |
+
aee Pashai, Northeast
|
3830 |
+
lil Lillooet
|
3831 |
+
lvk Lavukaleve
|
3832 |
+
ibd Iwaidja
|
3833 |
+
azt Atta, Faire
|
3834 |
+
usa Usarufa
|
3835 |
+
saa Saba
|
3836 |
+
bar Bavarian
|
3837 |
+
mzn Mazandarani
|
3838 |
+
unx Munda
|
3839 |
+
puo Puoc
|
3840 |
+
lek Leipon
|
3841 |
+
pkg Pak-Tong
|
3842 |
+
niu Niue
|
3843 |
+
oni Onin
|
3844 |
+
jaf Jara
|
3845 |
+
dwa Diri
|
3846 |
+
lmg Lamogai
|
3847 |
+
tau Tanana, Upper
|
3848 |
+
zoh Zoque, Chimalapa
|
3849 |
+
cbg Chimila
|
3850 |
+
gla Scottish Gaelic
|
3851 |
+
yur Yurok
|
3852 |
+
peb Pomo, Eastern
|
3853 |
+
kbb Kaxuiâna
|
3854 |
+
ivv Ivatan
|
3855 |
+
oka Okanagan
|
3856 |
+
ral Ralte
|
3857 |
+
nun Anong
|
3858 |
+
soz Temi
|
3859 |
+
ndh Ndali
|
3860 |
+
kdy Keijar
|
3861 |
+
bjo Banda, Mid-Southern
|
3862 |
+
env Enwan
|
3863 |
+
nds Saxon, Low
|
3864 |
+
kyy Asa’a
|
3865 |
+
moy Shekkacho
|
3866 |
+
mnm Mapena
|
3867 |
+
sbh Sori-Harengan
|
3868 |
+
bek Bebeli
|
3869 |
+
pdn Fedan
|
3870 |
+
mxm Meramera
|
3871 |
+
moj Monzombo
|
3872 |
+
tul Tula
|
3873 |
+
oks Oko-Eni-Osayen
|
3874 |
+
bjc Bariji
|
3875 |
+
hvv Huave, Santa María del Mar
|
3876 |
+
dme Dugwor
|
3877 |
+
plk Shina, Kohistani
|
3878 |
+
lal Lalia
|
3879 |
+
sir Siri
|
3880 |
+
yhd Arabic, Judeo-Iraqi
|
3881 |
+
zmp Mpuono
|
3882 |
+
ofu Efutop
|
3883 |
+
iki Iko
|
3884 |
+
sjg Assangori
|
3885 |
+
gae Guarequena
|
3886 |
+
hei Heiltsuk
|
3887 |
+
dmr Damar, East
|
3888 |
+
lti Leti
|
3889 |
+
ipo Ipiko
|
3890 |
+
dva Duau
|
3891 |
+
yaw Yawalapití
|
3892 |
+
dgg Doga
|
3893 |
+
mdw Mbosi
|
3894 |
+
mzv Mandja
|
3895 |
+
tkp Tikopia
|
3896 |
+
snv Sa’ban
|
3897 |
+
bte Gamo-Ningi
|
3898 |
+
nqt Nteng
|
3899 |
+
etx Iten
|
3900 |
+
gwa Mbato
|
3901 |
+
aji Ajië
|
3902 |
+
gni Gooniyandi
|
3903 |
+
blm Beli
|
3904 |
+
hid Hidatsa
|
3905 |
+
tof Gizrra
|
3906 |
+
kos Kosraean
|
3907 |
+
cja Cham, Western
|
3908 |
+
yki Yoke
|
3909 |
+
haa Han
|
3910 |
+
gad Gaddang
|
3911 |
+
mbs Manobo, Sarangani
|
3912 |
+
kkd Kinuku
|
3913 |
+
kol Kol
|
3914 |
+
noz Nayi
|
3915 |
+
kms Kamasau
|
3916 |
+
kfu Katkari
|
3917 |
+
mps Dadibi
|
3918 |
+
sbc Kele
|
3919 |
+
pon Pohnpeian
|
3920 |
+
sos Seeku
|
3921 |
+
kwt Kwesten
|
3922 |
+
diz Ding
|
3923 |
+
buj Basa-Gurmana
|
3924 |
+
onj Onjob
|
3925 |
+
cbo Izora
|
3926 |
+
pic Pinji
|
3927 |
+
zpw Zapotec, Zaniza
|
3928 |
+
sro Sardinian, Campidanese
|
3929 |
+
psi Pashai, Southeast
|
3930 |
+
kyh Karok
|
3931 |
+
msk Mansaka
|
3932 |
+
kxn Melanau, Kanowit-Tanjong
|
3933 |
+
end Ende
|
3934 |
+
mgf Maklew
|
3935 |
+
wgb Wagawaga
|
3936 |
+
mhc Mocho
|
3937 |
+
niw Nimo
|
3938 |
+
gyd Kayardild
|
3939 |
+
crc Lonwolwol
|
3940 |
+
bwf Boselewa
|
3941 |
+
huf Humene
|
3942 |
+
lad Ladino
|
3943 |
+
dei Demisa
|
3944 |
+
kgb Kawe
|
3945 |
+
prc Parachi
|
3946 |
+
ttw Long Wat
|
3947 |
+
ilu Ili’uun
|
3948 |
+
mnu Mer
|
3949 |
+
mbo Mbo
|
3950 |
+
glo Galambu
|
3951 |
+
sys Sinyar
|
3952 |
+
sgy Sanglechi
|
3953 |
+
poo Pomo, Central
|
3954 |
+
tsi Tsimshian
|
3955 |
+
svb Ulau-Suain
|
3956 |
+
wsa Warembori
|
3957 |
+
kkz Kaska
|
3958 |
+
gsw German, Swiss
|
3959 |
+
skb Saek
|
3960 |
+
ano Andoque
|
3961 |
+
zun Zuni
|
3962 |
+
tnm Tabla
|
3963 |
+
sbb Simbo
|
3964 |
+
wkd Mo
|
3965 |
+
sby Soli
|
3966 |
+
xok Xokleng
|
3967 |
+
chj Chinantec, Ojitlán
|
3968 |
+
jge Judeo-Georgian
|
3969 |
+
ugo Ugong
|
3970 |
+
lmi Lombi
|
3971 |
+
nkh Naga, Khezha
|
3972 |
+
huz Hunzib
|
3973 |
+
tft Ternate
|
3974 |
+
mrp Morouas
|
3975 |
+
mrf Elseng
|
3976 |
+
yot Yotti
|
3977 |
+
gbv Gbanu
|
3978 |
+
ayt Ayta, Magbukun
|
3979 |
+
hgw Haigwai
|
3980 |
+
swr Saweru
|
3981 |
+
lcc Legenyem
|
3982 |
+
zpe Zapotec, Petapa
|
3983 |
+
zpd Zapotec, Southeastern Ixtlán
|
3984 |
+
kep Kaikadi
|
3985 |
+
vmj Mixtec, Ixtayutla
|
3986 |
+
clu Caluyanun
|
3987 |
+
cma Maa
|
3988 |
+
qun Quinault
|
3989 |
+
kcf Ukaan
|
3990 |
+
fry Frisian
|
3991 |
+
har Harari
|
3992 |
+
bta Bata
|
3993 |
+
wro Worrorra
|
3994 |
+
mwp Kala Lagaw Ya
|
3995 |
+
sny Saniyo-Hiyewe
|
3996 |
+
nzk Nzakara
|
3997 |
+
knw Kung-Ekoka
|
3998 |
+
wbk Waigali
|
3999 |
+
smn Saami, Inari
|
4000 |
+
shq Sala
|
4001 |
+
zat Zapotec, Tabaa
|
4002 |
+
ngj Ngie
|
4003 |
+
psh Pashai, Southwest
|
4004 |
+
amt Amto
|
4005 |
+
xgu Unggumi
|
4006 |
+
qui Quileute
|
4007 |
+
gww Kwini
|
4008 |
+
agi Agariya
|
4009 |
+
caf Carrier, Southern
|
4010 |
+
pay Pech
|
4011 |
+
cbd Carijona
|
4012 |
+
mwa Mwatebu
|
4013 |
+
gcn Gaina
|
4014 |
+
suq Suri, Tirmaga-Chai
|
4015 |
+
djc Daju, Dar Daju
|
4016 |
+
aaa Ghotuo
|
4017 |
+
etn Eton
|
data/tts/all_langs.tsv
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
tuk-script_latin Turkmen
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
librosa
|
2 |
+
git+https://github.com/huggingface/transformers.git
|
3 |
+
torch
|
4 |
+
Cython==0.29.21
|
5 |
+
phonemizer==2.2.1
|
6 |
+
scipy
|
7 |
+
numpy
|
8 |
+
torchvision
|
9 |
+
matplotlib
|
10 |
+
Unidecode==1.1.1
|
11 |
+
monotonic-align
|
tts.py
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import os
|
7 |
+
import re
|
8 |
+
import tempfile
|
9 |
+
import torch
|
10 |
+
import sys
|
11 |
+
import gradio as gr
|
12 |
+
|
13 |
+
from huggingface_hub import hf_hub_download
|
14 |
+
|
15 |
+
# Setup TTS env
|
16 |
+
if "vits" not in sys.path:
|
17 |
+
sys.path.append("vits")
|
18 |
+
|
19 |
+
from vits import commons, utils
|
20 |
+
from vits.models import SynthesizerTrn
|
21 |
+
|
22 |
+
|
23 |
+
class TextMapper(object):
|
24 |
+
def __init__(self, vocab_file):
|
25 |
+
self.symbols = [
|
26 |
+
x.replace("\n", "") for x in open(vocab_file, encoding="utf-8").readlines()
|
27 |
+
]
|
28 |
+
self.SPACE_ID = self.symbols.index(" ")
|
29 |
+
self._symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
|
30 |
+
self._id_to_symbol = {i: s for i, s in enumerate(self.symbols)}
|
31 |
+
|
32 |
+
def text_to_sequence(self, text, cleaner_names):
|
33 |
+
"""Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
34 |
+
Args:
|
35 |
+
text: string to convert to a sequence
|
36 |
+
cleaner_names: names of the cleaner functions to run the text through
|
37 |
+
Returns:
|
38 |
+
List of integers corresponding to the symbols in the text
|
39 |
+
"""
|
40 |
+
sequence = []
|
41 |
+
clean_text = text.strip()
|
42 |
+
for symbol in clean_text:
|
43 |
+
symbol_id = self._symbol_to_id[symbol]
|
44 |
+
sequence += [symbol_id]
|
45 |
+
return sequence
|
46 |
+
|
47 |
+
def uromanize(self, text, uroman_pl):
|
48 |
+
iso = "xxx"
|
49 |
+
with tempfile.NamedTemporaryFile() as tf, tempfile.NamedTemporaryFile() as tf2:
|
50 |
+
with open(tf.name, "w") as f:
|
51 |
+
f.write("\n".join([text]))
|
52 |
+
cmd = f"perl " + uroman_pl
|
53 |
+
cmd += f" -l {iso} "
|
54 |
+
cmd += f" < {tf.name} > {tf2.name}"
|
55 |
+
os.system(cmd)
|
56 |
+
outtexts = []
|
57 |
+
with open(tf2.name) as f:
|
58 |
+
for line in f:
|
59 |
+
line = re.sub(r"\s+", " ", line).strip()
|
60 |
+
outtexts.append(line)
|
61 |
+
outtext = outtexts[0]
|
62 |
+
return outtext
|
63 |
+
|
64 |
+
def get_text(self, text, hps):
|
65 |
+
text_norm = self.text_to_sequence(text, hps.data.text_cleaners)
|
66 |
+
if hps.data.add_blank:
|
67 |
+
text_norm = commons.intersperse(text_norm, 0)
|
68 |
+
text_norm = torch.LongTensor(text_norm)
|
69 |
+
return text_norm
|
70 |
+
|
71 |
+
def filter_oov(self, text, lang=None):
|
72 |
+
text = self.preprocess_char(text, lang=lang)
|
73 |
+
val_chars = self._symbol_to_id
|
74 |
+
txt_filt = "".join(list(filter(lambda x: x in val_chars, text)))
|
75 |
+
return txt_filt
|
76 |
+
|
77 |
+
def preprocess_char(self, text, lang=None):
|
78 |
+
"""
|
79 |
+
Special treatement of characters in certain languages
|
80 |
+
"""
|
81 |
+
if lang == "ron":
|
82 |
+
text = text.replace("ț", "ţ")
|
83 |
+
print(f"{lang} (ț -> ţ): {text}")
|
84 |
+
return text
|
85 |
+
|
86 |
+
|
87 |
+
def synthesize(text, lang, speed):
|
88 |
+
|
89 |
+
if speed is None:
|
90 |
+
speed = 1.0
|
91 |
+
|
92 |
+
lang_code = lang.split(":")[0].strip()
|
93 |
+
|
94 |
+
vocab_file = hf_hub_download(
|
95 |
+
repo_id="facebook/mms-tts",
|
96 |
+
filename="vocab.txt",
|
97 |
+
subfolder=f"models/{lang_code}",
|
98 |
+
)
|
99 |
+
config_file = hf_hub_download(
|
100 |
+
repo_id="facebook/mms-tts",
|
101 |
+
filename="config.json",
|
102 |
+
subfolder=f"models/{lang_code}",
|
103 |
+
)
|
104 |
+
g_pth = hf_hub_download(
|
105 |
+
repo_id="facebook/mms-tts",
|
106 |
+
filename="G_100000.pth",
|
107 |
+
subfolder=f"models/{lang_code}",
|
108 |
+
)
|
109 |
+
|
110 |
+
if torch.cuda.is_available():
|
111 |
+
device = torch.device("cuda")
|
112 |
+
elif (
|
113 |
+
hasattr(torch.backends, "mps")
|
114 |
+
and torch.backends.mps.is_available()
|
115 |
+
and torch.backends.mps.is_built()
|
116 |
+
):
|
117 |
+
device = torch.device("mps")
|
118 |
+
else:
|
119 |
+
device = torch.device("cpu")
|
120 |
+
|
121 |
+
print(f"Run inference with {device}")
|
122 |
+
|
123 |
+
assert os.path.isfile(config_file), f"{config_file} doesn't exist"
|
124 |
+
hps = utils.get_hparams_from_file(config_file)
|
125 |
+
text_mapper = TextMapper(vocab_file)
|
126 |
+
net_g = SynthesizerTrn(
|
127 |
+
len(text_mapper.symbols),
|
128 |
+
hps.data.filter_length // 2 + 1,
|
129 |
+
hps.train.segment_size // hps.data.hop_length,
|
130 |
+
**hps.model,
|
131 |
+
)
|
132 |
+
net_g.to(device)
|
133 |
+
_ = net_g.eval()
|
134 |
+
|
135 |
+
_ = utils.load_checkpoint(g_pth, net_g, None)
|
136 |
+
|
137 |
+
is_uroman = hps.data.training_files.split(".")[-1] == "uroman"
|
138 |
+
|
139 |
+
if is_uroman:
|
140 |
+
uroman_dir = "uroman"
|
141 |
+
assert os.path.exists(uroman_dir)
|
142 |
+
uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
|
143 |
+
text = text_mapper.uromanize(text, uroman_pl)
|
144 |
+
|
145 |
+
text = text.lower()
|
146 |
+
text = text_mapper.filter_oov(text, lang=lang)
|
147 |
+
stn_tst = text_mapper.get_text(text, hps)
|
148 |
+
with torch.no_grad():
|
149 |
+
x_tst = stn_tst.unsqueeze(0).to(device)
|
150 |
+
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
|
151 |
+
hyp = (
|
152 |
+
net_g.infer(
|
153 |
+
x_tst,
|
154 |
+
x_tst_lengths,
|
155 |
+
noise_scale=0.667,
|
156 |
+
noise_scale_w=0.8,
|
157 |
+
length_scale=1.0 / speed,
|
158 |
+
)[0][0, 0]
|
159 |
+
.cpu()
|
160 |
+
.float()
|
161 |
+
.numpy()
|
162 |
+
)
|
163 |
+
|
164 |
+
return gr.Audio.update(value=(hps.data.sampling_rate, hyp)), text
|
165 |
+
|
166 |
+
|
167 |
+
TTS_EXAMPLES = [
|
168 |
+
["Salam. Men indi ýuwaş ýuwaşdan size düşünip başladym", "tuk-script_latin: Turkmen"],
|
169 |
+
["Türkmençe bir bilýäňmow sen?", "tuk-script_latin: Turkmen"],
|
170 |
+
["Iň gowy adamlar, yzyny özüň bilýäň.", "tuk-script_latin: Turkmen"],
|
171 |
+
["Siz bilen tanyşanyma örän şat.", "tuk-script_latin: Turkmen"],
|
172 |
+
["Esasy zat jan saglyk.", "tuk-script_latin: Turkmen"],
|
173 |
+
]
|
uroman/.gitignore
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
!Build/
|
2 |
+
.last_cover_stats
|
3 |
+
/META.yml
|
4 |
+
/META.json
|
5 |
+
/MYMETA.*
|
6 |
+
*.o
|
7 |
+
*.pm.tdy
|
8 |
+
*.bs
|
9 |
+
|
10 |
+
# Devel::Cover
|
11 |
+
cover_db/
|
12 |
+
|
13 |
+
# Devel::NYTProf
|
14 |
+
nytprof.out
|
15 |
+
|
16 |
+
# Dizt::Zilla
|
17 |
+
/.build/
|
18 |
+
|
19 |
+
# Module::Build
|
20 |
+
_build/
|
21 |
+
Build
|
22 |
+
Build.bat
|
23 |
+
|
24 |
+
# Module::Install
|
25 |
+
inc/
|
26 |
+
|
27 |
+
# ExtUtils::MakeMaker
|
28 |
+
/blib/
|
29 |
+
/_eumm/
|
30 |
+
/*.gz
|
31 |
+
/Makefile
|
32 |
+
/Makefile.old
|
33 |
+
/MANIFEST.bak
|
34 |
+
/pm_to_blib
|
35 |
+
/*.zip
|
uroman/LICENSE.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Copyright (C) 2015-2020 Ulf Hermjakob, USC Information Sciences Institute
|
2 |
+
|
3 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
4 |
+
|
5 |
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
6 |
+
|
7 |
+
Any publication of projects using uroman shall acknowledge its use: "This project uses the universal romanizer software 'uroman' written by Ulf Hermjakob, USC Information Sciences Institute (2015-2020)".
|
8 |
+
Bibliography: Ulf Hermjakob, Jonathan May, and Kevin Knight. 2018. Out-of-the-box universal romanization tool uroman. In Proceedings of the 56th Annual Meeting of Association for Computational Linguistics, Demo Track.
|
9 |
+
|
10 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
11 |
+
|
uroman/README.md
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# uroman
|
2 |
+
|
3 |
+
*uroman* is a *universal romanizer*. It converts text in any script to the Latin alphabet.
|
4 |
+
|
5 |
+
Version: 1.2.8
|
6 |
+
Release date: April 23, 2021
|
7 |
+
Author: Ulf Hermjakob, USC Information Sciences Institute
|
8 |
+
|
9 |
+
|
10 |
+
### Usage
|
11 |
+
```bash
|
12 |
+
$ uroman.pl [-l <lang-code>] [--chart] [--no-cache] < STDIN
|
13 |
+
where the optional <lang-code> is a 3-letter languages code, e.g. ara, bel, bul, deu, ell, eng, fas,
|
14 |
+
grc, ell, eng, heb, kaz, kir, lav, lit, mkd, mkd2, oss, pnt, pus, rus, srp, srp2, tur, uig, ukr, yid.
|
15 |
+
--chart specifies chart output (in JSON format) to represent alternative romanizations.
|
16 |
+
--no-cache disables caching.
|
17 |
+
```
|
18 |
+
### Examples
|
19 |
+
```bash
|
20 |
+
$ bin/uroman.pl < text/zho.txt
|
21 |
+
$ bin/uroman.pl -l tur < text/tur.txt
|
22 |
+
$ bin/uroman.pl -l heb --chart < text/heb.txt
|
23 |
+
$ bin/uroman.pl < test/multi-script.txt > test/multi-script.uroman.txt
|
24 |
+
```
|
25 |
+
|
26 |
+
Identifying the input as Arabic, Belarusian, Bulgarian, English, Farsi, German,
|
27 |
+
Ancient Greek, Modern Greek, Pontic Greek, Hebrew, Kazakh, Kyrgyz, Latvian,
|
28 |
+
Lithuanian, North Macedonian, Russian, Serbian, Turkish, Ukrainian, Uyghur or
|
29 |
+
Yiddish will improve romanization for those languages as some letters in those
|
30 |
+
languages have different sound values from other languages using the same script
|
31 |
+
(French, Russian, Hebrew respectively).
|
32 |
+
No effect for other languages in this version.
|
33 |
+
|
34 |
+
### Bibliography
|
35 |
+
Ulf Hermjakob, Jonathan May, and Kevin Knight. 2018. Out-of-the-box universal romanization tool uroman. In Proceedings of the 56th Annual Meeting of Association for Computational Linguistics, Demo Track. ACL-2018 Best Demo Paper Award. [Paper in ACL Anthology](https://www.aclweb.org/anthology/P18-4003) | [Poster](https://www.isi.edu/~ulf/papers/poster-uroman-acl2018.pdf) | [BibTex](https://www.aclweb.org/anthology/P18-4003.bib)
|
36 |
+
|
37 |
+
### Change History
|
38 |
+
Changes in version 1.2.8
|
39 |
+
* Updated to Unicode 13.0 (2021), which supports several new scripts (10% larger UnicodeData.txt).
|
40 |
+
* Improved support for Georgian.
|
41 |
+
* Preserve various symbols (as opposed to mapping to the symbols' names).
|
42 |
+
* Various small improvements.
|
43 |
+
|
44 |
+
Changes in version 1.2.7
|
45 |
+
* Improved support for Pashto.
|
46 |
+
|
47 |
+
Changes in version 1.2.6
|
48 |
+
* Improved support for Ukrainian, Russian and Ogham (ancient Irish script).
|
49 |
+
* Added support for English Braille.
|
50 |
+
* Added alternative Romanization for North Macedonian and Serbian (mkd2/srp2)
|
51 |
+
reflecting a casual style that many native speakers of those languages use
|
52 |
+
when writing text in Latin script, e.g. non-accented single letters (e.g. "s")
|
53 |
+
rather than phonetically motivated combinations of letters (e.g. "sh").
|
54 |
+
* When a line starts with "::lcode xyz ", the new uroman version will switch to
|
55 |
+
that language for that line. This is used for the new reference test file.
|
56 |
+
* Various small improvements.
|
57 |
+
|
58 |
+
Changes in version 1.2.5
|
59 |
+
* Improved support for Armenian and eight languages using Cyrillic scripts.
|
60 |
+
-- For Serbian and Macedonian, which are often written in both Cyrillic
|
61 |
+
and Latin scripts, uroman will map both official versions to the same
|
62 |
+
romanized text, e.g. both "Ниш" and "Niš" will be mapped to "Nish" (which
|
63 |
+
properly reflects the pronunciation of the city's name).
|
64 |
+
For both Serbian and Macedonian, casual writers often use a simplified
|
65 |
+
Latin form without diacritics, e.g. "s" to represent not only Cyrillic "с"
|
66 |
+
and Latin "s", but also "ш" or "š", even if this conflates "s" and "sh" and
|
67 |
+
other such pairs. The casual romanization can be simulated by using
|
68 |
+
alternative uroman language codes "srp2" and "mkd2", which romanize
|
69 |
+
both "Ниш" and "Niš" to "Nis" to reflect the casual Latin spelling.
|
70 |
+
* Various small improvements.
|
71 |
+
|
72 |
+
Changes in version 1.2.4
|
73 |
+
* Bug-fix that generated two emtpy lines for each empty line in cache mode.
|
74 |
+
|
75 |
+
Changes in version 1.2
|
76 |
+
* Run-time improvement based on (1) token-based caching and (2) shortcut
|
77 |
+
romanization (identity) of ASCII strings for default 1-best (non-chart)
|
78 |
+
output. Speed-up by a factor of 10 for Bengali and Uyghur on medium and
|
79 |
+
large size texts.
|
80 |
+
* Incremental improvements for Farsi, Amharic, Russian, Hebrew and related
|
81 |
+
languages.
|
82 |
+
* Richer lattice structure (more alternatives) for "Romanization" of English
|
83 |
+
to support better matching to romanizations of other languages.
|
84 |
+
Changes output only when --chart option is specified. No change in output for
|
85 |
+
default 1-best output, which for ASCII characters is always the input string.
|
86 |
+
|
87 |
+
Changes in version 1.1 (major upgrade)
|
88 |
+
* Offers chart output (in JSON format) to represent alternative romanizations.
|
89 |
+
-- Location of first character is defined to be "line: 1, start:0, end:0".
|
90 |
+
* Incremental improvements of Hebrew and Greek romanization; Chinese numbers.
|
91 |
+
* Improved web-interface at http://www.isi.edu/~ulf/uroman.html
|
92 |
+
-- Shows corresponding original and romanization text in red
|
93 |
+
when hovering over a text segment.
|
94 |
+
-- Shows alternative romanizations when hovering over romanized text
|
95 |
+
marked by dotted underline.
|
96 |
+
-- Added right-to-left script detection and improved display for right-to-left
|
97 |
+
script text (as determined line by line).
|
98 |
+
-- On-page support for some scripts that are often not pre-installed on users'
|
99 |
+
computers (Burmese, Egyptian, Klingon).
|
100 |
+
|
101 |
+
Changes in version 1.0 (major upgrade)
|
102 |
+
* Upgraded principal internal data structure from string to lattice.
|
103 |
+
* Improvements mostly in vowelization of South and Southeast Asian languages.
|
104 |
+
* Vocalic 'r' more consistently treated as vowel (no additional vowel added).
|
105 |
+
* Repetition signs (Japanese/Chinese/Thai/Khmer/Lao) are mapped to superscript 2.
|
106 |
+
* Japanese Katakana middle dots now mapped to ASCII space.
|
107 |
+
* Tibetan intersyllabic mark now mapped to middle dot (U+00B7).
|
108 |
+
* Some corrections regarding analysis of Chinese numbers.
|
109 |
+
* Many more foreign diacritics and punctuation marks dropped or mapped to ASCII.
|
110 |
+
* Zero-width characters dropped, except line/sentence-initial byte order marks.
|
111 |
+
* Spaces normalized to ASCII space.
|
112 |
+
* Fixed bug that in some cases mapped signs (such as dagger or bullet) to their verbal descriptions.
|
113 |
+
* Tested against previous version of uroman with a new uroman visual diff tool.
|
114 |
+
* Almost an order of magnitude faster.
|
115 |
+
|
116 |
+
Changes in version 0.7 (minor upgrade)
|
117 |
+
* Added script uroman-quick.pl for Arabic script languages, incl. Uyghur.
|
118 |
+
Much faster, pre-caching mapping of Arabic to Latin characters, simple greedy processing.
|
119 |
+
Will not convert material from non-Arabic blocks such as any (somewhat unusual) Cyrillic
|
120 |
+
or Chinese characters in Uyghur texts.
|
121 |
+
|
122 |
+
Changes in version 0.6 (minor upgrade)
|
123 |
+
* Added support for two letter characters used in Uzbek:
|
124 |
+
(1) character "ʻ" ("modifier letter turned comma", which modifies preceding "g" and "u" letters)
|
125 |
+
(2) character "ʼ" ("modifier letter apostrophe", which Uzbek uses to mark a glottal stop).
|
126 |
+
Both are now mapped to "'" (plain ASCII apostrophe).
|
127 |
+
* Added support for Uyghur vowel characters such as "ې" (Arabic e) and "ۆ" (Arabic oe)
|
128 |
+
even when they are not preceded by "ئ" (yeh with hamza above).
|
129 |
+
* Added support for Arabic semicolon "؛", Arabic ligature forms for phrases such as "ﷺ"
|
130 |
+
("sallallahou alayhe wasallam" = "prayer of God be upon him and his family and peace")
|
131 |
+
* Added robustness for Arabic letter presentation forms (initial/medial/final/isolated).
|
132 |
+
However, it is strongly recommended to normalize any presentation form Arabic letters
|
133 |
+
to their non-presentation form before calling uroman.
|
134 |
+
* Added force flush directive ($|=1;).
|
135 |
+
|
136 |
+
Changes in version 0.5 (minor upgrade)
|
137 |
+
* Improvements for Uyghur (make sure to use language option: -l uig)
|
138 |
+
|
139 |
+
Changes in version 0.4 (minor upgrade)
|
140 |
+
* Improvements for Thai (special cases for vowel/consonant reordering, e.g. for "sara o"; dropped some aspiration 'h's)
|
141 |
+
* Minor change for Arabic (added "alef+fathatan" = "an")
|
142 |
+
|
143 |
+
New features in version 0.3
|
144 |
+
* Covers Mandarin (Chinese)
|
145 |
+
* Improved romanization for numerous languages
|
146 |
+
* Preserves capitalization (e.g. from Latin, Cyrillic, Greek scripts)
|
147 |
+
* Maps from native digits to Western numbers
|
148 |
+
* Faster for South Asian languages
|
149 |
+
|
150 |
+
### Other features
|
151 |
+
* Web interface: http://www.isi.edu/~ulf/uroman.html
|
152 |
+
* Vowelization is provided when locally computable, e.g. for many South Asian languages and Tibetan.
|
153 |
+
|
154 |
+
### Limitations
|
155 |
+
* The current version of uroman has a few limitations, some of which we plan to address in future versions.
|
156 |
+
For Japanese, *uroman* currently romanizes hiragana and katakana as expected, but kanji are interpreted as Chinese characters and romanized as such.
|
157 |
+
For Egyptian hieroglyphs, only single-sound phonetic characters and numbers are currently romanized.
|
158 |
+
For Linear B, only phonetic syllabic characters are romanized.
|
159 |
+
For some other extinct scripts such as cuneiform, no romanization is provided.
|
160 |
+
* A romanizer is not a full transliterator. For example, this version of
|
161 |
+
uroman does not vowelize text that lacks explicit vowelization such as
|
162 |
+
normal text in Arabic and Hebrew (without diacritics/points).
|
163 |
+
|
164 |
+
### Acknowledgments
|
165 |
+
This research is based upon work supported in part by the Office of the Director of National Intelligence (ODNI), Intelligence Advanced Research Projects Activity (IARPA), via contract # FA8650-17-C-9116, and by research sponsored by Air Force Research Laboratory (AFRL) under agreement number FA8750-19-1-1000. The views and conclusions contained herein are those of the authors and should not be interpreted as necessarily representing the official policies, either expressed or implied, of ODNI, IARPA, Air Force Laboratory, DARPA, or the U.S. Government. The U.S. Government is authorized to reproduce and distribute reprints for governmental purposes notwithstanding any copyright annotation therein.
|
uroman/README.txt
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
uroman version 1.2.8
|
2 |
+
Release date: April 23, 2021
|
3 |
+
Author: Ulf Hermjakob, USC Information Sciences Institute
|
4 |
+
|
5 |
+
uroman is a universal romanizer. It converts text in any script to the Latin alphabet.
|
6 |
+
|
7 |
+
Usage: uroman.pl [-l <lang-code>] [--chart] [--no-cache] < STDIN
|
8 |
+
where the optional <lang-code> is a 3-letter languages code, e.g. ara, bel, bul, deu, ell, eng, fas,
|
9 |
+
grc, ell, eng, heb, kaz, kir, lav, lit, mkd, mkd2, oss, pnt, pus, rus, srp, srp2, tur, uig, ukr, yid.
|
10 |
+
--chart specifies chart output (in JSON format) to represent alternative romanizations.
|
11 |
+
--no-cache disables caching.
|
12 |
+
Examples: bin/uroman.pl < text/zho.txt
|
13 |
+
bin/uroman.pl -l tur < text/tur.txt
|
14 |
+
bin/uroman.pl -l heb --chart < text/heb.txt
|
15 |
+
bin/uroman.pl < test/multi-script.txt > test/multi-script.uroman.txt
|
16 |
+
|
17 |
+
Identifying the input as Arabic, Belarusian, Bulgarian, English, Farsi, German,
|
18 |
+
Ancient Greek, Modern Greek, Pontic Greek, Hebrew, Kazakh, Kyrgyz, Latvian,
|
19 |
+
Lithuanian, North Macedonian, Russian, Serbian, Turkish, Ukrainian, Uyghur or Yiddish
|
20 |
+
will improve romanization for those languages as some letters in those languages
|
21 |
+
have different sound values from other languages using the same script.
|
22 |
+
No effect for other languages in this version.
|
23 |
+
|
24 |
+
Bibliography: Ulf Hermjakob, Jonathan May, and Kevin Knight. 2018. Out-of-the-box universal romanization tool uroman. In Proceedings of the 56th Annual Meeting of Association for Computational Linguistics, Demo Track. [Best Demo Paper Award]
|
25 |
+
|
26 |
+
Changes in version 1.2.8
|
27 |
+
* Improved support for Georgian.
|
28 |
+
* Updated UnicodeData.txt to version 13 (2021) with several new scripts (10% larger).
|
29 |
+
* Preserve various symbols (as opposed to mapping to the symbols' names).
|
30 |
+
* Various small improvements.
|
31 |
+
Changes in version 1.2.7
|
32 |
+
* Improved support for Pashto.
|
33 |
+
Changes in version 1.2.6
|
34 |
+
* Improved support for Ukrainian, Russian and Ogham (ancient Irish script).
|
35 |
+
* Added support for English Braille.
|
36 |
+
* Added alternative Romanization for North Macedonian and Serbian (mkd2/srp2)
|
37 |
+
reflecting a casual style that many native speakers of those languages use
|
38 |
+
when writing text in Latin script, e.g. non-accented single letters (e.g. "s")
|
39 |
+
rather than phonetically motivated combinations of letters (e.g. "sh").
|
40 |
+
* When a line starts with "::lcode xyz ", the new uroman version will switch to
|
41 |
+
that language for that line. This is used for the new reference test file.
|
42 |
+
* Various small improvements.
|
43 |
+
Changes in version 1.2.5
|
44 |
+
* Improved support for Armenian and eight languages using Cyrillic scripts.
|
45 |
+
-- For Serbian and Macedonian, which are often written in both Cyrillic
|
46 |
+
and Latin scripts, uroman will map both official versions to the same
|
47 |
+
romanized text, e.g. both "Ниш" and "Niš" will be mapped to "Nish" (which
|
48 |
+
properly reflects the pronunciation of the city's name).
|
49 |
+
For both Serbian and Macedonian, casual writers often use a simplified
|
50 |
+
Latin form without diacritics, e.g. "s" to represent not only Cyrillic "с"
|
51 |
+
and Latin "s", but also "ш" or "š", even if this conflates "s" and "sh" and
|
52 |
+
other such pairs. The casual romanization can be simulated by using
|
53 |
+
alternative uroman language codes "srp2" and "mkd2", which romanize
|
54 |
+
both "Ниш" and "Niš" to "Nis" to reflect the casual Latin spelling.
|
55 |
+
* Various small improvements.
|
56 |
+
Changes in version 1.2.4
|
57 |
+
* Added support for Tifinagh (a script used for Berber languages).
|
58 |
+
* Bug-fix that generated two emtpy lines for each empty line in cache mode.
|
59 |
+
Changes in version 1.2.3
|
60 |
+
* Exclude emojis, dingbats, many other pictographs from being romanized (e.g. to "face")
|
61 |
+
Changes in version 1.2
|
62 |
+
* Run-time improvement based on (1) token-based caching and (2) shortcut
|
63 |
+
romanization (identity) of ASCII strings for default 1-best (non-chart)
|
64 |
+
output. Speed-up by a factor of 10 for Bengali and Uyghur on medium and
|
65 |
+
large size texts.
|
66 |
+
* Incremental improvements for Farsi, Amharic, Russian, Hebrew and related
|
67 |
+
languages.
|
68 |
+
* Richer lattice structure (more alternatives) for "Romanization" of English
|
69 |
+
to support better matching to romanizations of other languages.
|
70 |
+
Changes output only when --chart option is specified. No change in output for
|
71 |
+
default 1-best output, which for ASCII characters is always the input string.
|
72 |
+
Changes in version 1.1 (major upgrade)
|
73 |
+
* Offers chart output (in JSON format) to represent alternative romanizations.
|
74 |
+
-- Location of first character is defined to be "line: 1, start:0, end:0".
|
75 |
+
* Incremental improvements of Hebrew and Greek romanization; Chinese numbers.
|
76 |
+
* Improved web-interface at http://www.isi.edu/~ulf/uroman.html
|
77 |
+
-- Shows corresponding original and romanization text in red
|
78 |
+
when hovering over a text segment.
|
79 |
+
-- Shows alternative romanizations when hovering over romanized text
|
80 |
+
marked by dotted underline.
|
81 |
+
-- Added right-to-left script detection and improved display for right-to-left
|
82 |
+
script text (as determined line by line).
|
83 |
+
-- On-page support for some scripts that are often not pre-installed on users'
|
84 |
+
computers (Burmese, Egyptian, Klingon).
|
85 |
+
Changes in version 1.0 (major upgrade)
|
86 |
+
* Upgraded principal internal data structure from string to lattice.
|
87 |
+
* Improvements mostly in vowelization of South and Southeast Asian languages.
|
88 |
+
* Vocalic 'r' more consistently treated as vowel (no additional vowel added).
|
89 |
+
* Repetition signs (Japanese/Chinese/Thai/Khmer/Lao) are mapped to superscript 2.
|
90 |
+
* Japanese Katakana middle dots now mapped to ASCII space.
|
91 |
+
* Tibetan intersyllabic mark now mapped to middle dot (U+00B7).
|
92 |
+
* Some corrections regarding analysis of Chinese numbers.
|
93 |
+
* Many more foreign diacritics and punctuation marks dropped or mapped to ASCII.
|
94 |
+
* Zero-width characters dropped, except line/sentence-initial byte order marks.
|
95 |
+
* Spaces normalized to ASCII space.
|
96 |
+
* Fixed bug that in some cases mapped signs (such as dagger or bullet) to their verbal descriptions.
|
97 |
+
* Tested against previous version of uroman with a new uroman visual diff tool.
|
98 |
+
* Almost an order of magnitude faster.
|
99 |
+
Changes in version 0.7 (minor upgrade)
|
100 |
+
* Added script uroman-quick.pl for Arabic script languages, incl. Uyghur.
|
101 |
+
Much faster, pre-caching mapping of Arabic to Latin characters, simple greedy processing.
|
102 |
+
Will not convert material from non-Arabic blocks such as any (somewhat unusual) Cyrillic
|
103 |
+
or Chinese characters in Uyghur texts.
|
104 |
+
Changes in version 0.6 (minor upgrade)
|
105 |
+
* Added support for two letter characters used in Uzbek:
|
106 |
+
(1) character "ʻ" ("modifier letter turned comma", which modifies preceding "g" and "u" letters)
|
107 |
+
(2) character "ʼ" ("modifier letter apostrophe", which Uzbek uses to mark a glottal stop).
|
108 |
+
Both are now mapped to "'" (plain ASCII apostrophe).
|
109 |
+
* Added support for Uyghur vowel characters such as "ې" (Arabic e) and "ۆ" (Arabic oe)
|
110 |
+
even when they are not preceded by "ئ" (yeh with hamza above).
|
111 |
+
* Added support for Arabic semicolon "؛", Arabic ligature forms for phrases such as "ﷺ"
|
112 |
+
("sallallahou alayhe wasallam" = "prayer of God be upon him and his family and peace")
|
113 |
+
* Added robustness for Arabic letter presentation forms (initial/medial/final/isolated).
|
114 |
+
However, it is strongly recommended to normalize any presentation form Arabic letters
|
115 |
+
to their non-presentation form before calling uroman.
|
116 |
+
* Added force flush directive ($|=1;).
|
117 |
+
Changes in version 0.5 (minor upgrade)
|
118 |
+
* Improvements for Uyghur (make sure to use language option: -l uig)
|
119 |
+
Changes in version 0.4 (minor upgrade)
|
120 |
+
* Improvements for Thai (special cases for vowel/consonant reordering, e.g. for "sara o"; dropped some aspiration 'h's)
|
121 |
+
* Minor change for Arabic (added "alef+fathatan" = "an")
|
122 |
+
New features in version 0.3
|
123 |
+
* Covers Mandarin (Chinese)
|
124 |
+
* Improved romanization for numerous languages
|
125 |
+
* Preserves capitalization (e.g. from Latin, Cyrillic, Greek scripts)
|
126 |
+
* Maps from native digits to Western numbers
|
127 |
+
* Faster for South Asian languages
|
128 |
+
|
129 |
+
Other features
|
130 |
+
* Web interface: http://www.isi.edu/~ulf/uroman.html
|
131 |
+
* Vowelization is provided when locally computable, e.g. for many South Asian
|
132 |
+
languages and Tibetan.
|
133 |
+
|
134 |
+
Limitations
|
135 |
+
* This version of uroman assumes all CJK ideographs to be Mandarin (Chinese).
|
136 |
+
This means that Japanese kanji are incorrectly romanized; however, Japanese
|
137 |
+
hiragana and katakana are properly romanized.
|
138 |
+
* A romanizer is not a full transliterator. For example, this version of
|
139 |
+
uroman does not vowelize text that lacks explicit vowelization such as
|
140 |
+
normal text in Arabic and Hebrew (without diacritics/points).
|
141 |
+
|
uroman/bin/de-accent.pl
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/perl -w
|
2 |
+
|
3 |
+
sub print_version {
|
4 |
+
print STDERR "$0 version 1.1\n";
|
5 |
+
print STDERR " Author: Ulf Hermjakob\n";
|
6 |
+
print STDERR " Last changed: March 14, 2011\n";
|
7 |
+
}
|
8 |
+
|
9 |
+
sub print_usage {
|
10 |
+
print STDERR "$0 [options] < with_accents.txt > without_accents.txt\n";
|
11 |
+
print STDERR " -h or -help\n";
|
12 |
+
print STDERR " -v or -version\n";
|
13 |
+
}
|
14 |
+
|
15 |
+
sub de_accent_string {
|
16 |
+
local($s) = @_;
|
17 |
+
|
18 |
+
# $s =~ tr/A-Z/a-z/;
|
19 |
+
unless (0) {
|
20 |
+
# Latin-1
|
21 |
+
if ($s =~ /\xC3[\x80-\xBF]/) {
|
22 |
+
$s =~ s/(À|Á|Â|Ã|Ä|Å)/A/g;
|
23 |
+
$s =~ s/Æ/Ae/g;
|
24 |
+
$s =~ s/Ç/C/g;
|
25 |
+
$s =~ s/Ð/D/g;
|
26 |
+
$s =~ s/(È|É|Ê|Ë)/E/g;
|
27 |
+
$s =~ s/(Ì|Í|Î|Ï)/I/g;
|
28 |
+
$s =~ s/Ñ/N/g;
|
29 |
+
$s =~ s/(Ò|Ó|Ô|Õ|Ö|Ø)/O/g;
|
30 |
+
$s =~ s/(Ù|Ú|Û|Ü)/U/g;
|
31 |
+
$s =~ s/Þ/Th/g;
|
32 |
+
$s =~ s/Ý/Y/g;
|
33 |
+
$s =~ s/(à|á|â|ã|ä|å)/a/g;
|
34 |
+
$s =~ s/æ/ae/g;
|
35 |
+
$s =~ s/ç/c/g;
|
36 |
+
$s =~ s/(è|é|ê|ë)/e/g;
|
37 |
+
$s =~ s/(ì|í|î|ï)/i/g;
|
38 |
+
$s =~ s/ð/d/g;
|
39 |
+
$s =~ s/ñ/n/g;
|
40 |
+
$s =~ s/(ò|ó|ô|õ|ö)/o/g;
|
41 |
+
$s =~ s/ß/ss/g;
|
42 |
+
$s =~ s/þ/th/g;
|
43 |
+
$s =~ s/(ù|ú|û|ü)/u/g;
|
44 |
+
$s =~ s/(ý|ÿ)/y/g;
|
45 |
+
}
|
46 |
+
# Latin Extended-A
|
47 |
+
if ($s =~ /[\xC4-\xC5][\x80-\xBF]/) {
|
48 |
+
$s =~ s/(Ā|Ă|Ą)/A/g;
|
49 |
+
$s =~ s/(ā|ă|ą)/a/g;
|
50 |
+
$s =~ s/(Ć|Ĉ|Ċ|Č)/C/g;
|
51 |
+
$s =~ s/(ć|ĉ|ċ|č)/c/g;
|
52 |
+
$s =~ s/(Ď|Đ)/D/g;
|
53 |
+
$s =~ s/(ď|đ)/d/g;
|
54 |
+
$s =~ s/(Ē|Ĕ|Ė|Ę|Ě)/E/g;
|
55 |
+
$s =~ s/(ē|ĕ|ė|ę|ě)/e/g;
|
56 |
+
$s =~ s/(Ĝ|Ğ|Ġ|Ģ)/G/g;
|
57 |
+
$s =~ s/(ĝ|ğ|ġ|ģ)/g/g;
|
58 |
+
$s =~ s/(Ĥ|Ħ)/H/g;
|
59 |
+
$s =~ s/(ĥ|ħ)/h/g;
|
60 |
+
$s =~ s/(Ĩ|Ī|Ĭ|Į|İ)/I/g;
|
61 |
+
$s =~ s/(ĩ|ī|ĭ|į|ı)/i/g;
|
62 |
+
$s =~ s/IJ/Ij/g;
|
63 |
+
$s =~ s/ij/ij/g;
|
64 |
+
$s =~ s/Ĵ/J/g;
|
65 |
+
$s =~ s/ĵ/j/g;
|
66 |
+
$s =~ s/Ķ/K/g;
|
67 |
+
$s =~ s/(ķ|ĸ)/k/g;
|
68 |
+
$s =~ s/(Ĺ|Ļ|Ľ|Ŀ|Ł)/L/g;
|
69 |
+
$s =~ s/(ļ|ľ|ŀ|ł)/l/g;
|
70 |
+
$s =~ s/(Ń|Ņ|Ň|Ŋ)/N/g;
|
71 |
+
$s =~ s/(ń|ņ|ň|ʼn|ŋ)/n/g;
|
72 |
+
$s =~ s/(Ō|Ŏ|Ő)/O/g;
|
73 |
+
$s =~ s/(ō|ŏ|ő)/o/g;
|
74 |
+
$s =~ s/Œ/Oe/g;
|
75 |
+
$s =~ s/œ/oe/g;
|
76 |
+
$s =~ s/(Ŕ|Ŗ|Ř)/R/g;
|
77 |
+
$s =~ s/(ŕ|ŗ|ř)/r/g;
|
78 |
+
$s =~ s/(Ś|Ŝ|Ş|Š)/S/g;
|
79 |
+
$s =~ s/(ś|ŝ|ş|š|ſ)/s/g;
|
80 |
+
$s =~ s/(Ţ|Ť|Ŧ)/T/g;
|
81 |
+
$s =~ s/(ţ|ť|ŧ)/t/g;
|
82 |
+
$s =~ s/(Ũ|Ū|Ŭ|Ů|Ű|Ų)/U/g;
|
83 |
+
$s =~ s/(ũ|ū|ŭ|ů|ű|ų)/u/g;
|
84 |
+
$s =~ s/Ŵ/W/g;
|
85 |
+
$s =~ s/ŵ/w/g;
|
86 |
+
$s =~ s/(Ŷ|Ÿ)/Y/g;
|
87 |
+
$s =~ s/ŷ/y/g;
|
88 |
+
$s =~ s/(Ź|Ż|Ž)/Z/g;
|
89 |
+
$s =~ s/(ź|ż|ž)/z/g;
|
90 |
+
}
|
91 |
+
# Latin Extended Additional
|
92 |
+
if ($s =~ /\xE1[\xB8-\xBF][\x80-\xBF]/) {
|
93 |
+
$s =~ s/(ḁ|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẚ)/a/g;
|
94 |
+
$s =~ s/(ḃ|ḅ|ḇ)/b/g;
|
95 |
+
$s =~ s/(ḉ)/c/g;
|
96 |
+
$s =~ s/(ḋ|ḍ|ḏ|ḑ|ḓ)/d/g;
|
97 |
+
$s =~ s/(ḕ|ḗ|ḙ|ḛ|ḝ|ẹ|ẻ|ẽ|ế|ề|ể|ễ|ệ)/e/g;
|
98 |
+
$s =~ s/(ḟ)/f/g;
|
99 |
+
$s =~ s/(ḡ)/g/g;
|
100 |
+
$s =~ s/(ḣ|ḥ|ḧ|ḩ|ḫ)/h/g;
|
101 |
+
$s =~ s/(ḭ|ḯ|ỉ|ị)/i/g;
|
102 |
+
$s =~ s/(ḱ|ḳ|ḵ)/k/g;
|
103 |
+
$s =~ s/(ḷ|ḹ|ḻ|ḽ)/l/g;
|
104 |
+
$s =~ s/(ḿ|ṁ|ṃ)/m/g;
|
105 |
+
$s =~ s/(ṅ|ṇ|ṉ|ṋ)/m/g;
|
106 |
+
$s =~ s/(ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ṍ|ṏ|ṑ|ṓ)/o/g;
|
107 |
+
$s =~ s/(ṕ|ṗ)/p/g;
|
108 |
+
$s =~ s/(ṙ|ṛ|ṝ|ṟ)/r/g;
|
109 |
+
$s =~ s/(ṡ|ṣ|ṥ|ṧ|ṩ|ẛ)/s/g;
|
110 |
+
$s =~ s/(ṫ|ṭ|ṯ|ṱ)/t/g;
|
111 |
+
$s =~ s/(ṳ|ṵ|ṷ|ṹ|ṻ|ụ|ủ|ứ|ừ|ử|ữ|ự)/u/g;
|
112 |
+
$s =~ s/(ṽ|ṿ)/v/g;
|
113 |
+
$s =~ s/(ẁ|ẃ|ẅ|ẇ|ẉ|ẘ)/w/g;
|
114 |
+
$s =~ s/(ẋ|ẍ)/x/g;
|
115 |
+
$s =~ s/(ẏ|ỳ|ỵ|ỷ|ỹ|ẙ)/y/g;
|
116 |
+
$s =~ s/(ẑ|ẓ|ẕ)/z/g;
|
117 |
+
$s =~ s/(Ḁ|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ)/A/g;
|
118 |
+
$s =~ s/(Ḃ|Ḅ|Ḇ)/B/g;
|
119 |
+
$s =~ s/(Ḉ)/C/g;
|
120 |
+
$s =~ s/(Ḋ|Ḍ|Ḏ|Ḑ|Ḓ)/D/g;
|
121 |
+
$s =~ s/(Ḕ|Ḗ|Ḙ|Ḛ|Ḝ|Ẹ|Ẻ|Ẽ|Ế|Ề|Ể|Ễ|Ệ)/E/g;
|
122 |
+
$s =~ s/(Ḟ)/F/g;
|
123 |
+
$s =~ s/(Ḡ)/G/g;
|
124 |
+
$s =~ s/(Ḣ|Ḥ|Ḧ|Ḩ|Ḫ)/H/g;
|
125 |
+
$s =~ s/(Ḭ|Ḯ|Ỉ|Ị)/I/g;
|
126 |
+
$s =~ s/(Ḱ|Ḳ|Ḵ)/K/g;
|
127 |
+
$s =~ s/(Ḷ|Ḹ|Ḻ|Ḽ)/L/g;
|
128 |
+
$s =~ s/(Ḿ|Ṁ|Ṃ)/M/g;
|
129 |
+
$s =~ s/(Ṅ|Ṇ|Ṉ|Ṋ)/N/g;
|
130 |
+
$s =~ s/(Ṍ|Ṏ|Ṑ|Ṓ|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ)/O/g;
|
131 |
+
$s =~ s/(Ṕ|Ṗ)/P/g;
|
132 |
+
$s =~ s/(Ṙ|Ṛ|Ṝ|Ṟ)/R/g;
|
133 |
+
$s =~ s/(Ṡ|Ṣ|Ṥ|Ṧ|Ṩ)/S/g;
|
134 |
+
$s =~ s/(Ṫ|Ṭ|Ṯ|Ṱ)/T/g;
|
135 |
+
$s =~ s/(Ṳ|Ṵ|Ṷ|Ṹ|Ṻ|Ụ|Ủ|Ứ|Ừ|Ử|Ữ|Ự)/U/g;
|
136 |
+
$s =~ s/(Ṽ|Ṿ)/V/g;
|
137 |
+
$s =~ s/(Ẁ|Ẃ|Ẅ|Ẇ|Ẉ)/W/g;
|
138 |
+
$s =~ s/(Ẍ)/X/g;
|
139 |
+
$s =~ s/(Ẏ|Ỳ|Ỵ|Ỷ|Ỹ)/Y/g;
|
140 |
+
$s =~ s/(Ẑ|Ẓ|Ẕ)/Z/g;
|
141 |
+
}
|
142 |
+
# Greek letters
|
143 |
+
if ($s =~ /\xCE[\x86-\xAB]/) {
|
144 |
+
$s =~ s/ά/α/g;
|
145 |
+
$s =~ s/έ/ε/g;
|
146 |
+
$s =~ s/ί/ι/g;
|
147 |
+
$s =~ s/ϊ/ι/g;
|
148 |
+
$s =~ s/ΐ/ι/g;
|
149 |
+
$s =~ s/ό/ο/g;
|
150 |
+
$s =~ s/ύ/υ/g;
|
151 |
+
$s =~ s/ϋ/υ/g;
|
152 |
+
$s =~ s/ΰ/υ/g;
|
153 |
+
$s =~ s/ώ/ω/g;
|
154 |
+
$s =~ s/Ά/Α/g;
|
155 |
+
$s =~ s/Έ/Ε/g;
|
156 |
+
$s =~ s/Ή/Η/g;
|
157 |
+
$s =~ s/Ί/Ι/g;
|
158 |
+
$s =~ s/Ϊ/Ι/g;
|
159 |
+
$s =~ s/Ύ/Υ/g;
|
160 |
+
$s =~ s/Ϋ/Υ/g;
|
161 |
+
$s =~ s/Ώ/Ω/g;
|
162 |
+
}
|
163 |
+
# Cyrillic letters
|
164 |
+
if ($s =~ /\xD0[\x80-\xAF]/) {
|
165 |
+
$s =~ s/Ѐ/Е/g;
|
166 |
+
$s =~ s/Ё/Е/g;
|
167 |
+
$s =~ s/Ѓ/Г/g;
|
168 |
+
$s =~ s/Ќ/К/g;
|
169 |
+
$s =~ s/Ѝ/И/g;
|
170 |
+
$s =~ s/Й/И/g;
|
171 |
+
$s =~ s/ѐ/е/g;
|
172 |
+
$s =~ s/ё/е/g;
|
173 |
+
$s =~ s/ѓ/г/g;
|
174 |
+
$s =~ s/ќ/к/g;
|
175 |
+
$s =~ s/ѝ/и/g;
|
176 |
+
$s =~ s/й/и/g;
|
177 |
+
}
|
178 |
+
}
|
179 |
+
return $s;
|
180 |
+
}
|
181 |
+
|
182 |
+
while (@ARGV) {
|
183 |
+
$arg = shift @ARGV;
|
184 |
+
if ($arg =~ /^-*(h|help)$/i) {
|
185 |
+
&print_usage;
|
186 |
+
exit 1;
|
187 |
+
} elsif ($arg =~ /^-*(v|version)$/i) {
|
188 |
+
&print_version;
|
189 |
+
exit 1;
|
190 |
+
} else {
|
191 |
+
print STDERR "Ignoring unrecognized argument $arg\n";
|
192 |
+
}
|
193 |
+
}
|
194 |
+
|
195 |
+
$line_number = 0;
|
196 |
+
while (<>) {
|
197 |
+
$line_number++;
|
198 |
+
print &de_accent_string($_);
|
199 |
+
}
|
200 |
+
exit 0;
|
201 |
+
|
uroman/bin/string-distance.pl
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/perl -w
|
2 |
+
|
3 |
+
# Author: Ulf Hermjakob
|
4 |
+
# Release date: October 13, 2019
|
5 |
+
|
6 |
+
# Usage: string-distance.pl {-lc1 <language-code>} {-lc2 <language-code>} < STDIN > STDOUT
|
7 |
+
# Example: string-distance.pl -lc1 rus -lc2 ukr < STDIN > STDOUT
|
8 |
+
# Example: string-distance.pl < ../test/string-similarity-test-input.txt
|
9 |
+
# Input format: two strings per line (tab-separated, in Latin script)
|
10 |
+
# Strings in non-Latin scripts should first be romanized. (Recommended script: uroman.pl)
|
11 |
+
# Output format: repetition of the two input strings, plus the string distance between them (tab-separated).
|
12 |
+
# Additional output meta info lines at the top are marked with an initial #.
|
13 |
+
#
|
14 |
+
# The script uses data from a string-distance-cost-rules file that lists costs,
|
15 |
+
# where the default cost is "1" with lower costs for differences in vowels,
|
16 |
+
# duplicate consonants, "f" vs. "ph" etc.
|
17 |
+
# Language cost rules can be language-specific and context-sensitive.
|
18 |
+
|
19 |
+
$|=1;
|
20 |
+
|
21 |
+
use FindBin;
|
22 |
+
use Cwd "abs_path";
|
23 |
+
use File::Basename qw(dirname);
|
24 |
+
use File::Spec;
|
25 |
+
|
26 |
+
my $bin_dir = abs_path(dirname($0));
|
27 |
+
my $root_dir = File::Spec->catfile($bin_dir, File::Spec->updir());
|
28 |
+
my $data_dir = File::Spec->catfile($root_dir, "data");
|
29 |
+
my $lib_dir = File::Spec->catfile($root_dir, "lib");
|
30 |
+
|
31 |
+
use lib "$FindBin::Bin/../lib";
|
32 |
+
use List::Util qw(min max);
|
33 |
+
use NLP::utilities;
|
34 |
+
use NLP::stringDistance;
|
35 |
+
$util = NLP::utilities;
|
36 |
+
$sd = NLP::stringDistance;
|
37 |
+
$verbose = 0;
|
38 |
+
$separator = "\t";
|
39 |
+
|
40 |
+
$cost_rule_filename = File::Spec->catfile($data_dir, "string-distance-cost-rules.txt");
|
41 |
+
|
42 |
+
$lang_code1 = "eng";
|
43 |
+
$lang_code2 = "eng";
|
44 |
+
%ht = ();
|
45 |
+
|
46 |
+
while (@ARGV) {
|
47 |
+
$arg = shift @ARGV;
|
48 |
+
if ($arg =~ /^-+lc1$/) {
|
49 |
+
$lang_code_candidate = shift @ARGV;
|
50 |
+
$lang_code1 = $lang_code_candidate if $lang_code_candidate =~ /^[a-z]{3,3}$/;
|
51 |
+
} elsif ($arg =~ /^-+lc2$/) {
|
52 |
+
$lang_code_candidate = shift @ARGV;
|
53 |
+
$lang_code2 = $lang_code_candidate if $lang_code_candidate =~ /^[a-z]{3,3}$/;
|
54 |
+
} elsif ($arg =~ /^-+(v|verbose)$/) {
|
55 |
+
$verbose = shift @ARGV;
|
56 |
+
} else {
|
57 |
+
print STDERR "Ignoring unrecognized arg $arg\n";
|
58 |
+
}
|
59 |
+
}
|
60 |
+
|
61 |
+
$sd->load_string_distance_data($cost_rule_filename, *ht, $verbose);
|
62 |
+
print STDERR "Loaded resources.\n" if $verbose;
|
63 |
+
|
64 |
+
my $chart_id = 0;
|
65 |
+
my $line_number = 0;
|
66 |
+
print "# Lang-code-1: $lang_code1 Lang-code-2: $lang_code2\n";
|
67 |
+
while (<>) {
|
68 |
+
$line_number++;
|
69 |
+
if ($verbose) {
|
70 |
+
if ($line_number =~ /000$/) {
|
71 |
+
if ($line_number =~ /0000$/) {
|
72 |
+
print STDERR $line_number;
|
73 |
+
} else {
|
74 |
+
print STDERR ".";
|
75 |
+
}
|
76 |
+
}
|
77 |
+
}
|
78 |
+
my $line = $_;
|
79 |
+
$line =~ s/^\xEF\xBB\xBF//;
|
80 |
+
next if $line =~ /^\s*(\#.*)?$/;
|
81 |
+
my $s1;
|
82 |
+
my $s2;
|
83 |
+
if (($s1, $s2) = ($line =~ /^("(?:\\"|[^"])*"|\S+)$separator("(?:\\"|[^"])*"|\S+)\s*$/)) {
|
84 |
+
$s1 = $util->dequote_string($s1);
|
85 |
+
$s2 = $util->dequote_string($s2);
|
86 |
+
} elsif ($line =~ /^\s*(#.*)$/) {
|
87 |
+
} else {
|
88 |
+
print STDERR "Could not process line $line_number: $line" if $verbose;
|
89 |
+
print "\n";
|
90 |
+
next;
|
91 |
+
}
|
92 |
+
|
93 |
+
$cost = $sd->quick_romanized_string_distance_by_chart($s1, $s2, *ht, "", $lang_code1, $lang_code2);
|
94 |
+
print "$s1\t$s2\t$cost\n";
|
95 |
+
}
|
96 |
+
print STDERR "\n" if $verbose;
|
97 |
+
|
98 |
+
exit 0;
|
99 |
+
|
uroman/bin/uroman-quick.pl
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/perl -w
|
2 |
+
|
3 |
+
# uroman Nov. 12, 2015 - July 25, 2016
|
4 |
+
# version v0.7
|
5 |
+
# Author: Ulf Hermjakob
|
6 |
+
|
7 |
+
# Usage: uroman-quick.pl {-l [tur|uig|ukr|yid]} < STDIN
|
8 |
+
# currently only for Arabic script languages, incl. Uyghur
|
9 |
+
|
10 |
+
$|=1;
|
11 |
+
|
12 |
+
use FindBin;
|
13 |
+
use Cwd "abs_path";
|
14 |
+
use File::Basename qw(dirname);
|
15 |
+
use File::Spec;
|
16 |
+
|
17 |
+
my $bin_dir = abs_path(dirname($0));
|
18 |
+
my $root_dir = File::Spec->catfile($bin_dir, File::Spec->updir());
|
19 |
+
my $data_dir = File::Spec->catfile($root_dir, "data");
|
20 |
+
my $lib_dir = File::Spec->catfile($root_dir, "lib");
|
21 |
+
|
22 |
+
use lib "$FindBin::Bin/../lib";
|
23 |
+
use NLP::Romanizer;
|
24 |
+
use NLP::UTF8;
|
25 |
+
$romanizer = NLP::Romanizer;
|
26 |
+
%ht = ();
|
27 |
+
$lang_code = "";
|
28 |
+
|
29 |
+
while (@ARGV) {
|
30 |
+
$arg = shift @ARGV;
|
31 |
+
if ($arg =~ /^-+(l|lc|lang-code)$/) {
|
32 |
+
$lang_code = lc (shift @ARGV || "")
|
33 |
+
} else {
|
34 |
+
print STDERR "Ignoring unrecognized arg $arg\n";
|
35 |
+
}
|
36 |
+
}
|
37 |
+
|
38 |
+
$romanization_table_arabic_block_filename = File::Spec->catfile($data_dir, "romanization-table-arabic-block.txt");
|
39 |
+
$romanization_table_filename = File::Spec->catfile($data_dir, "romanization-table.txt");
|
40 |
+
|
41 |
+
$romanizer->load_romanization_table(*ht, $romanization_table_arabic_block_filename);
|
42 |
+
$romanizer->load_romanization_table(*ht, $romanization_table_filename);
|
43 |
+
|
44 |
+
$line_number = 0;
|
45 |
+
while (<>) {
|
46 |
+
$line_number++;
|
47 |
+
my $line = $_;
|
48 |
+
print $romanizer->quick_romanize($line, $lang_code, *ht) . "\n";
|
49 |
+
if ($line_number =~ /0000$/) {
|
50 |
+
print STDERR $line_number;
|
51 |
+
} elsif ($line_number =~ /000$/) {
|
52 |
+
print STDERR ".";
|
53 |
+
}
|
54 |
+
}
|
55 |
+
print STDERR "\n";
|
56 |
+
|
57 |
+
exit 0;
|
58 |
+
|
uroman/bin/uroman-tsv.sh
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env bash
|
2 |
+
# Created by Thamme Gowda on June 17, 2019
|
3 |
+
|
4 |
+
DIR=$(dirname "${BASH_SOURCE[0]}") # get the directory name
|
5 |
+
# DIR=$(realpath "${DIR}") # resolve its full path if need be
|
6 |
+
|
7 |
+
if [[ $# -lt 1 || $# -gt 2 ]]; then
|
8 |
+
>&2 echo "ERROR: invalid args"
|
9 |
+
>&2 echo "Usage: <input.tsv> [<output.tsv>]"
|
10 |
+
exit 2
|
11 |
+
fi
|
12 |
+
|
13 |
+
INP=$1
|
14 |
+
OUT=$2
|
15 |
+
|
16 |
+
CMD=$DIR/uroman.pl
|
17 |
+
|
18 |
+
function romanize(){
|
19 |
+
paste <(cut -f1 $INP) <(cut -f2 $INP | $CMD)
|
20 |
+
}
|
21 |
+
|
22 |
+
if [[ -n $OUT ]]; then
|
23 |
+
romanize > $OUT
|
24 |
+
else
|
25 |
+
romanize
|
26 |
+
fi
|
27 |
+
|
28 |
+
|
uroman/bin/uroman.pl
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/perl -w
|
2 |
+
|
3 |
+
# uroman Nov. 12, 2015 - Apr. 23, 2021
|
4 |
+
$version = "v1.2.8";
|
5 |
+
# Author: Ulf Hermjakob
|
6 |
+
|
7 |
+
# Usage: uroman.pl {-l [ara|bel|bul|deu|ell|eng|fas|grc|heb|kaz|kir|lav|lit|mkd|mkd2|oss|pnt|rus|srp|srp2|tur|uig|ukr|yid]} {--chart|--offset-mapping} {--no-cache} {--workset} < STDIN
|
8 |
+
# Example: cat workset.txt | uroman.pl --offset-mapping --workset
|
9 |
+
|
10 |
+
$|=1;
|
11 |
+
|
12 |
+
use FindBin;
|
13 |
+
use Cwd "abs_path";
|
14 |
+
use File::Basename qw(dirname);
|
15 |
+
use File::Spec;
|
16 |
+
|
17 |
+
my $bin_dir = abs_path(dirname($0));
|
18 |
+
my $root_dir = File::Spec->catfile($bin_dir, File::Spec->updir());
|
19 |
+
my $data_dir = File::Spec->catfile($root_dir, "data");
|
20 |
+
my $lib_dir = File::Spec->catfile($root_dir, "lib");
|
21 |
+
|
22 |
+
use lib "$FindBin::Bin/../lib";
|
23 |
+
use NLP::Chinese;
|
24 |
+
use NLP::Romanizer;
|
25 |
+
use NLP::UTF8;
|
26 |
+
use NLP::utilities;
|
27 |
+
use JSON;
|
28 |
+
$chinesePM = NLP::Chinese;
|
29 |
+
$romanizer = NLP::Romanizer;
|
30 |
+
$util = NLP::utilities;
|
31 |
+
%ht = ();
|
32 |
+
%pinyin_ht = ();
|
33 |
+
$lang_code = "";
|
34 |
+
$return_chart_p = 0;
|
35 |
+
$return_offset_mappings_p = 0;
|
36 |
+
$workset_p = 0;
|
37 |
+
$cache_rom_tokens_p = 1;
|
38 |
+
|
39 |
+
$script_data_filename = File::Spec->catfile($data_dir, "Scripts.txt");
|
40 |
+
$unicode_data_overwrite_filename = File::Spec->catfile($data_dir, "UnicodeDataOverwrite.txt");
|
41 |
+
$unicode_data_filename = File::Spec->catfile($data_dir, "UnicodeData.txt");
|
42 |
+
$romanization_table_filename = File::Spec->catfile($data_dir, "romanization-table.txt");
|
43 |
+
$chinese_tonal_pinyin_filename = File::Spec->catfile($data_dir, "Chinese_to_Pinyin.txt");
|
44 |
+
|
45 |
+
while (@ARGV) {
|
46 |
+
$arg = shift @ARGV;
|
47 |
+
if ($arg =~ /^-+(l|lc|lang-code)$/) {
|
48 |
+
$lang_code = lc (shift @ARGV || "")
|
49 |
+
} elsif ($arg =~ /^-+chart$/i) {
|
50 |
+
$return_chart_p = 1;
|
51 |
+
} elsif ($arg =~ /^-+workset$/i) {
|
52 |
+
$workset_p = 1;
|
53 |
+
} elsif ($arg =~ /^-+offset[-_]*map/i) {
|
54 |
+
$return_offset_mappings_p = 1;
|
55 |
+
} elsif ($arg =~ /^-+unicode[-_]?data/i) {
|
56 |
+
$filename = shift @ARGV;
|
57 |
+
if (-r $filename) {
|
58 |
+
$unicode_data_filename = $filename;
|
59 |
+
} else {
|
60 |
+
print STDERR "Ignoring invalid UnicodeData filename $filename\n";
|
61 |
+
}
|
62 |
+
} elsif ($arg =~ /^-+(no-tok-cach|no-cach)/i) {
|
63 |
+
$cache_rom_tokens_p = 0;
|
64 |
+
} else {
|
65 |
+
print STDERR "Ignoring unrecognized arg $arg\n";
|
66 |
+
}
|
67 |
+
}
|
68 |
+
|
69 |
+
$romanizer->load_script_data(*ht, $script_data_filename);
|
70 |
+
$romanizer->load_unicode_data(*ht, $unicode_data_filename);
|
71 |
+
$romanizer->load_unicode_overwrite_romanization(*ht, $unicode_data_overwrite_filename);
|
72 |
+
$romanizer->load_romanization_table(*ht, $romanization_table_filename);
|
73 |
+
$chinese_to_pinyin_not_yet_loaded_p = 1;
|
74 |
+
$current_date = $util->datetime("dateTtime");
|
75 |
+
$lang_code_clause = ($lang_code) ? " \"lang-code\":\"$lang_code\",\n" : "";
|
76 |
+
|
77 |
+
print "{\n \"romanizer\":\"uroman $version (Ulf Hermjakob, USC/ISI)\",\n \"date\":\"$current_date\",\n$lang_code_clause \"romanization\": [\n" if $return_chart_p;
|
78 |
+
my $line_number = 0;
|
79 |
+
my $chart_result = "";
|
80 |
+
while (<>) {
|
81 |
+
$line_number++;
|
82 |
+
my $line = $_;
|
83 |
+
my $snt_id = "";
|
84 |
+
if ($workset_p) {
|
85 |
+
next if $line =~ /^#/;
|
86 |
+
if (($i_value, $s_value) = ($line =~ /^(\S+\.\d+)\s(.*)$/)) {
|
87 |
+
$snt_id = $i_value;
|
88 |
+
$line = "$s_value\n";
|
89 |
+
} else {
|
90 |
+
next;
|
91 |
+
}
|
92 |
+
}
|
93 |
+
if ($chinese_to_pinyin_not_yet_loaded_p && $chinesePM->string_contains_utf8_cjk_unified_ideograph_p($line)) {
|
94 |
+
$chinesePM->read_chinese_tonal_pinyin_files(*pinyin_ht, $chinese_tonal_pinyin_filename);
|
95 |
+
$chinese_to_pinyin_not_yet_loaded_p = 0;
|
96 |
+
}
|
97 |
+
if ($return_chart_p) {
|
98 |
+
print $chart_result;
|
99 |
+
*chart_ht = $romanizer->romanize($line, $lang_code, "", *ht, *pinyin_ht, 0, "return chart", $line_number);
|
100 |
+
$chart_result = $romanizer->chart_to_json_romanization_elements(0, $chart_ht{N_CHARS}, *chart_ht, $line_number);
|
101 |
+
} elsif ($return_offset_mappings_p) {
|
102 |
+
($best_romanization, $offset_mappings) = $romanizer->romanize($line, $lang_code, "", *ht, *pinyin_ht, 0, "return offset mappings", $line_number, 0);
|
103 |
+
print "::snt-id $snt_id\n" if $workset_p;
|
104 |
+
print "::orig $line";
|
105 |
+
print "::rom $best_romanization\n";
|
106 |
+
print "::align $offset_mappings\n\n";
|
107 |
+
} elsif ($cache_rom_tokens_p) {
|
108 |
+
print $romanizer->romanize_by_token_with_caching($line, $lang_code, "", *ht, *pinyin_ht, 0, "", $line_number) . "\n";
|
109 |
+
} else {
|
110 |
+
print $romanizer->romanize($line, $lang_code, "", *ht, *pinyin_ht, 0, "", $line_number) . "\n";
|
111 |
+
}
|
112 |
+
}
|
113 |
+
$chart_result =~ s/,(\s*)$/$1/;
|
114 |
+
print $chart_result;
|
115 |
+
print " ]\n}\n" if $return_chart_p;
|
116 |
+
|
117 |
+
$dev_test_p = 0;
|
118 |
+
if ($dev_test_p) {
|
119 |
+
$n_suspicious_code_points = 0;
|
120 |
+
$n_instances = 0;
|
121 |
+
foreach $char_name (sort { hex($ht{UTF_NAME_TO_UNICODE}->{$a}) <=> hex($ht{UTF_NAME_TO_UNICODE}->{$b}) }
|
122 |
+
keys %{$ht{SUSPICIOUS_ROMANIZATION}}) {
|
123 |
+
$unicode_value = $ht{UTF_NAME_TO_UNICODE}->{$char_name};
|
124 |
+
$utf8_string = $ht{UTF_NAME_TO_CODE}->{$char_name};
|
125 |
+
foreach $romanization (sort keys %{$ht{SUSPICIOUS_ROMANIZATION}->{$char_name}}) {
|
126 |
+
$count = $ht{SUSPICIOUS_ROMANIZATION}->{$char_name}->{$romanization};
|
127 |
+
$s = ($count == 1) ? "" : "s";
|
128 |
+
print STDERR "*** Suspiciously lengthy romanization:\n" unless $n_suspicious_code_points;
|
129 |
+
print STDERR "::s $utf8_string ::t $romanization ::comment $char_name (U+$unicode_value)\n";
|
130 |
+
$n_suspicious_code_points++;
|
131 |
+
$n_instances += $count;
|
132 |
+
}
|
133 |
+
}
|
134 |
+
print STDERR " *** Total of $n_suspicious_code_points suspicious code points ($n_instances instance$s)\n" if $n_suspicious_code_points;
|
135 |
+
}
|
136 |
+
|
137 |
+
exit 0;
|
138 |
+
|
uroman/data/Chinese_to_Pinyin.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
uroman/data/Scripts.txt
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
::script-name Aegean
|
2 |
+
::script-name Ahom
|
3 |
+
::script-name Anatolian Hieroglyph
|
4 |
+
::script-name Arabic ::direction right-to-left
|
5 |
+
::script-name Armenian
|
6 |
+
::script-name Avestan
|
7 |
+
::script-name Balinese
|
8 |
+
::script-name Bamum
|
9 |
+
::script-name Bassa Vah
|
10 |
+
::script-name Batak
|
11 |
+
::script-name Bengali ::abugida-default-vowel a
|
12 |
+
::script-name Bhaiksuki
|
13 |
+
::script-name Bopomofo ::language Chinese
|
14 |
+
::script-name Brahmi ::abugida-default-vowel a
|
15 |
+
::script-name Braille
|
16 |
+
::script-name Buginese
|
17 |
+
::script-name Buhid
|
18 |
+
::script-name Canadian Syllabics
|
19 |
+
::script-name Carian
|
20 |
+
::script-name Caucasian Albanian
|
21 |
+
::script-name Chakma
|
22 |
+
::script-name Cham
|
23 |
+
::script-name Cherokee
|
24 |
+
::script-name Coptic
|
25 |
+
::script-name Cuneiform
|
26 |
+
::script-name Cypriot
|
27 |
+
::script-name Cyrillic
|
28 |
+
::script-name CJK ::alt-script-name Chinese, Kanji ::language Chinese, Japanese, Korean, Mandarin
|
29 |
+
::script-name Deseret
|
30 |
+
::script-name Devanagari ::abugida-default-vowel a
|
31 |
+
::script-name Duployan
|
32 |
+
::script-name Egyptian Hieroglyph
|
33 |
+
::script-name Elbasan
|
34 |
+
::script-name Ethiopic
|
35 |
+
::script-name Georgian
|
36 |
+
::script-name Glagolitic
|
37 |
+
::script-name Gothic
|
38 |
+
::script-name Grantha
|
39 |
+
::script-name Greek
|
40 |
+
::script-name Gujarati ::abugida-default-vowel a
|
41 |
+
::script-name Gurmukhi ::abugida-default-vowel a
|
42 |
+
::script-name Hangul ::language Korean
|
43 |
+
::script-name Hanunoo
|
44 |
+
::script-name Hatran
|
45 |
+
::script-name Hebrew ::direction right-to-left
|
46 |
+
::script-name Hiragana ::language Japanese
|
47 |
+
::script-name Imperial Aramaic
|
48 |
+
::script-name Inscriptional Pahlavi
|
49 |
+
::script-name Inscriptional Parthian
|
50 |
+
::script-name Javanese
|
51 |
+
::script-name Kaithi
|
52 |
+
::script-name Kannada ::abugida-default-vowel a
|
53 |
+
::script-name Katakana ::language Japanese
|
54 |
+
::script-name Kayah Li
|
55 |
+
::script-name Kharoshthi
|
56 |
+
::script-name Khmer ::abugida-default-vowel a, o
|
57 |
+
::script-name Khojki
|
58 |
+
::script-name Khudawadi
|
59 |
+
::script-name Klingon
|
60 |
+
::script-name Lao
|
61 |
+
::script-name Lepcha
|
62 |
+
::script-name Latin
|
63 |
+
::script-name Limbu
|
64 |
+
::script-name Linear A
|
65 |
+
::script-name Linear B
|
66 |
+
::script-name Lycian
|
67 |
+
::script-name Lydian
|
68 |
+
::script-name Mahajani
|
69 |
+
::script-name Malayalam ::abugida-default-vowel a
|
70 |
+
::script-name Mandaic
|
71 |
+
::script-name Manichaean
|
72 |
+
::script-name Marchen
|
73 |
+
::script-name Meetei Mayek
|
74 |
+
::script-name Meroitic Cursive
|
75 |
+
::script-name Meroitic Hieroglyphic
|
76 |
+
::script-name Miao
|
77 |
+
::script-name Modi ::abugida-default-vowel a
|
78 |
+
::script-name Mongolian
|
79 |
+
::script-name Mro
|
80 |
+
::script-name Multani
|
81 |
+
::script-name Myanmar ::alt-script-name Burmese ::abugida-default-vowel a
|
82 |
+
::script-name Nabataean
|
83 |
+
::script-name New Tai Lue
|
84 |
+
::script-name Newa
|
85 |
+
::script-name Nko ::direction right-to-left
|
86 |
+
::script-name Ogham
|
87 |
+
::script-name Ol Chiki
|
88 |
+
::script-name Old Hungarian
|
89 |
+
::script-name Old Italic
|
90 |
+
::script-name Old Permic
|
91 |
+
::script-name Old Persian
|
92 |
+
::script-name Old North Arabian
|
93 |
+
::script-name Old South Arabian
|
94 |
+
::script-name Old Turkic
|
95 |
+
::script-name Oriya ::alt-script-name Odia ::abugida-default-vowel a
|
96 |
+
::script-name Osage
|
97 |
+
::script-name Osmanya
|
98 |
+
::script-name Pahawh Hmong
|
99 |
+
::script-name Palmyrene
|
100 |
+
::script-name Pau Cin Hau
|
101 |
+
::script-name Phags-pa
|
102 |
+
::script-name Phaistos Disc
|
103 |
+
::script-name Phoenician
|
104 |
+
::script-name Psalter Pahlavi
|
105 |
+
::script-name Rejang
|
106 |
+
::script-name Runic
|
107 |
+
::script-name Samaritan
|
108 |
+
::script-name Saurashtra
|
109 |
+
::script-name Sharada
|
110 |
+
::script-name Shavian
|
111 |
+
::script-name Siddham
|
112 |
+
::script-name Sinhala ::abugida-default-vowel a
|
113 |
+
::script-name Sora Sompeng
|
114 |
+
::script-name Sundanese ::abugida-default-vowel a
|
115 |
+
::script-name Syloti Nagri
|
116 |
+
::script-name Syriac
|
117 |
+
::script-name Tagalog
|
118 |
+
::script-name Tagbanwa
|
119 |
+
::script-name Tai Le
|
120 |
+
::script-name Tai Tham
|
121 |
+
::script-name Tai Viet
|
122 |
+
::script-name Takri
|
123 |
+
::script-name Tamil ::abugida-default-vowel a
|
124 |
+
::script-name Tangut
|
125 |
+
::script-name Telugu ::abugida-default-vowel a
|
126 |
+
::script-name Thaana ::direction right-to-left
|
127 |
+
::script-name Thai
|
128 |
+
::script-name Tibetan ::abugida-default-vowel a
|
129 |
+
::script-name Tifinagh
|
130 |
+
::script-name Tirhuta
|
131 |
+
::script-name Ugaritic
|
132 |
+
::script-name Vai
|
133 |
+
::script-name Vedic
|
134 |
+
::script-name Warang Citi
|
135 |
+
::script-name Yi
|
uroman/data/UnicodeData.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
uroman/data/UnicodeDataOverwrite.txt
ADDED
@@ -0,0 +1,442 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## UnicodeDataOverwrite.txt
|
2 |
+
::u 00A0 ::r " " ::comment no-break space
|
3 |
+
::u 01BF ::r w ::comment ƿ Latin Character Wynn (Old English)
|
4 |
+
::u 0294 ::r ' ::comment gottal stop
|
5 |
+
::u 0295 ::r ' ::comment ʕ voiced pharyngeal fricative
|
6 |
+
::u 0305 ::r "" ::comment ̅ Combining overline
|
7 |
+
::u 0306 ::r "" ::comment ̆ Combining breve
|
8 |
+
::u 0307 ::r "" ::comment ̇ Combining dot above
|
9 |
+
::u 030A ::r "" ::comment ̊ Combining ring above
|
10 |
+
::u 030C ::r "" ::comment ̌ Combining caron
|
11 |
+
::u 0311 ::r "" ::comment ̑ Combining inverted breve
|
12 |
+
::u 031D ::r "" ::comment ̝ Combining down up below
|
13 |
+
::u 031E ::r "" ::comment ̞ Combining down tack below
|
14 |
+
::u 031F ::r "" ::comment ̟ Combining plus sign below
|
15 |
+
::u 0323 ::r "" ::comment ̣ Combining dot below
|
16 |
+
::u 0325 ::r "" ::comment ̥ Combining ring below
|
17 |
+
::u 0329 ::r "" ::comment ̩ Combining vertical line below
|
18 |
+
::u 032A ::r "" ::comment ̪ Combining bridge below
|
19 |
+
::u 032F ::r "" ::comment ̯ Combining inverted breve below
|
20 |
+
::u 0342 ::r "" ::comment ͂ Combining Greek perispomeni (circumflex accent)
|
21 |
+
::u 0343 ::r "" ::comment ̓ Combining Greek koronis
|
22 |
+
::u 0361 ::r "" ::comment Combining double inverted breve
|
23 |
+
::u 0384 ::r "" ::comment ΄ Greek tonos
|
24 |
+
::u 0482 ::r 1000· ::comment ҂ Cyrillic thousands sign
|
25 |
+
::u 0483 ::r "" ::comment ҃ Combining Cyrillic Titlo ::annotation titlo
|
26 |
+
::u 0484 ::r "" ::comment ҄ Combining Cyrillic Palatalization ::annotation palatalization
|
27 |
+
::u 055B ::r "" ::comment ՛ Armenian emphasis mark
|
28 |
+
::u 055F ::r "" ::comment ՟ Armenian abbreviation mark ::annotation abbreviation
|
29 |
+
|
30 |
+
::u 0901 ::r +m ::comment Devanagari sign candrabindu
|
31 |
+
::u 0902 ::r +m ::comment Devanagari sign anusvara
|
32 |
+
::u 0903 ::r +h ::comment Devanagari sign visarga
|
33 |
+
::u 093D ::r ' ::comment Devanagari sign avagraha
|
34 |
+
::u 0950 ::r om ::comment ॐ Devanagari om symbol
|
35 |
+
::u 0951 ::r "" ::comment ॑ Devanagari stress sign "udatta"
|
36 |
+
::u 0952 ::r "" ::comment ॒ Devanagari stress sign "anudatta"
|
37 |
+
::u 0981 ::r +n ::comment Bengali sign candrabindu ("chôndrôbindu")
|
38 |
+
::u 0982 ::r +ng ::comment Bengali sign anusvara ("ônushar")
|
39 |
+
::u 0983 ::r +h ::comment Bengali sign visarga ("bishôrgô")
|
40 |
+
::u 099A ::r ch ::comment instead of Bengali C(A)
|
41 |
+
::u 099B ::r chh ::comment instead of Bengali CC(A)
|
42 |
+
::u 0A02 ::r +m ::comment Gurmukhi sign bindi
|
43 |
+
::u 0A70 ::r +m ::comment Gurmukhi tippi
|
44 |
+
# ::u 0A72 ::r "" ::comment Gurmukhi addak
|
45 |
+
::u 0A72 ::r "" ::comment Gurmukhi iri
|
46 |
+
::u 0A73 ::r "" ::comment Gurmukhi ura
|
47 |
+
::u 0B01 ::r +m ::comment Oriya sign candrabindu
|
48 |
+
::u 0B03 ::r +h ::comment Oriya sign visarga
|
49 |
+
::u 0B5F ::r ya ::comment ୟ Oriya letter yya
|
50 |
+
::u 0B82 ::r +m ::comment Tamil sign anusvara (not to be used?)
|
51 |
+
::u 0B83 ::r +h ::comment Tamil sign visarga ("āytam")
|
52 |
+
::u 0B9F ::r t ::comment instead of Tamil TT(A)
|
53 |
+
::u 0BA3 ::r n ::comment instead of Tamil NN(A)
|
54 |
+
::u 0BA9 ::r n ::comment instead of Tamil NNN(A)
|
55 |
+
::u 0BB1 ::r r ::comment instead of Tamil RR(A)
|
56 |
+
::u 0BB3 ::r l ::comment instead of Tamil LL(A)
|
57 |
+
::u 0BB4 ::r l ::comment instead of Tamil LLL(A)
|
58 |
+
::u 0C03 ::r +h ::comment ః Telugu sign visarga
|
59 |
+
::u 0C83 ::r +h ::comment Kannada sign visarga
|
60 |
+
::u 0D02 ::r +m ::comment Malayalam sign anusvara
|
61 |
+
::u 0D03 ::r +h ::comment Malayalam sign visarga
|
62 |
+
::u 0D82 ::r +n ::comment Sinhala sign anusvaraya
|
63 |
+
::u 0DA4 ::r ny ::comment Sinhala ඤ
|
64 |
+
::u 0DA5 ::r gn ::comment Sinhala ඥ
|
65 |
+
::u 0DCA ::r "" ::comment Sinhala sign al-lakuna (virama = no vowel)
|
66 |
+
::u 0DCF ::r aa ::comment Sinhala ා
|
67 |
+
::u 0DD0 ::r ae ::comment Sinhala ැ
|
68 |
+
::u 0DD1 ::r ae ::comment Sinhala ෑ
|
69 |
+
::u 0DD2 ::r i ::comment Sinhala ි
|
70 |
+
::u 0DD3 ::r ii ::comment Sinhala ී
|
71 |
+
::u 0DD4 ::r u ::comment Sinhala ු
|
72 |
+
::u 0DD6 ::r uu ::comment Sinhala ූ
|
73 |
+
::u 0DD8 ::r r ::comment Sinhala ෘ
|
74 |
+
::u 0DD9 ::r e ::comment Sinhala ෙ
|
75 |
+
::u 0DDA ::r ee ::comment Sinhala ේ
|
76 |
+
::u 0DDB ::r ai ::comment Sinhala ෛ
|
77 |
+
::u 0DDC ::r o ::comment Sinhala ො
|
78 |
+
::u 0DDD ::r oo ::comment Sinhala ෝ
|
79 |
+
::u 0DDE ::r au ::comment Sinhala ෞ
|
80 |
+
::u 0DDF ::r aa ::comment Sinhala ා
|
81 |
+
::u 0DF2 ::r rr ::comment Sinhala ෲ
|
82 |
+
|
83 |
+
::u 0E02 ::r k ::comment Thai character KHO KHAI
|
84 |
+
::u 0E03 ::r k ::comment Thai character KHO KHUAT
|
85 |
+
::u 0E04 ::r k ::comment Thai character KHO KHWAI
|
86 |
+
::u 0E05 ::r k ::comment Thai character KHO KHON
|
87 |
+
::u 0E06 ::r k ::comment Thai character KHO RAKHANG
|
88 |
+
::u 0E10 ::r t ::comment Thai character THO THAN
|
89 |
+
::u 0E11 ::r t ::comment Thai character THO NANGMONTHO
|
90 |
+
::u 0E12 ::r t ::comment Thai character THO PHUTHAO
|
91 |
+
::u 0E16 ::r t ::comment Thai character THO THUNG
|
92 |
+
::u 0E17 ::r t ::comment Thai character THO THAHAN
|
93 |
+
::u 0E18 ::r t ::comment Thai character THO THONG
|
94 |
+
::u 0E1C ::r p ::comment Thai character PHO PHUNG
|
95 |
+
::u 0E1E ::r p ::comment Thai character PHO PHAN
|
96 |
+
::u 0E20 ::r p ::comment Thai character PHO SAMPHAO
|
97 |
+
::u 0E2D ::r o ::comment Thai character O ANG
|
98 |
+
::u 0E2F ::r ... ::comment ฯ Thai character PAIYANNOI (ellipsis, abbreviation)
|
99 |
+
::u 0E31 ::r a ::comment Thai character MAI HAN-AKAT
|
100 |
+
::u 0E3A ::r "" ::comment Thai character PHINTHU (Pali virama)
|
101 |
+
::u 0E40 ::r e ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA E
|
102 |
+
::u 0E41 ::r ae ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA AE
|
103 |
+
::u 0E42 ::r o ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA O
|
104 |
+
::u 0E43 ::r ai ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA AI MAIMUAN
|
105 |
+
::u 0E44 ::r ai ::syllable-info written-pre-consonant-spoken-post-consonant ::comment Thai character SARA AI MAIMALAI
|
106 |
+
::u 0E45 ::r "" ::comment Thai character LAKKHANGYAO vowel lengthener
|
107 |
+
::u 0E47 ::r o ::comment Thai character MAITAIKHU vowel shortener
|
108 |
+
::u 0E48 ::r "" ::tone-mark non-standard ::comment Thai tone mark MAI EK
|
109 |
+
::u 0E49 ::r "" ::tone-mark standard ::comment Thai tone mark MAI THO
|
110 |
+
::u 0E4A ::r "" ::tone-mark high ::comment Thai tone mark MAI TRI
|
111 |
+
::u 0E4B ::r "" ::tone-mark rising ::comment Thai tone mark MAI CHATTAWA
|
112 |
+
::u 0E4C ::r "" ::comment Thai character THANTHAKHAT cancellation mark (cf. virama)
|
113 |
+
::u 0E4D ::r +m ::comment ํ Thai character NIKHAHIT final nasal (cf. anusvara)
|
114 |
+
::u 0ECC ::r "" ::comment ໌ Lao cancellation mark ::annotation cancellation
|
115 |
+
::u 0F0B ::r · ::comment ་ Tibetan mark intersyllabic tsheg
|
116 |
+
::u 0F0C ::r "" ::comment ༌ Tibetan mark delimiter tsheg bstar
|
117 |
+
::u 0F84 ::r "" ::comment ྄ Tibetan halanta
|
118 |
+
::u 1036 ::r +n ::comment Myanmar sign anusvara ("auk myit")
|
119 |
+
::u 1037 ::r "" ::tone-mark creaky ::comment Myanmar sign dot below
|
120 |
+
::u 1038 ::r "" ::tone-mark high ::comment Myanmar sign visarga
|
121 |
+
|
122 |
+
::u 16A0 ::r f ::comment ᚠ RUNIC LETTER FEHU FEOH FE F
|
123 |
+
::u 16A1 ::r v ::comment ᚡ RUNIC LETTER V
|
124 |
+
::u 16A2 ::r u ::comment ᚢ RUNIC LETTER URUZ UR U
|
125 |
+
::u 16A3 ::r y ::comment ᚣ RUNIC LETTER YR
|
126 |
+
::u 16A4 ::r y ::comment ᚤ RUNIC LETTER Y
|
127 |
+
::u 16A5 ::r w ::comment ᚥ RUNIC LETTER W
|
128 |
+
::u 16A6 ::r th ::comment ᚦ RUNIC LETTER THURISAZ THURS THORN
|
129 |
+
::u 16A7 ::r th ::comment ᚧ RUNIC LETTER ETH
|
130 |
+
::u 16A8 ::r a ::comment ᚨ RUNIC LETTER ANSUZ A
|
131 |
+
::u 16A9 ::r o ::comment ᚩ RUNIC LETTER OS O
|
132 |
+
::u 16AA ::r a ::comment ᚪ RUNIC LETTER AC A
|
133 |
+
::u 16AB ::r ae ::comment ᚫ RUNIC LETTER AESC
|
134 |
+
::u 16AC ::r o ::comment ᚬ RUNIC LETTER LONG-BRANCH-OSS O
|
135 |
+
::u 16AD ::r o ::comment ᚭ RUNIC LETTER SHORT-TWIG-OSS O
|
136 |
+
::u 16AE ::r o ::comment ᚮ RUNIC LETTER O
|
137 |
+
::u 16AF ::r oe ::comment ᚯ RUNIC LETTER OE
|
138 |
+
::u 16B0 ::r on ::comment ᚰ RUNIC LETTER ON
|
139 |
+
::u 16B1 ::r r ::comment ᚱ RUNIC LETTER RAIDO RAD REID R
|
140 |
+
::u 16B2 ::r k ::comment ᚲ RUNIC LETTER KAUNA
|
141 |
+
::u 16B3 ::r c ::comment ᚳ RUNIC LETTER CEN
|
142 |
+
::u 16B4 ::r k ::comment ᚴ RUNIC LETTER KAUN K
|
143 |
+
::u 16B5 ::r g ::comment ᚵ RUNIC LETTER G
|
144 |
+
::u 16B6 ::r ng ::comment ᚶ RUNIC LETTER ENG
|
145 |
+
::u 16B7 ::r g ::comment ᚷ RUNIC LETTER GEBO GYFU G
|
146 |
+
::u 16B8 ::r g ::comment ᚸ RUNIC LETTER GAR
|
147 |
+
::u 16B9 ::r w ::comment ᚹ RUNIC LETTER WUNJO WYNN W
|
148 |
+
::u 16BA ::r h ::comment ᚺ RUNIC LETTER HAGLAZ H
|
149 |
+
::u 16BB ::r h ::comment ᚻ RUNIC LETTER HAEGL H
|
150 |
+
::u 16BC ::r h ::comment ᚼ RUNIC LETTER LONG-BRANCH-HAGALL H
|
151 |
+
::u 16BD ::r h ::comment ᚽ RUNIC LETTER SHORT-TWIG-HAGALL H
|
152 |
+
::u 16BE ::r n ::comment ᚾ RUNIC LETTER NAUDIZ NYD NAUD N
|
153 |
+
::u 16BF ::r n ::comment ᚿ RUNIC LETTER SHORT-TWIG-NAUD N
|
154 |
+
::u 16C0 ::r n ::comment ᛀ RUNIC LETTER DOTTED-N
|
155 |
+
::u 16C1 ::r i ::comment ᛁ RUNIC LETTER ISAZ IS ISS I
|
156 |
+
::u 16C2 ::r e ::comment ᛂ RUNIC LETTER E
|
157 |
+
::u 16C3 ::r j ::comment ᛃ RUNIC LETTER JERAN J
|
158 |
+
::u 16C4 ::r j ::comment ᛄ RUNIC LETTER GER
|
159 |
+
::u 16C5 ::r ae ::comment ᛅ RUNIC LETTER LONG-BRANCH-AR AE
|
160 |
+
::u 16C6 ::r a ::comment ᛆ RUNIC LETTER SHORT-TWIG-AR A
|
161 |
+
::u 16C7 ::r i ::comment ᛇ RUNIC LETTER IWAZ EOH
|
162 |
+
::u 16C8 ::r p ::comment ᛈ RUNIC LETTER PERTHO PEORTH P
|
163 |
+
::u 16C9 ::r z ::comment ᛉ RUNIC LETTER ALGIZ EOLHX
|
164 |
+
::u 16CA ::r s ::comment ᛊ RUNIC LETTER SOWILO S
|
165 |
+
::u 16CB ::r s ::comment ᛋ RUNIC LETTER SIGEL LONG-BRANCH-SOL S
|
166 |
+
::u 16CC ::r s ::comment ᛌ RUNIC LETTER SHORT-TWIG-SOL S
|
167 |
+
::u 16CD ::r c ::comment ᛍ RUNIC LETTER C
|
168 |
+
::u 16CE ::r z ::comment ᛎ RUNIC LETTER Z
|
169 |
+
::u 16CF ::r t ::comment ᛏ RUNIC LETTER TIWAZ TIR TYR T
|
170 |
+
::u 16D0 ::r t ::comment ᛐ RUNIC LETTER SHORT-TWIG-TYR T
|
171 |
+
::u 16D1 ::r d ::comment ᛑ RUNIC LETTER D
|
172 |
+
::u 16D2 ::r b ::comment ᛒ RUNIC LETTER BERKANAN BEORC BJARKAN B
|
173 |
+
::u 16D3 ::r b ::comment ᛓ RUNIC LETTER SHORT-TWIG-BJARKAN B
|
174 |
+
::u 16D4 ::r p ::comment ᛔ RUNIC LETTER DOTTED-P
|
175 |
+
::u 16D5 ::r p ::comment ᛕ RUNIC LETTER OPEN-P
|
176 |
+
::u 16D6 ::r e ::comment ᛖ RUNIC LETTER EHWAZ EH E
|
177 |
+
::u 16D7 ::r m ::comment ᛗ RUNIC LETTER MANNAZ MAN M
|
178 |
+
::u 16D8 ::r m ::comment ᛘ RUNIC LETTER LONG-BRANCH-MADR M
|
179 |
+
::u 16D9 ::r m ::comment ᛙ RUNIC LETTER SHORT-TWIG-MADR M
|
180 |
+
::u 16DA ::r l ::comment ᛚ RUNIC LETTER LAUKAZ LAGU LOGR L
|
181 |
+
::u 16DB ::r l ::comment ᛛ RUNIC LETTER DOTTED-L
|
182 |
+
::u 16DC ::r ng ::comment ᛜ RUNIC LETTER INGWAZ
|
183 |
+
::u 16DD ::r ng ::comment ᛝ RUNIC LETTER ING
|
184 |
+
::u 16DE ::r d ::comment ᛞ RUNIC LETTER DAGAZ DAEG D
|
185 |
+
::u 16DF ::r o ::comment ᛟ RUNIC LETTER OTHALAN ETHEL O
|
186 |
+
::u 16E0 ::r ea ::comment ᛠ RUNIC LETTER EAR
|
187 |
+
::u 16E1 ::r io ::comment ᛡ RUNIC LETTER IOR
|
188 |
+
::u 16E2 ::r q ::comment ᛢ RUNIC LETTER CWEORTH
|
189 |
+
::u 16E3 ::r k ::comment ᛣ RUNIC LETTER CALC
|
190 |
+
::u 16E4 ::r k ::comment ᛤ RUNIC LETTER CEALC
|
191 |
+
::u 16E5 ::r st ::comment ᛥ RUNIC LETTER STAN
|
192 |
+
::u 16E6 ::r r ::comment ᛦ RUNIC LETTER LONG-BRANCH-YR
|
193 |
+
::u 16E7 ::r r ::comment ᛧ RUNIC LETTER SHORT-TWIG-YR
|
194 |
+
::u 16E8 ::r r ::comment ᛨ RUNIC LETTER ICELANDIC-YR
|
195 |
+
::u 16E9 ::r q ::comment ᛩ RUNIC LETTER Q
|
196 |
+
::u 16EA ::r x ::comment ᛪ RUNIC LETTER X
|
197 |
+
|
198 |
+
::u 17B9 ::r oe ::comment Khmer vowel sign y (short)
|
199 |
+
::u 17BA ::r oe ::comment Khmer vowel sign yy (long)
|
200 |
+
::u 17C6 ::r +m ::comment Khmer sign nikahit (cf. anusvara)
|
201 |
+
::u 17C7 ::r +h ::comment Khmer sign reahmuk (cf. visarga)
|
202 |
+
::u 17C8 ::r ' ::comment Khmer sign yuukaleapintu (short vowel and glottal stop)
|
203 |
+
::u 17C9 ::r "" ::comment Khmer sign muusikatoan: changes the second register to the first
|
204 |
+
::u 17CA ::r "" ::comment Khmer sign triisap: changes the first register to the second
|
205 |
+
::u 17CB ::r "" ::comment Khmer sign bantoc (vowel shortener)
|
206 |
+
::u 17D2 ::r "" ::comment Khmer sign coeng (foot/subscript, cf. virama = no vowel)
|
207 |
+
::u 17D5 ::r . ::comment Khmer sign bariyoosan; period ending entire text or chapter
|
208 |
+
|
209 |
+
::u 180E ::r ' ::comment Mongolian vowel separator
|
210 |
+
|
211 |
+
::u 1B80 ::r +ng ::comment ᮀ Sundanese sign panyecek
|
212 |
+
::u 1B81 ::r +r ::comment ᮁ Sundanese sign panglayar
|
213 |
+
::u 1B82 ::r +h ::comment ᮂ Sundanese sign pangwisad
|
214 |
+
::u 1BA1 ::r ya ::comment ᮡ Sundanese consonant sign pamingkal
|
215 |
+
::u 1BA2 ::r ra ::comment ᮢ Sundanese consonant sign panyakr
|
216 |
+
::u 1BA3 ::r la ::comment ᮣ Sundanese consonant sign panyiku
|
217 |
+
::u 1BA4 ::r i ::comment ᮤ Sundanese consonant sign panghulu
|
218 |
+
::u 1BA5 ::r u ::comment ᮥ Sundanese consonant sign panyuku
|
219 |
+
::u 1BA6 ::r e ::comment ᮦ Sundanese vowel sign panaelaeng
|
220 |
+
::u 1BA7 ::r o ::comment ᮧ Sundanese vowel sign panolong
|
221 |
+
::u 1BA8 ::r e ::comment ᮨ Sundanese vowel sign pamepet
|
222 |
+
::u 1BA9 ::r eu ::comment ᮩ Sundanese vowel sign paneuleung
|
223 |
+
::u 1BAA ::r "" ::comment ᮪ Sundanese sign pamaaeh or patén (no vowel/virama)
|
224 |
+
|
225 |
+
::u 1FBD ::r "" ::comment ᾽ Greek koronis
|
226 |
+
::u 1FFE ::r "" ::comment Greek dasia (rough breathing)
|
227 |
+
|
228 |
+
::u 2002 ::r " " ::comment en space
|
229 |
+
::u 2003 ::r " " ::comment em space
|
230 |
+
::u 2004 ::r " " ::comment three-per-em space
|
231 |
+
::u 2005 ::r " " ::comment four-per-em space
|
232 |
+
::u 2006 ::r " " ::comment six-per-em space
|
233 |
+
::u 2007 ::r " " ::comment figure space
|
234 |
+
::u 2008 ::r " " ::comment punctuation space
|
235 |
+
::u 2009 ::r " " ::comment thin space
|
236 |
+
::u 200A ::r " " ::comment hair space
|
237 |
+
::u 202F ::r " " ::comment narrow no-break space
|
238 |
+
|
239 |
+
::u 2D30 ::r a ::comment TIFINAGH LETTER YA ⴰ
|
240 |
+
::u 2D31 ::r b ::comment TIFINAGH LETTER YAB ⴱ
|
241 |
+
::u 2D32 ::r bh ::comment TIFINAGH LETTER YABH ⴲ
|
242 |
+
::u 2D33 ::r g ::comment TIFINAGH LETTER YAG ⴳ
|
243 |
+
::u 2D34 ::r ghh ::comment TIFINAGH LETTER YAGHH ⴴ
|
244 |
+
::u 2D35 ::r j ::comment TIFINAGH LETTER BERBER ACADEMY YAJ ⴵ
|
245 |
+
::u 2D36 ::r j ::comment TIFINAGH LETTER YAJ ⴶ
|
246 |
+
::u 2D37 ::r d ::comment TIFINAGH LETTER YAD ⴷ
|
247 |
+
::u 2D38 ::r dh ::comment TIFINAGH LETTER YADH ⴸ
|
248 |
+
::u 2D39 ::r dd ::comment TIFINAGH LETTER YADD ⴹ
|
249 |
+
::u 2D3A ::r ddh ::comment TIFINAGH LETTER YADDH ⴺ
|
250 |
+
::u 2D3B ::r e ::comment TIFINAGH LETTER YEY ⴻ
|
251 |
+
::u 2D3C ::r f ::comment TIFINAGH LETTER YAF ⴼ
|
252 |
+
::u 2D3D ::r k ::comment TIFINAGH LETTER YAK ⴽ
|
253 |
+
::u 2D3E ::r k ::comment TIFINAGH LETTER TUAREG YAK ⴾ
|
254 |
+
::u 2D3F ::r khh ::comment TIFINAGH LETTER YAKHH ⴿ
|
255 |
+
::u 2D40 ::r h ::comment TIFINAGH LETTER YAH ⵀ
|
256 |
+
::u 2D41 ::r h ::comment TIFINAGH LETTER BERBER ACADEMY YAH ⵁ
|
257 |
+
::u 2D42 ::r h ::comment TIFINAGH LETTER TUAREG YAH ⵂ
|
258 |
+
::u 2D43 ::r hh ::comment TIFINAGH LETTER YAHH ⵃ
|
259 |
+
::u 2D44 ::r ' ::comment TIFINAGH LETTER YAA ⵄ
|
260 |
+
::u 2D45 ::r kh ::comment TIFINAGH LETTER YAKH ⵅ
|
261 |
+
::u 2D46 ::r kh ::comment TIFINAGH LETTER TUAREG YAKH ⵆ
|
262 |
+
::u 2D47 ::r q ::comment TIFINAGH LETTER YAQ ⵇ
|
263 |
+
::u 2D48 ::r q ::comment TIFINAGH LETTER TUAREG YAQ ⵈ
|
264 |
+
::u 2D49 ::r i ::comment TIFINAGH LETTER YI ⵉ
|
265 |
+
::u 2D4A ::r zh ::comment TIFINAGH LETTER YAZH ⵊ
|
266 |
+
::u 2D4B ::r zh ::comment TIFINAGH LETTER AHAGGAR YAZH ⵋ
|
267 |
+
::u 2D4C ::r zh ::comment TIFINAGH LETTER TUAREG YAZH ⵌ
|
268 |
+
::u 2D4D ::r l ::comment TIFINAGH LETTER YAL ⵍ
|
269 |
+
::u 2D4E ::r m ::comment TIFINAGH LETTER YAM ⵎ
|
270 |
+
::u 2D4F ::r n ::comment TIFINAGH LETTER YAN ⵏ
|
271 |
+
::u 2D50 ::r gn ::comment TIFINAGH LETTER TUAREG YAGN ⵐ
|
272 |
+
::u 2D51 ::r ng ::comment TIFINAGH LETTER TUAREG YANG ⵑ
|
273 |
+
::u 2D52 ::r p ::comment TIFINAGH LETTER YAP ⵒ
|
274 |
+
::u 2D53 ::r u ::comment TIFINAGH LETTER YU ⵓ
|
275 |
+
::u 2D54 ::r r ::comment TIFINAGH LETTER YAR ⵔ
|
276 |
+
::u 2D55 ::r rr ::comment TIFINAGH LETTER YARR ⵕ
|
277 |
+
::u 2D56 ::r gh ::comment TIFINAGH LETTER YAGH ⵖ
|
278 |
+
::u 2D57 ::r gh ::comment TIFINAGH LETTER TUAREG YAGH ⵗ
|
279 |
+
::u 2D58 ::r gh ::comment TIFINAGH LETTER AYER YAGH ⵘ
|
280 |
+
::u 2D59 ::r s ::comment TIFINAGH LETTER YAS ⵙ
|
281 |
+
::u 2D5A ::r ss ::comment TIFINAGH LETTER YASS ⵚ
|
282 |
+
::u 2D5B ::r sh ::comment TIFINAGH LETTER YASH ⵛ
|
283 |
+
::u 2D5C ::r t ::comment TIFINAGH LETTER YAT ⵜ
|
284 |
+
::u 2D5D ::r th ::comment TIFINAGH LETTER YATH ⵝ
|
285 |
+
::u 2D5E ::r ch ::comment TIFINAGH LETTER YACH ⵞ
|
286 |
+
::u 2D5F ::r tt ::comment TIFINAGH LETTER YATT ⵟ
|
287 |
+
::u 2D60 ::r v ::comment TIFINAGH LETTER YAV ⵠ
|
288 |
+
::u 2D61 ::r w ::comment TIFINAGH LETTER YAW ⵡ
|
289 |
+
::u 2D62 ::r y ::comment TIFINAGH LETTER YAY ⵢ
|
290 |
+
::u 2D63 ::r z ::comment TIFINAGH LETTER YAZ ⵣ
|
291 |
+
::u 2D64 ::r z ::comment TIFINAGH LETTER TAWELLEMET YAZ ⵤ
|
292 |
+
::u 2D65 ::r zz ::comment TIFINAGH LETTER YAZZ ⵥ
|
293 |
+
::u 2D66 ::r ye ::comment TIFINAGH LETTER YE ⵦ
|
294 |
+
::u 2D67 ::r yo ::comment TIFINAGH LETTER YO ⵧ
|
295 |
+
::u 2D6F ::r "" ::comment TIFINAGH MODIFIER LETTER LABIALIZATION MARK ⵯ
|
296 |
+
::u 2D70 ::r "" ::comment TIFINAGH SEPARATOR MARK ⵰
|
297 |
+
::u 2D7F ::r "" ::comment TIFINAGH CONSONANT JOINER ⵿
|
298 |
+
|
299 |
+
::u 3063 ::r tsu ::comment Hiragana letter small tsu
|
300 |
+
::u 30C3 ::r tsu ::comment Katakana letter small tsu
|
301 |
+
|
302 |
+
::u ABE3 ::r o ::comment ꯣ Meetei Mayek vowel sign onap
|
303 |
+
::u ABE7 ::r ou ::comment ꯧ Meetei Mayek vowel sign sounap
|
304 |
+
|
305 |
+
::u F008 ::r "" ::comment Yoruba diacritic in private use area
|
306 |
+
::u F00F ::r "" ::comment Yoruba diacritic in private use area
|
307 |
+
::u F023 ::r "" ::comment Yoruba diacritic in private use area
|
308 |
+
::u F025 ::r "" ::comment Yoruba diacritic in private use area
|
309 |
+
|
310 |
+
::u F8D0 ::r a ::name KLINGON LETTER A
|
311 |
+
::u F8D1 ::r b ::name KLINGON LETTER B
|
312 |
+
::u F8D2 ::r ch ::name KLINGON LETTER CH
|
313 |
+
::u F8D3 ::r D ::name KLINGON LETTER D
|
314 |
+
::u F8D4 ::r e ::name KLINGON LETTER E
|
315 |
+
::u F8D5 ::r gh ::name KLINGON LETTER GH
|
316 |
+
::u F8D6 ::r H ::name KLINGON LETTER H
|
317 |
+
::u F8D7 ::r I ::name KLINGON LETTER I
|
318 |
+
::u F8D8 ::r j ::name KLINGON LETTER J
|
319 |
+
::u F8D9 ::r l ::name KLINGON LETTER L
|
320 |
+
::u F8DA ::r m ::name KLINGON LETTER M
|
321 |
+
::u F8DB ::r n ::name KLINGON LETTER N
|
322 |
+
::u F8DC ::r ng ::name KLINGON LETTER NG
|
323 |
+
::u F8DD ::r o ::name KLINGON LETTER O
|
324 |
+
::u F8DE ::r p ::name KLINGON LETTER P
|
325 |
+
::u F8DF ::r q ::name KLINGON LETTER Q
|
326 |
+
::u F8E0 ::r Q ::name KLINGON LETTER Q
|
327 |
+
::u F8E1 ::r r ::name KLINGON LETTER R
|
328 |
+
::u F8E2 ::r S ::name KLINGON LETTER S
|
329 |
+
::u F8E3 ::r t ::name KLINGON LETTER T
|
330 |
+
::u F8E4 ::r tlh ::name KLINGON LETTER TLH
|
331 |
+
::u F8E5 ::r u ::name KLINGON LETTER U
|
332 |
+
::u F8E6 ::r v ::name KLINGON LETTER V
|
333 |
+
::u F8E7 ::r w ::name KLINGON LETTER W
|
334 |
+
::u F8E8 ::r y ::name KLINGON LETTER Y
|
335 |
+
::u F8E9 ::r ' ::name KLINGON LETTER GLOTTAL STOP
|
336 |
+
::u F8F0 ::num 0 ::name KLINGON DIGIT ZERO
|
337 |
+
::u F8F1 ::num 1 ::name KLINGON DIGIT ONE
|
338 |
+
::u F8F2 ::num 2 ::name KLINGON DIGIT TWO
|
339 |
+
::u F8F3 ::num 3 ::name KLINGON DIGIT THREE
|
340 |
+
::u F8F4 ::num 4 ::name KLINGON DIGIT FOUR
|
341 |
+
::u F8F5 ::num 5 ::name KLINGON DIGIT FIVE
|
342 |
+
::u F8F6 ::num 6 ::name KLINGON DIGIT SIX
|
343 |
+
::u F8F7 ::num 7 ::name KLINGON DIGIT SEVEN
|
344 |
+
::u F8F8 ::num 8 ::name KLINGON DIGIT EIGHT
|
345 |
+
::u F8F9 ::num 9 ::name KLINGON DIGIT NINE
|
346 |
+
::u F8FD ::r , ::name KLINGON COMMA
|
347 |
+
::u F8FE ::r . ::name KLINGON FULL STOP
|
348 |
+
::u F8FF ::name KLINGON MUMMIFICATION GLYPH
|
349 |
+
|
350 |
+
::u 1163D ::r +m ::comment Modi sign anusvara
|
351 |
+
::u 1163E ::r +h ::comment Modi sign visarga
|
352 |
+
|
353 |
+
::u 13068 ::num 1000000 ::comment Egyptian Hieroglyph
|
354 |
+
::u 1308B ::r r ::comment Egyptian Hieroglyph ::pic mouth
|
355 |
+
::u 1309D ::r ' ::comment Egyptian Hieroglyph (ayn) ::pic forearm
|
356 |
+
::u 130A7 ::r d ::comment Egyptian Hieroglyph ::pic hand
|
357 |
+
::u 130AD ::num 10000 ::comment Egyptian Hieroglyph
|
358 |
+
::u 130AE ::num 20000 ::comment Egyptian Hieroglyph
|
359 |
+
::u 130AF ::num 30000 ::comment Egyptian Hieroglyph
|
360 |
+
::u 130B0 ::num 40000 ::comment Egyptian Hieroglyph
|
361 |
+
::u 130B1 ::num 50000 ::comment Egyptian Hieroglyph
|
362 |
+
::u 130B2 ::num 60000 ::comment Egyptian Hieroglyph
|
363 |
+
::u 130B3 ::num 70000 ::comment Egyptian Hieroglyph
|
364 |
+
::u 130B4 ::num 80000 ::comment Egyptian Hieroglyph
|
365 |
+
::u 130B5 ::num 90000 ::comment Egyptian Hieroglyph
|
366 |
+
::u 130B6 ::num 50000 ::comment Egyptian Hieroglyph
|
367 |
+
::u 130C0 ::r b ::comment Egyptian Hieroglyph ::pic foot
|
368 |
+
::u 130ED ::r l ::comment Egyptian Hieroglyph [also rw] ::pic lion recumbent
|
369 |
+
::u 13121 ::r h ::comment Egyptian Hieroglyph (f-underscore) ::pic aninal's belly and udder
|
370 |
+
::u 1313F ::r a ::comment Egyptian Hieroglyph (alef) ::pic vulture
|
371 |
+
::u 13153 ::r m ::comment Egyptian Hieroglyph ::pic owl
|
372 |
+
::u 13171 ::r w ::comment Egyptian Hieroglyph ::pic quail chick
|
373 |
+
::u 13187 ::r ::comment Egyptian Hieroglyph (determinative/son) H8 ::pic egg
|
374 |
+
::u 13190 ::num 100000 ::comment Egyptian Hieroglyph
|
375 |
+
::u 13191 ::r f ::comment Egyptian Hieroglyph ::pic horned viper
|
376 |
+
::u 13193 ::r d ::comment Egyptian Hieroglyph (J) ::pic cobra
|
377 |
+
::u 131BC ::num 1000 ::comment Egyptian Hieroglyph
|
378 |
+
::u 131BD ::num 2000 ::comment Egyptian Hieroglyph
|
379 |
+
::u 131BE ::num 3000 ::comment Egyptian Hieroglyph
|
380 |
+
::u 131BF ::num 4000 ::comment Egyptian Hieroglyph
|
381 |
+
::u 131C0 ::num 5000 ::comment Egyptian Hieroglyph
|
382 |
+
::u 131C1 ::num 6000 ::comment Egyptian Hieroglyph
|
383 |
+
::u 131C2 ::num 7000 ::comment Egyptian Hieroglyph
|
384 |
+
::u 131C3 ::num 8000 ::comment Egyptian Hieroglyph
|
385 |
+
::u 131C4 ::num 9000 ::comment Egyptian Hieroglyph
|
386 |
+
::u 131CB ::r i ::comment Egyptian Hieroglyph (yod) ::pic single reed
|
387 |
+
::u 131CC ::r y ::comment Egyptian Hieroglyph ::pic double reed
|
388 |
+
::u 1320E ::r q ::comment Egyptian Hieroglyph (qaf) ::pic sandy slope
|
389 |
+
::u 13209 ::comment Egyptian Hieroglyph ::pic desert hills
|
390 |
+
::u 13216 ::r n ::comment Egyptian Hieroglyph ::pic ripple of water
|
391 |
+
::u 13219 ::r sh ::comment Egyptian Hieroglyph (š) ::pic basin
|
392 |
+
::u 13254 ::r h ::comment Egyptian Hieroglyph ::pic reed shelter
|
393 |
+
::u 13283 ::r z ::comment Egyptian Hieroglyph [also S?] ::pic door bolt
|
394 |
+
::u 132AA ::r p ::comment Egyptian Hieroglyph ::pic stool
|
395 |
+
::u 132D4 ::r n ::comment Egyptian Hieroglyph ::pic red crown
|
396 |
+
::u 132F4 ::r s ::comment Egyptian Hieroglyph [also Z?] ::pic folded cloth
|
397 |
+
::u 13319 ::comment Egyptian Hieroglyph ::pic throw stick
|
398 |
+
::u 13362 ::num 100 ::comment Egyptian Hieroglyph
|
399 |
+
::u 13363 ::num 200 ::comment Egyptian Hieroglyph
|
400 |
+
::u 13364 ::num 300 ::comment Egyptian Hieroglyph
|
401 |
+
::u 13365 ::num 400 ::comment Egyptian Hieroglyph
|
402 |
+
::u 13366 ::num 500 ::comment Egyptian Hieroglyph
|
403 |
+
::u 13367 ::num 600 ::comment Egyptian Hieroglyph
|
404 |
+
::u 13368 ::num 700 ::comment Egyptian Hieroglyph
|
405 |
+
::u 13369 ::num 800 ::comment Egyptian Hieroglyph
|
406 |
+
::u 1336A ::num 900 ::comment Egyptian Hieroglyph
|
407 |
+
::u 1336B ::num 500 ::comment Egyptian Hieroglyph
|
408 |
+
::u 1336F ::r o ::comment Egyptian Hieroglyph ::pic lasso
|
409 |
+
::u 1337F ::r t ::comment Egyptian Hieroglyph (ṯ) ::pic hobble
|
410 |
+
::u 13386 ::num 10 ::comment Egyptian Hieroglyph
|
411 |
+
::u 13387 ::num 20 ::comment Egyptian Hieroglyph
|
412 |
+
::u 13388 ::num 30 ::comment Egyptian Hieroglyph
|
413 |
+
::u 13389 ::num 40 ::comment Egyptian Hieroglyph
|
414 |
+
::u 1338A ::num 50 ::comment Egyptian Hieroglyph
|
415 |
+
::u 1338B ::num 60 ::comment Egyptian Hieroglyph
|
416 |
+
::u 1338C ::num 70 ::comment Egyptian Hieroglyph
|
417 |
+
::u 1338D ::num 80 ::comment Egyptian Hieroglyph
|
418 |
+
::u 1338E ::num 90 ::comment Egyptian Hieroglyph
|
419 |
+
::u 1338F ::num 20 ::comment Egyptian Hieroglyph
|
420 |
+
::u 13390 ::num 30 ::comment Egyptian Hieroglyph
|
421 |
+
::u 13391 ::num 40 ::comment Egyptian Hieroglyph
|
422 |
+
::u 13392 ::num 50 ::comment Egyptian Hieroglyph
|
423 |
+
::u 1339B ::r h ::comment Egyptian Hieroglyph ::pic twisted flax
|
424 |
+
::u 133A1 ::r k ::comment Egyptian Hieroglyph ::pic basket with handle
|
425 |
+
::u 133A2 ::r k ::comment Egyptian Hieroglyph ::pic basket with handle, variant
|
426 |
+
::u 133A4 ::r g ::comment Egyptian Hieroglyph ::pic bag
|
427 |
+
::u 133BC ::r g ::comment Egyptian Hieroglyph ::pic stand
|
428 |
+
::u 133CF ::r t ::comment Egyptian Hieroglyph ::pic loaf
|
429 |
+
::u 133ED ::r y ::comment Egyptian Hieroglyph ::pic two strokes
|
430 |
+
::u 133F2 ::r w ::comment Egyptian Hieroglyph ::pic quail chick, hieratic variant
|
431 |
+
::u 133FA ::num 1 ::comment Egyptian Hieroglyph
|
432 |
+
::u 133FB ::num 2 ::comment Egyptian Hieroglyph
|
433 |
+
::u 133FC ::num 3 ::comment Egyptian Hieroglyph
|
434 |
+
::u 133FD ::num 4 ::comment Egyptian Hieroglyph
|
435 |
+
::u 133FE ::num 5 ::comment Egyptian Hieroglyph
|
436 |
+
::u 133FF ::num 6 ::comment Egyptian Hieroglyph
|
437 |
+
::u 13400 ::num 7 ::comment Egyptian Hieroglyph
|
438 |
+
::u 13401 ::num 8 ::comment Egyptian Hieroglyph
|
439 |
+
::u 13402 ::num 9 ::comment Egyptian Hieroglyph
|
440 |
+
::u 13403 ::num 5 ::comment Egyptian Hieroglyph
|
441 |
+
::u 1340D ::r kh ::comment Egyptian Hieroglyph (ḫ, khah) ::pic placenta?
|
442 |
+
::u 1341D ::r m ::comment Egyptian Hieroglyph (also jm)
|
uroman/data/romanization-table-arabic-block.txt
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
::s ، ::t , ::comment ARABIC COMMA
|
2 |
+
::s ؛ ::t ; ::comment ARABIC SEMICOLON
|
3 |
+
::s ؟ ::t ? ::comment ARABIC QUESTION MARK
|
4 |
+
::s ء ::t ' ::comment ARABIC LETTER HAMZA
|
5 |
+
::s آ ::t a ::comment ARABIC LETTER ALEF WITH MADDA ABOVE
|
6 |
+
::s أ ::t a ::comment ARABIC LETTER ALEF WITH HAMZA ABOVE
|
7 |
+
::s ؤ ::t w ::comment ARABIC LETTER WAW WITH HAMZA ABOVE
|
8 |
+
::s إ ::t i ::comment ARABIC LETTER ALEF WITH HAMZA BELOW
|
9 |
+
::s ئ ::t ye ::comment ARABIC LETTER YEH WITH HAMZA ABOVE
|
10 |
+
::s ا ::t a ::comment ARABIC LETTER ALEF
|
11 |
+
::s ب ::t b ::comment ARABIC LETTER BEH
|
12 |
+
::s ة ::t a ::comment ARABIC LETTER TEH MARBUTA
|
13 |
+
::s ت ::t t ::comment ARABIC LETTER TEH
|
14 |
+
::s ث ::t th ::comment ARABIC LETTER THEH
|
15 |
+
::s ج ::t j ::comment ARABIC LETTER JEEM
|
16 |
+
::s ح ::t h ::comment ARABIC LETTER HAH
|
17 |
+
::s خ ::t kh ::comment ARABIC LETTER KHAH
|
18 |
+
::s د ::t d ::comment ARABIC LETTER DAL
|
19 |
+
::s ذ ::t th ::comment ARABIC LETTER THAL
|
20 |
+
::s ر ::t r ::comment ARABIC LETTER REH
|
21 |
+
::s ز ::t z ::comment ARABIC LETTER ZAIN
|
22 |
+
::s س ::t s ::comment ARABIC LETTER SEEN
|
23 |
+
::s ش ::t sh ::comment ARABIC LETTER SHEEN
|
24 |
+
::s ص ::t s ::comment ARABIC LETTER SAD
|
25 |
+
::s ض ::t d ::comment ARABIC LETTER DAD
|
26 |
+
::s ط ::t t ::comment ARABIC LETTER TAH
|
27 |
+
::s ظ ::t z ::comment ARABIC LETTER ZAH
|
28 |
+
::s ع ::t ' ::comment ARABIC LETTER AIN
|
29 |
+
::s غ ::t gh ::comment ARABIC LETTER GHAIN
|
30 |
+
::s ـ ::t - ::comment ARABIC TATWEEL
|
31 |
+
::s ف ::t f ::comment ARABIC LETTER FEH
|
32 |
+
::s ق ::t q ::comment ARABIC LETTER QAF
|
33 |
+
::s ك ::t k ::comment ARABIC LETTER KAF
|
34 |
+
::s ل ::t l ::comment ARABIC LETTER LAM
|
35 |
+
::s م ::t m ::comment ARABIC LETTER MEEM
|
36 |
+
::s ن ::t n ::comment ARABIC LETTER NOON
|
37 |
+
::s ه ::t h ::comment ARABIC LETTER HEH
|
38 |
+
::s و ::t w ::comment ARABIC LETTER WAW
|
39 |
+
::s ى ::t a ::comment ARABIC LETTER ALEF MAKSURA
|
40 |
+
::s ي ::t y ::comment ARABIC LETTER YEH
|
41 |
+
::s َ ::t a ::comment ARABIC FATHA
|
42 |
+
::s ُ ::t u ::comment ARABIC DAMMA
|
43 |
+
::s ِ ::t i ::comment ARABIC KASRA
|
44 |
+
::s ْ ::t ::comment ARABIC SUKUN
|
45 |
+
::s ٔ ::t ' ::comment ARABIC HAMZA ABOVE
|
46 |
+
::s ٕ ::t ' ::comment ARABIC HAMZA BELOW
|
47 |
+
::s ٠ ::t 0 ::comment ARABIC-INDIC DIGIT ZERO
|
48 |
+
::s ١ ::t 1 ::comment ARABIC-INDIC DIGIT ONE
|
49 |
+
::s ٢ ::t 2 ::comment ARABIC-INDIC DIGIT TWO
|
50 |
+
::s ٣ ::t 3 ::comment ARABIC-INDIC DIGIT THREE
|
51 |
+
::s ٤ ::t 4 ::comment ARABIC-INDIC DIGIT FOUR
|
52 |
+
::s ٥ ::t 5 ::comment ARABIC-INDIC DIGIT FIVE
|
53 |
+
::s ٦ ::t 6 ::comment ARABIC-INDIC DIGIT SIX
|
54 |
+
::s ٧ ::t 7 ::comment ARABIC-INDIC DIGIT SEVEN
|
55 |
+
::s ٨ ::t 8 ::comment ARABIC-INDIC DIGIT EIGHT
|
56 |
+
::s ٩ ::t 9 ::comment ARABIC-INDIC DIGIT NINE
|
57 |
+
::s ٪ ::t % ::comment ARABIC PERCENT SIGN
|
58 |
+
::s ٫ ::t , ::comment ARABIC DECIMAL SEPARATOR
|
59 |
+
::s ٬ ::t , ::comment ARABIC THOUSANDS SEPARATOR
|
60 |
+
::s ٮ ::t b ::comment ARABIC LETTER DOTLESS BEH
|
61 |
+
::s ٯ ::t q ::comment ARABIC LETTER DOTLESS QAF
|
62 |
+
::s ٰ ::t a ::comment ARABIC LETTER SUPERSCRIPT ALEF
|
63 |
+
::s ٱ ::t a ::comment ARABIC LETTER ALEF WASLA
|
64 |
+
::s ٲ ::t a ::comment ARABIC LETTER ALEF WITH WAVY HAMZA ABOVE
|
65 |
+
::s ٳ ::t a ::comment ARABIC LETTER ALEF WITH WAVY HAMZA BELOW
|
66 |
+
::s ٷ ::t u ::comment ARABIC LETTER U WITH HAMZA ABOVE
|
67 |
+
::s ٹ ::t tt ::comment ARABIC LETTER TTEH
|
68 |
+
::s ٺ ::t tt ::comment ARABIC LETTER TTEHEH
|
69 |
+
::s ٻ ::t b ::comment ARABIC LETTER BEEH
|
70 |
+
::s ټ ::t t ::comment ARABIC LETTER TEH WITH RING
|
71 |
+
::s ٽ ::t t ::comment ARABIC LETTER TEH WITH THREE DOTS ABOVE DOWNWARDS
|
72 |
+
::s پ ::t p ::comment ARABIC LETTER PEH
|
73 |
+
::s ٿ ::t t ::comment ARABIC LETTER TEHEH
|
74 |
+
::s ڀ ::t b ::comment ARABIC LETTER BEHEH
|
75 |
+
::s ځ ::t h ::comment ARABIC LETTER HAH WITH HAMZA ABOVE
|
76 |
+
::s ڂ ::t h ::comment ARABIC LETTER HAH WITH TWO DOTS VERTICAL ABOVE
|
77 |
+
::s ڃ ::t ny ::comment ARABIC LETTER NYEH
|
78 |
+
::s ڄ ::t dy ::comment ARABIC LETTER DYEH
|
79 |
+
::s څ ::t h ::comment ARABIC LETTER HAH WITH THREE DOTS ABOVE
|
80 |
+
::s چ ::t tch ::comment ARABIC LETTER TCHEH
|
81 |
+
::s ڇ ::t tch ::comment ARABIC LETTER TCHEHEH
|
82 |
+
::s ڈ ::t dd ::comment ARABIC LETTER DDAL
|
83 |
+
::s ډ ::t d ::comment ARABIC LETTER DAL WITH RING
|
84 |
+
::s ڊ ::t d ::comment ARABIC LETTER DAL WITH DOT BELOW
|
85 |
+
::s ڋ ::t d ::comment ARABIC LETTER DAL WITH DOT BELOW AND SMALL TAH
|
86 |
+
::s ڌ ::t d ::comment ARABIC LETTER DAHAL
|
87 |
+
::s ڍ ::t dd ::comment ARABIC LETTER DDAHAL
|
88 |
+
::s ڎ ::t d ::comment ARABIC LETTER DUL
|
89 |
+
::s ڏ ::t d ::comment ARABIC LETTER DAL WITH THREE DOTS ABOVE DOWNWARDS
|
90 |
+
::s ڐ ::t d ::comment ARABIC LETTER DAL WITH FOUR DOTS ABOVE
|
91 |
+
::s ڑ ::t rr ::comment ARABIC LETTER RREH
|
92 |
+
::s ڒ ::t r ::comment ARABIC LETTER REH WITH SMALL V
|
93 |
+
::s ړ ::t r ::comment ARABIC LETTER REH WITH RING
|
94 |
+
::s ڔ ::t r ::comment ARABIC LETTER REH WITH DOT BELOW
|
95 |
+
::s ڕ ::t r ::comment ARABIC LETTER REH WITH SMALL V BELOW
|
96 |
+
::s ږ ::t r ::comment ARABIC LETTER REH WITH DOT BELOW AND DOT ABOVE
|
97 |
+
::s ڗ ::t r ::comment ARABIC LETTER REH WITH TWO DOTS ABOVE
|
98 |
+
::s ژ ::t j ::comment ARABIC LETTER JEH
|
99 |
+
::s ڙ ::t r ::comment ARABIC LETTER REH WITH FOUR DOTS ABOVE
|
100 |
+
::s ښ ::t s ::comment ARABIC LETTER SEEN WITH DOT BELOW AND DOT ABOVE
|
101 |
+
::s ڛ ::t s ::comment ARABIC LETTER SEEN WITH THREE DOTS BELOW
|
102 |
+
::s ڜ ::t s ::comment ARABIC LETTER SEEN WITH THREE DOTS BELOW AND THREE DOTS ABOVE
|
103 |
+
::s ڝ ::t s ::comment ARABIC LETTER SAD WITH TWO DOTS BELOW
|
104 |
+
::s ڞ ::t s ::comment ARABIC LETTER SAD WITH THREE DOTS ABOVE
|
105 |
+
::s ڟ ::t t ::comment ARABIC LETTER TAH WITH THREE DOTS ABOVE
|
106 |
+
::s ڠ ::t n ::comment ARABIC LETTER AIN WITH THREE DOTS ABOVE
|
107 |
+
::s ڡ ::t f ::comment ARABIC LETTER DOTLESS FEH
|
108 |
+
::s ڢ ::t f ::comment ARABIC LETTER FEH WITH DOT MOVED BELOW
|
109 |
+
::s ڣ ::t f ::comment ARABIC LETTER FEH WITH DOT BELOW
|
110 |
+
::s ڤ ::t v ::comment ARABIC LETTER VEH
|
111 |
+
::s ڥ ::t f ::comment ARABIC LETTER FEH WITH THREE DOTS BELOW
|
112 |
+
::s ڦ ::t p ::comment ARABIC LETTER PEHEH
|
113 |
+
::s ڧ ::t q ::comment ARABIC LETTER QAF WITH DOT ABOVE
|
114 |
+
::s ڨ ::t q ::comment ARABIC LETTER QAF WITH THREE DOTS ABOVE
|
115 |
+
::s ک ::t k ::comment ARABIC LETTER KEHEH
|
116 |
+
::s ڪ ::t k ::comment ARABIC LETTER SWASH KAF
|
117 |
+
::s ګ ::t k ::comment ARABIC LETTER KAF WITH RING
|
118 |
+
::s ڬ ::t k ::comment ARABIC LETTER KAF WITH DOT ABOVE
|
119 |
+
::s ڭ ::t ng ::comment ARABIC LETTER NG
|
120 |
+
::s ڮ ::t k ::comment ARABIC LETTER KAF WITH THREE DOTS BELOW
|
121 |
+
::s گ ::t g ::comment ARABIC LETTER GAF
|
122 |
+
::s ڰ ::t g ::comment ARABIC LETTER GAF WITH RING
|
123 |
+
::s ڱ ::t ng ::comment ARABIC LETTER NGOEH
|
124 |
+
::s ڲ ::t g ::comment ARABIC LETTER GAF WITH TWO DOTS BELOW
|
125 |
+
::s ڳ ::t g ::comment ARABIC LETTER GUEH
|
126 |
+
::s ڴ ::t g ::comment ARABIC LETTER GAF WITH THREE DOTS ABOVE
|
127 |
+
::s ڵ ::t l ::comment ARABIC LETTER LAM WITH SMALL V
|
128 |
+
::s ڶ ::t l ::comment ARABIC LETTER LAM WITH DOT ABOVE
|
129 |
+
::s ڷ ::t l ::comment ARABIC LETTER LAM WITH THREE DOTS ABOVE
|
130 |
+
::s ڸ ::t l ::comment ARABIC LETTER LAM WITH THREE DOTS BELOW
|
131 |
+
::s ڹ ::t n ::comment ARABIC LETTER NOON WITH DOT BELOW
|
132 |
+
::s ں ::t n ::comment ARABIC LETTER NOON GHUNNA
|
133 |
+
::s ڻ ::t rn ::comment ARABIC LETTER RNOON
|
134 |
+
::s ڼ ::t n ::comment ARABIC LETTER NOON WITH RING
|
135 |
+
::s ڽ ::t n ::comment ARABIC LETTER NOON WITH THREE DOTS ABOVE
|
136 |
+
::s ھ ::t h ::comment ARABIC LETTER HEH DOACHASHMEE
|
137 |
+
::s ڿ ::t tch ::comment ARABIC LETTER TCHEH WITH DOT ABOVE
|
138 |
+
::s ۀ ::t h ::comment ARABIC LETTER HEH WITH YEH ABOVE
|
139 |
+
::s ہ ::t h ::comment ARABIC LETTER HEH GOAL
|
140 |
+
::s ۂ ::t h ::comment ARABIC LETTER HEH GOAL WITH HAMZA ABOVE
|
141 |
+
::s ۃ ::t a ::comment ARABIC LETTER TEH MARBUTA GOAL
|
142 |
+
::s ۄ ::t w ::comment ARABIC LETTER WAW WITH RING
|
143 |
+
::s ۅ ::t oe ::comment ARABIC LETTER KIRGHIZ OE
|
144 |
+
::s ۆ ::t oe ::comment ARABIC LETTER OE
|
145 |
+
::s ۇ ::t u ::comment ARABIC LETTER U
|
146 |
+
::s ۈ ::t yu ::comment ARABIC LETTER YU
|
147 |
+
::s ۉ ::t yu ::comment ARABIC LETTER KIRGHIZ YU
|
148 |
+
::s ۊ ::t w ::comment ARABIC LETTER WAW WITH TWO DOTS ABOVE
|
149 |
+
::s ۋ ::t v ::comment ARABIC LETTER VE
|
150 |
+
::s ی ::t y ::comment ARABIC LETTER FARSI YEH
|
151 |
+
::s ۍ ::t y ::comment ARABIC LETTER YEH WITH TAIL
|
152 |
+
::s ێ ::t y ::comment ARABIC LETTER YEH WITH SMALL V
|
153 |
+
::s ۏ ::t w ::comment ARABIC LETTER WAW WITH DOT ABOVE
|
154 |
+
::s ې ::t e ::comment ARABIC LETTER E
|
155 |
+
::s ۑ ::t y ::comment ARABIC LETTER YEH WITH THREE DOTS BELOW
|
156 |
+
::s ے ::t y ::comment ARABIC LETTER YEH BARREE
|
157 |
+
::s ۓ ::t y ::comment ARABIC LETTER YEH BARREE WITH HAMZA ABOVE
|
158 |
+
::s ۔ ::t . ::comment ARABIC FULL STOP
|
159 |
+
::s ە ::t ae ::comment ARABIC LETTER AE
|
160 |
+
::s ۮ ::t d ::comment ARABIC LETTER DAL WITH INVERTED V
|
161 |
+
::s ۯ ::t r ::comment ARABIC LETTER REH WITH INVERTED V
|
162 |
+
::s ۰ ::t 0 ::comment EXTENDED ARABIC-INDIC DIGIT ZERO
|
163 |
+
::s ۱ ::t 1 ::comment EXTENDED ARABIC-INDIC DIGIT ONE
|
164 |
+
::s ۲ ::t 2 ::comment EXTENDED ARABIC-INDIC DIGIT TWO
|
165 |
+
::s ۳ ::t 3 ::comment EXTENDED ARABIC-INDIC DIGIT THREE
|
166 |
+
::s ۴ ::t 4 ::comment EXTENDED ARABIC-INDIC DIGIT FOUR
|
167 |
+
::s ۵ ::t 5 ::comment EXTENDED ARABIC-INDIC DIGIT FIVE
|
168 |
+
::s ۶ ::t 6 ::comment EXTENDED ARABIC-INDIC DIGIT SIX
|
169 |
+
::s ۷ ::t 7 ::comment EXTENDED ARABIC-INDIC DIGIT SEVEN
|
170 |
+
::s ۸ ::t 8 ::comment EXTENDED ARABIC-INDIC DIGIT EIGHT
|
171 |
+
::s ۹ ::t 9 ::comment EXTENDED ARABIC-INDIC DIGIT NINE
|
172 |
+
::s ۺ ::t sh ::comment ARABIC LETTER SHEEN WITH DOT BELOW
|
173 |
+
::s ۻ ::t d ::comment ARABIC LETTER DAD WITH DOT BELOW
|
174 |
+
::s ۼ ::t gh ::comment ARABIC LETTER GHAIN WITH DOT BELOW
|
175 |
+
::s ۽ ::t & ::comment ARABIC SIGN SINDHI AMPERSAND
|
176 |
+
::s ﷲ ::t allah ::comment ARABIC LIGATURE ALLAH ISOLATED FORM
|
177 |
+
|
178 |
+
::s ::t ::comment ZERO WIDTH NON-JOINER
|
179 |
+
::s ::t ::comment ZERO WIDTH JOINER
|
uroman/data/romanization-table.txt
ADDED
@@ -0,0 +1,2019 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
## European Latin extensions
|
3 |
+
# Vowels
|
4 |
+
::s Ä ::t Ae
|
5 |
+
::s Ö ::t Oe
|
6 |
+
::s Ü ::t Ue
|
7 |
+
::s Å ::t Aa
|
8 |
+
::s Æ ::t Ae
|
9 |
+
::s Ø ::t oe
|
10 |
+
::s Œ ::t Oe
|
11 |
+
::s ä ::t ae
|
12 |
+
::s ö ::t oe
|
13 |
+
::s ü ::t ue
|
14 |
+
::s å ::t aa
|
15 |
+
::s æ ::t ae
|
16 |
+
::s ø ::t oe
|
17 |
+
::s œ ::t oe
|
18 |
+
# Consonants
|
19 |
+
::s Ç ::t S
|
20 |
+
::s ç ::t s
|
21 |
+
::s Ç ::t Ch ::lcode tur
|
22 |
+
::s ç ::t ch ::lcode tur
|
23 |
+
::s Ş ::t Sh
|
24 |
+
::s ş ::t sh
|
25 |
+
::s Ș ::t Sh
|
26 |
+
::s ș ::t sh
|
27 |
+
::s ß ::t ss
|
28 |
+
::s Ț ::t Ts
|
29 |
+
::s ț ::t ts
|
30 |
+
|
31 |
+
# Digraphs
|
32 |
+
# ::s ʣ ::t dz
|
33 |
+
::s ʤ ::t dzh ::comment Latin small letter dezh digraph
|
34 |
+
# ::s ʥ ::t dz
|
35 |
+
# ::s ʦ ::t ts
|
36 |
+
::s ʧ ::t tsh ::comment Latin small letter tesh digraph
|
37 |
+
# ::s ʨ ::t tc
|
38 |
+
|
39 |
+
# Miscellaneous
|
40 |
+
::s ə ::t e
|
41 |
+
|
42 |
+
# English
|
43 |
+
::s chr ::t chr ::t-alt kr ::example chromosome, synchronize
|
44 |
+
::s Chr ::t Chr ::t-alt Kr ::example Christmas, Chrysler
|
45 |
+
::s eight ::t eight ::t-alt eit ::example eight, weight
|
46 |
+
::s Eight ::t Eight ::t-alt Eit ::example Eighteen
|
47 |
+
::s ight ::t ight ::t-alt ait ::example Knight
|
48 |
+
::s gh ::t gh ::t-alt f, ph, "" ::example laugh, daughter
|
49 |
+
::s high ::t high ::t-alt hai ::example highlight
|
50 |
+
::s High ::t High ::t-alt Hai ::example High School
|
51 |
+
::s Isle ::t Isle ::t-alt Ail ::use-only-for-whole-word ::example Isle
|
52 |
+
::s Island ::t Island ::t-alt Ailand ::use-only-for-whole-word ::example Island
|
53 |
+
::s kn ::t kn ::t-alt n ::use-only-at-start-of-word ::example knowledge
|
54 |
+
::s Kn ::t Kn ::t-alt N ::use-only-at-start-of-word ::example Knight
|
55 |
+
::s Mc ::t Mc ::t-alt Mac ::use-only-at-start-of-word ::example McNulty
|
56 |
+
::s mc ::t mc ::t-alt mac ::use-only-at-start-of-word
|
57 |
+
::s oo ::t oo ::t-alt u ::lcode eng ::example Brooklyn; Goose Bay
|
58 |
+
::s ph ::t ph ::t-alt f ::example alpha
|
59 |
+
::s Ph ::t Ph ::t-alt F ::example Philip
|
60 |
+
::s Thom ::t Thom ::t-alt Tom ::use-only-at-start-of-word ::example Thomas, Thompson
|
61 |
+
::s tion ::t tion ::t-alt shen ::example
|
62 |
+
::s Sean ::t Sean ::t-alt Shawn ::use-only-for-whole-word
|
63 |
+
::s ssion ::t ssion ::t-alt shen ::example Sessions
|
64 |
+
::s St ::t St ::t-alt Saint ::use-only-for-whole-word
|
65 |
+
::s St. ::t St. ::t-alt Saint ::use-only-for-whole-word
|
66 |
+
::s Wr ::t Wr ::t-alt R ::example Wren
|
67 |
+
::s wr ::t wr ::t-alt r ::example Cartwright
|
68 |
+
::s x ::t x ::t-alt ks ::example Mexico
|
69 |
+
::s x ::t x ::t-alt gz ::example example, anxiety, exhaust, exit
|
70 |
+
|
71 |
+
# French
|
72 |
+
::s â ::t a ::t-alt as ::example pâte/paste, pastry
|
73 |
+
::s ê ::t e ::t-alt es ::example fête/feast
|
74 |
+
::s î ::t i ::t-alt is ::example île/isle
|
75 |
+
::s ô ::t o ::t-alt os ::example côte/coast
|
76 |
+
::s û ::t u ::t-alt us ::example août/August
|
77 |
+
::s eaux ::t eaux ::t-alt o ::example Bordeaux
|
78 |
+
::s eau ::t eau ::t-alt o ::example Chateau
|
79 |
+
::s auld ::t auld ::t-alt o ::use-only-at-end-of-word ::example Renauld
|
80 |
+
::s ault ::t ault ::t-alt o ::use-only-at-end-of-word ::example Renault
|
81 |
+
::s oux ::t oux ::t-alt u
|
82 |
+
::s ois ::t ois ::t-alt oa ::use-only-at-end-of-word ::example Dubois
|
83 |
+
|
84 |
+
# German
|
85 |
+
::s Sch ::t Sch ::t-alt Sh
|
86 |
+
::s sch ::t sch ::t-alt sh
|
87 |
+
::s stein ::t stein ::t-alt shtain
|
88 |
+
::s dt ::t dt ::t-alt tt ::use-only-at-end-of-word ::example Schmidt
|
89 |
+
|
90 |
+
# Dutch
|
91 |
+
::s ij ::t ij ::t-alt ai
|
92 |
+
::s Ij ::t Ij ::t-alt Ai
|
93 |
+
|
94 |
+
# Latvian
|
95 |
+
::s Ā ::t A ::t-alt Aa ::lcode lav
|
96 |
+
::s ā ::t a ::t-alt aa ::lcode lav
|
97 |
+
::s Ē ::t E ::t-alt Ee ::lcode lav
|
98 |
+
::s ē ::t e ::t-alt ee ::lcode lav
|
99 |
+
::s Ī ::t I ::t-alt Ii ::lcode lav
|
100 |
+
::s ī ::t i ::t-alt ii ::lcode lav
|
101 |
+
::s Ū ::t U ::t-alt Uu ::lcode lav
|
102 |
+
::s ū ::t u ::t-alt uu ::lcode lav
|
103 |
+
::s Ģ ::t G ::t-alt Gj ::lcode lav
|
104 |
+
::s ģ ::t g ::t-alt gj ::lcode lav
|
105 |
+
::s Ķ ::t K ::t-alt Kj ::lcode lav
|
106 |
+
::s ķ ::t k ::t-alt kj ::lcode lav
|
107 |
+
::s Ļ ::t L ::t-alt Lj ::lcode lav
|
108 |
+
::s ļ ::t l ::t-alt lj ::lcode lav
|
109 |
+
::s Ņ ::t N ::t-alt Nj ::lcode lav
|
110 |
+
::s ņ ::t n ::t-alt nj ::lcode lav
|
111 |
+
::s C ::t C ::t-alt Ts ::lcode lav
|
112 |
+
::s c ::t c ::t-alt ts ::lcode lav
|
113 |
+
::s Č ::t C ::t-alt Tsh ::lcode lav
|
114 |
+
::s č ::t c ::t-alt tsh ::lcode lav
|
115 |
+
::s Š ::t Sh ::t-alt s ::lcode lav
|
116 |
+
::s š ::t sh ::t-alt s ::lcode lav
|
117 |
+
::s Ž ::t Z ::t-alt Zh ::lcode lav
|
118 |
+
::s ž ::t z ::t-alt zh ::lcode lav
|
119 |
+
|
120 |
+
# Lithuanian
|
121 |
+
::s C ::t C ::t-alt Ts ::lcode lit
|
122 |
+
::s c ::t c ::t-alt ts ::lcode lit
|
123 |
+
::s Č ::t C ::t-alt Tsh ::lcode lit
|
124 |
+
::s č ::t c ::t-alt tsh ::lcode lit
|
125 |
+
::s Š ::t Sh ::t-alt s ::lcode lit
|
126 |
+
::s š ::t sh ::t-alt s ::lcode lit
|
127 |
+
::s Ž ::t Z ::t-alt Zh ::lcode lit
|
128 |
+
::s ž ::t z ::t-alt zh ::lcode lit
|
129 |
+
|
130 |
+
# International Greek (e.g. as used in chemical compounds)
|
131 |
+
::s β ::t b
|
132 |
+
::s Β ::t B
|
133 |
+
::s ϐ ::t b
|
134 |
+
|
135 |
+
# Ancient Greek
|
136 |
+
::s β ::t b ::lcode grc
|
137 |
+
::s Β ::t B ::lcode grc
|
138 |
+
::s γγ ::t ng ::lcode grc
|
139 |
+
::s γκ ::t nk ::lcode grc
|
140 |
+
::s γξ ::t nx ::lcode grc
|
141 |
+
::s γχ ::t nch ::lcode grc
|
142 |
+
::s ϱ ::t r ::lcode grc
|
143 |
+
|
144 |
+
# Pontic Greek
|
145 |
+
::s β ::t v ::t-alt b ::lcode pnt
|
146 |
+
::s Β ::t V ::t-alt B ::lcode pnt
|
147 |
+
::s ϐ ::t v ::t-alt b ::lcode pnt
|
148 |
+
|
149 |
+
# Modern Greek (generally the default)
|
150 |
+
::s β ::t v ::t-alt b ::lcode ell
|
151 |
+
::s Β ::t V ::t-alt B ::lcode ell
|
152 |
+
::s ϐ ::t v ::t-alt b ::lcode ell
|
153 |
+
::s Ι ::t I
|
154 |
+
::s ι ::t i
|
155 |
+
::s ί ::t i
|
156 |
+
::s ἶ ::t i
|
157 |
+
::s Υ ::t Y
|
158 |
+
::s υ ::t y
|
159 |
+
::s Ρ ::t R
|
160 |
+
::s ρ ::t r
|
161 |
+
::s ϱ ::t r
|
162 |
+
::s Χ ::t Ch ::t-alt Kh
|
163 |
+
::s χ ::t ch ::t-alt kh
|
164 |
+
::s φ ::t f ::t-alt ph
|
165 |
+
::s Φ ::t F ::t-alt Ph
|
166 |
+
::s Ντ ::t D
|
167 |
+
::s ντ ::t nd ::t-alt d, nt
|
168 |
+
# ::s ντζ ::t ntz
|
169 |
+
::s Μπ ::t B
|
170 |
+
::s μπ ::t b ::use-only-at-start-of-word
|
171 |
+
::s μπ ::t mb ::t-alt b, mp ::dont-use-at-start-of-word
|
172 |
+
::s λμπ ::t lb
|
173 |
+
::s νμπ ::t nb
|
174 |
+
::s ρμπ ::t rb
|
175 |
+
::s γγ ::t ng
|
176 |
+
::s Γκ ::t G
|
177 |
+
::s γκ ::t ng ::t-alt g ::dont-use-at-start-of-word
|
178 |
+
::s γκ ::t g ::use-only-at-start-of-word
|
179 |
+
::s γξ ::t nx ::lcode grc
|
180 |
+
::s γχ ::t nch ::lcode grc
|
181 |
+
::s ει ::t ei ::t-alt i
|
182 |
+
::s Ει ::t Ei ::t-alt I
|
183 |
+
::s ευ ::t eu ::t-alt ev ::comment donated by Constantine
|
184 |
+
::s Ευ ::t Eu ::t-alt Ev ::comment donated by Constantine
|
185 |
+
::s αυ ::t au ::t-alt av
|
186 |
+
::s Αυ ::t Au ::t-alt Av
|
187 |
+
::s ου ::t ou ::t-alt u
|
188 |
+
::s Ου ::t Ou ::t-alt U
|
189 |
+
::s ηυ ::t eu
|
190 |
+
::s Ηυ ::t Eu
|
191 |
+
::s υι ::t ui
|
192 |
+
::s Υι ::t Ui
|
193 |
+
::s ωυ ::t ou
|
194 |
+
::s Ωυ ::t Ou
|
195 |
+
::s ͺ ::t ::comment GREEK YPOGEGRAMMENI (U+037A)
|
196 |
+
::s ϒ ::t Y ::comment GREEK UPSILON WITH HOOK SYMBOL (U+03D2)
|
197 |
+
::s ϓ ::t Y ::comment GREEK UPSILON WITH ACUTE AND HOOK SYMBOL (U+03D3)
|
198 |
+
::s ϔ ::t Y ::comment GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL (U+03D4)
|
199 |
+
::s ι ::t ::comment GREEK PROSGEGRAMMENI (U+1FBE)
|
200 |
+
::s ᾿ ::t ::comment GREEK PSILI (U+1FBF)
|
201 |
+
::s ῀ ::t ::comment GREEK PERISPOMENI (U+1FC0)
|
202 |
+
::s ` ::t ::comment GREEK VARIA (U+1FEF)
|
203 |
+
::s ´ ::t ::comment GREEK OXIA (U+1FFD)
|
204 |
+
|
205 |
+
# Glagolitic
|
206 |
+
::s Ⰿ ::t M ::comment GLAGOLITIC CAPITAL LETTER MYSLITE (U+2C0F)
|
207 |
+
::s Ⱞ ::t M ::comment GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE (U+2C2E)
|
208 |
+
::s ⰿ ::t m ::comment GLAGOLITIC SMALL LETTER MYSLITE (U+2C3F)
|
209 |
+
::s ⱞ ::t m ::comment GLAGOLITIC SMALL LETTER LATINATE MYSLITE (U+2C5E)
|
210 |
+
::s 𞀏 ::t m ::comment COMBINING GLAGOLITIC LETTER MYSLITE (U+1E00F)
|
211 |
+
|
212 |
+
# Cyrillic
|
213 |
+
::s Г ::t G ::t-alt H ::comment Cyrillic capital ghe
|
214 |
+
::s г ::t g ::t-alt h ::comment Cyrillic small ghe
|
215 |
+
::s Е ::t E ::t-alt Ye ::comment Cyrillic capital ie
|
216 |
+
::s е ::t e ::t-alt ye ::comment Cyrillic small ie
|
217 |
+
::s Ё ::t E ::t-alt Yo
|
218 |
+
::s ё ::t e ::t-alt yo
|
219 |
+
::s Х ::t Kh ::t-alt Ch, H ::comment Cyrillic capital ha
|
220 |
+
::s х ::t kh ::t-alt ch, h ::comment Cyrillic small ha
|
221 |
+
::s Щ ::t Shch ::t-alt Sh
|
222 |
+
::s щ ::t shch ::t-alt sh
|
223 |
+
::s Ъ ::t ::comment Cyrillic capital hard sign
|
224 |
+
::s ъ ::t ::comment Cyrillic small hard sign
|
225 |
+
::s ᲆ ::t ::comment CYRILLIC SMALL LETTER TALL HARD SIGN
|
226 |
+
::s Ы ::t Y ::comment Cyrillic capital yeru
|
227 |
+
::s ы ::t y ::comment Cyrillic small yeru
|
228 |
+
::s Ь ::t ::comment Cyrillic capital soft sign
|
229 |
+
::s ь ::t ::comment Cyrillic small soft sign
|
230 |
+
::s Ж ::t Zh ::comment Cyrillic capital letter zhe
|
231 |
+
::s Ш ::t Sh ::comment Cyrillic capital letter sha
|
232 |
+
::s Ч ::t Ch ::comment Cyrillic capital letter che
|
233 |
+
::s Џ ::t Dzh ::comment Cyrillic capital letter dzhe
|
234 |
+
::s Є ::t Ie ::comment Cyrillic capital letter ie
|
235 |
+
::s Ю ::t Yu ::comment Cyrillic capital letter yu
|
236 |
+
::s Я ::t Ya ::comment Cyrillic capital letter ya
|
237 |
+
|
238 |
+
::s Ҥ ::t Ng ::comment Cyrillic capital ligature EN GHE
|
239 |
+
::s ҥ ::t ng ::comment Cyrillic small ligature EN GHE
|
240 |
+
::s Ә ::t e ::comment Cyrillic capital schwa
|
241 |
+
::s ә ::t e ::comment Cyrillic small schwa
|
242 |
+
::s Ӏ ::t ' ::comment Cyrillic palochka
|
243 |
+
::s Ҵ ::t TS ::comment Cyrillic capital ligature te tse, used in Abkhasian
|
244 |
+
::s ҵ ::t ts ::comment Cyrillic small ligature te tse, used in Abkhasian
|
245 |
+
::s Ӕ ::t AE ::comment Cyrillic capital ligature a ie
|
246 |
+
::s ӕ ::t ae ::comment Cyrillic small ligature a ie
|
247 |
+
::s ʹ ::t "'" ::comment modifier letter prime
|
248 |
+
::s ʺ ::t '"' ::comment modifier letter double prime
|
249 |
+
::s ий ::t iy ::dont-use-at-end-of-word
|
250 |
+
::s ий ::t y ::use-only-at-end-of-word
|
251 |
+
|
252 |
+
::s ᲈ ::t u ::comment CYRILLIC SMALL LETTER UNBLENDED UK ligature ou
|
253 |
+
|
254 |
+
# Russian
|
255 |
+
::s Г ::t G ::t-alt _NONE_ ::lcode rus ::comment Cyrillic capital letter ghe
|
256 |
+
::s г ::t g ::t-alt _NONE_ ::lcode rus ::comment Cyrillic small letter ghe
|
257 |
+
::s Й ::t Y ::t-alt I, J ::lcode rus ::comment Cyrillic capital letter short i
|
258 |
+
::s й ::t y ::t-alt i, j ::lcode rus ::comment Cyrillic small letter short i
|
259 |
+
::s Ц ::t Ts ::t-alt C ::lcode rus ::comment Cyrillic capital letter tse
|
260 |
+
::s ц ::t ts ::t-alt c ::lcode rus ::comment Cyrillic small letter tse
|
261 |
+
::s Щ ::t Shch ::t-alt _NONE_ ::lcode rus ::comment Cyrillic capital letter shcha
|
262 |
+
::s щ ::t shch ::t-alt _NONE_ ::lcode rus ::comment Cyrillic small letter shcha
|
263 |
+
::s Ѣ ::t E ::t-alt Ie ::lcode rus ::comment archaic Cyrillic capital letter yat
|
264 |
+
::s ѣ ::t e ::t-alt ie ::lcode rus ::comment archaic Cyrillic small letter yat
|
265 |
+
::s Е ::t E ::t-alt Ye ::dont-use-at-start-of-word ::lcode rus ::comment Cyrillic capital ie
|
266 |
+
::s Е ::t Ye ::t-alt E ::use-only-at-start-of-word ::lcode rus
|
267 |
+
::s е ::t e ::t-alt ye ::dont-use-at-start-of-word ::lcode rus ::comment Cyrillic small ie
|
268 |
+
::s е ::t ye ::t-alt e ::use-only-at-start-of-word ::lcode rus
|
269 |
+
::s ае ::t aye ::lcode rus
|
270 |
+
::s а́е ::t aye ::lcode rus
|
271 |
+
::s ее ::t eye ::lcode rus
|
272 |
+
::s е́е ::t eye ::lcode rus
|
273 |
+
::s ие ::t iye ::lcode rus
|
274 |
+
::s и́е ::t iye ::lcode rus
|
275 |
+
::s ое ::t oye ::lcode rus
|
276 |
+
::s о́е ::t oye ::lcode rus
|
277 |
+
::s уе ::t uye ::lcode rus
|
278 |
+
::s у́е ::t uye ::lcode rus
|
279 |
+
::s ье ::t ye ::lcode rus
|
280 |
+
::s ъе ::t ye ::lcode rus
|
281 |
+
::s Ё ::t Yo ::t-alt E ::lcode rus ::comment Cyrillic capital io
|
282 |
+
::s ё ::t yo ::t-alt e ::lcode rus
|
283 |
+
::s аё ::t ayo ::lcode rus
|
284 |
+
::s а́ё ::t ayo ::lcode rus
|
285 |
+
::s её ::t eyo ::lcode rus
|
286 |
+
::s е́ё ::t eyo ::lcode rus
|
287 |
+
::s иё ::t iyo ::lcode rus
|
288 |
+
::s и́ё ::t iyo ::lcode rus
|
289 |
+
::s оё ::t oyo ::lcode rus
|
290 |
+
::s о́ё ::t oyo ::lcode rus
|
291 |
+
::s уё ::t uyo ::lcode rus
|
292 |
+
::s у́ё ::t uyo ::lcode rus
|
293 |
+
::s ьё ::t yo ::lcode rus
|
294 |
+
::s ъё ::t yo ::lcode rus
|
295 |
+
::s ий ::t y ::lcode rus
|
296 |
+
|
297 |
+
# Ukranian
|
298 |
+
::s Г ::t H ::lcode ukr ::comment Ukrainian capital letter he
|
299 |
+
::s г ::t h ::lcode ukr ::comment Ukrainian small letter he
|
300 |
+
::s Ґ ::t G ::lcode ukr ::comment Ukrainian capital letter ghe
|
301 |
+
::s ґ ::t g ::lcode ukr ::comment Ukrainian small letter ghe
|
302 |
+
::s Е ::t E ::t-alt _NONE_ ::lcode ukr ::comment Cyrillic capital ie
|
303 |
+
::s е ::t e ::t-alt _NONE_ ::lcode ukr ::comment Cyrillic small ie
|
304 |
+
::s И ::t Y ::lcode ukr ::comment Ukrainian capital letter i
|
305 |
+
::s и ::t y ::lcode ukr ::comment Ukrainian small letter i
|
306 |
+
::s Ї ::t Yi ::lcode ukr ::comment Ukrainian capital letter yi
|
307 |
+
::s ї ::t yi ::lcode ukr ::comment Ukrainian small letter yi
|
308 |
+
::s Й ::t I ::t-alt Y ::lcode ukr ::comment Cyrillic capital letter short i
|
309 |
+
::s й ::t i ::t-alt y ::lcode ukr ::comment Cyrillic small letter short i
|
310 |
+
::s Ц ::t Ts ::t-alt C ::lcode ukr ::comment Cyrillic capital letter tse
|
311 |
+
::s ц ::t ts ::t-alt c ::lcode ukr ::comment Cyrillic small letter tse
|
312 |
+
::s Щ ::t Shch ::t-alt _NONE_ ::lcode ukr ::comment Cyrillic capital letter shcha
|
313 |
+
::s щ ::t shch ::t-alt _NONE_ ::lcode ukr ::comment Cyrillic small letter shcha
|
314 |
+
::s Ѣ ::t E ::t-alt Ie ::lcode ukr ::comment archaic Cyrillic capital letter yat
|
315 |
+
::s ѣ ::t e ::t-alt ie ::lcode ukr ::comment archaic Cyrillic small letter yat
|
316 |
+
::s Иї ::t Yi ::lcode ukr ::comment avoid Yyi
|
317 |
+
::s иї ::t yi ::lcode ukr ::comment avoid yyi
|
318 |
+
::s ій ::t iy ::lcode ukr
|
319 |
+
::s і́й ::t iy ::lcode ukr
|
320 |
+
::s ий ::t y ::lcode ukr ::comment Зеленський/Zelensky
|
321 |
+
|
322 |
+
# Belarusian
|
323 |
+
::s Г ::t H ::t-alt G ::lcode bel ::comment capital letter he
|
324 |
+
::s г ::t h ::t-alt g ::lcode bel ::comment small letter he
|
325 |
+
::s Ґ ::t G ::lcode bel ::comment capital letter ghe
|
326 |
+
::s ґ ::t g ::lcode bel ::comment small letter ghe
|
327 |
+
::s Й ::t J ::t-alt Y ::lcode bel ::comment Cyrillic capital letter short i
|
328 |
+
::s й ::t j ::t-alt y ::lcode bel ::comment Cyrillic small letter short i
|
329 |
+
::s Ц ::t Ts ::t-alt C ::lcode bel ::comment Cyrillic capital letter tse
|
330 |
+
::s ц ::t ts ::t-alt c ::lcode bel ::comment Cyrillic small letter tse
|
331 |
+
::s Щ ::t Shch ::t-alt _NONE_ ::lcode bel ::comment Cyrillic capital letter shcha
|
332 |
+
::s щ ::t shch ::t-alt _NONE_ ::lcode bel ::comment Cyrillic small letter shcha
|
333 |
+
::s Ѣ ::t E ::t-alt Ie ::lcode bel ::comment archaic Cyrillic capital letter yat
|
334 |
+
::s ѣ ::t e ::t-alt ie ::lcode bel ::comment archaic Cyrillic small letter yat
|
335 |
+
::s 'я ::t ya ::lcode bel
|
336 |
+
::s ’я ::t ya ::lcode bel
|
337 |
+
::s 'і ::t i ::lcode bel
|
338 |
+
::s ’і ::t i ::lcode bel
|
339 |
+
::s Ё ::t Yo ::t-alt E ::lcode bel ::comment Cyrillic capital io
|
340 |
+
::s ё ::t yo ::t-alt e ::lcode bel
|
341 |
+
::s ёў ::t you ::lcode bel
|
342 |
+
::s ий ::t y ::lcode bel
|
343 |
+
|
344 |
+
# Serbian
|
345 |
+
::s Г ::t G ::t-alt _NONE_ ::lcode srp ::comment Cyrillic capital ghe
|
346 |
+
::s г ::t g ::t-alt _NONE_ ::lcode srp ::comment Cyrillic small ghe
|
347 |
+
::s Х ::t H ::t-alt _NONE_ ::lcode srp ::comment Cyrillic capital ha
|
348 |
+
::s х ::t h ::t-alt _NONE_ ::lcode srp ::comment Cyrillic small ha
|
349 |
+
::s Е ::t E ::t-alt _NONE_ ::lcode srp ::comment Cyrillic capital ie
|
350 |
+
::s е ::t e ::t-alt _NONE_ ::lcode srp ::comment Cyrillic small ie
|
351 |
+
::s Ђ ::t Dj ::lcode srp ::comment Cyrillic capital dje
|
352 |
+
::s Љ ::t Lj ::lcode srp ::comment Cyrillic capital lje
|
353 |
+
::s Ћ ::t Tsh ::lcode srp ::comment Cyrillic capital tshe
|
354 |
+
::s Ж ::t Zh ::lcode srp ::comment Cyrillic capital zhe
|
355 |
+
::s Ц ::t C ::t-alt Ts ::lcode srp ::comment Cyrillic capital tse
|
356 |
+
::s ц ::t c ::t-alt ts ::lcode srp ::comment Cyrillic capital tse
|
357 |
+
::s Đ ::t Dj ::lcode srp ::comment Latin capital d with stroke
|
358 |
+
::s đ ::t dj ::lcode srp ::comment Latin small d with stroke
|
359 |
+
::s Ž ::t Zh ::lcode srp ::comment Latin capital z with caron
|
360 |
+
::s ž ::t zh ::lcode srp ::comment Latin small z with caron
|
361 |
+
::s Ć ::t Tsh ::lcode srp ::comment Latin capital c with acute
|
362 |
+
::s ć ::t tsh ::lcode srp ::comment Latin small c with acute
|
363 |
+
::s Č ::t Ch ::lcode srp ::comment Latin capital c with caron
|
364 |
+
::s č ::t ch ::lcode srp ::comment Latin small c with caron
|
365 |
+
::s Š ::t Sh ::lcode srp ::comment Latin capital s with caron
|
366 |
+
::s š ::t sh ::lcode srp ::comment Latin small s with caron
|
367 |
+
|
368 |
+
::s Г ::t G ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital ghe
|
369 |
+
::s г ::t g ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small ghe
|
370 |
+
::s Х ::t H ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital ha
|
371 |
+
::s х ::t h ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small ha
|
372 |
+
::s Ц ::t C ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital letter tse
|
373 |
+
::s ц ::t c ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small letter tse
|
374 |
+
::s Ч ::t C ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital letter che
|
375 |
+
::s ч ::t c ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small letter che
|
376 |
+
::s Џ ::t Dz ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital letter dzhe
|
377 |
+
::s џ ::t dz ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small letter dzhe
|
378 |
+
::s Е ::t E ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital ie
|
379 |
+
::s е ::t e ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small ie
|
380 |
+
::s Ш ::t S ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital sha
|
381 |
+
::s ш ::t s ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small sha
|
382 |
+
::s Ж ::t Z ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital zhe
|
383 |
+
::s ж ::t z ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small zhe
|
384 |
+
::s Љ ::t Lj ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital lje
|
385 |
+
::s љ ::t lj ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small lje
|
386 |
+
::s Њ ::t Nj ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital nje
|
387 |
+
::s њ ::t nj ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small nje
|
388 |
+
::s Ђ ::t Dj ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital dje
|
389 |
+
::s ђ ::t dj ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small dje
|
390 |
+
::s Ћ ::t C ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic capital tshe
|
391 |
+
::s ћ ::t c ::t-alt _NONE_ ::lcode srp2 ::comment Cyrillic small tshe
|
392 |
+
::s Đ ::t Dj ::lcode srp2 ::comment Latin capital d with stroke
|
393 |
+
::s đ ::t dj ::lcode srp2 ::comment Latin small d with stroke
|
394 |
+
|
395 |
+
# Montenegrin extension (controversial)
|
396 |
+
::s З́ ::t Zj ::lcode srp ::comment Cyrillic capital zje
|
397 |
+
::s з́ ::t zj ::lcode srp ::comment Cyrillic small zje
|
398 |
+
::s С́ ::t Sj ::lcode srp ::comment Cyrillic capital sje
|
399 |
+
::s с́ ::t sj ::lcode srp ::comment Cyrillic small sje
|
400 |
+
::s Ź ::t Zj ::lcode srp ::comment Latin capital z with acute
|
401 |
+
::s ź ::t zj ::lcode srp ::comment Latin small z with acute
|
402 |
+
::s Ś ::t Sj ::lcode srp ::comment Latin capital s with acute
|
403 |
+
::s ś ::t sj ::lcode srp ::comment Latin small s with acute
|
404 |
+
|
405 |
+
::s З́ ::t Z ::lcode srp2 ::comment Cyrillic capital zje
|
406 |
+
::s з́ ::t z ::lcode srp2 ::comment Cyrillic small zje
|
407 |
+
::s С́ ::t S ::lcode srp2 ::comment Cyrillic capital sje
|
408 |
+
::s с́ ::t s ::lcode srp2 ::comment Cyrillic small sje
|
409 |
+
::s Ź ::t Z ::lcode srp2 ::comment Latin capital z with acute
|
410 |
+
::s ź ::t z ::lcode srp2 ::comment Latin small z with acute
|
411 |
+
::s Ś ::t S ::lcode srp2 ::comment Latin capital s with acute
|
412 |
+
::s ś ::t s ::lcode srp2 ::comment Latin small s with acute
|
413 |
+
|
414 |
+
# Bulgarian
|
415 |
+
::s Г ::t G ::t-alt _NONE_ ::lcode bul ::comment Cyrillic capital ghe
|
416 |
+
::s г ::t g ::t-alt _NONE_ ::lcode bul ::comment Cyrillic small ghe
|
417 |
+
::s Х ::t H ::t-alt Kh ::lcode bul ::comment Cyrillic capital letter ha
|
418 |
+
::s х ::t h ::t-alt kh ::lcode bul ::comment Cyrillic small letter ha
|
419 |
+
::s Ц ::t C ::t-alt Ts ::lcode bul ::comment Cyrillic capital letter tse
|
420 |
+
::s ц ::t c ::t-alt ts ::lcode bul ::comment Cyrillic small letter tse
|
421 |
+
::s Щ ::t Sht ::t-alt _NONE_ ::lcode bul ::comment Cyrillic capital letter shcha
|
422 |
+
::s щ ::t sht ::t-alt _NONE_ ::lcode bul ::comment Cyrillic small letter shcha
|
423 |
+
::s Е ::t E ::t-alt _NONE_ ::lcode bul ::comment Cyrillic capital ie
|
424 |
+
::s е ::t e ::t-alt _NONE_ ::lcode bul ::comment Cyrillic small ie
|
425 |
+
::s Ж ::t Zh ::t-alt Z, J ::lcode bul ::comment Cyrillic capital zhe
|
426 |
+
::s ж ::t zh ::t-alt z, j ::lcode bul ::comment Cyrillic small zhe
|
427 |
+
::s Й ::t I ::t-alt Y, J ::lcode bul ::comment Cyrillic capital letter short i
|
428 |
+
::s й ::t i ::t-alt y, j ::lcode bul ::comment Cyrillic short letter short i
|
429 |
+
::s Ю ::t Yu ::t-alt U, Ju, Iu ::lcode bul ::comment Cyrillic capital letter yu
|
430 |
+
::s ю ::t yu ::t-alt u, ju, iu ::lcode bul ::comment Cyrillic small letter yu
|
431 |
+
::s Ъ ::t U ::t-alt A ::lcode bul ::comment Cyrillic capital letter hard sign
|
432 |
+
::s ъ ::t u ::t-alt a ::lcode bul ::comment Cyrillic capital letter hard sign
|
433 |
+
::s Ѣ ::t E ::t-alt Ie ::lcode bul ::comment archaic Cyrillic capital letter yat
|
434 |
+
::s ѣ ::t e ::t-alt ie ::lcode bul ::comment archaic Cyrillic small letter yat
|
435 |
+
::s Ѫ ::t U ::lcode bul ::comment archaic Cyrillic capital letter yus
|
436 |
+
::s ѫ ::t u ::lcode bul ::comment archaic Cyrillic small letter yus
|
437 |
+
::s ИЯ ::t IA ::lcode bul ::use-only-at-end-of-word
|
438 |
+
::s ия ::t ia ::lcode bul ::use-only-at-end-of-word
|
439 |
+
|
440 |
+
::s Ž ::t Zh ::lcode bul ::comment Latin capital z with caron
|
441 |
+
::s ž ::t zh ::lcode bul ::comment Latin small z with caron
|
442 |
+
::s Č ::t Ch ::lcode bul ::comment Latin capital c with caron
|
443 |
+
::s č ::t ch ::lcode bul ::comment Latin small c with caron
|
444 |
+
::s Š ::t Sh ::lcode bul ::comment Latin capital s with caron
|
445 |
+
::s š ::t sh ::lcode bul ::comment Latin small s with caron
|
446 |
+
::s Ŝ ::t Sht ::lcode bul ::comment Latin capital s with circumflex
|
447 |
+
::s ŝ ::t sht ::lcode bul ::comment Latin small s with circumflex
|
448 |
+
::s Û ::t Yu ::t-alt U, Ju, Iu ::lcode bul ::comment Latin capital u with circumflex
|
449 |
+
::s û ::t yu ::t-alt u, ju, iu ::lcode bul ::comment Latin small u with circumflex
|
450 |
+
::s  ::t Ya ::t-alt _NONE_ ::lcode bul ::comment Latin capital a with circumflex
|
451 |
+
::s â ::t ya ::t-alt _NONE_ ::lcode bul ::comment Latin small a with circumflex
|
452 |
+
::s Ŭ ::t U ::t-alt A ::lcode bul ::comment Latin capital u with breve (for hard sign)
|
453 |
+
::s ŭ ::t u ::t-alt a ::lcode bul ::comment Latin small u with breve (for hard sign)
|
454 |
+
::s Ǎ ::t U ::t-alt A ::lcode bul ::comment Latin capital a with caron (for hard sign)
|
455 |
+
::s ǎ ::t u ::t-alt a ::lcode bul ::comment Latin small a with caron (for hard sign)
|
456 |
+
|
457 |
+
# Macedonian
|
458 |
+
::s Г ::t G ::t-alt _NONE_ ::lcode mkd ::comment Cyrillic capital ghe
|
459 |
+
::s г ::t g ::t-alt _NONE_ ::lcode mkd ::comment Cyrillic small ghe
|
460 |
+
::s Х ::t H ::lcode mkd ::comment Cyrillic capital ha
|
461 |
+
::s х ::t h ::lcode mkd ::comment Cyrillic small ha
|
462 |
+
::s Ц ::t C ::t-alt Ts ::lcode mkd ::comment Cyrillic capital letter tse
|
463 |
+
::s ц ::t c ::t-alt ts ::lcode mkd ::comment Cyrillic small letter tse
|
464 |
+
::s Џ ::t Dzh ::t-alt Dj, Dz ::lcode mkd ::comment Cyrillic capital letter dzhe
|
465 |
+
::s џ ::t dzh ::t-alt dj, dz ::lcode mkd ::comment Cyrillic small letter dzhe
|
466 |
+
::s Е ::t E ::t-alt _NONE_ ::lcode mkd ::comment Cyrillic capital ie
|
467 |
+
::s е ::t e ::t-alt _NONE_ ::lcode mkd ::comment Cyrillic small ie
|
468 |
+
::s Ž ::t Zh ::lcode mkd ::comment Latin capital z with caron
|
469 |
+
::s ž ::t zh ::lcode mkd ::comment Latin small z with caron
|
470 |
+
::s Č ::t Ch ::lcode mkd ::comment Latin capital c with caron
|
471 |
+
::s č ::t ch ::lcode mkd ::comment Latin small c with caron
|
472 |
+
::s Š ::t Sh ::lcode mkd ::comment Latin capital s with caron
|
473 |
+
::s š ::t sh ::lcode mkd ::comment Latin small s with caron
|
474 |
+
::s Ǵ ::t Gj ::lcode mkd
|
475 |
+
::s ǵ ::t gj ::lcode mkd
|
476 |
+
::s Đ ::t Gj ::lcode mkd
|
477 |
+
::s đ ::t gj ::lcode mkd
|
478 |
+
::s Ẑ ::t Dz ::lcode mkd
|
479 |
+
::s ẑ ::t dz ::lcode mkd
|
480 |
+
::s J̌ ::t J ::lcode mkd
|
481 |
+
::s ǰ ::t j ::lcode mkd
|
482 |
+
::s L̂ ::t Lj ::lcode mkd
|
483 |
+
::s l̂ ::t lj ::lcode mkd
|
484 |
+
::s N̂ ::t Nj ::lcode mkd
|
485 |
+
::s n̂ ::t nj ::lcode mkd
|
486 |
+
::s Ḱ ::t Kj ::lcode mkd
|
487 |
+
::s ḱ ::t kj ::lcode mkd
|
488 |
+
::s Ć ::t Kj ::lcode mkd
|
489 |
+
::s ć ::t kj ::lcode mkd
|
490 |
+
::s D̂ ::t Dzh ::lcode mkd
|
491 |
+
::s d̂ ::t dzh ::lcode mkd
|
492 |
+
|
493 |
+
::s Г ::t G ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital ghe
|
494 |
+
::s г ::t g ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small ghe
|
495 |
+
::s Х ::t H ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital ha
|
496 |
+
::s х ::t h ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small ha
|
497 |
+
::s Ц ::t C ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital letter tse
|
498 |
+
::s ц ::t c ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small letter tse
|
499 |
+
::s Ч ::t C ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital letter che
|
500 |
+
::s ч ::t c ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small letter che
|
501 |
+
::s Џ ::t D ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital letter dzhe
|
502 |
+
::s џ ::t d ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small letter dzhe
|
503 |
+
::s Е ::t E ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital ie
|
504 |
+
::s е ::t e ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small ie
|
505 |
+
::s Ш ::t S ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital sha
|
506 |
+
::s ш ::t s ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small sha
|
507 |
+
::s Ѓ ::t G ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital gje
|
508 |
+
::s ѓ ::t g ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small gje
|
509 |
+
::s Ж ::t Z ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital zhe
|
510 |
+
::s ж ::t z ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small zhe
|
511 |
+
::s Ѕ ::t Z ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital dze
|
512 |
+
::s ѕ ::t z ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small dze
|
513 |
+
::s Ќ ::t K ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital kje
|
514 |
+
::s ќ ::t k ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small kje
|
515 |
+
::s Љ ::t L ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital lje
|
516 |
+
::s љ ::t l ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small lje
|
517 |
+
::s Њ ::t N ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic capital nje
|
518 |
+
::s њ ::t n ::t-alt _NONE_ ::lcode mkd2 ::comment Cyrillic small nje
|
519 |
+
::s Ž ::t Z ::lcode mkd2 ::comment Latin capital z with caron
|
520 |
+
::s ž ::t z ::lcode mkd2 ::comment Latin small z with caron
|
521 |
+
::s Č ::t C ::lcode mkd2 ::comment Latin capital c with caron
|
522 |
+
::s č ::t c ::lcode mkd2 ::comment Latin small c with caron
|
523 |
+
::s Š ::t S ::lcode mkd2 ::comment Latin capital s with caron
|
524 |
+
::s š ::t s ::lcode mkd2 ::comment Latin small s with caron
|
525 |
+
::s Ǵ ::t G ::lcode mkd2
|
526 |
+
::s ǵ ::t g ::lcode mkd2
|
527 |
+
::s Đ ::t G ::lcode mkd2
|
528 |
+
::s đ ::t g ::lcode mkd2
|
529 |
+
::s Ẑ ::t D ::lcode mkd2
|
530 |
+
::s ẑ ::t d ::lcode mkd2
|
531 |
+
::s J̌ ::t J ::lcode mkd2
|
532 |
+
::s ǰ ::t j ::lcode mkd2
|
533 |
+
::s L̂ ::t L ::lcode mkd2
|
534 |
+
::s l̂ ::t l ::lcode mkd2
|
535 |
+
::s N̂ ::t N ::lcode mkd2
|
536 |
+
::s n̂ ::t n ::lcode mkd2
|
537 |
+
::s Ḱ ::t K ::lcode mkd2
|
538 |
+
::s ḱ ::t k ::lcode mkd2
|
539 |
+
::s Ć ::t K ::lcode mkd2
|
540 |
+
::s ć ::t k ::lcode mkd2
|
541 |
+
::s D̂ ::t D ::lcode mkd2
|
542 |
+
::s d̂ ::t d ::lcode mkd2
|
543 |
+
|
544 |
+
# Kazakh
|
545 |
+
::s Ә ::t A ::lcode kaz
|
546 |
+
::s ә ::t a ::lcode kaz
|
547 |
+
::s Г ::t G ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic capital ghe
|
548 |
+
::s г ::t g ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic small ghe
|
549 |
+
::s Ғ ::t G ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic capital ghe with stroke
|
550 |
+
::s ғ ::t g ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic small ghe with stroke
|
551 |
+
::s Е ::t E ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic capital ie
|
552 |
+
::s е ::t e ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic small ie
|
553 |
+
::s Ё ::t Yo ::t-alt _NONE_ ::lcode kaz
|
554 |
+
::s ё ::t yo ::t-alt _NONE_ ::lcode kaz
|
555 |
+
::s Х ::t H ::t-alt X ::lcode kaz ::comment Cyrillic capital ha
|
556 |
+
::s х ::t h ::t-alt x ::lcode kaz ::comment Cyrillic small ha
|
557 |
+
::s Һ ::t H ::lcode kaz ::comment Cyrillic capital shha
|
558 |
+
::s һ ::t h ::lcode kaz ::comment Cyrillic small shha
|
559 |
+
::s Қ ::t Q ::t-alt K ::lcode kaz
|
560 |
+
::s қ ::t q ::t-alt k ::lcode kaz
|
561 |
+
::s Ц ::t Ts ::t-alt C ::lcode kaz ::comment Cyrillic capital letter tse
|
562 |
+
::s ц ::t ts ::t-alt c ::lcode kaz ::comment Cyrillic small letter tse
|
563 |
+
::s Щ ::t Sh ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic capital letter shcha
|
564 |
+
::s щ ::t sh ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic small letter shcha
|
565 |
+
::s У ::t U ::t-alt Y ::lcode kaz
|
566 |
+
::s у ::t u ::t-alt y ::lcode kaz
|
567 |
+
::s уы ::t wy ::lcode kaz
|
568 |
+
::s Ж ::t J ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic capital zhe
|
569 |
+
::s ж ::t j ::t-alt _NONE_ ::lcode kaz ::comment Cyrillic small zhe
|
570 |
+
::s Ю ::t Yw ::t-alt Yuw, Yiw ::lcode kaz ::comment Cyrillic capital letter yu
|
571 |
+
::s ю ::t yw ::t-alt yuw, yiw ::lcode kaz ::comment Cyrillic small letter yu
|
572 |
+
|
573 |
+
# Kyrgyz
|
574 |
+
::s Г ::t G ::t-alt _NONE_ ::lcode kir ::comment Cyrillic capital ghe
|
575 |
+
::s г ::t g ::t-alt _NONE_ ::lcode kir ::comment Cyrillic small ghe
|
576 |
+
::s Е ::t E ::t-alt Ye ::lcode kir ::comment Cyrillic capital ie
|
577 |
+
::s е ::t e ::t-alt ye ::lcode kir ::comment Cyrillic small ie
|
578 |
+
::s Ё ::t Yo ::t-alt _NONE_ ::lcode kir
|
579 |
+
::s ё ::t yo ::t-alt _NONE_ ::lcode kir
|
580 |
+
::s Х ::t Kh ::t-alt X, H ::lcode kir ::comment Cyrillic capital ha
|
581 |
+
::s х ::t kh ::t-alt x, h ::lcode kir ::comment Cyrillic small ha
|
582 |
+
::s Ж ::t Zh ::t-alt J ::lcode kir ::comment Cyrillic capital zhe
|
583 |
+
::s ж ::t zh ::t-alt j ::lcode kir ::comment Cyrillic small zhe
|
584 |
+
::s Й ::t Y ::t-alt I ::lcode kir ::comment Cyrillic capital letter short i
|
585 |
+
::s й ::t y ::t-alt i ::lcode kir ::comment Cyrillic small letter short i
|
586 |
+
::s Ц ::t Ts ::t-alt C ::lcode kir ::comment Cyrillic capital letter tse
|
587 |
+
::s ц ::t ts ::t-alt c ::lcode kir ::comment Cyrillic small letter tse
|
588 |
+
::s Ң ::t Ng ::lcode kir
|
589 |
+
::s ң ::t ng ::lcode kir
|
590 |
+
::s Ө ::t O ::t-alt Oe ::lcode kir
|
591 |
+
::s ө ::t o ::t-alt oe ::lcode kir
|
592 |
+
::s Ү ::t U ::t-alt Y, Ue ::lcode kir
|
593 |
+
::s ү ::t u ::t-alt y, ue ::lcode kir
|
594 |
+
::s Ы ::t I ::t-alt Y ::lcode kir
|
595 |
+
::s ы ::t i ::t-alt y ::lcode kir
|
596 |
+
::s йы ::t yi ::lcode kir
|
597 |
+
::s ый ::t iy ::lcode kir
|
598 |
+
|
599 |
+
# Ossetian
|
600 |
+
::s ийы ::t iy ::lcode oss
|
601 |
+
|
602 |
+
# Gothic
|
603 |
+
::s 𐌴 ::t e ::comment Gothic letter aihvus
|
604 |
+
::s 𐌹 ::t i ::comment Gothic letter eis
|
605 |
+
::s 𐍇 ::t x ::comment Gothic letter iggws
|
606 |
+
|
607 |
+
# Runic
|
608 |
+
::s ᛫ ::t " " ::comment Runic single punctuation, used as word separator
|
609 |
+
::s ᛬ ::t . ::comment Runic multiple punctuation, used as sentence separator
|
610 |
+
|
611 |
+
# Ogham
|
612 |
+
::s ᚁ ::t b ::comment Ogham letter Beith
|
613 |
+
::s ᚂ ::t l ::comment Ogham letter Luis
|
614 |
+
::s ᚃ ::t f ::comment Ogham letter Fearn
|
615 |
+
::s ᚄ ::t s ::comment Ogham letter Sail
|
616 |
+
::s ᚅ ::t n ::comment Ogham letter Nion
|
617 |
+
::s ᚋ ::t m ::comment Ogham letter Muin
|
618 |
+
::s ᚌ ::t g ::comment Ogham letter Gort
|
619 |
+
::s ᚍ ::t v ::t-alt ng ::comment Ogham letter nGéadal
|
620 |
+
::s ᚎ ::t z ::comment Ogham letter Straif
|
621 |
+
::s ᚏ ::t r ::comment Ogham letter Ruis
|
622 |
+
::s ᚆ ::t h ::t-alt j ::comment Ogham letter Uath
|
623 |
+
::s ᚇ ::t d ::comment Ogham letter Dair
|
624 |
+
::s ᚈ ::t t ::comment Ogham letter Tinne
|
625 |
+
::s ᚉ ::t k ::comment Ogham letter Coll
|
626 |
+
::s ᚊ ::t q ::t-alt kw ::comment Ogham letter Ceirt
|
627 |
+
::s ᚐ ::t a ::comment Ogham letter Ailm
|
628 |
+
::s ᚑ ::t o ::comment Ogham letter Onn
|
629 |
+
::s ᚒ ::t u ::comment Ogham letter Úr
|
630 |
+
::s ᚓ ::t e ::comment Ogham letter Eadhadh
|
631 |
+
::s ᚔ ::t i ::comment Ogham letter Iodhadh
|
632 |
+
::s ᚚ ::t p ::comment Ogham letter Peith
|
633 |
+
# Additional Ogham letters (outside standard alphabet)
|
634 |
+
::s ᚕ ::t eo ::t-alt ea ::comment Ogham additional letter Éabhadh
|
635 |
+
::s ᚖ ::t oi ::t-alt oe ::comment Ogham additional letter Ór
|
636 |
+
::s ᚗ ::t ui ::t-alt ua ::comment Ogham additional letter Uilleann
|
637 |
+
::s ᚘ ::t p ::t-alt io ::comment Ogham additional letter Ifín
|
638 |
+
::s ᚙ ::t ch ::t-alt x, ai ::comment Ogham additional letter Eamhancholl
|
639 |
+
::s ::t " " ::comment Ogham space mark
|
640 |
+
::s ᚛ ::t "" ::comment Ogham feather mark
|
641 |
+
::s ᚜ ::t "" ::comment Ogham feather mark
|
642 |
+
|
643 |
+
# Georgian
|
644 |
+
::s ა ::t a ::comment Georgian letter an
|
645 |
+
::s ე ::t e ::comment Georgian letter en
|
646 |
+
::s ი ::t i ::comment Georgian letter in
|
647 |
+
::s ო ::t o ::comment Georgian letter on
|
648 |
+
::s უ ::t u ::comment Georgian letter un
|
649 |
+
::s ჱ ::t ey ::comment archaic Georgian letter he
|
650 |
+
::s ჲ ::t i ::comment archaic Georgian letter hie
|
651 |
+
::s ჳ :::t w ::comment archaic Georgian letter we
|
652 |
+
::s ჴ ::t q ::comment archaic Georgian letter har
|
653 |
+
::s ჵ ::t o ::comment archaic Georgian letter hoe
|
654 |
+
::s ჶ ::t f ::comment Georgian letter fi (Greek phi)
|
655 |
+
::s ჷ ::t e ::comment Georgian letter yn (schwa)
|
656 |
+
::s ჸ ::t a ::comment Georgian letter elifi
|
657 |
+
::s ჹ ::t g ::comment Georgian letter gan
|
658 |
+
::s ჺ ::t ' ::comment Georgian letter ain
|
659 |
+
::s ჼ ::t n ::comment Georgian letter nar
|
660 |
+
::s ჽ ::t e ::comment Georgian letter aen
|
661 |
+
::s ჾ ::t ::comment Georgian letter hard sign
|
662 |
+
::s ჿ ::t w ::comment Georgian letter labial sign
|
663 |
+
|
664 |
+
::s Ⴚ ::t TS ::comment GEORGIAN CAPITAL LETTER CAN
|
665 |
+
::s ც ::t ts ::comment GEORGIAN LETTER CAN
|
666 |
+
::s Ც ::t TS ::comment GEORGIAN MTAVRULI CAPITAL LETTER CAN
|
667 |
+
::s ⴚ ::t ts ::comment GEORGIAN SMALL LETTER CAN
|
668 |
+
::s Ⴜ ::t TS ::comment GEORGIAN CAPITAL LETTER CIL
|
669 |
+
::s წ ::t ts ::comment GEORGIAN LETTER CIL
|
670 |
+
::s Წ ::t TS ::comment GEORGIAN MTAVRULI CAPITAL LETTER CIL
|
671 |
+
::s ⴜ ::t ts ::comment GEORGIAN SMALL LETTER CIL
|
672 |
+
::s Ⴛ ::t DZ ::comment GEORGIAN CAPITAL LETTER JIL
|
673 |
+
::s ძ ::t dz ::comment GEORGIAN LETTER JIL
|
674 |
+
::s Ძ ::t DZ ::comment GEORGIAN MTAVRULI CAPITAL LETTER JIL
|
675 |
+
::s ⴛ ::t dz ::comment GEORGIAN SMALL LETTER JIL
|
676 |
+
::s Ⴟ ::t J ::comment GEORGIAN CAPITAL LETTER JHAN
|
677 |
+
::s ჯ ::t j ::comment GEORGIAN LETTER JHAN
|
678 |
+
::s Ჯ ::t J ::comment GEORGIAN MTAVRULI CAPITAL LETTER JHAN
|
679 |
+
::s ⴟ ::t j ::comment GEORGIAN SMALL LETTER JHAN
|
680 |
+
|
681 |
+
|
682 |
+
::s Ⴀ ::t A ::comment Georgian capital letter an
|
683 |
+
::s Ⴄ ::t E ::comment Georgian capital letter en
|
684 |
+
::s Ⴈ ::t I ::comment Georgian capital letter in
|
685 |
+
::s Ⴍ ::t O ::comment Georgian capital letter on
|
686 |
+
::s Ⴓ ::t U ::comment Georgian capital letter un
|
687 |
+
::s Ⴡ ::t EY ::comment archaic Georgian capital letter he
|
688 |
+
::s Ⴢ ::t I ::comment archaic Georgian capital letter hie
|
689 |
+
::s Ⴣ :::t W ::comment archaic Georgian capitel letter we
|
690 |
+
::s Ⴤ ::t Q ::comment archaic Georgian capital letter har
|
691 |
+
::s Ⴥ ::t O ::comment archaic Georgian capital letter hoe
|
692 |
+
::s Ⴧ ::t E ::comment archaic Georgian capital letter yn (schwa)
|
693 |
+
::s Ⴭ ::t E ::comment archaic Georgian capital letter aen
|
694 |
+
|
695 |
+
::s Ა ::t A ::comment Georgian Mtavruli capital letter an
|
696 |
+
::s Ე ::t E ::comment Georgian Mtavruli capital letter en
|
697 |
+
::s Ი ::t I ::comment Georgian Mtavruli capital letter in
|
698 |
+
::s Ო ::t O ::comment Georgian Mtavruli capital letter on
|
699 |
+
::s Უ ::t U ::comment Georgian Mtavruli capital letter un
|
700 |
+
::s Ჱ ::t EY ::comment archaic Georgian Mtavruli capital letter he
|
701 |
+
::s Ჲ ::t I ::comment archaic Georgian Mtavruli capital letter hie
|
702 |
+
::s Ჳ :::t W ::comment archaic Georgian Mtavruli capital letter we
|
703 |
+
::s Ჴ ::t Q ::comment archaic Georgian Mtavruli capital letter har
|
704 |
+
::s Ჵ ::t O ::comment archaic Georgian Mtavruli capital letter hoe
|
705 |
+
::s Ჶ ::t F ::comment Georgian Mtavruli capital letter fi (Greek phi)
|
706 |
+
::s Ჷ ::t E ::comment Georgian Mtavruli capital letter yn (schwa)
|
707 |
+
::s Ჸ ::t A ::comment Georgian Mtavruli capital letter elifi
|
708 |
+
::s Ჹ ::t G ::comment Georgian Mtavruli capital letter gan
|
709 |
+
::s Ჺ ::t ' ::comment Georgian Mtavruli capital letter ain
|
710 |
+
::s Ჽ ::t E ::comment Georgian Mtavruli capital letter aen
|
711 |
+
::s Ჾ ::t ::comment Georgian Mtavruli capital letter hard sign
|
712 |
+
::s Ჿ ::t W ::comment Georgian Mtavruli capital letter labial sign
|
713 |
+
|
714 |
+
::s ⴀ ::t a ::comment Georgian small letter an
|
715 |
+
::s ⴄ ::t e ::comment Georgian small letter en
|
716 |
+
::s ⴈ ::t i ::comment Georgian small letter in
|
717 |
+
::s ⴍ ::t o ::comment Georgian small letter on
|
718 |
+
::s ⴓ ::t u ::comment Georgian small letter un
|
719 |
+
::s ⴡ ::t ey ::comment archaic Georgian small letter he
|
720 |
+
::s ⴢ ::t i ::comment archaic Georgian small letter hie
|
721 |
+
::s ⴣ :::t w ::comment archaic Georgian small letter we
|
722 |
+
::s ⴤ ::t q ::comment archaic Georgian small letter har
|
723 |
+
::s ⴥ ::t o ::comment archaic Georgian small letter hoe
|
724 |
+
::s ⴧ ::t e ::comment Georgian small letter yn (schwa)
|
725 |
+
::s ⴭ ::t e ::comment Georgian small letter aen
|
726 |
+
|
727 |
+
# Armenian
|
728 |
+
::s Ա ::t A ::comment Armenian capital letter ayb
|
729 |
+
::s ա ::t a ::comment Armenian small letter ayb
|
730 |
+
::s ՠ ::t a ::comment ARMENIAN SMALL LETTER TURNED AYB (CHECK)
|
731 |
+
::s Ե ::t E ::comment Armenian capital letter ech ::dont-use-at-start-of-word
|
732 |
+
::s ե ::t e ::comment Armenian small letter ech ::dont-use-at-start-of-word
|
733 |
+
::s Ե ::t Ye ::comment Armenian capital letter ech ::use-only-at-start-of-word
|
734 |
+
::s ե ::t ye ::comment Armenian small letter ech ::use-only-at-start-of-word
|
735 |
+
::s Է ::t E ::comment Armenian capital letter eh
|
736 |
+
::s է ::t e ::comment Armenian small letter eh
|
737 |
+
::s Ը ::t E ::comment Armenian capital letter et
|
738 |
+
::s ը ::t e ::comment Armenian small letter et
|
739 |
+
::s Ի ::t I ::comment Armenian capital letter ini
|
740 |
+
::s ի ::t i ::comment Armenian small letter ini
|
741 |
+
::s Յ ::t Y ::comment Armenian capital letter yi
|
742 |
+
::s յ ::t y ::comment Armenian small letter yi
|
743 |
+
::s ֈ ::t y ::comment ARMENIAN SMALL LETTER YI WITH STROKE (CHECK)
|
744 |
+
::s Ո ::t Vo ::comment Armenian capital letter vo ::use-only-at-start-of-word
|
745 |
+
::s ո ::t vo ::comment Armenian small letter vo ::use-only-at-start-of-word
|
746 |
+
::s Ո ::t O ::comment Armenian capital letter vo ::dont-use-at-start-of-word
|
747 |
+
::s ո ::t o ::comment Armenian small letter vo ::dont-use-at-start-of-word
|
748 |
+
::s Ւ ::t W ::comment Armenian capital letter yiwn
|
749 |
+
::s ւ ::t w ::comment Armenian small letter yiwn
|
750 |
+
::s Օ ::t O ::comment Armenian capital letter oh
|
751 |
+
::s օ ::t o ::comment Armenian small letter oh
|
752 |
+
::s Խ ::t Kh ::comment Armenian capital letter xeh
|
753 |
+
::s խ ::t kh ::comment Armenian small letter xeh
|
754 |
+
|
755 |
+
::s Ժ ::t Zh ::comment Armenian capital letter zhe
|
756 |
+
::s Ղ ::t Gh ::comment Armenian capital letter ghad
|
757 |
+
::s Ճ ::t Tch ::comment Armenian capital letter cheh
|
758 |
+
::s ճ ::t tch ::comment Armenian small letter cheh
|
759 |
+
::s Շ ::t Sh ::comment Armenian capital letter sha
|
760 |
+
::s Չ ::t Ch ::comment Armenian capital letter cha
|
761 |
+
::s Ջ ::t J ::comment Armenian capital letter jheh
|
762 |
+
::s ջ ::t j ::comment Armenian small letter jheh
|
763 |
+
::s Վ ::t V ::comment Armenian capital letter vew
|
764 |
+
::s վ ::t v ::comment Armenian small letter vew
|
765 |
+
::s Ձ ::t Dz ::comment Armenian capital letter ja
|
766 |
+
::s ձ ::t dz ::comment Armenian small letter ja
|
767 |
+
::s Ծ ::t Ts ::comment Armenian capital letter ca
|
768 |
+
::s ծ ::t ts ::comment Armenian small letter ca
|
769 |
+
::s Ք ::t K ::t-alt Q ::comment Armenian capital letter keh - sometimes romanized as K' or Q
|
770 |
+
::s ք ::t k ::t-alt q ::comment Armenian small letter keh - sometimes romanized as k' or q
|
771 |
+
|
772 |
+
::s են ::t en ::use-only-for-whole-word ::comment exception (auxiliary verb)
|
773 |
+
::s եմ ::t em ::use-only-for-whole-word ::comment exception (auxiliary verb)
|
774 |
+
::s ենք ::t enk ::use-only-for-whole-word ::comment exception (auxiliary verb)
|
775 |
+
::s ես ::t es ::use-only-for-whole-word ::comment exception (auxiliary verb)
|
776 |
+
::s եք ::t ek ::use-only-for-whole-word ::comment exception (auxiliary verb)
|
777 |
+
|
778 |
+
::s և ::t ev ::comment Armenian small ligature ech yiwn
|
779 |
+
::s ՈՒ ::t U ::comment Armenian capital vo+yiwn
|
780 |
+
::s Ու ::t U ::comment Armenian capital/small vo+yiwn
|
781 |
+
::s ու ::t u ::comment Armenian small vo+wywn
|
782 |
+
|
783 |
+
::s իւ ::t yu
|
784 |
+
|
785 |
+
## Japanese
|
786 |
+
# Katakana
|
787 |
+
::s シ ::t shi
|
788 |
+
::s チ ::t chi
|
789 |
+
::s フ ::t fu
|
790 |
+
::s ジ ::t ji
|
791 |
+
::s ヂ ::t ji
|
792 |
+
::s ヅ ::t zu
|
793 |
+
::s シャ ::t sha
|
794 |
+
::s シュ ::t shu
|
795 |
+
::s ショ ::t sho
|
796 |
+
::s チャ ::t cha
|
797 |
+
::s チェ ::t che
|
798 |
+
::s チュ ::t chu
|
799 |
+
::s チョ ::t cho
|
800 |
+
::s ジャ ::t ja
|
801 |
+
::s ジュ ::t ju
|
802 |
+
::s ジョ ::t jo
|
803 |
+
::s ジェ ::t je
|
804 |
+
::s ヂャ ::t ja
|
805 |
+
::s ヂュ ::t ju
|
806 |
+
::s ヂョ ::t jo
|
807 |
+
::s フェ ::t fe
|
808 |
+
::s ヴェ ::t ve
|
809 |
+
::s フィ ::t fi
|
810 |
+
::s ウィ ::t wi
|
811 |
+
::s ヴィ ::t vi
|
812 |
+
::s ティ ::t ti
|
813 |
+
::s ディ ::t di
|
814 |
+
::s ッ ::t (__SOKUON__) ::comment katakana double following consonant
|
815 |
+
::s ー ::t (__CHOONPU__) ::comment katakana prolonged sound mark
|
816 |
+
::s 𛅤 ::t i ::comment KATAKANA LETTER SMALL WI
|
817 |
+
::s 𛅥 ::t e ::comment KATAKANA LETTER SMALL WE
|
818 |
+
::s 𛅦 ::t o ::comment KATAKANA LETTER SMALL WO
|
819 |
+
# Hiragana
|
820 |
+
::s し ::t shi
|
821 |
+
::s ち ::t chi
|
822 |
+
::s つ ::t tsu
|
823 |
+
::s ふ ::t fu
|
824 |
+
::s を ::t o
|
825 |
+
::s じ ::t ji
|
826 |
+
::s ぢ ::t ji
|
827 |
+
::s づ ::t zu
|
828 |
+
::s しゃ ::t sha
|
829 |
+
::s しゅ ::t shu
|
830 |
+
::s しょ ::t sho
|
831 |
+
::s ちゃ ::t cha
|
832 |
+
::s ちゅ ::t chu
|
833 |
+
::s ちょ ::t cho
|
834 |
+
::s じゃ ::t ja
|
835 |
+
::s じゅ ::t ju
|
836 |
+
::s じょ ::t jo
|
837 |
+
::s ぢゃ ::t ja
|
838 |
+
::s ぢゅ ::t ju
|
839 |
+
::s ぢょ ::t jo
|
840 |
+
::s 𛅐 ::t i ::comment HIRAGANA LETTER SMALL WI
|
841 |
+
::s 𛅑 ::t e ::comment HIRAGANA LETTER SMALL WE
|
842 |
+
::s 𛅒 ::t o ::comment HIRAGANA LETTER SMALL WO
|
843 |
+
::s っ ::t (__SOKUON__) ::comment hiragana double following consonant
|
844 |
+
::s 々 ::t ² ::comment ideographic iteration mark ::annotation repetition-sign
|
845 |
+
|
846 |
+
::s フ ::t fu ::t-alt f
|
847 |
+
::s キ ::t ki ::t-alt k
|
848 |
+
::s ク ::t ku ::t-alt k
|
849 |
+
::s ラ ::t ra ::t-alt la
|
850 |
+
::s リ ::t ri ::t-alt li
|
851 |
+
::s ル ::t ru ::t-alt lu, l, r
|
852 |
+
::s レ ::t re ::t-alt le
|
853 |
+
::s ロ ::t ro ::t-alt lo
|
854 |
+
::s ム ::t mu ::t-alt m ::example キム = Kim
|
855 |
+
::s シ ::t shi ::t-alt si ::example メキシコ = meksiko (Mexico)
|
856 |
+
::s ス ::t su ::t-alt s
|
857 |
+
::s ト ::t to ::t-alt t
|
858 |
+
::s ツ ::t tsu ::t-alt tu, ts ::example シュルツ = Schultz
|
859 |
+
|
860 |
+
::s ㋿ ::t Reiwa ::comment SQUARE ERA NAME REIWA
|
861 |
+
|
862 |
+
# Chinese
|
863 |
+
::s 邦 ::t bang ::t-alt bon, bum, bun, pon
|
864 |
+
::s 鲍 ::t bao ::t-alt bow
|
865 |
+
::s 堡 ::t bao ::t-alt berg, burg, bourg, burgh
|
866 |
+
::s 贝 ::t bei ::t-alt ber
|
867 |
+
::s 本 ::t ben ::t-alt bern, bon, bourn, burn
|
868 |
+
::s 彼得 ::t bide ::t-alt peter, pet
|
869 |
+
::s 伯 ::t bo ::t-alt ber
|
870 |
+
::s 波 ::t bo ::t-alt po
|
871 |
+
::s 布 ::t bu ::t-alt b
|
872 |
+
::s 策 ::t ce ::t-alt tze, tzer
|
873 |
+
::s 曾 ::t ceng ::t-alt tzen, zen
|
874 |
+
::s 彻 ::t che ::t-alt tche
|
875 |
+
::s 茨 ::t ci ::t-alt ts, tz, z
|
876 |
+
::s 兹 ::t ci ::t-alt ds, dz, tz, z, zi
|
877 |
+
::s 蒂 ::t di ::t-alt ti, tti
|
878 |
+
::s 丁 ::t ding ::t-alt din, tin
|
879 |
+
::s 顿 ::t dun ::t-alt ton
|
880 |
+
::s 多 ::t duo ::t-alt do, dor, to
|
881 |
+
::s 尔 ::t er ::t-alt l, le, ll, r
|
882 |
+
::s 弗 ::t fu ::t-alt f, fer, pher, v, ver, vir
|
883 |
+
::s 夫 ::t fu ::t-alt f, v, v
|
884 |
+
::s 福 ::t fu ::t-alt faw, for, ford
|
885 |
+
::s 哥 ::t ge ::t-alt go, co
|
886 |
+
::s 戈 ::t ge ::t-alt go
|
887 |
+
::s 各 ::t ge ::t-alt go, co
|
888 |
+
::s 赫 ::t he ::t-alt ch, che, cher, ge
|
889 |
+
::s 华 ::t hua ::t-alt ver, wa, war, wer ::example Washington
|
890 |
+
::s 怀 ::t huai ::t-alt whi, wi, wy
|
891 |
+
::s 惠 ::t hui ::t-alt wha, whea
|
892 |
+
::s 基 ::t ji ::t-alt ki, chi
|
893 |
+
::s 吉 ::t ji ::t-alt gi, gui
|
894 |
+
::s 加 ::t jia ::t-alt ca, ga, ka ::example Canada
|
895 |
+
::s 杰 ::t jie ::t-alt ger
|
896 |
+
::s 金 ::t jin ::t-alt kin, gin
|
897 |
+
::s 斤 ::t jin ::t-alt zin
|
898 |
+
::s 康 ::t kang ::t-alt con, corn
|
899 |
+
::s 考 ::t kao ::t-alt cow, cour
|
900 |
+
::s 克 ::t ke ::t-alt k, che, cher
|
901 |
+
::s 科 ::t ke ::t-alt ko
|
902 |
+
::s 拉 ::t la ::t-alt ra ::example Tirana
|
903 |
+
::s 朗 ::t lang ::t-alt lon, ron
|
904 |
+
::s 赖 ::t lai ::t-alt ri
|
905 |
+
::s 劳 ::t lao ::t-alt low
|
906 |
+
::s 勒 ::t lei ::t-alt ler
|
907 |
+
::s 伦 ::t lun ::t-alt lon, ran, ron
|
908 |
+
::s 里 ::t li ::t-alt ri
|
909 |
+
::s 利 ::t li ::t-alt ri ::example Ferrari
|
910 |
+
::s 隆 ::t long ::t-alt lon, lum, lund
|
911 |
+
::s 罗 ::t luo ::t-alt l, lo, lu, ro, row, ru
|
912 |
+
::s 洛 ::t luo ::t-alt lo, low, ro
|
913 |
+
::s 默 ::t mo ::t-alt mer
|
914 |
+
::s 纳 ::t na ::t-alt ne, ner
|
915 |
+
::s 珀 ::t po ::t-alt per
|
916 |
+
::s 奇 ::t qi ::t-alt chi, dge, ge, tch
|
917 |
+
::s 齐 ::t qi ::t-alt tsi, zi
|
918 |
+
::s 乔 ::t qiao ::t-alt jo
|
919 |
+
::s 青 ::t qing ::t-alt tsing
|
920 |
+
::s 琼 ::t qiong ::t-alt jon, jum, jun
|
921 |
+
::s 瑟 ::t se ::t-alt the
|
922 |
+
::s 什 ::t shen ::t-alt sh
|
923 |
+
::s 圣 ::t sheng ::t-alt san, sao, saint
|
924 |
+
::s 斯 ::t si ::t-alt s, rth, th ::example Alaska
|
925 |
+
::s 索 ::t suo ::t-alt tho
|
926 |
+
::s 特 ::t te ::t-alt t
|
927 |
+
::s 翁 ::t weng ::t-alt on
|
928 |
+
::s 沃 ::t wo ::t-alt ver, vo, war, wer
|
929 |
+
::s 乌 ::t wu ::t-alt ou, u
|
930 |
+
::s 希 ::t xi ::t-alt chi, hi, shi
|
931 |
+
::s 西 ::t xi ::t-alt s, si
|
932 |
+
::s 锡 ::t xi ::t-alt ci, si, thi, zi
|
933 |
+
::s 夏 ::t xia ::t-alt ha, cha, cia, sha, tia
|
934 |
+
::s 香 ::t xiang ::t-alt chan, cham
|
935 |
+
::s 歇 ::t xie ::t-alt she
|
936 |
+
::s 谢 ::t xie ::t-alt che, she
|
937 |
+
::s 辛 ::t xin ::t-alt cin, sen, sin, sing, sun, zen
|
938 |
+
::s 欣 ::t xin ::t-alt hin, shin
|
939 |
+
::s 休 ::t xiu ::t-alt hu, hue
|
940 |
+
::s 修 ::t xiu ::t-alt ciu, siu, thew, tiu
|
941 |
+
::s 许 ::t xu ::t-alt hue, schue
|
942 |
+
::s 逊 ::t xun ::t-alt son
|
943 |
+
::s 耶 ::t ye ::t-alt yer, ier
|
944 |
+
::s 泽 ::t ze ::t-alt ser
|
945 |
+
::s 扎 ::t zha ::t-alt za
|
946 |
+
::s 詹 ::t zhan ::t-alt ja, jam, jan, jen, jon
|
947 |
+
::s 治 ::t zhi ::t-alt ge ::example George
|
948 |
+
|
949 |
+
## Numbers
|
950 |
+
# Chinese and Japanese numbers
|
951 |
+
::s 零 ::num 0
|
952 |
+
::s 〇 ::num 0
|
953 |
+
::s 一 ::num 1
|
954 |
+
::s 二 ::num 2
|
955 |
+
::s 三 ::num 3
|
956 |
+
::s 四 ::num 4
|
957 |
+
::s 五 ::num 5
|
958 |
+
::s 六 ::num 6
|
959 |
+
::s 七 ::num 7
|
960 |
+
::s 八 ::num 8
|
961 |
+
::s 九 ::num 9
|
962 |
+
::s 十 ::num 10
|
963 |
+
::s 百 ::num 100
|
964 |
+
::s 千 ::num 1000
|
965 |
+
::s 万 ::num 10000
|
966 |
+
::s 萬 ::num 10000
|
967 |
+
::s 亿 ::num 100000000
|
968 |
+
::s 億 ::num 100000000
|
969 |
+
::s 兆 ::num 1000000000000
|
970 |
+
::s 京 ::num 10000000000000000
|
971 |
+
|
972 |
+
# numbers in non-number words (to be exptended)
|
973 |
+
::s 一贯 ::t yiguan ::comment consistent
|
974 |
+
|
975 |
+
::s 红十字会 ::t hongshizihui ::comment Red Cross
|
976 |
+
|
977 |
+
::s 百度 ::t baidu ::comment Baidu (company)
|
978 |
+
::s 百分 ::t baifen ::comment percent
|
979 |
+
::s 百合 ::t baihe ::comment lily
|
980 |
+
::s 百货 ::t baihuo ::comment general merchandise
|
981 |
+
::s 百科 ::t baike ::comment encyclopedia
|
982 |
+
::s 百老汇 ::t bailaohui
|
983 |
+
::s 百灵 ::t bailing
|
984 |
+
::s 百慕大 ::t baimuda
|
985 |
+
::s 百日咳 ::t bairike
|
986 |
+
::s 百色市 ::t baiseshi
|
987 |
+
::s 百事可乐 ::t baishikele ::comment Pepsi Cola
|
988 |
+
::s 百無 ::t baiwu
|
989 |
+
::s 百香 ::t baixiang
|
990 |
+
::s 百姓 ::t baixing
|
991 |
+
::s 百叶 ::t baiye
|
992 |
+
::s 百色 ::t bose
|
993 |
+
::s 杨百翰 ::t yangbaihan ::comment Brigham Young
|
994 |
+
|
995 |
+
::s 北京 ::t beijing
|
996 |
+
::s 京都 ::t jingdou
|
997 |
+
::s 东京 ::t dongjing
|
998 |
+
::s 京胡 ::t jinghu
|
999 |
+
::s 南京 ::t nangjing
|
1000 |
+
::s 普京 ::t pujing ::comment Putin
|
1001 |
+
::s 東京 ::t dongjing ::comment Tokyo
|
1002 |
+
::s 京兆 ::t jingzhao
|
1003 |
+
|
1004 |
+
::s ㎢ ::t km²
|
1005 |
+
::s ㎥ ::t m³
|
1006 |
+
::s ㎝ ::t cm
|
1007 |
+
|
1008 |
+
## Indian
|
1009 |
+
# see mostly under UnicodeDataOverwrite.txt
|
1010 |
+
|
1011 |
+
# Malayalam
|
1012 |
+
::s ൗ ::t au ::comment MALAYALAM AU LENGTH MARK
|
1013 |
+
|
1014 |
+
# Tamil
|
1015 |
+
::s ட ::t d ::comment most commonly d, but t when word-initial or in a doubled consonant
|
1016 |
+
::s ஃப ::t f ::comment h+p=f
|
1017 |
+
::s ஃஜ ::t z ::comment h+j=z
|
1018 |
+
|
1019 |
+
# Myanmar/Burmese
|
1020 |
+
# ::s ့ ::t ::comment dot below, denotes creaky tone
|
1021 |
+
# ::s း ::t ::comment visarga, denotes high tone
|
1022 |
+
::s ၌ ::t -nai ::comment locative
|
1023 |
+
::s ၍ ::t -jwe ::comment completed
|
1024 |
+
::s ၎ ::t legau ::comment aforementioned
|
1025 |
+
::s ၏ ::t -i ::comment genetive
|
1026 |
+
|
1027 |
+
# Lao
|
1028 |
+
::s ັ ::t a ::comment vowel sign mai kan
|
1029 |
+
::s ົ ::t o ::comment vowel sign mai kon
|
1030 |
+
::s ູ ::t uu ::comment vowel sign uu
|
1031 |
+
::s ຽ ::t y ::comment semivowel sign nyo
|
1032 |
+
::s ຼ ::t l ::comment semivowel sign lo
|
1033 |
+
::s ລ ::t l ::comment lo loot
|
1034 |
+
::s ຣ ::t l ::comment lo ling
|
1035 |
+
::s ໝ ::t m ::comment ho mo
|
1036 |
+
::s ໜ ::n ::comment ho no
|
1037 |
+
::s ຢ ::t y ::comment yo
|
1038 |
+
::s ໍ ::t oo ::comment niggahita (possibly also nasal -m in final position)
|
1039 |
+
::s ໆ ::t ² ::comment Lao ko la ::annotation repetition-sign
|
1040 |
+
::s ຯ ::t ... ::comment Lao ellipsis
|
1041 |
+
|
1042 |
+
# Thai
|
1043 |
+
::s ออ ::t o
|
1044 |
+
::s อั ::t a
|
1045 |
+
::s อิ ::t i
|
1046 |
+
::s ๆ ::t ² ::comment Thai character maiyamok ::annotation repetition-sign
|
1047 |
+
|
1048 |
+
# Khmer
|
1049 |
+
::s ័ ::t "" ::comment Khmer samyok sannya: indicates deviation from the general rules of pronunciation
|
1050 |
+
::s ៏ ::t "" ::comment Khmer sign ahsda: denotes stressed intonation in some single-consonant words
|
1051 |
+
::s ៍ ::t "" ::comment Khmer sign toandakhiat: indicates that the base character is not pronounced
|
1052 |
+
::s ៌ ::t "" ::comment Khmer sign robat: a diacritic historically corresponding to the repha form of ra in Devanagari
|
1053 |
+
::s ប៉ ::t pa ::comment Khmer ba + musĕkâtônd -> pa
|
1054 |
+
::s ៗ ::t ² ::comment Khmer sign lek too ::annotation repetition-sign
|
1055 |
+
|
1056 |
+
## Semitic languages
|
1057 |
+
# Arabic
|
1058 |
+
::s و ::t w ::comment Arabic letter waw ::t-alt o, u ::lcode ara
|
1059 |
+
::s ء ::t ' ::comment hamza
|
1060 |
+
::s ٔ ::t ' ::comment hamza above
|
1061 |
+
::s ٕ ::t ' ::comment hamza below
|
1062 |
+
::s ع ::t ' ::comment ain
|
1063 |
+
::s آ ::t a ::comment alef madda
|
1064 |
+
::s ٓا ::t a ::comment Arabic maddah above plus alef (presumably an ill-formed version of آ; found 1 instance in Urdu text)
|
1065 |
+
::s إ ::t i ::comment alef with hamza below
|
1066 |
+
::s ٱ ::t a ::comment alef wasla ::comment typically indicates liaison with preceding word
|
1067 |
+
::s ة ::t a ::comment teh marbuta
|
1068 |
+
::s ۃ ::t a ::comment teh marbuta goal ::comment Used in Punjabi, Sindhi. Different from plain 'teh marbuta'?
|
1069 |
+
::s ي ::t y ::comment Arabic yeh
|
1070 |
+
::s ى ::t a ::comment alef maksura
|
1071 |
+
::s ﻯ ::t a ::comment alef maksura isolated form
|
1072 |
+
::s ﻰ ::t a ::comment alef maksura final form
|
1073 |
+
::s ﯨ ::t a ::comment Uighur Kazach Kirghiz alef maksura initial form
|
1074 |
+
::s ﯩ ::t a ::comment Uighur Kazach Kirghiz alef maksura medial form
|
1075 |
+
::s ٰ ::t a ::comment Arabic letter superscript alef
|
1076 |
+
::s ـ ::t ::comment tatweel (filler)
|
1077 |
+
::s َ ::t a ::comment fatha ("-a")
|
1078 |
+
::s ُ ::t u ::comment damma ("-u")
|
1079 |
+
::s ِ ::t i ::comment kasra ("-i")
|
1080 |
+
::s ْ ::t ::comment sukun (no vowel)
|
1081 |
+
::s ۡ ::t ::comment small high dotless head of khah; like sukun (no vowel); used in Kashmiri, Assamese
|
1082 |
+
::s ً ::t ::comment fathatan ("-an")
|
1083 |
+
::s اً ::t an ::comment alef + fathatan
|
1084 |
+
::s ٌ ::t ::comment dammatan ("-un")
|
1085 |
+
::s ٍ ::t ::comment kasratan ("-in")
|
1086 |
+
::s ّ ::t ::comment shadda (consonant doubler)
|
1087 |
+
::s ڃ ::t ny ::comment Arabic letter nyeh U+0683 (used in Sindhi (snd))
|
1088 |
+
::s ڄ ::t dy ::comment Arabic letter dyeh U+0684 (used in Sindhi (snd))
|
1089 |
+
::s ۾ ::t men ::comment Sindhi postposition men
|
1090 |
+
::s ؑ ::t alayhe wasallam ::comment "upon him be peace"
|
1091 |
+
::s ﷴ ::t mohammad ::comment "Mohammad"
|
1092 |
+
::s ﷸ ::t wasallam ::comment "and peace"
|
1093 |
+
::s ﷺ ::t sallallahou alayhe wasallam ::comment "prayer of God be upon him and his family and peace"
|
1094 |
+
|
1095 |
+
::s ࣓ ::t waw ::comment ARABIC SMALL LOW WAW
|
1096 |
+
::s ࣔ ::t al-rub ::comment ARABIC SMALL HIGH WORD AR-RUB
|
1097 |
+
::s ࣕ ::t s ::comment ARABIC SMALL HIGH SAD
|
1098 |
+
::s ࣖ ::t ' ::comment ARABIC SMALL HIGH AIN
|
1099 |
+
::s ࣗ ::t q ::comment ARABIC SMALL HIGH QAF
|
1100 |
+
::s ࣘ ::t n ::comment ARABIC SMALL HIGH NOON WITH KASRA
|
1101 |
+
::s ࣙ ::t n ::comment ARABIC SMALL LOW NOON WITH KASRA
|
1102 |
+
::s ࣚ ::t al-thalatha ::comment ARABIC SMALL HIGH WORD ATH-THALATHA
|
1103 |
+
::s ࣛ ::t al-sajda ::comment ARABIC SMALL HIGH WORD AS-SAJDA
|
1104 |
+
::s ࣜ ::t al-nisf ::comment ARABIC SMALL HIGH WORD AN-NISF
|
1105 |
+
::s ࣝ ::t sakta ::comment ARABIC SMALL HIGH WORD SAKTA
|
1106 |
+
::s ࣞ ::t qif ::comment ARABIC SMALL HIGH WORD QIF
|
1107 |
+
::s ࣟ ::t waqfa ::comment ARABIC SMALL HIGH WORD WAQFA
|
1108 |
+
::s ࣠ ::t ::comment ARABIC SMALL HIGH FOOTNOTE MARKER (CHECK)
|
1109 |
+
::s ࣡ ::t ::comment ARABIC SMALL HIGH SIGN SAFHA (CHECK)
|
1110 |
+
::s ::t ::comment ARABIC DISPUTED END OF AYAH (CHECK)
|
1111 |
+
|
1112 |
+
# Farsi
|
1113 |
+
::s ی ::t i ::t-alt y ::comment Contributed by Nima
|
1114 |
+
::s ای ::t i ::t-alt ai ::use-only-at-start-of-word ::comment Contributed by Nima
|
1115 |
+
::s هٔ ::t eye ::use-only-at-end-of-word ::lcode fas ::comment Contributed by Nima
|
1116 |
+
::s و ::t v ::t-alt o, u ::lcode fas ::comment Arabic letter waw
|
1117 |
+
::s ض ::t z ::t-alt d ::lcode fas ::comment Contributed by Marjan
|
1118 |
+
::s ث ::t s ::t-alt th ::lcode fas ::comment Contributed by Marjan
|
1119 |
+
::s ذ ::t z ::t-alt th ::lcode fas ::comment Contributed by Nima
|
1120 |
+
::s ع ::t a ::t-alt ' ::lcode fas ::comment Contributed by Nima
|
1121 |
+
::s عا ::t a ::lcode fas ::comment Contributed by Nima
|
1122 |
+
::s عی ::t i ::t-alt iy ::lcode fas ::comment Contributed by Nima
|
1123 |
+
::s عو ::t u ::t-alt o, av ::lcode fas ::comment Contributed by Nima
|
1124 |
+
::s چ ::t ch ::t-alt tch, tsh ::lcode fas ::comment Contributed by Nima
|
1125 |
+
::s ه ::t e ::t-alt h ::use-only-at-end-of-word ::lcode fas ::comment Contributed by Nima
|
1126 |
+
::s ::t "" ::t-alt " " ::lcode fas ::comment source is character "zero-width non-joiner" (U+200C); Contributed by Nima
|
1127 |
+
::s غ ::t gh ::t-alt g ::lcode fas
|
1128 |
+
::s آئی ::t ai ::t-alt ae ::lcode fas
|
1129 |
+
::s ائی ::t ai ::t-alt ae ::lcode fas
|
1130 |
+
::s آئو ::t au ::t-alt ao ::lcode fas
|
1131 |
+
::s ائو ::t au ::t-alt ao ::lcode fas
|
1132 |
+
|
1133 |
+
# Kashmiri (so far: educated guesses)
|
1134 |
+
::s ٖ ::t a ::comment Arabic subscript alef U+0656
|
1135 |
+
::s ٗ ::t u ::comment Arabic inverted damma U+0657
|
1136 |
+
::s ۚ ::t j ::comment Arabic small high jeem U+06DA
|
1137 |
+
::s ۪ ::t ::comment Arabic emtpy centre low stop U+06EA
|
1138 |
+
::s ۬ ::t ::comment Arabic rounded high stop with filled center U+06EC
|
1139 |
+
|
1140 |
+
# Pashto
|
1141 |
+
::s ٙ ::t e ::comment Arabic zwarakay
|
1142 |
+
::s ځ ::t z ::t-alt dz ::comment Pashto letter zim; Arabic letter "hah with hamza above"
|
1143 |
+
::s څ ::t ts ::t-alt c ::comment Pashto letter tsim; Arabic letter "h with three dots above"
|
1144 |
+
::s ګ ::t g ::comment Pashto letter gaf; Arabic letter "kaf with ring"
|
1145 |
+
::s ڼ ::t n ::comment Arabic letter "noon with ring"
|
1146 |
+
::s ږ ::t g ::t-alt z, zh, j ::comment pronunciation varies regionally
|
1147 |
+
::s ښ ::t kh ::t-alt sh ::comment pronunciation varies regionally
|
1148 |
+
::s ه ::t h ::t-alt a ::lcode pus
|
1149 |
+
::s ۀ ::t e ::lcode pus ::comment Arabic letter "heh with yeh above"
|
1150 |
+
::s و ::t w ::t-alt o, u ::lcode pus
|
1151 |
+
::s ی ::t ay ::t-alt y ::lcode pus
|
1152 |
+
::s وی ::t wy ::t-alt oy, uy ::lcode pus
|
1153 |
+
::s ای ::t ay ::lcode pus
|
1154 |
+
::s ۍ ::t ay ::lcode pus
|
1155 |
+
::s ئ ::t ay ::t-alt y ::lcode pus
|
1156 |
+
::s ژ ::t zh ::t-alt z ::lcode pus ::comment [ʒ]
|
1157 |
+
::s ض ::t z ::t-alt d ::lcode pus
|
1158 |
+
::s ث ::t s ::lcode pus ::t-alt th ::comment Arabic letter theh (unvoiced th/θ)
|
1159 |
+
::s ذ ::t z ::lcode pus ::t-alt th ::comment Arabic letter thal (voiced th/ð)
|
1160 |
+
|
1161 |
+
# Hebrew
|
1162 |
+
::s ב ::t v ::comment Hebrew letter bet ::t-alt b
|
1163 |
+
::s כ ::t k ::comment Hebrew letter kaf ::t-alt kh
|
1164 |
+
::s ך ::t k ::comment Hebrew letter kaf ::t-alt kh
|
1165 |
+
::s פ ::t f ::comment Hebrew letter pe ::t-alt p
|
1166 |
+
::s ש ::t sh ::comment Hebrew letter shin ::t-alt s
|
1167 |
+
::s ו ::t v ::comment Hebrew letter vav ::t-alt o, u
|
1168 |
+
::s ח ::t ch ::comment Hebrew letter het ::t-alt h ::use-alt-in-pointed
|
1169 |
+
::s ק ::t q ::t-alt k ::use-alt-in-pointed
|
1170 |
+
::s וֹ ::t o
|
1171 |
+
::s וּ ::t u
|
1172 |
+
::s קְוָ ::t qva ::t-alt kva ::use-alt-in-pointed
|
1173 |
+
::s י ::t y
|
1174 |
+
::s יּ ::t y
|
1175 |
+
::s יָּ ::t ya
|
1176 |
+
::s ײ ::t yy ::comment Hebrew ligature Yiddish double Yod (CHECK)
|
1177 |
+
::s ׯ ::t yyy ::comment HEBREW YOD TRIANGLE (CHECK)
|
1178 |
+
::s ע ::t '
|
1179 |
+
::s ִי ::t i ::t-alt iy ::use-alt-in-pointed
|
1180 |
+
::s ֵי ::t e
|
1181 |
+
::s ִיּ ::t iy
|
1182 |
+
::s ִיָּ ::t iya
|
1183 |
+
::s ױ ::t oy
|
1184 |
+
::s א ::t a ::t-alt '
|
1185 |
+
::s אָ ::t a
|
1186 |
+
::s ֹא ::t o
|
1187 |
+
::s אַ ::t 'a
|
1188 |
+
::s אֲ ::t 'a
|
1189 |
+
::s אֶ ::t e
|
1190 |
+
::s אֱ ::t e
|
1191 |
+
::s פ ::t f
|
1192 |
+
::s פּ ::t p
|
1193 |
+
::s פַּ ::t pa
|
1194 |
+
::s פְּ ::t pe ::t-alt p ::use-alt-in-pointed
|
1195 |
+
::s שׁ ::t sh
|
1196 |
+
::s שָׁ ::t sha
|
1197 |
+
::s שָּׁ ::t sha ::comment ?
|
1198 |
+
::s שְׁ ::t she ::t-alt sh ::use-alt-in-pointed
|
1199 |
+
::s שֶׁ ::t she
|
1200 |
+
::s שִׁ ::t shi
|
1201 |
+
::s שֻׁ ::t shu
|
1202 |
+
::s שׂ ::t s
|
1203 |
+
::s שָׂ ::t sa
|
1204 |
+
::s שְׂ ::t s ::t-alt se ::use-alt-in-pointed
|
1205 |
+
::s כּ ::t k
|
1206 |
+
::s כֶּ ::t ke
|
1207 |
+
::s כֹּ ::t ko
|
1208 |
+
::s בּ ::t b
|
1209 |
+
::s בַּ ::t ba
|
1210 |
+
::s בָּ ::t ba
|
1211 |
+
::s בְּ ::t be ::t-alt b ::use-alt-in-pointed
|
1212 |
+
::s בֶּ ::t be
|
1213 |
+
::s תּ ::t t
|
1214 |
+
::s תַּ ::t ta
|
1215 |
+
::s תֵּ ::t te
|
1216 |
+
::s תִּ ::t ti
|
1217 |
+
::s דָּ ::t da
|
1218 |
+
::s דְּ ::t de ::t-alt d ::use-alt-in-pointed
|
1219 |
+
::s גּ ::t g
|
1220 |
+
::s לֵּ ::t le
|
1221 |
+
::s ד׳ ::t dh
|
1222 |
+
::s ג׳ ::t j
|
1223 |
+
::s ת׳ ::t th
|
1224 |
+
::s ז׳ ::t zh
|
1225 |
+
::s חַ ::t ach ::comment furtive patah ::use-only-at-end-of-word
|
1226 |
+
::s עַ ::t a' ::comment furtive patah ::use-only-at-end-of-word
|
1227 |
+
::s הַּ ::t ah ::comment furtive patah ::use-only-at-end-of-word
|
1228 |
+
::s ַ ::t a ::comment Hebrew point patah
|
1229 |
+
::s ֲ ::t a ::comment Hebrew point hataf patah (hataf = reduced)
|
1230 |
+
::s ֳ ::t o ::comment Hebrew point hataf qamats
|
1231 |
+
::s ָ ::t a ::comment Hebrew point qamats ::t-alt o ::use-alt-in-pointed
|
1232 |
+
::s ֶ ::t e ::comment Hebrew point segol
|
1233 |
+
::s ֱ ::t e ::comment Hebrew point hataf segol (hataf = reduced)
|
1234 |
+
::s ְ ::t e ::comment Hebrew point sheva ::t-alt "" ::use-alt-in-pointed
|
1235 |
+
::s ֵ ::t e ::comment Hebrew point tsere
|
1236 |
+
::s ִ ::t i ::comment Hebrew point hiriq
|
1237 |
+
::s ֹ ::t o ::comment Hebrew point holam
|
1238 |
+
::s ֻ ::t u ::comment Hebrew point qubuts
|
1239 |
+
# ::s ּ ::t "" ::comment Hebrew point dagesh or mapiq
|
1240 |
+
|
1241 |
+
# Yiddish
|
1242 |
+
::s א ::t a ::lcode yid ::comment called "silent" alef
|
1243 |
+
::s אי ::t y ::lcode yid
|
1244 |
+
::s איי ::t ey ::lcode yid
|
1245 |
+
::s או ::t u ::lcode yid
|
1246 |
+
::s אוי ::t oy ::lcode yid
|
1247 |
+
::s אַ ::t a ::lcode yid
|
1248 |
+
::s אָ ::t o ::lcode yid
|
1249 |
+
::s ב ::t b ::lcode yid
|
1250 |
+
::s בֿ ::t v ::lcode yid
|
1251 |
+
::s דזש ::t dzh ::lcode yid
|
1252 |
+
::s ו ::t u ::lcode yid
|
1253 |
+
::s וּ ::t u ::lcode yid
|
1254 |
+
::s וֹ ::t o ::lcode yid
|
1255 |
+
::s װ ::t v ::lcode yid
|
1256 |
+
::s ווא ::t wa ::lcode yid
|
1257 |
+
::s וואַ ::t wa ::lcode yid
|
1258 |
+
::s ווע ::t we ::lcode yid
|
1259 |
+
::s ווי ::t wi ::lcode yid
|
1260 |
+
::s וואוי ::t wo ::lcode yid
|
1261 |
+
::s וי ::t oy ::lcode yid
|
1262 |
+
::s זש ::t zh ::lcode yid
|
1263 |
+
::s ח ::t ch ::lcode yid
|
1264 |
+
::s טש ::t tsh ::lcode yid
|
1265 |
+
::s יִ::t i ::lcode yid
|
1266 |
+
::s יי ::t ey ::lcode yid ::comment maybe "yi" at beginning of word
|
1267 |
+
::s ײַ ::t ay ::lcode yid
|
1268 |
+
::s כּ ::t k ::lcode yid
|
1269 |
+
::s כ ::t ch ::lcode yid
|
1270 |
+
::s ך ::t ch ::lcode yid
|
1271 |
+
::s ע ::t e ::lcode yid
|
1272 |
+
::s פּ ::t p ::lcode yid
|
1273 |
+
::s פֿ ::t f ::lcode yid
|
1274 |
+
::s ף ::t f ::lcode yid ::comment sometimes p
|
1275 |
+
::s ק ::t k ::lcode yid
|
1276 |
+
::s ת ::t s ::lcode yid
|
1277 |
+
|
1278 |
+
# Syriac/Aramaic (should be vetted by expert)
|
1279 |
+
::s ܰ ::t a ::comment Syriac pthaha above
|
1280 |
+
::s ܲ ::t a ::comment Syriac pthaha dotted
|
1281 |
+
::s ܳ ::t aa ::comment Syriac zqapha above
|
1282 |
+
::s ܴ ::t aa ::comment Syriac zqapha below
|
1283 |
+
::s ܵ ::t aa ::comment Syriac zqapha dotted
|
1284 |
+
::s ܶ ::t e ::comment Syriac rbasa above
|
1285 |
+
::s ܷ ::t e ::comment Syriac rbasa below
|
1286 |
+
::s ܿ ::t o ::comment Syriac rwaha
|
1287 |
+
::s ܸ ::t e ::comment Syriac dotted zlama horizontal
|
1288 |
+
::s ܹ ::t e ::comment Syriac dotted zlama angular
|
1289 |
+
::s ܺ ::t i ::comment Syriac hbasa above
|
1290 |
+
::s ܝܺ ::t i ::comment Syriac yudh + hbasa above
|
1291 |
+
::s ܼ ::t u ::comment Syriac hbasa-esasa dotted
|
1292 |
+
::s ܽ ::t o ::comment Syriac esasa above
|
1293 |
+
::s ܾ ::t u ::comment Syriac esasa below
|
1294 |
+
::s ݇ ::t "" ::comment Syriac oblique line above; indication of a silent letter
|
1295 |
+
|
1296 |
+
::s ܖ ::t d ::comment Syriac letter dotless dalath rish; ambiguous form for undifferentiated early dalath/rish
|
1297 |
+
::s ܜ ::t t ::comment Syriac letter teth garshuni; used in Garshuni documents
|
1298 |
+
::s ܒ݂ ::t v ::comment Syriac beth + rukkakha
|
1299 |
+
::s ܒ�� ::t v ::comment Syriac beth + ring-below
|
1300 |
+
::s ܓ݂ ::t g ::comment Syriac gammal + rukkakha [IPA: ɣ]
|
1301 |
+
::s ܓ̥ ::t g ::comment Syriac gammal + ring-below [IPA: ɣ]
|
1302 |
+
::s ܕ݂ ::t d ::comment Syriac dalath + rukkakha [IPA: ð]
|
1303 |
+
::s ܕ̥ ::t d ::comment Syriac dalath + ring-below [IPA: ð]
|
1304 |
+
::s ܟ݂ ::t kh ::comment Syriac kaph + rukkakha [IPA: x]
|
1305 |
+
::s ܟ̥ ::t kh ::comment Syriac kaph + ring-below [IPA: x]
|
1306 |
+
::s ܦ݂ ::t f ::comment Syriac pe + rukkakha
|
1307 |
+
::s ܦ̥ ::t f ::comment Syriac pe + ring-below
|
1308 |
+
::s ܦ݁ ::t p ::comment Syriac pe + qushshaya
|
1309 |
+
::s ܬ݂ ::t th ::comment Syriac taw + rukkakha [IPA: θ]
|
1310 |
+
::s ܬ̥ ::t th ::comment Syriac taw + ring-below [IPA: θ]
|
1311 |
+
|
1312 |
+
::s ܄ ::t : ::comment Syriac sublinear colon; used at the end of verses of supplicationscolon skewed left
|
1313 |
+
::s ܆ ::t , ::comment Syriac colon skewed left; marks a dependent clause
|
1314 |
+
::s ܇ ::t , ::comment Syriac colon skewed right; marks the end of a subdivision of the apodosis, or latter part of a Biblical verse
|
1315 |
+
|
1316 |
+
# Uzbek
|
1317 |
+
::s ʻ ::t ' ::comment modifies pronunciation of preceding "o" and "g"
|
1318 |
+
::s ʼ ::t ' ::comment glottal stop (tutuq belgisi)
|
1319 |
+
|
1320 |
+
# Uyghur
|
1321 |
+
::s ئا ::t a ::lcode uig
|
1322 |
+
::s ە ::t e ::lcode uig
|
1323 |
+
::s ئې ::t e ::lcode uig ::latinplus ë
|
1324 |
+
::s ې ::t e ::lcode uig ::latinplus ë
|
1325 |
+
::s ئە ::t e ::lcode uig
|
1326 |
+
::s يە ::t e ::lcode uig
|
1327 |
+
::s ئى ::t i ::lcode uig
|
1328 |
+
::s ى ::t i ::lcode uig
|
1329 |
+
::s ئو ::t o ::lcode uig
|
1330 |
+
::s و ::t o ::lcode uig
|
1331 |
+
::s ئۇ ::t u ::lcode uig
|
1332 |
+
::s ۇ ::t u ::lcode uig
|
1333 |
+
::s چ ::t ch ::t-alt q ::lcode uig
|
1334 |
+
::s خ ::t x ::lcode uig
|
1335 |
+
::s ژ ::t zh ::lcode uig
|
1336 |
+
::s ئۆ ::t oe ::t-alt o ::lcode uig ::latinplus ö
|
1337 |
+
::s ۆ ::t oe ::t-alt o ::lcode uig ::latinplus ö
|
1338 |
+
::s ئۈ ::t ue ::t-alt u ::lcode uig ::latinplus ü
|
1339 |
+
::s ۈ ::t ue ::t-alt u ::lcode uig ::latinplus ü
|
1340 |
+
::s ۋ ::t w ::lcode uig
|
1341 |
+
|
1342 |
+
# Maldivian
|
1343 |
+
::s ް ::t ::comment thaana sukun
|
1344 |
+
::s ަ ::t a ::comment thaana abafili
|
1345 |
+
::s ާ ::t aa ::comment thaana aabaafili
|
1346 |
+
::s ި ::t i ::comment thaana ibifili
|
1347 |
+
::s ީ ::t ee ::comment thaana eebeefili
|
1348 |
+
::s ު ::t u ::comment thaana ubufili
|
1349 |
+
::s ޫ ::t oo ::comment thaana ooboofili
|
1350 |
+
::s ެ ::t e ::comment thaana ebefili
|
1351 |
+
::s ޭ ::t ey ::comment thaana eybeyfili
|
1352 |
+
::s ޮ ::t o ::comment thaana obofili
|
1353 |
+
::s ޯ ::t oa ::comment thaana oaboafili
|
1354 |
+
|
1355 |
+
# Canadian syllabics (Inuktitut)
|
1356 |
+
::s ᑊ ::t p ::comment syllable final
|
1357 |
+
::s ᐟ ::t t ::comment syllable final
|
1358 |
+
::s ᐠ ::t k ::comment syllable final
|
1359 |
+
::s ᐨ ::t c ::comment syllable final
|
1360 |
+
::s ᒼ ::t m ::comment syllable final
|
1361 |
+
::s ᐣ ::t n ::comment syllable final
|
1362 |
+
::s ᐢ ::t s ::comment syllable final
|
1363 |
+
::s ᐧ ::t y ::comment syllable final
|
1364 |
+
::s ᐤ ::t w ::comment syllable final
|
1365 |
+
::s ᐦ ::t h ::comment syllable final
|
1366 |
+
::s ᕽ ::t hk ::comment syllable final
|
1367 |
+
::s ᓫ ::t l ::comment syllable final
|
1368 |
+
::s ᕑ ::t r ::comment syllable final
|
1369 |
+
|
1370 |
+
# Mongolian
|
1371 |
+
::s ᢅ ::t ::comment MONGOLIAN LETTER ALI GALI BALUDA (CHECK) indicates assimilation
|
1372 |
+
::s ᢆ ::t ::comment MONGOLIAN LETTER ALI GALI THREE BALUDA (CHECK) indicates assimilation
|
1373 |
+
|
1374 |
+
# Tibetan
|
1375 |
+
::s ྅ ::t ::comment TIBETAN MARK PALUTA (CHECK) indicates assimilation
|
1376 |
+
|
1377 |
+
## Punctuation
|
1378 |
+
# delete
|
1379 |
+
::s ¿ ::t "" ::comment inverted question mark
|
1380 |
+
::s ¡ ::t "" ::comment inverted exclamation mark
|
1381 |
+
# decompose double-punctuation
|
1382 |
+
::s ‼ ::t !!
|
1383 |
+
::s ⁇ ::t ??
|
1384 |
+
::s ⁉ ::t !?
|
1385 |
+
::s ⁈ ::t ?!
|
1386 |
+
# preserve
|
1387 |
+
::s ′ ::t ′
|
1388 |
+
::s ∩ ::t ∩
|
1389 |
+
::s ‡ ::t ‡
|
1390 |
+
# Cyrillic
|
1391 |
+
::s ⁙ ::t . ::comment five dot punctuation
|
1392 |
+
# Amharic/Ethiopian
|
1393 |
+
::s ። ::t .
|
1394 |
+
::s ፣ ::t ,
|
1395 |
+
::s ፤ ::t ;
|
1396 |
+
::s ፥ ::t :
|
1397 |
+
::s ፧ ::t ? ::comment Ethiopic question mark
|
1398 |
+
::s ፡ ::t " " ::comment Ethiopic wordspace
|
1399 |
+
::s ፦ ::t : ::comment Ethiopic preface colon
|
1400 |
+
# Ethiopic wordspace often appropriated for other purposes:
|
1401 |
+
::s ፡፡ ::t .
|
1402 |
+
::s ፡- ::t :
|
1403 |
+
::s "፡ " ::t ", "
|
1404 |
+
::s ቸ ::t cha ::comment Ethiopic syllable ca
|
1405 |
+
::s ቹ ::t chu ::comment Ethiopic syllable cu
|
1406 |
+
::s ቺ ::t chi ::comment Ethiopic syllable ci
|
1407 |
+
::s ቻ ::t chaa ::comment Ethiopic syllable caa
|
1408 |
+
::s ቼ ::t chee ::comment Ethiopic syllable cee
|
1409 |
+
::s ች ::t che ::comment Ethiopic syllable ce
|
1410 |
+
::s ቾ ::t cho ::comment Ethiopic syllable co
|
1411 |
+
::s ሠ ::t sa ::comment Ethiopic syllable sza
|
1412 |
+
::s ሡ ::t su ::comment Ethiopic syllable szu
|
1413 |
+
::s ሢ ::t si ::comment Ethiopic syllable szi
|
1414 |
+
::s ሣ ::t saa ::comment Ethiopic syllable szaa
|
1415 |
+
::s ሤ ::t see::comment Ethiopic syllable szee
|
1416 |
+
::s ሥ ::t se ::comment Ethiopic syllable sze
|
1417 |
+
::s ሦ ::t so ::comment Ethiopic syllable szo
|
1418 |
+
::s ጠ ::t te ::comment Ethiopic syllable the with ejective 't'
|
1419 |
+
::s ጡ ::t tu ::comment Ethiopic syllable thu with ejective 't'
|
1420 |
+
::s ጢ ::t ti ::comment Ethiopic syllable thi with ejective 't'
|
1421 |
+
::s ጣ ::t taa ::comment Ethiopic syllable thaa with ejective 't'
|
1422 |
+
::s ጤ ::t tee ::comment Ethiopic syllable thee with ejective 't'
|
1423 |
+
::s ጥ ::t te ::comment Ethiopic syllable the with ejective 't'
|
1424 |
+
::s ጦ ::t to ::comment Ethiopic syllable tho with ejective 't'
|
1425 |
+
|
1426 |
+
# Devanagari (Hindi etc.)
|
1427 |
+
::s । ::t . ::comment danda
|
1428 |
+
::s ॥ ::t . ::comment double danda
|
1429 |
+
::s ৷ ::t . ::comment Bengali currency numerator four; used as danda
|
1430 |
+
::s ॰ ::t . ::comment Devanagari abbreviation sign
|
1431 |
+
# Bengali
|
1432 |
+
::s ৽ ::t . ::comment BENGALI ABBREVIATION SIGN
|
1433 |
+
::s ৾ ::t ::comment BENGALI SANDHI MARK (CHECK)
|
1434 |
+
# Gurmukhi
|
1435 |
+
::s ੶ ::t . ::comment GURMUKHI ABBREVIATION SIGN
|
1436 |
+
# Oriya/Odia (India)
|
1437 |
+
::s ::t . ::comment danda (deprecated, should use Devanagari danda ।)
|
1438 |
+
::s ::t . ::comment double danda (deprecated, should use Devanagari double danda ॥)
|
1439 |
+
# Tibetan
|
1440 |
+
::s ། ::t ,
|
1441 |
+
::s །: ::t :
|
1442 |
+
::s ༏ ::t ;
|
1443 |
+
::s ༎ ::t .
|
1444 |
+
::s ༑ ::t , ::comment Tibetan mark run chen spungs shad
|
1445 |
+
::s ༼ ::t ( ::comment Tibetan open roof punctuation
|
1446 |
+
::s ༽ ::t ) ::comment Tibetan close roof punctuation
|
1447 |
+
::s ༈ ::t "" ::comment Tibetan mark srbul shad
|
1448 |
+
::s 【 ::t [ ::comment left black lenticular bracket
|
1449 |
+
::s 】 ::t ] ::comment right black lenticular bracket
|
1450 |
+
::s ༄ ::t "" ::comment Tibetan head mark
|
1451 |
+
::s ༄༅ ::t "" ::comment Tibetan head mark
|
1452 |
+
::s ༆ ::t "" ::comment Tibetan head mark
|
1453 |
+
# Myanmar/Burmese
|
1454 |
+
::s ၊ ::t ,
|
1455 |
+
::s ။ ::t .
|
1456 |
+
Khmer
|
1457 |
+
::s ៖ ::t ; ::comment Khmer sign camnuc pii kuuh
|
1458 |
+
::s ។ ::t . ::comment Khmer sign khan
|
1459 |
+
# Arabic
|
1460 |
+
::s ، ::t ,
|
1461 |
+
::s ؛ ::t ;
|
1462 |
+
::s ٬ ::t ,
|
1463 |
+
::s ۔ ::t .
|
1464 |
+
::s ؟ ::t ?
|
1465 |
+
::s ٪ ::t %
|
1466 |
+
::s ٫ ::t , ::comment Arabic decimal separator
|
1467 |
+
::s ۽ ::t & ::comment Arabic sign Sindhi ampersand
|
1468 |
+
# Aramaic
|
1469 |
+
::s ܀ ::t .
|
1470 |
+
::s ܂ ::t .
|
1471 |
+
# Hebrew
|
1472 |
+
::s ־ ::t - ::comment maqaf
|
1473 |
+
# Armenian
|
1474 |
+
::s ։ ::t .
|
1475 |
+
::s ՝ ::t , ::comment Armenian comma
|
1476 |
+
# Chinese
|
1477 |
+
::s , ::t ", "
|
1478 |
+
::s 、 ::t ", "
|
1479 |
+
::s 。 ::t ". "
|
1480 |
+
::s ! ::t "! "
|
1481 |
+
::s ? ::t "? "
|
1482 |
+
::s 「 ::t ' "'
|
1483 |
+
::s 」 ::t '" '
|
1484 |
+
::s 《 ::t ' "'
|
1485 |
+
::s 》 ::t '" '
|
1486 |
+
::s ( ::t " ("
|
1487 |
+
::s ) ::t ") "
|
1488 |
+
::s ; ::t ;
|
1489 |
+
::s : ::t ": "
|
1490 |
+
::s ︰ ::t ": "
|
1491 |
+
::s - ::t -
|
1492 |
+
::s / ::t /
|
1493 |
+
::s = ::t =
|
1494 |
+
::s ~ ::t ~
|
1495 |
+
::s & ::t &
|
1496 |
+
::s < ::t <
|
1497 |
+
::s > ::t >
|
1498 |
+
::s % ::t %
|
1499 |
+
::s _ ::t _ ::comment FULLWIDTH LOW LINE (U+FF3F)
|
1500 |
+
::s { ::t { ::comment FULLWIDTH LEFT CURLY BRACKET (U+FF5B)
|
1501 |
+
::s } ::t } ::comment FULLWIDTH RIGHT CURLY BRACKET (U+FF5D)
|
1502 |
+
::s ::t " " ::comment ideographic space
|
1503 |
+
# Japanese
|
1504 |
+
::s 『 ::t ' "'
|
1505 |
+
::s 』 ::t '" '
|
1506 |
+
::s ・ ::t " " ::comment Katakana middle dot; separates name elements such as first and last name
|
1507 |
+
# N'ko
|
1508 |
+
::s ߽ ::t . ::comment NKO DANTAYALAN used to abbreviate units of measure
|
1509 |
+
# Medefaidrin
|
1510 |
+
::s 𖺗 ::t , ::comment MEDEFAIDRIN COMMA
|
1511 |
+
::s 𖺘 ::t . ::comment MEDEFAIDRIN FULL STOP
|
1512 |
+
# Khitan
|
1513 |
+
::s 𖿤 ::t ::comment KHITAN SMALL SCRIPT FILLER
|
1514 |
+
|
1515 |
+
# Symbols
|
1516 |
+
::s ∞ ::t ∞ ::comment infinity
|
1517 |
+
::s ::t ::comment soft hyphen; used to indicate preferred line breaks; remove
|
1518 |
+
::s ֊ ::t - ::comment Armenian hyphen; map to regular hyphen-minus
|
1519 |
+
::s ᐩ ::t + ::comment Canadian syllabics final plus; map to regular plus
|
1520 |
+
::s ﹐ ::t , ::comment small comma; map to regular comma
|
1521 |
+
::s ˚ ::t ° ::comment ring above; map to degree sign
|
1522 |
+
::s ⇒ ::t ⇒ ::comment rightwards double arrow
|
1523 |
+
::s † ::t † ::comment dagger
|
1524 |
+
::s • ::t • ::comment bullet
|
1525 |
+
::s ℃ ::t °C ::comment degree Celsius; split into 2 characters
|
1526 |
+
::s ℉ ::t °F ::comment degree Fahrenheit; split into 2 characters
|
1527 |
+
::s ― ::t ― ::comment horizontal bar
|
1528 |
+
::s ˇ ::t ˇ ::comment caron (sometimes apparently used for "Arabic vowel sign small v above" U+065A, e.g. in Gilaki language (glk))
|
1529 |
+
::s ″ ::t ″ ::comment double prime
|
1530 |
+
::s ﴾ ::t ( ::comment ornate left parenthesis
|
1531 |
+
::s ﴿ ::t ) ::comment ornate right parenthesis
|
1532 |
+
::s 〔 ::t [ ::comment left tortoise shell bracket
|
1533 |
+
::s 〕 ::t ] ::comment right tortoise shell bracket
|
1534 |
+
::s ﹝ ::t ( ::comment small left tortoise shell bracket
|
1535 |
+
::s ﹞ ::t ) ::comment small left tortoise shell bracket
|
1536 |
+
::s ¦ ::t ¦ ::comment BROKEN BAR (U+00A6)
|
1537 |
+
::s ¨ ::t ::comment DIAERESIS (U+00A8)
|
1538 |
+
::s ¯ ::t ::comment MACRON (U+00AF)
|
1539 |
+
::s ¸ ::t ::comment CEDILLA (U+00B8)
|
1540 |
+
::s Ƿ ::t W ::comment LATIN CAPITAL LETTER WYNN (U+01F7)
|
1541 |
+
::s ˘ ::t ::comment BREVE (U+02D8)
|
1542 |
+
::s ˛ ::t ::comment OGONEK (U+02DB)
|
1543 |
+
::s ˜ ::t ~ ::comment SMALL TILDE (U+02DC)
|
1544 |
+
::s ̒ ::t ::comment COMBINING TURNED COMMA ABOVE (U+0312)
|
1545 |
+
::s ̔ ::t ::comment COMBINING REVERSED COMMA ABOVE (U+0314)
|
1546 |
+
::s ̜ ::t ::comment COMBINING LEFT HALF RING BELOW (U+031C)
|
1547 |
+
::s ̧ ::t ::comment COMBINING CEDILLA (U+0327)
|
1548 |
+
::s ̫ ::t ::comment COMBINING INVERTED DOUBLE ARCH BELOW (U+032B)
|
1549 |
+
::s ̲ ::t ::comment COMBINING LOW LINE (U+0332)
|
1550 |
+
::s ̳ ::t ::comment COMBINING DOUBLE LOW LINE (U+0333)
|
1551 |
+
::s ̹ ::t ::comment COMBINING RIGHT HALF RING BELOW (U+0339)
|
1552 |
+
::s ̺ ::t ::comment COMBINING INVERTED BRIDGE BELOW (U+033A)
|
1553 |
+
::s ̿ ::t ::comment COMBINING DOUBLE OVERLINE (U+033F)
|
1554 |
+
::s ͅ ::t ::comment COMBINING GREEK YPOGEGRAMMENI (U+0345)
|
1555 |
+
::s ͑ ::t ::comment COMBINING LEFT HALF RING ABOVE (U+0351)
|
1556 |
+
::s ͗ ::t ::comment COMBINING RIGHT HALF RING ABOVE (U+0357)
|
1557 |
+
::s ͚ ::t ::comment COMBINING DOUBLE RING BELOW (U+035A)
|
1558 |
+
::s ͜ ::t ::comment COMBINING DOUBLE BREVE BELOW (U+035C)
|
1559 |
+
::s ͝ ::t ::comment COMBINING DOUBLE BREVE (U+035D)
|
1560 |
+
::s ͞ ::t ::comment COMBINING DOUBLE MACRON (U+035E)
|
1561 |
+
::s ͟ ::t ::comment COMBINING DOUBLE MACRON BELOW (U+035F)
|
1562 |
+
::s ͠ ::t ::comment COMBINING DOUBLE TILDE (U+0360)
|
1563 |
+
|
1564 |
+
::s ‐ ::t - ::comment HYPHEN (U+2010)
|
1565 |
+
::s ‗ ::t ‗ ::comment DOUBLE LOW LINE (U+2017)
|
1566 |
+
::s ‵ ::t ‵ ::comment REVERSED PRIME (U+2035)
|
1567 |
+
::s ‶ ::t ‶ ::comment REVERSED DOUBLE PRIME (U+2036)
|
1568 |
+
::s ‸ ::t ‸ ::comment CARET (U+2038)
|
1569 |
+
::s ‽ ::t ?! ::comment INTERROBANG (U+203D)
|
1570 |
+
::s ‾ ::t ‾ ::comment OVERLINE (U+203E)
|
1571 |
+
::s ‿ ::t ‿ ::comment UNDERTIE (U+203F)
|
1572 |
+
::s ⁂ ::t ⁂ ::comment ASTERISM (U+2042)
|
1573 |
+
::s ⁎ ::t * ::comment LOW ASTERISK (U+204E)
|
1574 |
+
::s ⁏ ::t ; ::comment REVERSED SEMICOLON (U+204F)
|
1575 |
+
::s ⁔ ::t ⁔ ::comment INVERTED UNDERTIE (U+2054)
|
1576 |
+
::s ⁝ ::t ⁝ ::comment TRICOLON (U+205D)
|
1577 |
+
::s ::t " " ::comment MEDIUM MATHEMATICAL SPACE (U+205F)
|
1578 |
+
::s ₋ ::t - ::comment SUBSCRIPT MINUS (U+208B)
|
1579 |
+
::s ⃩ ::t ::comment COMBINING WIDE BRIDGE ABOVE (U+20E9)
|
1580 |
+
|
1581 |
+
::s ﹔ ::t ; ::comment SMALL SEMICOLON (U+FE54)
|
1582 |
+
::s ﹕ ::t : ::comment SMALL COLON (U+FE55)
|
1583 |
+
::s ﹛ ::t { ::comment SMALL LEFT CURLY BRACKET (U+FE5B)
|
1584 |
+
::s ﹜ ::t } ::comment SMALL RIGHT CURLY BRACKET (U+FE5C)
|
1585 |
+
::s ﹠ ::t & ::comment SMALL AMPERSAND (U+FE60)
|
1586 |
+
::s ﹡ ::t * ::comment SMALL ASTERISK (U+FE61)
|
1587 |
+
::s ﹣ ::t - ::comment SMALL HYPHEN-MINUS (U+FE63)
|
1588 |
+
|
1589 |
+
::s ℈ ::t ℈ ::comment SCRUPLE (U+2108)
|
1590 |
+
::s ℟ ::t ℟ ::comment RESPONSE (U+211F)
|
1591 |
+
::s ℣ ::t ℣ ::comment VERSICLE (U+2123)
|
1592 |
+
::s ℽ ::t ℽ ::comment DOUBLE-STRUCK SMALL GAMMA (U+213D)
|
1593 |
+
::s ℾ ::t ℾ ::comment DOUBLE-STRUCK CAPITAL GAMMA (U+213E)
|
1594 |
+
::s ⅋ ::t ⅋ ::comment TURNED AMPERSAND (U+214B)
|
1595 |
+
::s ⅍ ::t A/S::comment AKTIESELSKAB (U+214D)
|
1596 |
+
|
1597 |
+
::s ⑃ ::t ⑃ ::comment OCR INVERTED FORK (U+2443)
|
1598 |
+
::s ⑊ ::t \\ ::comment OCR DOUBLE BACKSLASH (U+244A)
|
1599 |
+
::s ⟮ ::t ( ::comment MATHEMATICAL LEFT FLATTENED PARENTHESIS (U+27EE)
|
1600 |
+
::s ⟯ ::t ) ::comment MATHEMATICAL RIGHT FLATTENED PARENTHESIS (U+27EF)
|
1601 |
+
::s ⸨ ::t (( ::comment LEFT DOUBLE PARENTHESIS (U+2E28)
|
1602 |
+
::s ⸩ ::t )) ::comment RIGHT DOUBLE PARENTHESIS (U+2E29)
|
1603 |
+
|
1604 |
+
# kavyka indicates alternative reading
|
1605 |
+
::s ᷶ ::t ::comment COMBINING KAVYKA ABOVE RIGHT (U+1DF6)
|
1606 |
+
::s ᷷ ::t ::comment COMBINING KAVYKA ABOVE LEFT (U+1DF7)
|
1607 |
+
::s ⹅ ::t ::comment INVERTED LOW KAVYKA (U+2E45)
|
1608 |
+
::s ⹆ ::t ::comment INVERTED LOW KAVYKA WITH KAVYKA ABOVE (U+2E46)
|
1609 |
+
::s ⹇ ::t ::comment LOW KAVYKA (U+2E47)
|
1610 |
+
::s ⹈ ::t ::comment LOW KAVYKA WITH DOT (U+2E48)
|
1611 |
+
::s ꙾ ::t ::comment CYRILLIC KAVYKA (U+A67E)
|
1612 |
+
|
1613 |
+
# Braille
|
1614 |
+
::s ⠁ ::t a
|
1615 |
+
::s ⠃ ::t b
|
1616 |
+
::s ⠉ ::t c
|
1617 |
+
::s ⠙ ::t d
|
1618 |
+
::s ⠑ ::t e
|
1619 |
+
::s ⠋ ::t f
|
1620 |
+
::s ⠛ ::t g
|
1621 |
+
::s ⠓ ::t h
|
1622 |
+
::s ⠊ ::t i
|
1623 |
+
::s ⠚ ::t j
|
1624 |
+
::s ⠅ ::t k
|
1625 |
+
::s ⠇ ::t l
|
1626 |
+
::s ⠍ ::t m
|
1627 |
+
::s ⠝ ::t n
|
1628 |
+
::s ⠕ ::t o
|
1629 |
+
::s ⠏ ::t p
|
1630 |
+
::s ⠟ ::t q
|
1631 |
+
::s ⠗ ::t r
|
1632 |
+
::s ⠎ ::t s
|
1633 |
+
::s ⠞ ::t t
|
1634 |
+
::s ⠥ ::t u
|
1635 |
+
::s ⠧ ::t v
|
1636 |
+
::s ⠺ ::t w
|
1637 |
+
::s ⠭ ::t x
|
1638 |
+
::s ⠽ ::t y
|
1639 |
+
::s ⠵ ::t z
|
1640 |
+
|
1641 |
+
::s ⠜ ::t ae
|
1642 |
+
::s ⠪ ::t oe
|
1643 |
+
::s ⠳ ::t ue
|
1644 |
+
::s ⠷ ::t a ::comment à
|
1645 |
+
::s ⠡ ::t a ::comment â
|
1646 |
+
::s ⠿ ::t e ::comment é
|
1647 |
+
::s ⠮ ::t e ::comment è
|
1648 |
+
::s ⠣ ::t e ::comment ê
|
1649 |
+
::s ⠫ ::t e ::comment ë
|
1650 |
+
::s ⠩ ::t i ::comment î
|
1651 |
+
::s ⠻ ::t i ::comment ï
|
1652 |
+
::s ⠹ ::t o ::comment ô
|
1653 |
+
::s ⠾ ::t u ::comment ù
|
1654 |
+
::s ⠱ ::t u ::comment û
|
1655 |
+
|
1656 |
+
::s ⠡ ::t au ::lcode deu
|
1657 |
+
::s ⠌ ::t aeu ::lcode deu
|
1658 |
+
::s ⠹ ::t ch ::lcode deu
|
1659 |
+
::s ⠩ ::t ei ::lcode deu
|
1660 |
+
::s ⠣ ::t eu ::lcode deu
|
1661 |
+
::s ⠬ ::t ie ::lcode deu
|
1662 |
+
::s ⠱ ::t sch ::lcode deu
|
1663 |
+
::s ⠮ ::t ss ::lcode deu
|
1664 |
+
::s ⠾ ::t st ::lcode deu
|
1665 |
+
|
1666 |
+
::s ⠠⠠ ::t "" ::comment start of word all-caps mode
|
1667 |
+
# ::s ⠠⠁ ::t A
|
1668 |
+
# ::s ⠠⠃ ::t B
|
1669 |
+
# ::s ⠠⠉ ::t C
|
1670 |
+
# ::s ⠠⠙ ::t D
|
1671 |
+
# ::s ⠠⠑ ::t E
|
1672 |
+
# ::s ⠠⠋ ::t F
|
1673 |
+
# ::s ⠠⠛ ::t G
|
1674 |
+
# ::s ⠠⠓ ::t H
|
1675 |
+
# ::s ⠠⠊ ::t I
|
1676 |
+
# ::s ⠠⠚ ::t J
|
1677 |
+
# ::s ⠠⠅ ::t K
|
1678 |
+
# ::s ⠠⠇ ::t L
|
1679 |
+
# ::s ⠠⠍ ::t M
|
1680 |
+
# ::s ⠠⠝ ::t N
|
1681 |
+
# ::s ⠠⠕ ::t O
|
1682 |
+
# ::s ⠠⠏ ::t P
|
1683 |
+
# ::s ⠠⠟ ::t Q
|
1684 |
+
# ::s ⠠⠗ ::t R
|
1685 |
+
# ::s ⠠⠎ ::t S
|
1686 |
+
# ::s ⠠⠞ ::t T
|
1687 |
+
# ::s ⠠⠥ ::t U
|
1688 |
+
# ::s ⠠⠧ ::t V
|
1689 |
+
# ::s ⠠⠺ ::t W
|
1690 |
+
# ::s ⠠⠭ ::t X
|
1691 |
+
# ::s ⠠⠽ ::t Y
|
1692 |
+
# ::s ⠠⠵ ::t Z
|
1693 |
+
|
1694 |
+
::s ⠼⠁ ::t 1
|
1695 |
+
::s ⠼⠃ ::t 2
|
1696 |
+
::s ⠼⠉ ::t 3
|
1697 |
+
::s ⠼⠙ ::t 4
|
1698 |
+
::s ⠼⠑ ::t 5
|
1699 |
+
::s ⠼⠋ ::t 6
|
1700 |
+
::s ⠼⠛ ::t 7
|
1701 |
+
::s ⠼⠓ ::t 8
|
1702 |
+
::s ⠼⠊ ::t 9
|
1703 |
+
::s ⠼⠚ ::t 0
|
1704 |
+
|
1705 |
+
::s ⠂ ::t ,
|
1706 |
+
::s ⠆ ::t ;
|
1707 |
+
::s ⠒ ::t :
|
1708 |
+
::s ⠲ ::t .
|
1709 |
+
::s ⠦ ::t ?
|
1710 |
+
::s ⠖ ::t !
|
1711 |
+
::s ⠄ ::t '
|
1712 |
+
::s ⠤ ::t -
|
1713 |
+
::s ⠨⠤ ::t _
|
1714 |
+
|
1715 |
+
::s ⠀ ::t " " ::comment blank
|
1716 |
+
# ::s ⠐ t " " ::comment blank in numeric mode
|
1717 |
+
::s ⠈ ::t "" ::comment accent
|
1718 |
+
# ::s ⠌ ::t / ::comment in numeric mode only
|
1719 |
+
# ::s ⠐ ::comment abbreviation sign
|
1720 |
+
# ::s ⠘ ::comment abbreviation sign
|
1721 |
+
# ::s ⠠ ::comment capital indicator
|
1722 |
+
::s ⠨ ::t . ::comment decimal point; emphasis
|
1723 |
+
::s ⠰ ::t "" ::comment letter indicator
|
1724 |
+
# ::s ⠴ ::t ”
|
1725 |
+
# ::s ⠶ ::t ()
|
1726 |
+
# ::s ⠸ ::comment abbreviation sign
|
1727 |
+
::s ⠼ ::t "" ::comment number indicator
|
1728 |
+
::s ⠘⠚ ::t ° ::word-external-punctuation
|
1729 |
+
::s ⠘⠚⠠⠉ ::t °C
|
1730 |
+
::s ⠘⠚⠉ ::t °C
|
1731 |
+
::s ⠘⠚⠠⠋ ::t °F
|
1732 |
+
::s ⠘⠚⠋ ::t °F
|
1733 |
+
|
1734 |
+
::s ⠠⠶ ::t " ::word-external-punctuation
|
1735 |
+
::s ⠘⠦ ::t “ ::word-external-punctuation
|
1736 |
+
::s ⠘⠴ ::t ” ::word-external-punctuation
|
1737 |
+
::s ⠄⠦ ::t ‘
|
1738 |
+
::s ⠄⠴ ::t ’
|
1739 |
+
::s ⠠⠴ ::t ���
|
1740 |
+
::s ⠐⠣ ::t ( ::word-external-punctuation
|
1741 |
+
::s ⠐⠜ ::t ) ::word-external-punctuation
|
1742 |
+
::s ⠨⠣ ::t [ ::word-external-punctuation
|
1743 |
+
::s ⠨⠜ ::t ] ::word-external-punctuation
|
1744 |
+
::s ⠸⠣ ::t { ::word-external-punctuation
|
1745 |
+
::s ⠸⠜ ::t } ::word-external-punctuation
|
1746 |
+
::s ⠈⠣ ::t < ::word-external-punctuation
|
1747 |
+
::s ⠈⠜ ::t > ::word-external-punctuation
|
1748 |
+
::s ⠸⠌ ::t / ::word-external-punctuation
|
1749 |
+
::s ⠸⠡ ::t \ ::word-external-punctuation
|
1750 |
+
::s ⠠⠤ ::t – ::word-external-punctuation
|
1751 |
+
::s ⠐⠠⠤ ::t — ::word-external-punctuation
|
1752 |
+
::s ⠈⠯ ::t & ::word-external-punctuation
|
1753 |
+
::s ⠐⠔ ::t * ::word-external-punctuation
|
1754 |
+
::s ⠨⠦ ::t ∩ ::word-external-punctuation
|
1755 |
+
::s ⠨⠴ ::t % ::word-external-punctuation
|
1756 |
+
::s ⠐⠖ ::t + ::word-external-punctuation
|
1757 |
+
::s ⠐⠤ ::t − ::word-external-punctuation
|
1758 |
+
::s ⠐⠶ ::t = ::word-external-punctuation
|
1759 |
+
::s ⠈⠎ ::t $ ::word-external-punctuation
|
1760 |
+
::s ⠈⠉ ::t ¢ ::word-external-punctuation
|
1761 |
+
::s ⠈⠇ ::t £ ::word-external-punctuation
|
1762 |
+
::s ⠈⠽ ::t ¥ ::word-external-punctuation
|
1763 |
+
::s ⠈⠁ ::t @ ::word-external-punctuation
|
1764 |
+
::s ⠸⠹ ::t # ::word-external-punctuation
|
1765 |
+
::s ⠸⠲ ::t • ::word-external-punctuation
|
1766 |
+
::s ⠈⠢ ::t ^ ::word-external-punctuation
|
1767 |
+
::s ⠈⠔ ::t ~ ::word-external-punctuation
|
1768 |
+
::s ⠘⠉ ::t © ::word-external-punctuation
|
1769 |
+
::s ⠐⠌ ::t ÷ ::word-external-punctuation
|
1770 |
+
::s ⠐⠦ ::t × ::word-external-punctuation
|
1771 |
+
::s ⠈⠠⠹ ::t † ::word-external-punctuation
|
1772 |
+
::s ⠈⠠⠻ ::t ‡ ::word-external-punctuation
|
1773 |
+
::s ⠘⠏ ::t ¶ ::word-external-punctuation
|
1774 |
+
::s ⠘⠎ ::t § ::word-external-punctuation
|
1775 |
+
::s ⠘⠗ ::t ® ::word-external-punctuation
|
1776 |
+
::s ⠘⠞ ::t ™ ::word-external-punctuation
|
1777 |
+
|
1778 |
+
# English Braille
|
1779 |
+
::s ⠁⠃ ::t about ::lcode eng ::use-only-for-whole-word
|
1780 |
+
::s ⠁⠃⠧ ::t above ::lcode eng ::use-only-for-whole-word
|
1781 |
+
::s ⠁⠉ ::t according ::lcode eng ::use-only-for-whole-word
|
1782 |
+
::s ⠁⠉⠗ ::t across ::lcode eng ::use-only-for-whole-word
|
1783 |
+
::s ⠁⠋ ::t after ::lcode eng ::use-only-for-whole-word
|
1784 |
+
::s ⠁⠋⠝ ::t afternoon ::lcode eng ::use-only-for-whole-word
|
1785 |
+
::s ⠁⠋⠺ ::t afterward ::lcode eng ::use-only-for-whole-word
|
1786 |
+
::s ⠁⠛ ::t again ::lcode eng ::use-only-for-whole-word
|
1787 |
+
::s ⠁⠛⠌ ::t against ::lcode eng ::use-only-for-whole-word
|
1788 |
+
::s ⠠⠽ ::t ally ::lcode eng ::use-only-at-end-of-word ::use-only-in-lower-case-environment
|
1789 |
+
::s ⠁⠇⠍ ::t almost ::lcode eng ::use-only-for-whole-word
|
1790 |
+
::s ⠁⠇⠗ ::t already ::lcode eng ::use-only-for-whole-word
|
1791 |
+
::s ⠁⠇ ::t also ::lcode eng ::use-only-for-whole-word
|
1792 |
+
::s ⠁⠇⠹ ::t although ::lcode eng ::use-only-for-whole-word
|
1793 |
+
::s ⠁⠇⠞ ::t altogether ::lcode eng ::use-only-for-whole-word
|
1794 |
+
::s ⠁⠇⠺ ::t always ::lcode eng ::use-only-for-whole-word
|
1795 |
+
::s ⠨⠑ ::t ance ::lcode eng
|
1796 |
+
::s ⠯ ::t and ::lcode eng
|
1797 |
+
::s ⠜ ::t ar ::lcode eng
|
1798 |
+
::s ⠵ ::t as ::lcode eng ::use-only-for-whole-word
|
1799 |
+
::s ⠠⠝ ::t ation ::lcode eng ::use-only-at-end-of-word ::use-only-in-lower-case-environment
|
1800 |
+
::s ⠃ ::t b ::lcode eng
|
1801 |
+
::s ⠆ ::t bb ::lcode eng ::dont-use-at-start-of-word ::dont-use-at-end-of-word
|
1802 |
+
::s ⠆ ::t be ::lcode eng ::use-only-at-start-of-word
|
1803 |
+
::s ⠆⠉ ::t because ::lcode eng ::use-only-for-whole-word
|
1804 |
+
::s ⠆⠋ ::t before ::lcode eng ::use-only-for-whole-word
|
1805 |
+
::s ⠆⠓ ::t behind ::lcode eng ::use-only-for-whole-word
|
1806 |
+
::s ⠆⠇ ::t below ::lcode eng ::use-only-for-whole-word
|
1807 |
+
::s ⠆⠝ ::t beneath ::lcode eng ::use-only-for-whole-word
|
1808 |
+
::s ⠆⠎ ::t beside ::lcode eng ::use-only-for-whole-word
|
1809 |
+
::s ⠆⠞ ::t between ::lcode eng ::use-only-for-whole-word
|
1810 |
+
::s ⠆⠽ ::t beyond ::lcode eng ::use-only-for-whole-word
|
1811 |
+
::s ⠃⠇ ::t blind ::lcode eng ::use-only-for-whole-word
|
1812 |
+
::s ⠃⠗⠇ ::t Braille ::lcode eng ::use-only-for-whole-word
|
1813 |
+
::s ⠃ ::t but ::lcode eng ::use-only-for-whole-word
|
1814 |
+
::s ⠉ ::t c ::lcode eng
|
1815 |
+
::s ⠉ ::t can ::lcode eng ::use-only-for-whole-word
|
1816 |
+
::s ⠸⠉ ::t cannot ::lcode eng
|
1817 |
+
::s ⠒ ::t cc ::lcode eng ::dont-use-at-start-of-word ::dont-use-at-end-of-word
|
1818 |
+
::s ⠉⠧ ::t ceive ::lcode eng ::use-only-at-end-of-word
|
1819 |
+
::s ⠉⠧⠙ ::t ceived ::lcode eng ::use-only-at-end-of-word
|
1820 |
+
::s ⠉⠧⠎ ::t ceives ::lcode eng ::use-only-at-end-of-word
|
1821 |
+
::s ⠉⠧⠛ ::t ceiving ::lcode eng
|
1822 |
+
::s ⠡ ::t ch ::lcode eng
|
1823 |
+
::s ⠐⠡ ::t character ::lcode eng
|
1824 |
+
::s ⠡ ::t child ::lcode eng ::use-only-for-whole-word
|
1825 |
+
::s ⠡⠝ ::t children ::lcode eng ::use-only-for-whole-word
|
1826 |
+
::s ⠒ ::t con ::lcode eng ::use-only-at-start-of-word
|
1827 |
+
::s ⠒ ::t : ::lcode eng ::use-only-at-end-of-word
|
1828 |
+
::s ⠉⠙ ::t could ::lcode eng ::use-only-for-whole-word
|
1829 |
+
::s ⠙ ::t d ::lcode eng
|
1830 |
+
::s ⠙ ::t do ::lcode eng ::use-only-for-whole-word
|
1831 |
+
::s ⠐⠙ ::t day ::lcode eng
|
1832 |
+
# ::s ⠲ ::t dd ::t-alt . ::lcode eng ::dont-use-at-start-of-word ::dont-use-at-end-of-word ::comment abolished; interferes with period in abbrevisations such as U.S.
|
1833 |
+
::s ⠙⠉⠇ ::t declare ::lcode eng
|
1834 |
+
::s ⠙⠉⠇⠛ ::t declaring ::lcode eng
|
1835 |
+
::s ⠲ ::t dis ::lcode eng ::use-only-at-start-of-word
|
1836 |
+
::s ⠲ ::t . ::lcode eng ::dont-use-at-start-of-word
|
1837 |
+
::s ⠑ ::t e ::lcode eng
|
1838 |
+
::s ⠂ ::t ea ::lcode eng ::dont-use-at-end-of-word
|
1839 |
+
::s ⠂ ::t , ::lcode eng ::use-only-at-end-of-word
|
1840 |
+
::s ⠫ ::t ed ::lcode eng
|
1841 |
+
::s ⠑⠊ ::t either ::lcode eng ::use-only-for-whole-word
|
1842 |
+
::s ⠢ ::t en ::lcode eng
|
1843 |
+
::s ⠰⠑ ::t ence ::lcode eng ::dont-use-at-start-of-word
|
1844 |
+
::s ⠢ ::t enough ::lcode eng ::use-only-for-whole-word
|
1845 |
+
::s ⠻ ::t er ::lcode eng
|
1846 |
+
::s ⠐⠑ ::t ever ::lcode eng
|
1847 |
+
::s ⠑ ::t every ::lcode eng ::use-only-for-whole-word
|
1848 |
+
::s ⠋ ::t f ::lcode eng
|
1849 |
+
::s ⠐⠋ ::t father ::lcode eng
|
1850 |
+
::s ⠖ ::t ff ::lcode eng ::dont-use-at-start-of-word ::dont-use-at-end-of-word
|
1851 |
+
::s ⠋⠌ ::t first ::lcode eng
|
1852 |
+
::s ⠿ ::t for ::lcode eng
|
1853 |
+
::s ⠋⠗ ::t friend ::lcode eng ::use-only-for-whole-word
|
1854 |
+
::s ⠋⠗⠎ ::t friends ::lcode eng ::use-only-for-whole-word
|
1855 |
+
::s ⠋ ::t from ::lcode eng ::use-only-for-whole-word
|
1856 |
+
::s ⠰⠇ ::t ful ::lcode eng ::dont-use-at-start-of-word
|
1857 |
+
::s ⠛ ::t g ::lcode eng
|
1858 |
+
::s ⠶ ::t gg ::lcode eng ::dont-use-at-start-of-word ::dont-use-at-end-of-word
|
1859 |
+
::s ⠣ ::t gh ::lcode eng
|
1860 |
+
::s ⠛ ::t go ::lcode eng ::use-only-for-whole-word
|
1861 |
+
::s ⠛⠙ ::t good ::lcode eng ::use-only-at-start-of-word
|
1862 |
+
::s ⠛⠗⠞ ::t great ::lcode eng
|
1863 |
+
::s ⠓ ::t h ::lcode eng
|
1864 |
+
::s ⠸⠓ ::t had ::lcode eng
|
1865 |
+
::s ⠓ ::t have ::lcode eng ::use-only-for-whole-word
|
1866 |
+
::s ⠐⠓ ::t here ::lcode eng
|
1867 |
+
::s ⠓⠻⠋ ::t herself ::lcode eng ::use-only-for-whole-word
|
1868 |
+
::s ⠓⠍ ::t him ::lcode eng ::use-only-for-whole-word
|
1869 |
+
::s ⠓⠍⠋ ::t himself ::lcode eng ::use-only-for-whole-word
|
1870 |
+
::s ⠦ ::t ? ::lcode eng
|
1871 |
+
::s ⠦ ::t his ::lcode eng ::use-only-for-whole-word
|
1872 |
+
::s ⠊⠍⠍ ::t immediate ::lcode eng ::use-only-for-whole-word
|
1873 |
+
::s ⠊⠍⠍⠇⠽ ::t immediately ::lcode eng ::use-only-for-whole-word
|
1874 |
+
::s ⠔ ::t in ::lcode eng
|
1875 |
+
::s ⠔⠒ ::t incon ::lcode eng ::use-only-at-start-of-word
|
1876 |
+
::s ⠬ ::t ing ::lcode eng
|
1877 |
+
::s ⠭ ::t it ::lcode eng ::use-only-for-whole-word
|
1878 |
+
::s ⠭⠎ ::t its ::lcode eng ::use-only-for-whole-word
|
1879 |
+
::s ⠭⠋ ::t itself ::lcode eng ::use-only-for-whole-word
|
1880 |
+
::s ⠰⠽ ::t ity ::lcode eng ::dont-use-at-start-of-word
|
1881 |
+
::s ⠚ ::t j ::lcode eng
|
1882 |
+
::s ⠚ ::t just ::lcode eng ::use-only-for-whole-word
|
1883 |
+
::s ⠅ ::t k ::lcode eng
|
1884 |
+
::s ⠐⠅ ::t know ::lcode eng
|
1885 |
+
::s ⠅ ::t knowledge ::lcode eng ::use-only-for-whole-word
|
1886 |
+
::s ⠇ ::t l ::lcode eng
|
1887 |
+
::s ⠨⠎ ::t less ::lcode eng ::dont-use-at-start-of-word
|
1888 |
+
::s ⠇⠗ ::t letter ::lcode eng ::use-only-for-whole-word
|
1889 |
+
::s ⠇⠗⠎ ::t letters ::lcode eng ::use-only-for-whole-word
|
1890 |
+
::s ⠇ ::t like ::lcode eng ::use-only-for-whole-word
|
1891 |
+
::s ⠇⠇ ::t little ::lcode eng ::use-only-for-whole-word
|
1892 |
+
::s ⠐⠇ ::t lord ::lcode eng
|
1893 |
+
::s ⠍ ::t m ::lcode eng
|
1894 |
+
::s ⠸⠍ ::t many ::lcode eng
|
1895 |
+
::s ⠰⠞ ::t ment ::lcode eng ::dont-use-at-start-of-word
|
1896 |
+
::s ⠍ ::t more ::lcode eng ::use-only-for-whole-word
|
1897 |
+
::s ⠐⠍ ::t mother ::lcode eng
|
1898 |
+
::s ⠍⠡ ::t much ::lcode eng ::use-only-for-whole-word
|
1899 |
+
::s ⠍⠌ ::t must ::lcode eng ::use-only-for-whole-word
|
1900 |
+
::s ⠍⠽⠋ ::t myself ::lcode eng ::use-only-for-whole-word
|
1901 |
+
::s ⠝ ::t n ::lcode eng
|
1902 |
+
::s ⠐⠝ ::t name ::lcode eng
|
1903 |
+
::s ⠝⠑⠉ ::t necessary ::lcode eng ::use-only-for-whole-word
|
1904 |
+
::s ⠝⠑⠊ ::t neither ::lcode eng ::use-only-for-whole-word
|
1905 |
+
::s ⠰⠎ ::t ness ::lcode eng ::dont-use-at-start-of-word
|
1906 |
+
::s ⠝ ::t not ::lcode eng ::use-only-for-whole-word
|
1907 |
+
::s ⠕⠄⠉ ::t o'clock ::lcode eng ::use-only-for-whole-word
|
1908 |
+
::s ⠷ ::t of ::lcode eng
|
1909 |
+
::s ⠐⠕ ::t one ::lcode eng
|
1910 |
+
::s ⠰⠛ ::t ong ::lcode eng ::dont-use-at-start-of-word
|
1911 |
+
::s ⠳ ::t ou ::lcode eng
|
1912 |
+
::s ⠨⠙ ::t ound ::lcode eng
|
1913 |
+
::s ⠨⠞ ::t ount ::lcode eng
|
1914 |
+
::s ⠐⠳ ::t ought ::lcode eng
|
1915 |
+
::s ⠳⠗⠧⠎ ::t ourselves ::lcode eng ::use-only-for-whole-word
|
1916 |
+
::s ⠳ ::t out ::lcode eng ::use-only-for-whole-word
|
1917 |
+
::s ⠪ ::t ow ::lcode eng
|
1918 |
+
::s ⠏ ::t p ::lcode eng
|
1919 |
+
::s ⠏⠙ ::t paid ::lcode eng ::use-only-for-whole-word
|
1920 |
+
::s ⠐⠏ ::t part ::lcode eng
|
1921 |
+
::s ⠏ ::t people ::lcode eng ::use-only-for-whole-word
|
1922 |
+
::s ⠏⠻⠓ ::t perhaps ::lcode eng ::use-only-for-whole-word
|
1923 |
+
::s ⠟ ::t q ::lcode eng
|
1924 |
+
::s ⠐⠟ ::t question ::lcode eng
|
1925 |
+
::s ⠟⠅ ::t quick ::lcode eng ::use-only-for-whole-word
|
1926 |
+
::s ⠟⠅⠻ ::t quicker ::lcode eng ::use-only-for-whole-word
|
1927 |
+
::s ⠟⠅⠑⠌ ::t quickest ::lcode eng ::use-only-for-whole-word
|
1928 |
+
::s ⠟ ::t quite ::lcode eng ::use-only-for-whole-word
|
1929 |
+
::s ⠗ ::t r ::lcode eng
|
1930 |
+
::s ⠗ ::t rather ::lcode eng ::use-only-for-whole-word
|
1931 |
+
::s ⠐⠗ ::t right ::lcode eng
|
1932 |
+
::s ⠗⠚⠉ ::t rejoice ::lcode eng
|
1933 |
+
::s ⠗⠚⠉⠛ ::t rejoicing ::lcode eng
|
1934 |
+
::s ⠎ ::t s ::lcode eng
|
1935 |
+
::s ⠎⠙ ::t said ::lcode eng ::use-only-for-whole-word
|
1936 |
+
::s ⠩ ::t sh ::lcode eng
|
1937 |
+
::s ⠩ ::t shall ::lcode eng ::use-only-for-whole-word
|
1938 |
+
::s ⠩⠙ ::t should ::lcode eng ::use-only-for-whole-word
|
1939 |
+
::s ⠨⠝ ::t sion ::lcode eng
|
1940 |
+
::s ⠎ ::t so ::lcode eng ::use-only-for-whole-word
|
1941 |
+
::s ⠐⠎ ::t some ::lcode eng
|
1942 |
+
::s ⠸⠎ ::t spirit ::lcode eng
|
1943 |
+
::s ⠌ ::t st ::lcode eng
|
1944 |
+
::s ⠌ ::t still ::lcode eng ::use-only-for-whole-word
|
1945 |
+
::s ⠎⠡ ::t such ::lcode eng ::use-only-for-whole-word
|
1946 |
+
::s ⠞ ::t t ::lcode eng
|
1947 |
+
::s ⠹ ::t th ::lcode eng
|
1948 |
+
::s ⠞ ::t that ::lcode eng ::use-only-for-whole-word
|
1949 |
+
::s ⠹ ::t this ::lcode eng ::use-only-for-whole-word
|
1950 |
+
::s ⠮ ::t the ::lcode eng
|
1951 |
+
::s ⠸⠮ ::t their ::lcode eng
|
1952 |
+
::s ⠮⠍⠧⠎ ::t themselves ::lcode eng ::use-only-for-whole-word
|
1953 |
+
::s ⠐⠮ ::t there ::lcode eng
|
1954 |
+
::s ⠘⠮ ::t these ::lcode eng
|
1955 |
+
::s ⠘⠹ ::t those ::lcode eng
|
1956 |
+
::s ⠐⠹ ::t through ::lcode eng
|
1957 |
+
::s ⠐⠞ ::t time ::lcode eng
|
1958 |
+
::s ⠰⠝ ::t tion ::lcode eng ::dont-use-at-start-of-word
|
1959 |
+
::s ⠖ ::t to ::lcode eng ::use-only-for-whole-word
|
1960 |
+
::s ⠞⠙ ::t today ::lcode eng ::use-only-for-whole-word
|
1961 |
+
::s ⠞⠛⠗ ::t together ::lcode eng ::use-only-for-whole-word
|
1962 |
+
::s ⠞⠍ ::t tomorrow ::lcode eng ::use-only-for-whole-word
|
1963 |
+
::s ⠞⠝ ::t tonight ::lcode eng ::use-only-for-whole-word
|
1964 |
+
::s ⠥ ::t u ::lcode eng
|
1965 |
+
::s ⠥⠝⠒ ::t uncon ::lcode eng ::use-only-at-start-of-word
|
1966 |
+
::s ⠥ ::t us ::lcode eng ::use-only-for-whole-word
|
1967 |
+
::s ⠠⠥⠲⠎⠲ ::t U.S. ::lcode eng
|
1968 |
+
::s ⠐⠥ ::t under ::lcode eng
|
1969 |
+
::s ⠘⠥ ::t upon ::lcode eng
|
1970 |
+
::s ⠧ ::t v ::lcode eng
|
1971 |
+
::s ⠧ ::t very ::lcode eng ::use-only-for-whole-word
|
1972 |
+
::s ⠺ ::t w ::lcode eng
|
1973 |
+
::s ⠴ ::t " ::lcode eng
|
1974 |
+
::s ⠴ ::t was ::lcode eng ::use-only-for-whole-word
|
1975 |
+
::s ⠶ ::t were ::lcode eng ::use-only-for-whole-word
|
1976 |
+
::s ⠱ ::t wh ::lcode eng
|
1977 |
+
::s ⠐⠱ ::t where ::lcode eng
|
1978 |
+
::s ⠱ ::t which ::lcode eng ::use-only-for-whole-word
|
1979 |
+
::s ⠘⠱ ::t whose ::lcode eng
|
1980 |
+
::s ⠺ ::t will ::lcode eng ::use-only-for-whole-word
|
1981 |
+
::s ⠾ ::t with ::lcode eng
|
1982 |
+
::s ⠘⠺ ::t word ::lcode eng
|
1983 |
+
::s ⠐⠺ ::t work ::lcode eng
|
1984 |
+
::s ⠸⠺ ::t world ::lcode eng
|
1985 |
+
::s ⠺⠙ ::t would ::lcode eng ::use-only-for-whole-word
|
1986 |
+
::s ⠭ ::t x ::lcode eng
|
1987 |
+
::s ⠽ ::t y ::lcode eng
|
1988 |
+
::s ⠽ ::t you ::lcode eng ::use-only-for-whole-word
|
1989 |
+
::s ⠽⠗ ::t your ::lcode eng ::use-only-for-whole-word
|
1990 |
+
::s ⠽⠗⠎ ::t yours ::lcode eng ::use-only-for-whole-word
|
1991 |
+
::s ⠽⠗⠋ ::t yourself ::lcode eng ::use-only-for-whole-word
|
1992 |
+
::s ⠽⠗⠧⠎ ::t yourselves ::lcode eng ::use-only-for-whole-word
|
1993 |
+
::s ⠐⠽ ::t young ::lcode eng
|
1994 |
+
::s ⠵ ::t z ::lcode eng
|
1995 |
+
::s ⠠⠴ ::t ’ ::lcode eng
|
1996 |
+
|
1997 |
+
::preserve ::from U+2190 ::to U+21FF ::comments Arrows
|
1998 |
+
::preserve ::from U+2200 ::to U+22FF ::comment Mathematical Operators
|
1999 |
+
::preserve ::from U+2300 ::to U+23FF ::comment Miscellaneous Technical
|
2000 |
+
::preserve ::from U+2500 ::to U+257F ::comment Box Drawing
|
2001 |
+
::preserve ::from U+2580 ::to U+259F ::comment Block Elements
|
2002 |
+
::preserve ::from U+25A0 ::to U+25FF ::comment Geometric Shapes
|
2003 |
+
::preserve ::from U+2600 ::to U+26FF ::comment Miscellaneous Symbols
|
2004 |
+
::preserve ::from U+27C0 ::to U+27ED ::comment Miscellaneous Mathematical Symbols-A
|
2005 |
+
::preserve ::from U+27F0 ::to U+27FF ::comment Supplemental Arrows-A
|
2006 |
+
::preserve ::from U+2900 ::to U+297F ::comment Supplemental Arrows-B
|
2007 |
+
::preserve ::from U+2980 ::to U+29FF ::comment Miscellaneous Mathematical Symbols-B
|
2008 |
+
::preserve ::from U+2A00 ::to U+2AFF ::comment Supplemental Mathematical Operators
|
2009 |
+
::preserve ::from U+2B00 ::to U+2BFF ::comment Miscellaneous Symbols and Arrows
|
2010 |
+
::preserve ::from U+2E00 ::to U+2E27 ::comment Supplemental Punctuation (excluding ⸨⸩)
|
2011 |
+
::preserve ::from U+2E2A ::to U+2E7F ::comment Supplemental Punctuation (cont'd)
|
2012 |
+
::preserve ::from U+18B00 ::to U+18CD5 ::comment Khitan Small Script
|
2013 |
+
::preserve ::from U+1D100 ::to U+1D1FF ::comment Musical Symbols
|
2014 |
+
::preserve ::from U+1D6A8 ::to U+1D7CB ::comment Mathematical Alphanumeric Symbols (Greek)
|
2015 |
+
::preserve ::from U+1D800 ::to U+1DAAF ::comment Sutton SignWriting
|
2016 |
+
::preserve ::from U+1F800 ::to U+1F8FF ::comment Supplemental Arrows-C
|
2017 |
+
::preserve ::from U+1FA00 ::to U+1FA6F ::comment Chess Symbols
|
2018 |
+
::preserve ::from U+1FB00 ::to U+1FBCF ::comment Symbols for Legacy Computing
|
2019 |
+
::preserve ::from U+1FA70 ::to U+1FAFF ::comment Symbols and Pictographs Extended-A
|
uroman/data/romanization-table.v1.2.1.txt
ADDED
@@ -0,0 +1,814 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
## European Latin extensions
|
3 |
+
# Vowels
|
4 |
+
::s Ä ::t Ae
|
5 |
+
::s Ö ::t Oe
|
6 |
+
::s Ü ::t Ue
|
7 |
+
::s Å ::t Aa
|
8 |
+
::s Æ ::t Ae
|
9 |
+
::s Ø ::t oe
|
10 |
+
::s Œ ::t Oe
|
11 |
+
::s ä ::t ae
|
12 |
+
::s ö ::t oe
|
13 |
+
::s ü ::t ue
|
14 |
+
::s å ::t aa
|
15 |
+
::s æ ::t ae
|
16 |
+
::s ø ::t oe
|
17 |
+
::s œ ::t oe
|
18 |
+
# Consonants
|
19 |
+
::s Ç ::t S
|
20 |
+
::s ç ::t s
|
21 |
+
::s Ç ::t Ch ::lcode tur
|
22 |
+
::s ç ::t ch ::lcode tur
|
23 |
+
::s Ş ::t Sh
|
24 |
+
::s ş ::t sh
|
25 |
+
::s Ș ::t Sh
|
26 |
+
::s ș ::t sh
|
27 |
+
::s ß ::t ss
|
28 |
+
::s Ț ::t Ts
|
29 |
+
::s ț ::t ts
|
30 |
+
|
31 |
+
# Miscellaneous
|
32 |
+
::s ə ::t e
|
33 |
+
|
34 |
+
# English
|
35 |
+
::s chr ::t chr ::t-alt kr ::example chromosome, synchronize
|
36 |
+
::s Chr ::t Chr ::t-alt Kr ::example Christmas, Chrysler
|
37 |
+
::s eight ::t eight ::t-alt eit ::example eight, weight
|
38 |
+
::s Eight ::t Eight ::t-alt Eit ::example Eighteen
|
39 |
+
::s ight ::t ight ::t-alt ait ::example Knight
|
40 |
+
::s gh ::t gh ::t-alt f, ph, "" ::example laugh, daughter
|
41 |
+
::s high ::t high ::t-alt hai ::example highlight
|
42 |
+
::s High ::t High ::t-alt Hai ::example High School
|
43 |
+
::s Isle ::t Isle ::t-alt Ail ::use-only-at-start-of-word ::use-only-at-end-of-word ::example Isle
|
44 |
+
::s Island ::t Island ::t-alt Ailand ::use-only-at-start-of-word ::use-only-at-end-of-word ::example Island
|
45 |
+
::s kn ::t kn ::t-alt n ::use-only-at-start-of-word ::example knowledge
|
46 |
+
::s Kn ::t Kn ::t-alt N ::use-only-at-start-of-word ::example Knight
|
47 |
+
::s Mc ::t Mc ::t-alt Mac ::use-only-at-start-of-word ::example McNulty
|
48 |
+
::s mc ::t mc ::t-alt mac ::use-only-at-start-of-word
|
49 |
+
::s oo ::t oo ::t-alt u ::lcode eng ::example Brooklyn; Goose Bay
|
50 |
+
::s ph ::t ph ::t-alt f ::example alpha
|
51 |
+
::s Ph ::t Ph ::t-alt F ::example Philip
|
52 |
+
::s Thom ::t Thom ::t-alt Tom ::use-only-at-start-of-word ::example Thomas, Thompson
|
53 |
+
::s tion ::t tion ::t-alt shen ::example
|
54 |
+
::s Sean ::t Sean ::t-alt Shawn ::use-only-at-start-of-word ::use-only-at-end-of-word
|
55 |
+
::s ssion ::t ssion ::t-alt shen ::example Sessions
|
56 |
+
::s St ::t St ::t-alt Saint ::use-only-at-start-of-word ::use-only-at-end-of-word
|
57 |
+
::s St. ::t St. ::t-alt Saint ::use-only-at-start-of-word ::use-only-at-end-of-word
|
58 |
+
::s Wr ::t Wr ::t-alt R ::example Wren
|
59 |
+
::s wr ::t wr ::t-alt r ::example Cartwright
|
60 |
+
::s x ::t x ::t-alt ks ::example Mexico
|
61 |
+
::s x ::t x ::t-alt gz ::example example, anxiety, exhaust, exit
|
62 |
+
|
63 |
+
# French
|
64 |
+
::s â ::t a ::t-alt as ::example pâte/paste, pastry
|
65 |
+
::s ê ::t e ::t-alt es ::example fête/feast
|
66 |
+
::s î ::t i ::t-alt is ::example île/isle
|
67 |
+
::s ô ::t o ::t-alt os ::example côte/coast
|
68 |
+
::s û ::t u ::t-alt us ::example août/August
|
69 |
+
::s eaux ::t eaux ::t-alt o ::example Bordeaux
|
70 |
+
::s eau ::t eau ::t-alt o ::example Chateau
|
71 |
+
::s auld ::t auld ::t-alt o ::use-only-at-end-of-word ::example Renauld
|
72 |
+
::s ault ::t ault ::t-alt o ::use-only-at-end-of-word ::example Renault
|
73 |
+
::s oux ::t oux ::t-alt u
|
74 |
+
::s ois ::t ois ::t-alt oa ::use-only-at-end-of-word ::example Dubois
|
75 |
+
|
76 |
+
# German
|
77 |
+
::s Sch ::t Sch ::t-alt Sh
|
78 |
+
::s sch ::t sch ::t-alt sh
|
79 |
+
::s stein ::t stein ::t-alt shtain
|
80 |
+
::s dt ::t dt ::t-alt tt ::use-only-at-end-of-word ::example Schmidt
|
81 |
+
|
82 |
+
# Dutch
|
83 |
+
::s ij ::t ij ::t-alt ai
|
84 |
+
::s Ij ::t Ij ::t-alt Ai
|
85 |
+
|
86 |
+
# Greek
|
87 |
+
::s Ι ::t I
|
88 |
+
::s ι ::t i
|
89 |
+
::s ί ::t i
|
90 |
+
::s ἶ ::t i
|
91 |
+
::s Υ ::t Y
|
92 |
+
::s υ ::t y
|
93 |
+
::s Ρ ::t R
|
94 |
+
::s ρ ::t r
|
95 |
+
::s Ντ ::t D
|
96 |
+
::s ντ ::t nd ::t-alt d
|
97 |
+
# ::s ντζ ::t ntz
|
98 |
+
::s Μπ ::t B
|
99 |
+
::s μπ ::t mb ::t-alt b
|
100 |
+
::s γγ ::t ng
|
101 |
+
::s γκ ::t ng ::t-alt g
|
102 |
+
::s ει ::t ei ::t-alt i
|
103 |
+
::s ου ::t ou ::t-alt u
|
104 |
+
::s χ ::t ch ::t-alt kh
|
105 |
+
|
106 |
+
# Cyrillic
|
107 |
+
::s Г ::t G ::t-alt H
|
108 |
+
::s г ::t g ::t-alt h
|
109 |
+
::s Е ::t E ::t-alt Ye
|
110 |
+
::s е ::t e ::t-alt ye
|
111 |
+
::s Ё ::t E ::t-alt Yo
|
112 |
+
::s ё ::t e ::t-alt yo
|
113 |
+
::s Х ::t Kh ::t-alt Ch, H ::comment Cyrillic capital ha
|
114 |
+
::s х ::t kh ::t-alt ch, h ::comment Cyrillic small ha
|
115 |
+
::s Щ ::t Shch ::t-alt Sh
|
116 |
+
::s щ ::t shch ::t-alt sh
|
117 |
+
::s Ъ ::t ::comment Cyrillic capital hard sign
|
118 |
+
::s ъ ::t ::comment Cyrillic small hard sign
|
119 |
+
::s Ы ::t Y ::comment Cyrillic capital yeru
|
120 |
+
::s ы ::t y ::comment Cyrillic small yeru
|
121 |
+
::s Ь ::t ::comment Cyrillic capital soft sign
|
122 |
+
::s ь ::t ::comment Cyrillic small soft sign
|
123 |
+
|
124 |
+
::s Ҥ ::t Ng ::comment Cyrillic capital ligature EN GHE
|
125 |
+
::s ҥ ::t ng ::comment Cyrillic small ligature EN GHE
|
126 |
+
::s Ә ::t e ::comment Cyrillic capital schwa
|
127 |
+
::s ә ::t e ::comment Cyrillic small schwa
|
128 |
+
::s Ӏ ::t ' ::comment Cyrillic palochka
|
129 |
+
::s Ҵ ::t TS ::comment Cyrillic capital ligature te tse, used in Abkhasian
|
130 |
+
::s ҵ ::t ts ::comment Cyrillic small ligature te tse, used in Abkhasian
|
131 |
+
::s Ӕ ::t AE ::comment Cyrillic capital ligature a ie
|
132 |
+
::s ӕ ::t ae ::comment Cyrillic small ligature a ie
|
133 |
+
::s Г ::t H ::lcode ukr ::comment Ukrainian capital letter he
|
134 |
+
::s г ::t h ::lcode ukr ::comment Ukrainian small letter he
|
135 |
+
::s Ґ ::t G ::lcode ukr ::comment Ukrainian capital letter ghe
|
136 |
+
::s ґ ::t g ::lcode ukr ::comment Ukrainian small letter ghe
|
137 |
+
|
138 |
+
# Gothic
|
139 |
+
::s 𐌴 ::t e ::comment Gothic letter aihvus
|
140 |
+
::s 𐌹 ::t i ::comment Gothic letter eis
|
141 |
+
::s 𐍇 ::t x ::comment Gothic letter iggws
|
142 |
+
|
143 |
+
# Georgian
|
144 |
+
::s ა ::t a ::comment Georgian letter an
|
145 |
+
::s ე ::t e ::comment Georgian letter en
|
146 |
+
::s ი ::t i ::comment Georgian letter in
|
147 |
+
::s ო ::t o ::comment Georgian letter on
|
148 |
+
::s უ ::t u ::comment Georgian letter un
|
149 |
+
|
150 |
+
# Armenian
|
151 |
+
::s Ա ::t a ::comment Armenian capital letter ayb
|
152 |
+
::s ա ::t a ::comment Armenian small letter ayb
|
153 |
+
::s Ե ::t e ::comment Armenian capital letter ech
|
154 |
+
::s ե ::t e ::comment Armenian small letter ech
|
155 |
+
::s և ::t ev ::comment Armenian small ligature ech yiwn
|
156 |
+
::s Է ::t e ::comment Armenian capital letter eh
|
157 |
+
::s է ::t e ::comment Armenian small letter eh
|
158 |
+
::s Ի ::t i ::comment Armenian capital letter ini
|
159 |
+
::s ի ::t i ::comment Armenian small letter ini
|
160 |
+
::s Օ ::t o ::comment Armenian capital letter oh
|
161 |
+
::s օ ::t o ::comment Armenian small letter oh
|
162 |
+
|
163 |
+
## Japanese
|
164 |
+
# Katakana
|
165 |
+
::s シ ::t shi
|
166 |
+
::s チ ::t chi
|
167 |
+
::s フ ::t fu
|
168 |
+
::s ジ ::t ji
|
169 |
+
::s ヂ ::t ji
|
170 |
+
::s ヅ ::t zu
|
171 |
+
::s シャ ::t sha
|
172 |
+
::s シュ ::t shu
|
173 |
+
::s ショ ::t sho
|
174 |
+
::s チャ ::t cha
|
175 |
+
::s チェ ::t che
|
176 |
+
::s チュ ::t chu
|
177 |
+
::s チョ ::t cho
|
178 |
+
::s ジャ ::t ja
|
179 |
+
::s ジュ ::t ju
|
180 |
+
::s ジョ ::t jo
|
181 |
+
::s ジェ ::t je
|
182 |
+
::s ヂャ ::t ja
|
183 |
+
::s ヂュ ::t ju
|
184 |
+
::s ヂョ ::t jo
|
185 |
+
::s フェ ::t fe
|
186 |
+
::s ヴェ ::t ve
|
187 |
+
::s フィ ::t fi
|
188 |
+
::s ウィ ::t wi
|
189 |
+
::s ヴィ ::t vi
|
190 |
+
::s ティ ::t ti
|
191 |
+
::s ディ ::t di
|
192 |
+
::s ッ ::t (__SOKUON__) ::comment katakana double following consonant
|
193 |
+
::s ー ::t (__CHOONPU__) ::comment katakana prolonged sound mark
|
194 |
+
# Hiragana
|
195 |
+
::s し ::t shi
|
196 |
+
::s ち ::t chi
|
197 |
+
::s つ ::t tsu
|
198 |
+
::s ふ ::t fu
|
199 |
+
::s を ::t o
|
200 |
+
::s じ ::t ji
|
201 |
+
::s ぢ ::t ji
|
202 |
+
::s づ ::t zu
|
203 |
+
::s しゃ ::t sha
|
204 |
+
::s しゅ ::t shu
|
205 |
+
::s しょ ::t sho
|
206 |
+
::s ちゃ ::t cha
|
207 |
+
::s ちゅ ::t chu
|
208 |
+
::s ちょ ::t cho
|
209 |
+
::s じゃ ::t ja
|
210 |
+
::s じゅ ::t ju
|
211 |
+
::s じょ ::t jo
|
212 |
+
::s ぢゃ ::t ja
|
213 |
+
::s ぢゅ ::t ju
|
214 |
+
::s ぢょ ::t jo
|
215 |
+
::s っ ::t (__SOKUON__) ::comment hiragana double following consonant
|
216 |
+
::s 々 ::t ² ::comment ideographic iteration mark ::annotation repetition-sign
|
217 |
+
|
218 |
+
::s フ ::t fu ::t-alt f
|
219 |
+
::s キ ::t ki ::t-alt k
|
220 |
+
::s ク ::t ku ::t-alt k
|
221 |
+
::s ラ ::t ra ::t-alt la
|
222 |
+
::s リ ::t ri ::t-alt li
|
223 |
+
::s ル ::t ru ::t-alt lu, l, r
|
224 |
+
::s レ ::t re ::t-alt le
|
225 |
+
::s ロ ::t ro ::t-alt lo
|
226 |
+
::s ム ::t mu ::t-alt m ::example キム = Kim
|
227 |
+
::s シ ::t shi ::t-alt si ::example メキシコ = meksiko (Mexico)
|
228 |
+
::s ス ::t su ::t-alt s
|
229 |
+
::s ト ::t to ::t-alt t
|
230 |
+
::s ツ ::t tsu ::t-alt tu, ts ::example シュルツ = Schultz
|
231 |
+
|
232 |
+
# Chinese
|
233 |
+
::s 邦 ::t bang ::t-alt bon, bum, bun, pon
|
234 |
+
::s 鲍 ::t bao ::t-alt bow
|
235 |
+
::s 堡 ::t bao ::t-alt berg, burg, bourg, burgh
|
236 |
+
::s 贝 ::t bei ::t-alt ber
|
237 |
+
::s 本 ::t ben ::t-alt bern, bon, bourn, burn
|
238 |
+
::s 彼得 ::t bide ::t-alt peter, pet
|
239 |
+
::s 伯 ::t bo ::t-alt ber
|
240 |
+
::s 波 ::t bo ::t-alt po
|
241 |
+
::s 布 ::t bu ::t-alt b
|
242 |
+
::s 策 ::t ce ::t-alt tze, tzer
|
243 |
+
::s 曾 ::t ceng ::t-alt tzen, zen
|
244 |
+
::s 彻 ::t che ::t-alt tche
|
245 |
+
::s 茨 ::t ci ::t-alt ts, tz, z
|
246 |
+
::s 兹 ::t ci ::t-alt ds, dz, tz, z, zi
|
247 |
+
::s 蒂 ::t di ::t-alt ti, tti
|
248 |
+
::s 丁 ::t ding ::t-alt din, tin
|
249 |
+
::s 顿 ::t dun ::t-alt ton
|
250 |
+
::s 多 ::t duo ::t-alt do, dor, to
|
251 |
+
::s 尔 ::t er ::t-alt l, le, ll, r
|
252 |
+
::s 弗 ::t fu ::t-alt f, fer, pher, v, ver, vir
|
253 |
+
::s 夫 ::t fu ::t-alt f, v, v
|
254 |
+
::s 福 ::t fu ::t-alt faw, for, ford
|
255 |
+
::s 哥 ::t ge ::t-alt go, co
|
256 |
+
::s 戈 ::t ge ::t-alt go
|
257 |
+
::s 各 ::t ge ::t-alt go, co
|
258 |
+
::s 赫 ::t he ::t-alt ch, che, cher, ge
|
259 |
+
::s 华 ::t hua ::t-alt ver, wa, war, wer ::example Washington
|
260 |
+
::s 怀 ::t huai ::t-alt whi, wi, wy
|
261 |
+
::s 惠 ::t hui ::t-alt wha, whea
|
262 |
+
::s 基 ::t ji ::t-alt ki, chi
|
263 |
+
::s 吉 ::t ji ::t-alt gi, gui
|
264 |
+
::s 加 ::t jia ::t-alt ca, ga, ka ::example Canada
|
265 |
+
::s 杰 ::t jie ::t-alt ger
|
266 |
+
::s 金 ::t jin ::t-alt kin, gin
|
267 |
+
::s 斤 ::t jin ::t-alt zin
|
268 |
+
::s 康 ::t kang ::t-alt con, corn
|
269 |
+
::s 考 ::t kao ::t-alt cow, cour
|
270 |
+
::s 克 ::t ke ::t-alt k, che, cher
|
271 |
+
::s 科 ::t ke ::t-alt ko
|
272 |
+
::s 拉 ::t la ::t-alt ra ::example Tirana
|
273 |
+
::s 朗 ::t lang ::t-alt lon, ron
|
274 |
+
::s 赖 ::t lai ::t-alt ri
|
275 |
+
::s 劳 ::t lao ::t-alt low
|
276 |
+
::s 勒 ::t lei ::t-alt ler
|
277 |
+
::s 伦 ::t lun ::t-alt lon, ran, ron
|
278 |
+
::s 里 ::t li ::t-alt ri
|
279 |
+
::s 利 ::t li ::t-alt ri ::example Ferrari
|
280 |
+
::s 隆 ::t long ::t-alt lon, lum, lund
|
281 |
+
::s 罗 ::t luo ::t-alt l, lo, lu, ro, row, ru
|
282 |
+
::s 洛 ::t luo ::t-alt lo, low, ro
|
283 |
+
::s 默 ::t mo ::t-alt mer
|
284 |
+
::s 纳 ::t na ::t-alt ne, ner
|
285 |
+
::s 珀 ::t po ::t-alt per
|
286 |
+
::s 奇 ::t qi ::t-alt chi, dge, ge, tch
|
287 |
+
::s 齐 ::t qi ::t-alt tsi, zi
|
288 |
+
::s 乔 ::t qiao ::t-alt jo
|
289 |
+
::s 青 ::t qing ::t-alt tsing
|
290 |
+
::s 琼 ::t qiong ::t-alt jon, jum, jun
|
291 |
+
::s 瑟 ::t se ::t-alt the
|
292 |
+
::s 什 ::t shen ::t-alt sh
|
293 |
+
::s 圣 ::t sheng ::t-alt san, sao, saint
|
294 |
+
::s 斯 ::t si ::t-alt s, rth, th ::example Alaska
|
295 |
+
::s 索 ::t suo ::t-alt tho
|
296 |
+
::s 特 ::t te ::t-alt t
|
297 |
+
::s 翁 ::t weng ::t-alt on
|
298 |
+
::s 沃 ::t wo ::t-alt ver, vo, war, wer
|
299 |
+
::s 乌 ::t wu ::t-alt ou, u
|
300 |
+
::s 希 ::t xi ::t-alt chi, hi, shi
|
301 |
+
::s 西 ::t xi ::t-alt s, si
|
302 |
+
::s 锡 ::t xi ::t-alt ci, si, thi, zi
|
303 |
+
::s 夏 ::t xia ::t-alt ha, cha, cia, sha, tia
|
304 |
+
::s 香 ::t xiang ::t-alt chan, cham
|
305 |
+
::s 歇 ::t xie ::t-alt she
|
306 |
+
::s 谢 ::t xie ::t-alt che, she
|
307 |
+
::s 辛 ::t xin ::t-alt cin, sen, sin, sing, sun, zen
|
308 |
+
::s 欣 ::t xin ::t-alt hin, shin
|
309 |
+
::s 休 ::t xiu ::t-alt hu, hue
|
310 |
+
::s 修 ::t xiu ::t-alt ciu, siu, thew, tiu
|
311 |
+
::s 许 ::t xu ::t-alt hue, schue
|
312 |
+
::s 逊 ::t xun ::t-alt son
|
313 |
+
::s 耶 ::t ye ::t-alt yer, ier
|
314 |
+
::s 泽 ::t ze ::t-alt ser
|
315 |
+
::s 扎 ::t zha ::t-alt za
|
316 |
+
::s 詹 ::t zhan ::t-alt ja, jam, jan, jen, jon
|
317 |
+
::s 治 ::t zhi ::t-alt ge ::example George
|
318 |
+
|
319 |
+
## Numbers
|
320 |
+
# Chinese and Japanese numbers
|
321 |
+
::s 零 ::num 0
|
322 |
+
::s 〇 ::num 0
|
323 |
+
::s 一 ::num 1
|
324 |
+
::s 二 ::num 2
|
325 |
+
::s 三 ::num 3
|
326 |
+
::s 四 ::num 4
|
327 |
+
::s 五 ::num 5
|
328 |
+
::s 六 ::num 6
|
329 |
+
::s 七 ::num 7
|
330 |
+
::s 八 ::num 8
|
331 |
+
::s 九 ::num 9
|
332 |
+
::s 十 ::num 10
|
333 |
+
::s 百 ::num 100
|
334 |
+
::s 千 ::num 1000
|
335 |
+
::s 万 ::num 10000
|
336 |
+
::s 萬 ::num 10000
|
337 |
+
::s 亿 ::num 100000000
|
338 |
+
::s 億 ::num 100000000
|
339 |
+
::s 兆 ::num 1000000000000
|
340 |
+
::s 京 ::num 10000000000000000
|
341 |
+
|
342 |
+
::s 北京 ::t beijing
|
343 |
+
::s 京都 ::t jingdou
|
344 |
+
::s 东京 ::t dongjing
|
345 |
+
::s 京胡 ::t jinghu
|
346 |
+
::s 南京 ::t nangjing
|
347 |
+
::s 普京 ::t pujing ::comment Putin
|
348 |
+
::s 東京 ::t dongjing ::comment Tokyo
|
349 |
+
::s 京兆 ::t jingzhao
|
350 |
+
|
351 |
+
::s ㎢ ::t km²
|
352 |
+
::s ㎥ ::t m³
|
353 |
+
::s ㎝ ::t cm
|
354 |
+
|
355 |
+
## Indian
|
356 |
+
# see mostly under UnicodeDataOverwrite.txt
|
357 |
+
|
358 |
+
# Malayalam
|
359 |
+
::s ൗ ::t au ::comment MALAYALAM AU LENGTH MARK
|
360 |
+
|
361 |
+
# Tamil
|
362 |
+
::s ட ::t d ::comment most commonly d, but t when word-initial or in a doubled consonant
|
363 |
+
::s ஃப ::t f ::comment h+p=f
|
364 |
+
::s ஃஜ ::t z ::comment h+j=z
|
365 |
+
|
366 |
+
# Myanmar/Burmese
|
367 |
+
# ::s ့ ::t ::comment dot below, denotes creaky tone
|
368 |
+
# ::s း ::t ::comment visarga, denotes high tone
|
369 |
+
::s ၌ ::t -nai ::comment locative
|
370 |
+
::s ၍ ::t -jwe ::comment completed
|
371 |
+
::s ၎ ::t legau ::comment aforementioned
|
372 |
+
::s ၏ ::t -i ::comment genetive
|
373 |
+
|
374 |
+
# Lao
|
375 |
+
::s ັ ::t a ::comment vowel sign mai kan
|
376 |
+
::s ົ ::t o ::comment vowel sign mai kon
|
377 |
+
::s ູ ::t uu ::comment vowel sign uu
|
378 |
+
::s ຽ ::t y ::comment semivowel sign nyo
|
379 |
+
::s ຼ ::t l ::comment semivowel sign lo
|
380 |
+
::s ລ ::t l ::comment lo loot
|
381 |
+
::s ຣ ::t l ::comment lo ling
|
382 |
+
::s ໝ ::t m ::comment ho mo
|
383 |
+
::s ໜ ::n ::comment ho no
|
384 |
+
::s ຢ ::t y ::comment yo
|
385 |
+
::s ໍ ::t oo ::comment niggahita (possibly also nasal -m in final position)
|
386 |
+
::s ໆ ::t ² ::comment Lao ko la ::annotation repetition-sign
|
387 |
+
::s ຯ ::t ... ::comment Lao ellipsis
|
388 |
+
|
389 |
+
# Thai
|
390 |
+
::s ออ ::t o
|
391 |
+
::s อั ::t a
|
392 |
+
::s อิ ::t i
|
393 |
+
::s ๆ ::t ² ::comment Thai character maiyamok ::annotation repetition-sign
|
394 |
+
|
395 |
+
# Khmer
|
396 |
+
::s ័ ::t "" ::comment Khmer samyok sannya: indicates deviation from the general rules of pronunciation
|
397 |
+
::s ៏ ::t "" ::comment Khmer sign ahsda: denotes stressed intonation in some single-consonant words
|
398 |
+
::s ៍ ::t "" ::comment Khmer sign toandakhiat: indicates that the base character is not pronounced
|
399 |
+
::s ៌ ::t "" ::comment Khmer sign robat: a diacritic historically corresponding to the repha form of ra in Devanagari
|
400 |
+
::s ប៉ ::t pa ::comment Khmer ba + musĕkâtônd -> pa
|
401 |
+
::s ៗ ::t ² ::comment Khmer sign lek too ::annotation repetition-sign
|
402 |
+
|
403 |
+
## Semitic languages
|
404 |
+
# Arabic
|
405 |
+
::s و ::t w ::comment Arabic letter waw ::t-alt o, u ::lcode ara
|
406 |
+
::s ء ::t ' ::comment hamza
|
407 |
+
::s ٔ ::t ' ::comment hamza above
|
408 |
+
::s ٕ ::t ' ::comment hamza below
|
409 |
+
::s ع ::t ' ::comment ain
|
410 |
+
::s آ ::t a ::comment alef madda
|
411 |
+
::s ٓا ::t a ::comment Arabic maddah above plus alef (presumably an ill-formed version of آ; found 1 instance in Urdu text)
|
412 |
+
::s إ ::t i ::comment alef with hamza below
|
413 |
+
::s ٱ ::t a ::comment alef wasla ::comment typically indicates liaison with preceding word
|
414 |
+
::s ة ::t a ::comment teh marbuta
|
415 |
+
::s ۃ ::t a ::comment teh marbuta goal ::comment Used in Punjabi, Sindhi. Different from plain 'teh marbuta'?
|
416 |
+
::s ي ::t y ::comment Arabic yeh
|
417 |
+
::s ى ::t a ::comment alef maksura
|
418 |
+
::s ﻯ ::t a ::comment alef maksura isolated form
|
419 |
+
::s ﻰ ::t a ::comment alef maksura final form
|
420 |
+
::s ﯨ ::t a ::comment Uighur Kazach Kirghiz alef maksura initial form
|
421 |
+
::s ﯩ ::t a ::comment Uighur Kazach Kirghiz alef maksura medial form
|
422 |
+
::s ٰ ::t a ::comment Arabic letter superscript alef
|
423 |
+
::s ـ ::t ::comment tatweel (filler)
|
424 |
+
::s َ ::t a ::comment fatha ("-a")
|
425 |
+
::s ُ ::t u ::comment damma ("-u")
|
426 |
+
::s ِ ::t i ::comment kasra ("-i")
|
427 |
+
::s ْ ::t ::comment sukun (no vowel)
|
428 |
+
::s ۡ ::t ::comment small high dotless head of khah; like sukun (no vowel); used in Kashmiri, Assamese
|
429 |
+
::s ً ::t ::comment fathatan ("-an")
|
430 |
+
::s اً ::t an ::comment alef + fathatan
|
431 |
+
::s ٌ ::t ::comment dammatan ("-un")
|
432 |
+
::s ٍ ::t ::comment kasratan ("-in")
|
433 |
+
::s ّ ::t ::comment shadda (consonant doubler)
|
434 |
+
::s ڃ ::t ny ::comment Arabic letter nyeh U+0683 (used in Sindhi (snd))
|
435 |
+
::s ڄ ::t dy ::comment Arabic letter dyeh U+0684 (used in Sindhi (snd))
|
436 |
+
::s ۾ ::t men ::comment Sindhi postposition men
|
437 |
+
::s ؑ ::t alayhe wasallam ::comment "upon him be peace"
|
438 |
+
::s ﷴ ::t mohammad ::comment "Mohammad"
|
439 |
+
::s ﷸ ::t wasallam ::comment "and peace"
|
440 |
+
::s ﷺ ::t sallallahou alayhe wasallam ::comment "prayer of God be upon him and his family and peace"
|
441 |
+
|
442 |
+
# Farsi
|
443 |
+
::s ی ::t i ::t-alt y ::comment Contributed by Nima
|
444 |
+
::s ای ::t i ::t-alt ai ::use-only-at-start-of-word ::comment Contributed by Nima
|
445 |
+
::s هٔ ::t eye ::use-only-at-end-of-word ::lcode fas ::comment Contributed by Nima
|
446 |
+
::s و ::t v ::t-alt o, u ::lcode fas ::comment Arabic letter waw
|
447 |
+
::s ض ::t z ::t-alt d ::lcode fas ::comment Contributed by Marjan
|
448 |
+
::s ث ::t s ::t-alt th ::lcode fas ::comment Contributed by Marjan
|
449 |
+
::s ذ ::t z ::t-alt th ::lcode fas ::comment Contributed by Nima
|
450 |
+
::s ع ::t a ::t-alt ' ::lcode fas ::comment Contributed by Nima
|
451 |
+
::s عا ::t a ::lcode fas ::comment Contributed by Nima
|
452 |
+
::s عی ::t i ::t-alt iy ::lcode fas ::comment Contributed by Nima
|
453 |
+
::s عو ::t u ::t-alt o, av ::lcode fas ::comment Contributed by Nima
|
454 |
+
::s چ ::t ch ::t-alt tch, tsh ::lcode fas ::comment Contributed by Nima
|
455 |
+
::s ه ::t e ::t-alt h ::use-only-at-end-of-word ::lcode fas ::comment Contributed by Nima
|
456 |
+
::s ::t "" ::t-alt " " ::lcode fas ::comment source is character "zero-width non-joiner" (U+200C); Contributed by Nima
|
457 |
+
::s غ ::t gh ::t-alt g ::lcode fas
|
458 |
+
::s آئی ::t ai ::t-alt ae ::lcode fas
|
459 |
+
::s ائی ::t ai ::t-alt ae ::lcode fas
|
460 |
+
::s آئو ::t au ::t-alt ao ::lcode fas
|
461 |
+
::s ائو ::t au ::t-alt ao ::lcode fas
|
462 |
+
|
463 |
+
# Kashmiri (so far: educated guesses)
|
464 |
+
::s ٖ ::t a ::comment Arabic subscript alef U+0656
|
465 |
+
::s ٗ ::t u ::comment Arabic inverted damma U+0657
|
466 |
+
::s ۚ ::t j ::comment Arabic small high jeem U+06DA
|
467 |
+
::s ۪ ::t ::comment Arabic emtpy centre low stop U+06EA
|
468 |
+
::s ۬ ::t ::comment Arabic rounded high stop with filled center U+06EC
|
469 |
+
|
470 |
+
# Pashto
|
471 |
+
::s ٙ ::t e
|
472 |
+
|
473 |
+
# Hebrew
|
474 |
+
::s ב ::t v ::comment Hebrew letter bet ::t-alt b
|
475 |
+
::s כ ::t k ::comment Hebrew letter kaf ::t-alt kh
|
476 |
+
::s ך ::t k ::comment Hebrew letter kaf ::t-alt kh
|
477 |
+
::s פ ::t f ::comment Hebrew letter pe ::t-alt p
|
478 |
+
::s ש ::t sh ::comment Hebrew letter shin ::t-alt s
|
479 |
+
::s ו ::t v ::comment Hebrew letter vav ::t-alt o, u
|
480 |
+
::s ח ::t ch ::comment Hebrew letter het ::t-alt h ::use-alt-in-pointed
|
481 |
+
::s ק ::t q ::t-alt k ::use-alt-in-pointed
|
482 |
+
::s וֹ ::t o
|
483 |
+
::s וּ ::t u
|
484 |
+
::s קְוָ ::t qva ::t-alt kva ::use-alt-in-pointed
|
485 |
+
::s י ::t y
|
486 |
+
::s יּ ::t y
|
487 |
+
::s יָּ ::t ya
|
488 |
+
::s ע ::t '
|
489 |
+
::s ִי ::t i ::t-alt iy ::use-alt-in-pointed
|
490 |
+
::s ֵי ::t e
|
491 |
+
::s ִיּ ::t iy
|
492 |
+
::s ִיָּ ::t iya
|
493 |
+
::s ױ ::t oy
|
494 |
+
::s א ::t a ::t-alt '
|
495 |
+
::s אָ ::t a
|
496 |
+
::s ֹא ::t o
|
497 |
+
::s אַ ::t 'a
|
498 |
+
::s אֲ ::t 'a
|
499 |
+
::s אֶ ::t e
|
500 |
+
::s אֱ ::t e
|
501 |
+
::s פ ::t f
|
502 |
+
::s פּ ::t p
|
503 |
+
::s פַּ ::t pa
|
504 |
+
::s פְּ ::t pe ::t-alt p ::use-alt-in-pointed
|
505 |
+
::s שׁ ::t sh
|
506 |
+
::s שָׁ ::t sha
|
507 |
+
::s שָּׁ ::t sha ::comment ?
|
508 |
+
::s שְׁ ::t she ::t-alt sh ::use-alt-in-pointed
|
509 |
+
::s שֶׁ ::t she
|
510 |
+
::s שִׁ ::t shi
|
511 |
+
::s שֻׁ ::t shu
|
512 |
+
::s שׂ ::t s
|
513 |
+
::s שָׂ ::t sa
|
514 |
+
::s שְׂ ::t s ::t-alt se ::use-alt-in-pointed
|
515 |
+
::s כּ ::t k
|
516 |
+
::s כֶּ ::t ke
|
517 |
+
::s כֹּ ::t ko
|
518 |
+
::s בּ ::t b
|
519 |
+
::s בַּ ::t ba
|
520 |
+
::s בָּ ::t ba
|
521 |
+
::s בְּ ::t be ::t-alt b ::use-alt-in-pointed
|
522 |
+
::s בֶּ ::t be
|
523 |
+
::s תּ ::t t
|
524 |
+
::s תַּ ::t ta
|
525 |
+
::s תֵּ ::t te
|
526 |
+
::s תִּ ::t ti
|
527 |
+
::s דָּ ::t da
|
528 |
+
::s דְּ ::t de ::t-alt d ::use-alt-in-pointed
|
529 |
+
::s גּ ::t g
|
530 |
+
::s לֵּ ::t le
|
531 |
+
::s ד׳ ::t dh
|
532 |
+
::s ג׳ ::t j
|
533 |
+
::s ת׳ ::t th
|
534 |
+
::s ז׳ ::t zh
|
535 |
+
::s חַ ::t ach ::comment furtive patah ::use-only-at-end-of-word
|
536 |
+
::s עַ ::t a' ::comment furtive patah ::use-only-at-end-of-word
|
537 |
+
::s הַּ ::t ah ::comment furtive patah ::use-only-at-end-of-word
|
538 |
+
::s ַ ::t a ::comment Hebrew point patah
|
539 |
+
::s ֲ ::t a ::comment Hebrew point hataf patah (hataf = reduced)
|
540 |
+
::s ֳ ::t o ::comment Hebrew point hataf qamats
|
541 |
+
::s ָ ::t a ::comment Hebrew point qamats ::t-alt o ::use-alt-in-pointed
|
542 |
+
::s ֶ ::t e ::comment Hebrew point segol
|
543 |
+
::s ֱ ::t e ::comment Hebrew point hataf segol (hataf = reduced)
|
544 |
+
::s ְ ::t e ::comment Hebrew point sheva ::t-alt "" ::use-alt-in-pointed
|
545 |
+
::s ֵ ::t e ::comment Hebrew point tsere
|
546 |
+
::s ִ ::t i ::comment Hebrew point hiriq
|
547 |
+
::s ֹ ::t o ::comment Hebrew point holam
|
548 |
+
::s ֻ ::t u ::comment Hebrew point qubuts
|
549 |
+
# ::s ּ ::t "" ::comment Hebrew point dagesh or mapiq
|
550 |
+
|
551 |
+
# Yiddish
|
552 |
+
::s א ::t a ::lcode yid ::comment called "silent" alef
|
553 |
+
::s אי ::t y ::lcode yid
|
554 |
+
::s איי ::t ey ::lcode yid
|
555 |
+
::s או ::t u ::lcode yid
|
556 |
+
::s אוי ::t oy ::lcode yid
|
557 |
+
::s אַ ::t a ::lcode yid
|
558 |
+
::s אָ ::t o ::lcode yid
|
559 |
+
::s ב ::t b ::lcode yid
|
560 |
+
::s בֿ ::t v ::lcode yid
|
561 |
+
::s דזש ::t dzh ::lcode yid
|
562 |
+
::s ו ::t u ::lcode yid
|
563 |
+
::s וּ ::t u ::lcode yid
|
564 |
+
::s וֹ ::t o ::lcode yid
|
565 |
+
::s װ ::t v ::lcode yid
|
566 |
+
::s ווא ::t wa ::lcode yid
|
567 |
+
::s וואַ ::t wa ::lcode yid
|
568 |
+
::s ווע ::t we ::lcode yid
|
569 |
+
::s ווי ::t wi ::lcode yid
|
570 |
+
::s וואוי ::t wo ::lcode yid
|
571 |
+
::s וי ::t oy ::lcode yid
|
572 |
+
::s זש ::t zh ::lcode yid
|
573 |
+
::s ח ::t ch ::lcode yid
|
574 |
+
::s טש ::t tsh ::lcode yid
|
575 |
+
::s יִ::t i ::lcode yid
|
576 |
+
::s יי ::t ey ::lcode yid ::comment maybe "yi" at beginning of word
|
577 |
+
::s ײַ ::t ay ::lcode yid
|
578 |
+
::s כּ ::t k ::lcode yid
|
579 |
+
::s כ ::t ch ::lcode yid
|
580 |
+
::s ך ::t ch ::lcode yid
|
581 |
+
::s ע ::t e ::lcode yid
|
582 |
+
::s פּ ::t p ::lcode yid
|
583 |
+
::s פֿ ::t f ::lcode yid
|
584 |
+
::s ף ::t f ::lcode yid ::comment sometimes p
|
585 |
+
::s ק ::t k ::lcode yid
|
586 |
+
::s ת ::t s ::lcode yid
|
587 |
+
|
588 |
+
# Syriac/Aramaic (should be vetted by expert)
|
589 |
+
::s ܰ ::t a ::comment Syriac pthaha above
|
590 |
+
::s ܲ ::t a ::comment Syriac pthaha dotted
|
591 |
+
::s ܳ ::t aa ::comment Syriac zqapha above
|
592 |
+
::s ܴ ::t aa ::comment Syriac zqapha below
|
593 |
+
::s ܵ ::t aa ::comment Syriac zqapha dotted
|
594 |
+
::s ܶ ::t e ::comment Syriac rbasa above
|
595 |
+
::s ܷ ::t e ::comment Syriac rbasa below
|
596 |
+
::s ܿ ::t o ::comment Syriac rwaha
|
597 |
+
::s ܸ ::t e ::comment Syriac dotted zlama horizontal
|
598 |
+
::s ܹ ::t e ::comment Syriac dotted zlama angular
|
599 |
+
::s ܺ ::t i ::comment Syriac hbasa above
|
600 |
+
::s ܝܺ ::t i ::comment Syriac yudh + hbasa above
|
601 |
+
::s ܼ ::t u ::comment Syriac hbasa-esasa dotted
|
602 |
+
::s ܽ ::t o ::comment Syriac esasa above
|
603 |
+
::s ܾ ::t u ::comment Syriac esasa below
|
604 |
+
::s ݇ ::t "" ::comment Syriac oblique line above; indication of a silent letter
|
605 |
+
|
606 |
+
::s ܖ ::t d ::comment Syriac letter dotless dalath rish; ambiguous form for undifferentiated early dalath/rish
|
607 |
+
::s ܜ ::t t ::comment Syriac letter teth garshuni; used in Garshuni documents
|
608 |
+
::s ܒ݂ ::t v ::comment Syriac beth + rukkakha
|
609 |
+
::s ܒ̥ ::t v ::comment Syriac beth + ring-below
|
610 |
+
::s ܓ݂ ::t g ::comment Syriac gammal + rukkakha [IPA: ɣ]
|
611 |
+
::s ܓ̥ ::t g ::comment Syriac gammal + ring-below [IPA: ɣ]
|
612 |
+
::s ܕ݂ ::t d ::comment Syriac dalath + rukkakha [IPA: ð]
|
613 |
+
::s ܕ̥ ::t d ::comment Syriac dalath + ring-below [IPA: ð]
|
614 |
+
::s ܟ݂ ::t kh ::comment Syriac kaph + rukkakha [IPA: x]
|
615 |
+
::s ܟ̥ ::t kh ::comment Syriac kaph + ring-below [IPA: x]
|
616 |
+
::s ܦ݂ ::t f ::comment Syriac pe + rukkakha
|
617 |
+
::s ܦ̥ ::t f ::comment Syriac pe + ring-below
|
618 |
+
::s ܦ݁ ::t p ::comment Syriac pe + qushshaya
|
619 |
+
::s ܬ݂ ::t th ::comment Syriac taw + rukkakha [IPA: θ]
|
620 |
+
::s ܬ̥ ::t th ::comment Syriac taw + ring-below [IPA: θ]
|
621 |
+
|
622 |
+
::s ܄ ::t : ::comment Syriac sublinear colon; used at the end of verses of supplicationscolon skewed left
|
623 |
+
::s ܆ ::t , ::comment Syriac colon skewed left; marks a dependent clause
|
624 |
+
::s ܇ ::t , ::comment Syriac colon skewed right; marks the end of a subdivision of the apodosis, or latter part of a Biblical verse
|
625 |
+
|
626 |
+
# Uzbek
|
627 |
+
::s ʻ ::t ' ::comment modifies pronunciation of preceding "o" and "g"
|
628 |
+
::s ʼ ::t ' ::comment glottal stop (tutuq belgisi)
|
629 |
+
|
630 |
+
# Uyghur
|
631 |
+
::s ئا ::t a ::lcode uig
|
632 |
+
::s ە ::t e ::lcode uig
|
633 |
+
::s ئې ::t e ::lcode uig ::latinplus ë
|
634 |
+
::s ې ::t e ::lcode uig ::latinplus ë
|
635 |
+
::s ئە ::t e ::lcode uig
|
636 |
+
::s يە ::t e ::lcode uig
|
637 |
+
::s ئى ::t i ::lcode uig
|
638 |
+
::s ى ::t i ::lcode uig
|
639 |
+
::s ئو ::t o ::lcode uig
|
640 |
+
::s و ::t o ::lcode uig
|
641 |
+
::s ئۇ ::t u ::lcode uig
|
642 |
+
::s ۇ ::t u ::lcode uig
|
643 |
+
::s چ ::t ch ::t-alt q ::lcode uig
|
644 |
+
::s خ ::t x ::lcode uig
|
645 |
+
::s ژ ::t zh ::lcode uig
|
646 |
+
::s ئۆ ::t oe ::t-alt o ::lcode uig ::latinplus ö
|
647 |
+
::s ۆ ::t oe ::t-alt o ::lcode uig ::latinplus ö
|
648 |
+
::s ئۈ ::t ue ::t-alt u ::lcode uig ::latinplus ü
|
649 |
+
::s ۈ ::t ue ::t-alt u ::lcode uig ::latinplus ü
|
650 |
+
::s ۋ ::t w ::lcode uig
|
651 |
+
|
652 |
+
# Maldivian
|
653 |
+
::s ް ::t ::comment thaana sukun
|
654 |
+
::s ަ ::t a ::comment thaana abafili
|
655 |
+
::s ާ ::t aa ::comment thaana aabaafili
|
656 |
+
::s ި ::t i ::comment thaana ibifili
|
657 |
+
::s ީ ::t ee ::comment thaana eebeefili
|
658 |
+
::s ު ::t u ::comment thaana ubufili
|
659 |
+
::s ޫ ::t oo ::comment thaana ooboofili
|
660 |
+
::s ެ ::t e ::comment thaana ebefili
|
661 |
+
::s ޭ ::t ey ::comment thaana eybeyfili
|
662 |
+
::s ޮ ::t o ::comment thaana obofili
|
663 |
+
::s ޯ ::t oa ::comment thaana oaboafili
|
664 |
+
|
665 |
+
# Canadian syllabics (Inuktitut)
|
666 |
+
::s ᑊ ::t p ::comment syllable final
|
667 |
+
::s ᐟ ::t t ::comment syllable final
|
668 |
+
::s ᐠ ::t k ::comment syllable final
|
669 |
+
::s ᐨ ::t c ::comment syllable final
|
670 |
+
::s ᒼ ::t m ::comment syllable final
|
671 |
+
::s ᐣ ::t n ::comment syllable final
|
672 |
+
::s ᐢ ::t s ::comment syllable final
|
673 |
+
::s ᐧ ::t y ::comment syllable final
|
674 |
+
::s ᐤ ::t w ::comment syllable final
|
675 |
+
::s ᐦ ::t h ::comment syllable final
|
676 |
+
::s ᕽ ::t hk ::comment syllable final
|
677 |
+
::s ᓫ ::t l ::comment syllable final
|
678 |
+
::s ᕑ ::t r ::comment syllable final
|
679 |
+
|
680 |
+
## Punctuation
|
681 |
+
# delete
|
682 |
+
::s ¿ ::t "" ::comment inverted question mark
|
683 |
+
::s ¡ ::t "" ::comment inverted exclamation mark
|
684 |
+
# preserve
|
685 |
+
::s ′ ::t ′
|
686 |
+
# Cyrillic
|
687 |
+
::s ⁙ ::t . ::comment five dot punctuation
|
688 |
+
# Amharic/Ethiopian
|
689 |
+
::s ። ::t .
|
690 |
+
::s ፣ ::t ,
|
691 |
+
::s ፤ ::t ;
|
692 |
+
::s ፥ ::t :
|
693 |
+
::s ፡ ::t " " ::comment Ethiopic wordspace
|
694 |
+
::s ፦ ::t : ::comment Ethiopic preface colon
|
695 |
+
::s ቸ ::t cha ::comment Ethiopic syllable ca
|
696 |
+
::s ቹ ::t chu ::comment Ethiopic syllable cu
|
697 |
+
::s ቺ ::t chi ::comment Ethiopic syllable ci
|
698 |
+
::s ቻ ::t chaa ::comment Ethiopic syllable caa
|
699 |
+
::s ቼ ::t chee ::comment Ethiopic syllable cee
|
700 |
+
::s ች ::t che ::comment Ethiopic syllable ce
|
701 |
+
::s ቾ ::t cho ::comment Ethiopic syllable co
|
702 |
+
::s ሠ ::t sa ::comment Ethiopic syllable sza
|
703 |
+
::s ሡ ::t su ::comment Ethiopic syllable szu
|
704 |
+
::s ሢ ::t si ::comment Ethiopic syllable szi
|
705 |
+
::s ሣ ::t saa ::comment Ethiopic syllable szaa
|
706 |
+
::s ሤ ::t see::comment Ethiopic syllable szee
|
707 |
+
::s ሥ ::t se ::comment Ethiopic syllable sze
|
708 |
+
::s ሦ ::t so ::comment Ethiopic syllable szo
|
709 |
+
::s ጠ ::t te ::comment Ethiopic syllable the with ejective 't'
|
710 |
+
::s ጡ ::t tu ::comment Ethiopic syllable thu with ejective 't'
|
711 |
+
::s ጢ ::t ti ::comment Ethiopic syllable thi with ejective 't'
|
712 |
+
::s ጣ ::t taa ::comment Ethiopic syllable thaa with ejective 't'
|
713 |
+
::s ጤ ::t tee ::comment Ethiopic syllable thee with ejective 't'
|
714 |
+
::s ጥ ::t te ::comment Ethiopic syllable the with ejective 't'
|
715 |
+
::s ጦ ::t to ::comment Ethiopic syllable tho with ejective 't'
|
716 |
+
|
717 |
+
# Devanagari (Hindi etc.)
|
718 |
+
::s । ::t . ::comment danda
|
719 |
+
::s ॥ ::t . ::comment double danda
|
720 |
+
::s ৷ ::t . ::comment Bengali currency numerator four; used as danda
|
721 |
+
::s ॰ ::t . ::comment Devanagari abbreviation sign
|
722 |
+
# Oriya/Odia (India)
|
723 |
+
::s ::t . ::comment danda (deprecated, should use Devanagari danda ।)
|
724 |
+
::s ::t . ::comment double danda (deprecated, should use Devanagari double danda ॥)
|
725 |
+
# Tibetan
|
726 |
+
::s ། ::t ,
|
727 |
+
::s །: ::t :
|
728 |
+
::s ༏ ::t ;
|
729 |
+
::s ༎ ::t .
|
730 |
+
::s ༑ ::t , ::comment Tibetan mark run chen spungs shad
|
731 |
+
::s ༼ ::t ( ::comment Tibetan open roof punctuation
|
732 |
+
::s ༽ ::t ) ::comment Tibetan close roof punctuation
|
733 |
+
::s ༈ ::t "" ::comment Tibetan mark srbul shad
|
734 |
+
::s 【 ::t [ ::comment left black lenticular bracket
|
735 |
+
::s 】 ::t ] ::comment right black lenticular bracket
|
736 |
+
::s ༄ ::t "" ::comment Tibetan head mark
|
737 |
+
::s ༄༅ ::t "" ::comment Tibetan head mark
|
738 |
+
::s ༆ ::t "" ::comment Tibetan head mark
|
739 |
+
# Myanmar/Burmese
|
740 |
+
::s ၊ ::t ,
|
741 |
+
::s ။ ::t .
|
742 |
+
Khmer
|
743 |
+
::s ៖ ::t ; ::comment Khmer sign camnuc pii kuuh
|
744 |
+
::s ។ ::t . ::comment Khmer sign khan
|
745 |
+
# Arabic
|
746 |
+
::s ، ::t ,
|
747 |
+
::s ؛ ::t ;
|
748 |
+
::s ٬ ::t ,
|
749 |
+
::s ۔ ::t .
|
750 |
+
::s ؟ ::t ?
|
751 |
+
::s ٪ ::t %
|
752 |
+
::s ٫ ::t , ::comment Arabic decimal separator
|
753 |
+
::s ۽ ::t & ::comment Arabic sign Sindhi ampersand
|
754 |
+
# Aramaic
|
755 |
+
::s ܀ ::t .
|
756 |
+
::s ܂ ::t .
|
757 |
+
# Hebrew
|
758 |
+
::s ־ ::t - ::comment maqaf
|
759 |
+
# Armenian
|
760 |
+
::s ։ ::t .
|
761 |
+
::s ՝ ::t , ::comment Armenian comma
|
762 |
+
# Chinese
|
763 |
+
::s , ::t ", "
|
764 |
+
::s 、 ::t ", "
|
765 |
+
::s 。 ::t ". "
|
766 |
+
::s ! ::t "! "
|
767 |
+
::s ? ::t "? "
|
768 |
+
::s 「 ::t ' "'
|
769 |
+
::s 」 ::t '" '
|
770 |
+
::s 《 ::t ' "'
|
771 |
+
::s 》 ::t '" '
|
772 |
+
::s ( ::t " ("
|
773 |
+
::s ) ::t ") "
|
774 |
+
::s ; ::t ;
|
775 |
+
::s : ::t ": "
|
776 |
+
::s ︰ ::t ": "
|
777 |
+
::s - ::t -
|
778 |
+
::s / ::t /
|
779 |
+
::s = ::t =
|
780 |
+
::s ~ ::t ~
|
781 |
+
::s & ::t &
|
782 |
+
::s < ::t <
|
783 |
+
::s > ::t >
|
784 |
+
::s % ::t %
|
785 |
+
::s ::t " " ::comment ideographic space
|
786 |
+
# Japanese
|
787 |
+
::s 『 ::t ' "'
|
788 |
+
::s 』 ::t '" '
|
789 |
+
::s ・ ::t " " ::comment Katakana middle dot; separates name elements such as first and last name
|
790 |
+
|
791 |
+
# Symbols
|
792 |
+
::s ∞ ::t ∞ ::comment infinity
|
793 |
+
::s ::t ::comment soft hyphen; used to indicate preferred line breaks; remove
|
794 |
+
::s ֊ ::t - ::comment Armenian hyphen; map to regular hyphen-minus
|
795 |
+
::s ᐩ ::t + ::comment Canadian syllabics final plus; map to regular plus
|
796 |
+
::s ﹐ ::t , ::comment small comma; map to regular comma
|
797 |
+
::s ˚ ::t ° ::comment ring above; map to degree sign
|
798 |
+
::s ⇒ ::t ⇒ ::comment rightwards double arrow
|
799 |
+
::s † ::t † ::comment dagger
|
800 |
+
::s • ::t • ::comment bullet
|
801 |
+
::s ℃ ::t °C ::comment degree Celsius; split into 2 characters
|
802 |
+
::s ℉ ::t °F ::comment degree Fahrenheit; split into 2 characters
|
803 |
+
::s ― ::t ― ::comment horizontal bar
|
804 |
+
::s ˇ ::t ˇ ::comment caron (sometimes apparently used for "Arabic vowel sign small v above" U+065A, e.g. in Gilaki language (glk))
|
805 |
+
::s ″ ::t ″ ::comment double prime
|
806 |
+
::s ﴾ ::t ( ::comment ornate left parenthesis
|
807 |
+
::s ﴿ ::t ) ::comment ornate right parenthesis
|
808 |
+
::s 〔 ::t [ ::comment left tortoise shell bracket
|
809 |
+
::s 〕 ::t ] ::comment right tortoise shell bracket
|
810 |
+
::s ﹝ ::t ( ::comment small left tortoise shell bracket
|
811 |
+
::s ﹞ ::t ) ::comment small left tortoise shell bracket
|
812 |
+
::s ♄ ::t ♄ ::comment Saturn
|
813 |
+
::s ♆ ::t ♆ ::comment Neptune
|
814 |
+
::s ♋ ::t ♋ ::comment Cancer
|
uroman/data/string-distance-cost-rules.txt
ADDED
@@ -0,0 +1,896 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# String distance
|
2 |
+
|
3 |
+
::s1 a ::s2 ::cost 0.1
|
4 |
+
::s1 b ::s2 ::cost 1
|
5 |
+
::s1 b ::s2 ::cost 0.2 ::left1 /[aou]m$/ ::right1 [e] ::lc1 eng ::lc2 zho ::example Balcombe
|
6 |
+
::s1 c ::s2 ::cost 1
|
7 |
+
::s1 c ::s2 ::cost 0.2 ::left1 /[aeou]$/ ::right1 [cgkq] ::lc2 zho
|
8 |
+
::s1 c ::s2 ::cost 0.5 ::left1 /[aeou][lnr]?$/ ::right1 [h] ::lc2 zho
|
9 |
+
::s1 d ::s2 ::cost 1
|
10 |
+
::s1 d ::s2 ::cost 0.5 ::left1 /[aeiou][lnr]$/ ::right1 [-,$ ]
|
11 |
+
::s1 d ::s2 ::cost 0.4 ::lc1 eng ::lc2 zho ::right1 [bcfgklmnpqrstvwxz]
|
12 |
+
::s1 e ::s2 ::cost 0.1
|
13 |
+
::s1 é ::s2 ::cost 0.1
|
14 |
+
::s1 e ::s2 ::cost 0.02 ::lc2 fas
|
15 |
+
::s1 e ::s2 ::cost 0.02 ::lc1 amh ::lc2 eng
|
16 |
+
::s1 f ::s2 ::cost 1
|
17 |
+
::s1 g ::s2 ::cost 1
|
18 |
+
::s1 g ::s2 ::cost 0.4 ::right1 [bcdfghklmnpqrstvwxz] ::lc2 zho
|
19 |
+
::s1 g ::s2 ::cost 0.2 ::right1 [k] ::lc2 zho
|
20 |
+
::s1 h ::s2 ::cost 0.5
|
21 |
+
::s1 h ::s2 ::cost 0.1 ::left1 /[aeiouy]$/ ::right1 [-,bcdfghklmnpqrstvwxz$ ]
|
22 |
+
::s1 h ::s2 ::cost 0.2 ::left1 /[bdlnr]$/ ::right1 [-,$ aeiouy] ::example Delhi, Minh, Riyadh
|
23 |
+
::s1 i ::s2 ::cost 0.1
|
24 |
+
::s1 j ::s2 ::cost 0.5
|
25 |
+
::s1 k ::s2 ::cost 1
|
26 |
+
::s1 l ::s2 ::cost 1
|
27 |
+
::s1 l ::s2 ::cost 0.3 ::left1 /eui$/ ::right1 [-,$ ] ::example Argenteuil
|
28 |
+
::s1 l ::s2 ::cost 0.3 ::left1 /a$/ ::right1 [km] ::comment walk, palm
|
29 |
+
::s1 l ::s2 ::cost 0.3 ::left1 /[aeiou]$/ ::right1 [bdfgkmpstvwz] ::lc2 zho
|
30 |
+
::s1 m ::s2 ::cost 1
|
31 |
+
::s1 n ::s2 ::cost 1
|
32 |
+
::s1 n ::s2 ::cost 0.7 ::right1 [-,$ ]
|
33 |
+
::s1 o ::s2 ::cost 0.1
|
34 |
+
::s1 p ::s2 ::cost 1
|
35 |
+
::s1 q ::s2 ::cost 1
|
36 |
+
::s1 r ::s2 ::cost 1
|
37 |
+
::s1 r ::s2 ::cost 0.5 ::left1 /[aou]$/ ::right1 [-,bcdfghjklmnpqrstvwxz$ ]
|
38 |
+
::s1 r ::s2 ::cost 0.3 ::left1 /[aeiou]$/ ::right1 [-,bcdfghjklmnpqrstvwxz$ ] ::lc2 zho
|
39 |
+
::s1 re ::s2 ::cost 0.4 ::left1 /[ou]$/ ::right1 [-,$ ] ::lc2 zho
|
40 |
+
::s1 re ::s2 ::cost 0.5 ::left1 /[aeiou]$/ ::right1 [-,bcdfghjklmnpqrstvwxz$ ] ::lc2 zho
|
41 |
+
::s1 rr ::s2 ::cost 0.5 ::left1 /[aeiou]$/ ::right1 [-,bcdfghjklmnpqrstvwxz$ ] ::lc2 zho
|
42 |
+
::s1 s ::s2 ::cost 1
|
43 |
+
::s1 s ::s2 ::cost 0.6 ::right1 [-,$ ]
|
44 |
+
::s1 t ::s2 ::cost 1
|
45 |
+
::s1 t ::s2 ::cost 0.5 ::left1 /[aeiou][lnr]?$/ ::right1 [-,$ ]
|
46 |
+
::s1 t ::s2 ::cost 0.6 ::left1 /[bcdfghklmnpqrstvwxz]$/ ::right1 [bcdfghklmnpqrstvwxz]
|
47 |
+
::s1 u ::s2 ::cost 0.1
|
48 |
+
::s1 v ::s2 ::cost 1
|
49 |
+
::s1 w ::s2 ::cost 1
|
50 |
+
::s1 w ::s2 ::cost 0.4 ::lc1 eng ::right1 [i][c][hk][-,$ ] ::example Greenwich, Alnwick
|
51 |
+
::s1 x ::s2 ::cost 1
|
52 |
+
::s1 y ::s2 ::cost 0.3
|
53 |
+
::s1 z ::s2 ::cost 1
|
54 |
+
::s1 ı ::s2 ::cost 0.3
|
55 |
+
::s1 0 ::s2 ::cost 1
|
56 |
+
::s1 1 ::s2 ::cost 1
|
57 |
+
::s1 2 ::s2 ::cost 1
|
58 |
+
::s1 3 ::s2 ::cost 1
|
59 |
+
::s1 4 ::s2 ::cost 1
|
60 |
+
::s1 5 ::s2 ::cost 1
|
61 |
+
::s1 6 ::s2 ::cost 1
|
62 |
+
::s1 7 ::s2 ::cost 1
|
63 |
+
::s1 8 ::s2 ::cost 1
|
64 |
+
::s1 9 ::s2 ::cost 1
|
65 |
+
::s1 ' ::s2 ::cost 0.1
|
66 |
+
::s1 ` ::s2 ::cost 0.1
|
67 |
+
::s1 ( ::s2 ::cost 0.1
|
68 |
+
::s1 ) ::s2 ::cost 0.1
|
69 |
+
::s1 , ::s2 ::cost 0.1
|
70 |
+
::s1 ; ::s2 ::cost 0.1
|
71 |
+
::s1 - ::s2 ::cost 0.1
|
72 |
+
::s1 . ::s2 ::cost 0.1
|
73 |
+
::s1 .. ::s2 ::cost 0.12
|
74 |
+
::s1 ... ::s2 ::cost 0.14
|
75 |
+
::s1 ? ::s2 ::cost 0.2
|
76 |
+
::s1 ! ::s2 ::cost 0.2
|
77 |
+
::s1 ‼ ::s2 ::cost 0.2
|
78 |
+
::s1 ‼ ::s2 !! ::cost 0.02
|
79 |
+
::s1 ‼ ::s2 ! ::cost 0.1
|
80 |
+
::s1 / ::s2 ::cost 0.1
|
81 |
+
::s1 : ::s2 ::cost 0.1
|
82 |
+
::s1 ː ::s2 ::cost 0.1
|
83 |
+
::s1 ː ::s2 : ::cost 0.1
|
84 |
+
::s1 « ::s2 ::cost 0.1
|
85 |
+
::s1 » ::s2 ::cost 0.1
|
86 |
+
::s1 – ::s2 ::cost 0.1
|
87 |
+
::s1 – ::s2 - ::cost 0.05
|
88 |
+
::s1 — ::s2 ::cost 0.15
|
89 |
+
::s1 — ::s2 - ::cost 0.1
|
90 |
+
::s1 — ::s2 – ::cost 0.05
|
91 |
+
::s1 ─ ::s2 ::cost 0.2
|
92 |
+
::s1 ─ ::s2 - ::cost 0.15
|
93 |
+
::s1 ─ ::s2 – ::cost 0.1
|
94 |
+
::s1 ─ ::s2 — ::cost 0.05
|
95 |
+
::s1 ’ ::s2 ::cost 0.1
|
96 |
+
::s1 ʼ ::s2 ::cost 0.1
|
97 |
+
::s1 " " ::s2 ::cost 0.1
|
98 |
+
::s1 “ ::s2 ::cost 0.1
|
99 |
+
::s1 ” ::s2 ::cost 0.1
|
100 |
+
::s1 ″ ::s2 ::cost 0.1
|
101 |
+
::s1 # ::s2 ::cost 0.3
|
102 |
+
::s1 + ::s2 ::cost 0.3
|
103 |
+
::s1 * ::s2 ::cost 0.3
|
104 |
+
::s1 = ::s2 ::cost 0.3
|
105 |
+
::s1 < ::s2 ::cost 0.3
|
106 |
+
::s1 > ::s2 ::cost 0.3
|
107 |
+
::s1 [ ::s2 ::cost 0.3
|
108 |
+
::s1 ] ::s2 ::cost 0.3
|
109 |
+
::s1 { ::s2 ::cost 0.3
|
110 |
+
::s1 } ::s2 ::cost 0.3
|
111 |
+
::s1 | ::s2 ::cost 0.3
|
112 |
+
::s1 & ::s2 ::cost 0.3
|
113 |
+
::s1 _ ::s2 ::cost 0.3
|
114 |
+
::s1 • ::s2 ::cost 0.1
|
115 |
+
::s1 · ::s2 ::cost 0.1
|
116 |
+
::s1 ◦ ::s2 ::cost 0.1
|
117 |
+
::s1 ° ::s2 ::cost 0.1
|
118 |
+
::s1 … ::s2 ::cost 0.1
|
119 |
+
::s1 … ::s2 ... ::cost 0
|
120 |
+
::s1 @ ::s2 ::cost 0.3
|
121 |
+
::s1 © ::s2 ::cost 0.3
|
122 |
+
::s1 © ::s2 (c) ::cost 0.1
|
123 |
+
|
124 |
+
|
125 |
+
::s1 a ::s2 aa ::cost 0.02
|
126 |
+
::s1 a ::s2 aaa ::cost 0.03
|
127 |
+
::s1 a ::s2 aaaa ::cost 0.03
|
128 |
+
::s1 a ::s2 aaaaa ::cost 0.03
|
129 |
+
::s1 a ::s2 aaaaaa ::cost 0.04
|
130 |
+
::s1 a ::s2 aaaaaaa ::cost 0.04
|
131 |
+
::s1 a ::s2 aaaaaaaa ::cost 0.04
|
132 |
+
::s1 a ::s2 aaaaaaaaa ::cost 0.04
|
133 |
+
::s1 a ::s2 aaaaaaaaaa ::cost 0.04
|
134 |
+
::s1 a ::s2 aaaaaaaaaaa ::cost 0.04
|
135 |
+
::s1 a ::s2 aaaaaaaaaaaa ::cost 0.04
|
136 |
+
::s1 a ::s2 aaaaaaaaaaaaa ::cost 0.04
|
137 |
+
::s1 a ::s2 aaaaaaaaaaaaaa ::cost 0.04
|
138 |
+
::s1 a ::s2 aaaaaaaaaaaaaaa ::cost 0.04
|
139 |
+
::s1 a ::s2 aaaaaaaaaaaaaaaa ::cost 0.04
|
140 |
+
::s1 b ::s2 bb ::cost 0.02
|
141 |
+
::s1 b ::s2 bbb ::cost 0.03
|
142 |
+
::s1 b ::s2 bbbb ::cost 0.03
|
143 |
+
::s1 b ::s2 bbbbb ::cost 0.03
|
144 |
+
::s1 c ::s2 cc ::cost 0.02
|
145 |
+
::s1 c ::s2 ccc ::cost 0.03
|
146 |
+
::s1 c ::s2 cccc ::cost 0.03
|
147 |
+
::s1 c ::s2 ccccc ::cost 0.03
|
148 |
+
::s1 d ::s2 dd ::cost 0.02
|
149 |
+
::s1 d ::s2 ddd ::cost 0.03
|
150 |
+
::s1 d ::s2 dddd ::cost 0.03
|
151 |
+
::s1 d ::s2 ddddd ::cost 0.03
|
152 |
+
::s1 e ::s2 ee ::cost 0.02
|
153 |
+
::s1 e ::s2 eee ::cost 0.03
|
154 |
+
::s1 e ::s2 eeee ::cost 0.03
|
155 |
+
::s1 e ::s2 eeeee ::cost 0.03
|
156 |
+
::s1 e ::s2 eeeeee ::cost 0.04
|
157 |
+
::s1 e ::s2 eeeeeee ::cost 0.04
|
158 |
+
::s1 e ::s2 eeeeeeee ::cost 0.04
|
159 |
+
::s1 e ::s2 eeeeeeeee ::cost 0.04
|
160 |
+
::s1 e ::s2 eeeeeeeeee ::cost 0.04
|
161 |
+
::s1 e ::s2 eeeeeeeeeee ::cost 0.04
|
162 |
+
::s1 e ::s2 eeeeeeeeeeee ::cost 0.04
|
163 |
+
::s1 e ::s2 eeeeeeeeeeeee ::cost 0.04
|
164 |
+
::s1 e ::s2 eeeeeeeeeeeeee ::cost 0.04
|
165 |
+
::s1 e ::s2 eeeeeeeeeeeeeee ::cost 0.04
|
166 |
+
::s1 e ::s2 eeeeeeeeeeeeeeee ::cost 0.04
|
167 |
+
::s1 f ::s2 ff ::cost 0.02
|
168 |
+
::s1 f ::s2 fff ::cost 0.03
|
169 |
+
::s1 f ::s2 ffff ::cost 0.03
|
170 |
+
::s1 f ::s2 fffff ::cost 0.03
|
171 |
+
::s1 g ::s2 gg ::cost 0.02
|
172 |
+
::s1 g ::s2 ggg ::cost 0.03
|
173 |
+
::s1 g ::s2 gggg ::cost 0.03
|
174 |
+
::s1 g ::s2 ggggg ::cost 0.03
|
175 |
+
::s1 h ::s2 hh ::cost 0.02
|
176 |
+
::s1 h ::s2 hhh ::cost 0.03
|
177 |
+
::s1 h ::s2 hhhh ::cost 0.03
|
178 |
+
::s1 h ::s2 hhhhh ::cost 0.03
|
179 |
+
::s1 i ::s2 ii ::cost 0.02
|
180 |
+
::s1 i ::s2 iii ::cost 0.03
|
181 |
+
::s1 i ::s2 iiii ::cost 0.03
|
182 |
+
::s1 i ::s2 iiiii ::cost 0.03
|
183 |
+
::s1 i ::s2 iiiiii ::cost 0.04
|
184 |
+
::s1 i ::s2 iiiiiii ::cost 0.04
|
185 |
+
::s1 i ::s2 iiiiiiii ::cost 0.04
|
186 |
+
::s1 i ::s2 iiiiiiiii ::cost 0.04
|
187 |
+
::s1 i ::s2 iiiiiiiiii ::cost 0.04
|
188 |
+
::s1 i ::s2 iiiiiiiiiii ::cost 0.04
|
189 |
+
::s1 i ::s2 iiiiiiiiiiii ::cost 0.04
|
190 |
+
::s1 i ::s2 iiiiiiiiiiiii ::cost 0.04
|
191 |
+
::s1 i ::s2 iiiiiiiiiiiiii ::cost 0.04
|
192 |
+
::s1 i ::s2 iiiiiiiiiiiiiii ::cost 0.04
|
193 |
+
::s1 i ::s2 iiiiiiiiiiiiiiii ::cost 0.04
|
194 |
+
::s1 j ::s2 jj ::cost 0.02
|
195 |
+
::s1 j ::s2 jjj ::cost 0.03
|
196 |
+
::s1 j ::s2 jjjj ::cost 0.03
|
197 |
+
::s1 j ::s2 jjjjj ::cost 0.03
|
198 |
+
::s1 k ::s2 kk ::cost 0.02
|
199 |
+
::s1 k ::s2 kkk ::cost 0.03
|
200 |
+
::s1 k ::s2 kkkk ::cost 0.03
|
201 |
+
::s1 k ::s2 kkkkk ::cost 0.03
|
202 |
+
::s1 l ::s2 ll ::cost 0.02
|
203 |
+
::s1 l ::s2 lll ::cost 0.03
|
204 |
+
::s1 l ::s2 llll ::cost 0.03
|
205 |
+
::s1 l ::s2 lllll ::cost 0.03
|
206 |
+
::s1 m ::s2 mm ::cost 0.02
|
207 |
+
::s1 m ::s2 mmm ::cost 0.03
|
208 |
+
::s1 m ::s2 mmmm ::cost 0.03
|
209 |
+
::s1 m ::s2 mmmmm ::cost 0.03
|
210 |
+
::s1 n ::s2 nn ::cost 0.02
|
211 |
+
::s1 n ::s2 nnn ::cost 0.03
|
212 |
+
::s1 n ::s2 nnnn ::cost 0.03
|
213 |
+
::s1 n ::s2 nnnnn ::cost 0.03
|
214 |
+
::s1 o ::s2 oo ::cost 0.02
|
215 |
+
::s1 o ::s2 ooo ::cost 0.03
|
216 |
+
::s1 o ::s2 oooo ::cost 0.03
|
217 |
+
::s1 o ::s2 ooooo ::cost 0.03
|
218 |
+
::s1 o ::s2 oooooo ::cost 0.04
|
219 |
+
::s1 o ::s2 ooooooo ::cost 0.04
|
220 |
+
::s1 o ::s2 oooooooo ::cost 0.04
|
221 |
+
::s1 o ::s2 ooooooooo ::cost 0.04
|
222 |
+
::s1 o ::s2 oooooooooo ::cost 0.04
|
223 |
+
::s1 o ::s2 ooooooooooo ::cost 0.04
|
224 |
+
::s1 o ::s2 oooooooooooo ::cost 0.04
|
225 |
+
::s1 o ::s2 ooooooooooooo ::cost 0.04
|
226 |
+
::s1 o ::s2 oooooooooooooo ::cost 0.04
|
227 |
+
::s1 o ::s2 ooooooooooooooo ::cost 0.04
|
228 |
+
::s1 o ::s2 oooooooooooooooo ::cost 0.04
|
229 |
+
::s1 p ::s2 pp ::cost 0.02
|
230 |
+
::s1 p ::s2 ppp ::cost 0.03
|
231 |
+
::s1 p ::s2 pppp ::cost 0.03
|
232 |
+
::s1 p ::s2 ppppp ::cost 0.03
|
233 |
+
::s1 q ::s2 qq ::cost 0.02
|
234 |
+
::s1 q ::s2 qqq ::cost 0.03
|
235 |
+
::s1 q ::s2 qqqq ::cost 0.03
|
236 |
+
::s1 q ::s2 qqqqq ::cost 0.03
|
237 |
+
::s1 r ::s2 rr ::cost 0.02
|
238 |
+
::s1 r ::s2 rrr ::cost 0.03
|
239 |
+
::s1 r ::s2 rrrr ::cost 0.03
|
240 |
+
::s1 r ::s2 rrrrr ::cost 0.03
|
241 |
+
::s1 s ::s2 ss ::cost 0.02
|
242 |
+
::s1 s ::s2 sss ::cost 0.03
|
243 |
+
::s1 s ::s2 ssss ::cost 0.03
|
244 |
+
::s1 s ::s2 sssss ::cost 0.03
|
245 |
+
::s1 t ::s2 tt ::cost 0.02
|
246 |
+
::s1 t ::s2 ttt ::cost 0.03
|
247 |
+
::s1 t ::s2 tttt ::cost 0.03
|
248 |
+
::s1 t ::s2 ttttt ::cost 0.03
|
249 |
+
::s1 u ::s2 uu ::cost 0.02
|
250 |
+
::s1 u ::s2 uuu ::cost 0.03
|
251 |
+
::s1 u ::s2 uuuu ::cost 0.03
|
252 |
+
::s1 u ::s2 uuuuu ::cost 0.03
|
253 |
+
::s1 u ::s2 uuuuuu ::cost 0.04
|
254 |
+
::s1 u ::s2 uuuuuuu ::cost 0.04
|
255 |
+
::s1 u ::s2 uuuuuuuu ::cost 0.04
|
256 |
+
::s1 u ::s2 uuuuuuuuu ::cost 0.04
|
257 |
+
::s1 u ::s2 uuuuuuuuuu ::cost 0.04
|
258 |
+
::s1 u ::s2 uuuuuuuuuuu ::cost 0.04
|
259 |
+
::s1 u ::s2 uuuuuuuuuuuu ::cost 0.04
|
260 |
+
::s1 u ::s2 uuuuuuuuuuuuu ::cost 0.04
|
261 |
+
::s1 u ::s2 uuuuuuuuuuuuuu ::cost 0.04
|
262 |
+
::s1 u ::s2 uuuuuuuuuuuuuuu ::cost 0.04
|
263 |
+
::s1 u ::s2 uuuuuuuuuuuuuuuu ::cost 0.04
|
264 |
+
::s1 v ::s2 vv ::cost 0.02
|
265 |
+
::s1 v ::s2 vvv ::cost 0.03
|
266 |
+
::s1 v ::s2 vvvv ::cost 0.03
|
267 |
+
::s1 v ::s2 vvvvv ::cost 0.03
|
268 |
+
::s1 w ::s2 ww ::cost 0.02
|
269 |
+
::s1 w ::s2 www ::cost 0.03
|
270 |
+
::s1 w ::s2 wwww ::cost 0.03
|
271 |
+
::s1 w ::s2 wwwww ::cost 0.03
|
272 |
+
::s1 x ::s2 xx ::cost 0.02
|
273 |
+
::s1 x ::s2 xxx ::cost 0.03
|
274 |
+
::s1 x ::s2 xxxx ::cost 0.03
|
275 |
+
::s1 x ::s2 xxxxx ::cost 0.03
|
276 |
+
::s1 y ::s2 yy ::cost 0.02
|
277 |
+
::s1 y ::s2 yyy ::cost 0.03
|
278 |
+
::s1 y ::s2 yyyy ::cost 0.03
|
279 |
+
::s1 y ::s2 yyyyy ::cost 0.03
|
280 |
+
::s1 z ::s2 zz ::cost 0.02
|
281 |
+
::s1 z ::s2 zzz ::cost 0.03
|
282 |
+
::s1 z ::s2 zzzz ::cost 0.03
|
283 |
+
::s1 z ::s2 zzzzz ::cost 0.03
|
284 |
+
::s1 " " ::s2 " " ::cost 0
|
285 |
+
::s1 . ::s2 ::left1 /\./ ::left2 /\./ ::cost 0.02
|
286 |
+
::s1 … ::s2 ::left1 /…/ ::left2 /…/ ::cost 0.01
|
287 |
+
::s1 _ ::s2 ::left1 /_/ ::left2 /_/ ::cost 0.01
|
288 |
+
::s1 = ::s2 ::left1 /=/ ::left2 /=/ ::cost 0.01
|
289 |
+
::s1 ! ::s2 ::left1 /!/ ::left2 /!/ ::cost 0.02
|
290 |
+
::s1 ? ::s2 ::left1 /\?/ ::left2 /\?/ ::cost 0.02
|
291 |
+
::s1 aa ::s2 aː ::cost 0.02
|
292 |
+
::s1 ee ::s2 eː ::cost 0.02
|
293 |
+
::s1 ii ::s2 iː ::cost 0.02
|
294 |
+
::s1 oo ::s2 oː ::cost 0.02
|
295 |
+
::s1 uu ::s2 uː ::cost 0.02
|
296 |
+
|
297 |
+
::s1 a ::s2 e ::cost 0.1
|
298 |
+
::s1 au ::s2 o ::cost 0.1 ::lc1 eng
|
299 |
+
::s1 aw ::s2 o ::cost 0.3 ::right1 [-,bcdfghklmnpqrstvwxz$ ]
|
300 |
+
::s1 aw ::s2 o ::cost 0.1 ::right1 [-,bcdfghklmnpqrstvwxz$ ] ::lc1 eng
|
301 |
+
::s1 aw ::s2 a ::cost 0.2 ::right1 [-,bcdfghklmnpqrstvwxz$ ] ::lc1 eng
|
302 |
+
::s1 ay ::s2 i ::cost 0.02 ::lc1 fas ::lc2 eng
|
303 |
+
::s1 aye ::s2 ae ::cost 0.05 ::lc1 fas
|
304 |
+
::s1 é ::s2 e ::cost 0.05
|
305 |
+
::s1 e ::s2 i ::cost 0.15
|
306 |
+
::s1 e ::s2 i ::cost 0.1 ::lc1 uig ::lc2 uig
|
307 |
+
::s1 e ::s2 y ::cost 0.15
|
308 |
+
::s1 ew ::s2 u ::cost 0.3 ::right1 [-,bcdfghklmnpqrstvwxz$ ]
|
309 |
+
::s1 ew ::s2 u ::cost 0.1 ::right1 [-,bcdfghklmnpqrstvwxz$ ] ::lc1 eng
|
310 |
+
::s1 ew ::s2 u ::cost 0.3 ::right1 [aei][lgnrst] ::lc1 eng
|
311 |
+
::s1 ew ::s2 e ::cost 0.3 ::right1 [-,bcdfghklmnpqrstvwxz$ ] ::lc1 eng
|
312 |
+
::s1 i ::s2 a ::cost 0.1 ::right1 [-,$ ] ::lc1 fas
|
313 |
+
::s1 i ::s2 ea ::cost 0.03 ::lc2 eng
|
314 |
+
::s1 i ::s2 ee ::cost 0.03 ::lc2 eng
|
315 |
+
::s1 i ::s2 ei ::cost 0.05 ::lc2 eng
|
316 |
+
::s1 i ::s2 ie ::cost 0.03 ::lc2 eng
|
317 |
+
::s1 i ::s2 ı ::cost 0.05
|
318 |
+
::s1 i ::s2 e ::cost 0.1 ::lc2 eng
|
319 |
+
::s1 i ::s2 y ::cost 0.15
|
320 |
+
::s1 i ::s2 y ::cost 0.1 ::right2 [-,bcdfghklmnpqrstvwxz$ ]
|
321 |
+
::s1 ie ::s2 ei ::cost 0.15
|
322 |
+
::s1 ie ::s2 y ::cost 0.15
|
323 |
+
::s1 ij ::s2 ai ::cost 0.15
|
324 |
+
::s1 o ::s2 u ::cost 0.1
|
325 |
+
::s1 oo ::s2 u ::cost 0.1
|
326 |
+
::s1 ow ::s2 au ::cost 0.2 ::right1 [-,bcdfghklmnpqrstvwxz$ ]
|
327 |
+
::s1 ow ::s2 o ::cost 0.2 ::right1 [-,bcdfghklmnpqrstvwxz$ ]
|
328 |
+
::s1 ow ::s2 o ::cost 0.2 ::lc1 eng ::lc2 zho ::right1 [e]
|
329 |
+
::s1 ow ::s2 o ::cost 0.4 ::lc1 eng ::lc2 zho ::right1 [iy]
|
330 |
+
::s1 u ::s2 a ::cost 0.1 ::lc1 eng ::right1 [-,bcdfghklmnpqrstvwxz][bcdfghklmnpqrstvwxz$ ]
|
331 |
+
::s1 u ::s2 ou ::cost 0.05
|
332 |
+
::s1 u ::s2 yu ::cost 0.05 ::left1 /^(.*[- ])?$/
|
333 |
+
::s1 yeo ::s2 eo ::cost 0.1 ::lc1 fas
|
334 |
+
|
335 |
+
# Amharic
|
336 |
+
::s1 a ::s2 e ::cost 0.05 ::lc1 amh
|
337 |
+
::s1 aa ::s2 o ::cost 0.15 ::lc1 amh
|
338 |
+
::s1 aawe ::s2 au ::cost 0.05 ::lc1 amh
|
339 |
+
::s1 aawe ::s2 ao ::cost 0.1 ::lc1 amh
|
340 |
+
::s1 aawe ::s2 ou ::cost 0.1 ::lc1 amh
|
341 |
+
::s1 aawo ::s2 ao ::cost 0.05 ::lc1 amh
|
342 |
+
::s1 aaye ::s2 ai ::cost 0.05 ::lc1 amh
|
343 |
+
::s1 aaye ::s2 i ::cost 0.1 ::lc1 amh
|
344 |
+
::s1 aaye ::s2 ei ::cost 0.1 ::lc1 amh
|
345 |
+
::s1 awe ::s2 au ::cost 0.05 ::lc1 amh
|
346 |
+
::s1 awe ::s2 ao ::cost 0.1 ::lc1 amh
|
347 |
+
::s1 awe ::s2 ou ::cost 0.1 ::lc1 amh
|
348 |
+
::s1 ee ::s2 ai ::cost 0.1 ::lc1 amh
|
349 |
+
::s1 eewo ::s2 eo ::cost 0.05 ::lc1 amh
|
350 |
+
::s1 eeyaa ::s2 ea ::cost 0.1 ::lc1 amh
|
351 |
+
::s1 eeye ::s2 ai ::cost 0.1 ::lc1 amh
|
352 |
+
::s1 ewee ::s2 ue ::cost 0.1 ::lc1 amh
|
353 |
+
::s1 gwaa ::s2 gua ::cost 0.05 ::lc1 amh
|
354 |
+
::s1 iya ::s2 ie ::cost 0.05 ::lc1 amh
|
355 |
+
::s1 iyaa ::s2 ia ::cost 0.05 ::lc1 amh
|
356 |
+
::s1 iyo ::s2 io ::cost 0.05 ::lc1 amh
|
357 |
+
::s1 kxaa ::s2 kha ::cost 0.05 ::lc1 amh
|
358 |
+
::s1 liyaa ::s2 llia ::cost 0.05 ::lc1 amh
|
359 |
+
::s2 qaa ::s2 cca ::cost 0.05 ::lc1 amh
|
360 |
+
::s1 uwaa ::s2 ua ::cost 0.05 ::lc1 amh
|
361 |
+
::s1 uwee ::s2 ue ::cost 0.05 ::lc1 amh
|
362 |
+
::s1 uwi ::s2 oui ::cost 0.05 ::lc1 amh
|
363 |
+
::s1 uwi ::s2 ui ::cost 0.05 ::lc1 amh
|
364 |
+
::s1 xaaye ::s2 hai ::cost 0.1 ::lc1 amh
|
365 |
+
::s1 xwaa ::s2 jua ::cost 0.1 ::lc1 amh
|
366 |
+
::s1 ziyaa ::s1 sia ::cost 0.05 ::lc1 amh
|
367 |
+
::s1 w ::s2 ::cost 0.3 ::lc1 amh ::left1 /[aeiou]$/ ::right1 [aeiou]
|
368 |
+
::s1 y ::s2 ::cost 0.1 ::lc1 amh ::left1 /[aeiou]$/ ::right1 [aeiou]
|
369 |
+
# abbreviations
|
370 |
+
::s1 ee. ::s2 a ::cost 0.02 ::lc1 amh ::left1 /^(.*[- ])?$/
|
371 |
+
::s1 si. ::s2 c ::cost 0.02 ::lc1 amh ::left1 /^(.*[- ])?$/
|
372 |
+
::s1 di. ::s2 d ::cost 0.02 ::lc1 amh ::left1 /^(.*[- ])?$/
|
373 |
+
::s1 eefe. ::s2 f ::cost 0.02 ::lc1 amh ::left1 /^(.*[- ])?$/
|
374 |
+
::s1 are. ::s2 r ::cost 0.02 ::lc1 amh ::left1 /^(.*[- ])?$/
|
375 |
+
|
376 |
+
# Arabic
|
377 |
+
::s1 ::s2 a ::cost 0.02 ::lc1 ara
|
378 |
+
::s1 ::s2 e ::cost 0.02 ::lc1 ara
|
379 |
+
::s1 ::s2 i ::cost 0.05 ::lc1 ara
|
380 |
+
::s1 ::s2 o ::cost 0.05 ::lc1 ara
|
381 |
+
::s1 ::s2 p ::cost 0.15 ::lc1 ara ::left2 /m$/ ::right2 [dfgklmnpqrstvwz]
|
382 |
+
::s1 ::s2 u ::cost 0.05 ::lc1 ara
|
383 |
+
::s1 y ::s2 a ::cost 0.15 ::lc1 ara
|
384 |
+
::s1 y ::s2 e ::cost 0.05 ::lc1 ara
|
385 |
+
::s1 y ::s2 ea ::cost 0.02 ::lc1 ara
|
386 |
+
::s1 y ::s2 ee ::cost 0.02 ::lc1 ara
|
387 |
+
::s1 y ::s2 i ::cost 0.02 ::lc1 ara
|
388 |
+
::s1 y ::s2 ie ::cost 0.02 ::lc1 ara
|
389 |
+
::s1 b ::s2 p ::cost 0.02 ::lc1 ara
|
390 |
+
::s1 b ::s2 pp ::cost 0.03 ::lc1 ara
|
391 |
+
::s1 f ::s2 v ::cost 0.02 ::lc1 ara
|
392 |
+
::s1 fyl ::s2 ville ::right2 [-,$ ] ::cost 0.05 ::lc1 ara
|
393 |
+
::s1 gh ::s2 g ::right2 [abcdfgklmnopqrstuvwz] ::cost 0.05 ::lc1 ara
|
394 |
+
::s1 ghz ::s2 gs ::cost 0.05 ::lc1 ara
|
395 |
+
::s1 j ::s2 g ::cost 0.2 ::lc1 ara
|
396 |
+
::s1 kh ::s2 g ::cost 0.3 ::lc1 ara ::right2 [eiy]
|
397 |
+
::s1 q ::s2 g ::cost 0.2 ::lc1 ara ::right2 [arouz]
|
398 |
+
::s1 q ::s2 gg ::cost 0.2 ::lc1 ara ::right2 [arouz]
|
399 |
+
::s1 th ::s2 z ::cost 0.4 ::lc1 ara ::right2 [aou] ::comment Spanish
|
400 |
+
::s1 " (" ::s2 ", " ::cost 0.02 ::lc1 ara
|
401 |
+
::s1 ) ::s2 ::right2 [-,$ ] ::cost 0.02 ::lc1 ara
|
402 |
+
|
403 |
+
# Bengali
|
404 |
+
::s1 aoyaa ::s2 wa ::cost 0.1 ::lc1 ben
|
405 |
+
::s1 aoye ::s2 way ::cost 0.1 ::lc1 ben
|
406 |
+
::s1 bhaa ::s2 ve ::cost 0.1 ::lc1 ben
|
407 |
+
::s1 bh ::s2 v ::cost 0.2 ::lc1 ben
|
408 |
+
::s1 bh ::s2 w ::cost 0.2 ::lc1 ben
|
409 |
+
::s1 b ::s2 v ::cost 0.3 ::lc1 ben
|
410 |
+
::s1 b ::s2 w ::cost 0.3 ::lc1 ben
|
411 |
+
::s1 dda ::s2 rh ::right2 [-,$ ] ::cost 0.2 ::lc1 ben
|
412 |
+
::s1 dd ::s2 r ::cost 0.4 ::lc1 ben
|
413 |
+
::s1 gk ::s2 k ::cost 0.05 ::lc1 ben
|
414 |
+
::s1 h ::s2 g ::right2 [eiy] ::cost 0.4 ::lc1 ben
|
415 |
+
::s1 h ::s2 j ::cost 0.4 ::lc1 ben
|
416 |
+
::s1 hoyaai ::s2 whi ::cost 0.05 ::lc1 ben
|
417 |
+
::s1 j ::s2 z ::cost 0.1 ::lc1 ben
|
418 |
+
::s1 j ::s2 s ::cost 0.3 ::lc1 ben
|
419 |
+
::s1 myaaka ::s2 mc ::cost 0.1 ::lc1 ben
|
420 |
+
::s1 myaaka ::s2 mac ::cost 0.1 ::lc1 ben
|
421 |
+
::s1 oyaa ::s2 wa ::cost 0.02 ::lc1 ben
|
422 |
+
::s1 oyaa ::s2 wo ::cost 0.1 ::lc1 ben
|
423 |
+
::s1 oyena ::s2 owen ::cost 0.1 ::lc1 ben
|
424 |
+
::s1 ph ::s2 v ::cost 0.1 ::lc1 ben
|
425 |
+
::s1 phana ::s2 von ::cost 0.1 ::lc1 ben
|
426 |
+
::s1 rhio ::s2 gio ::cost 0.2 ::lc1 ben
|
427 |
+
::s1 sh ::s2 s ::cost 0.4 ::lc1 ben
|
428 |
+
::s1 ss ::s2 sh ::left1 /[k]$/ ::cost 0.15 ::lc1 ben
|
429 |
+
::s1 ss ::s2 sh ::cost 0.3 ::lc1 ben
|
430 |
+
::s1 o ::s2 wo ::cost 0.2 ::lc1 ben ::left1 /^(.*[-, ]?)$/
|
431 |
+
::s1 oye ::s2 we ::cost 0.2 ::lc1 ben
|
432 |
+
::s1 tta ::s2 tho ::cost 0.3 ::lc1 ben
|
433 |
+
::s1 tthaa ::s2 ta ::cost 0.3 ::lc1 ben
|
434 |
+
::s1 u ::s2 wo ::cost 0.2 ::lc1 ben ::left1 /^(.*[-, ]?)$/
|
435 |
+
::s1 u ::s2 woo ::cost 0.2 ::lc1 ben ::left1 /^(.*[-, ]?)$/
|
436 |
+
::s1 u ::s2 wu ::cost 0.2 ::lc1 ben ::left1 /^(.*[-, ]?)$/
|
437 |
+
::s1 ui ::s2 wi ::cost 0.02 ::lc1 ben ::left1 /^(.*[-, ]?)$/
|
438 |
+
::s1 yaa ::s2 wa ::cost 0.3 ::lc1 ben
|
439 |
+
::s1 ye ::s2 we ::cost 0.3 ::lc1 ben
|
440 |
+
|
441 |
+
# Russian
|
442 |
+
::s1 ::s2 os ::cost 0.4 ::left2 /[bcdfghilmnprstvx]$/ ::right2 [-,$ ] ::lc1 rus
|
443 |
+
::s1 ::s2 us ::cost 0.4 ::left2 /[bcdfghilmnprstvx]$/ ::right2 [-,$ ] ::lc1 rus
|
444 |
+
::s1 av ::s2 au ::cost 0.05 ::lc1 rus
|
445 |
+
::s1 ch ::s2 cz ::cost 0.1 ::lc1 rus ::comment Polish
|
446 |
+
::s1 chch ::s2 cci ::right2 [aou] ::cost 0.1 ::lc1 rus
|
447 |
+
::s1 chch ::s2 cc ::right2 [eiy] ::cost 0.1 ::lc1 rus
|
448 |
+
::s1 chzh ::s2 zh ::cost 0.1 ::lc1 rus
|
449 |
+
::s1 dz ::s2 zz ::cost 0.1 ::lc1 rus ::right2 [aeiouy]
|
450 |
+
::s1 dz ::s2 j ::cost 0.3 ::lc1 rus ::right2 [aeiouy] ::comment Japanese
|
451 |
+
::s1 dzh ::s2 g ::cost 0.05 ::lc1 rus ::right2 [eiy]
|
452 |
+
::s1 dzh ::s2 gg ::cost 0.05 ::lc1 rus ::right2 [eiy]
|
453 |
+
::s1 dzh ::s2 j ::cost 0.05 ::lc1 rus
|
454 |
+
::s1 ev ::s2 eu ::cost 0.1 ::lc1 rus
|
455 |
+
::s1 f ::s2 th ::cost 0.6 ::lc1 rus
|
456 |
+
::s1 ievye ::s2 iaceae ::cost 0.02 ::right1 [-,$ ] ::lc1 rus ::comment scientific names for families of species
|
457 |
+
::s1 ii ::s2 ius ::cost 0.2 ::right1 [-,$ ] ::lc1 rus
|
458 |
+
::s1 i ::s2 j ::cost 0.2 ::lc1 rus
|
459 |
+
::s1 naya ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::suffix adjective
|
460 |
+
::s1 nyi ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::suffix adjective
|
461 |
+
::s1 ovye ::s2 aceae ::cost 0.02 ::right1 [-,$ ] ::lc1 rus ::comment scientific names for families of species
|
462 |
+
::s1 shsh ::s2 sh ::cost 0 ::lc1 rus
|
463 |
+
::s1 skaya ::s2 ian ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::suffix possessive
|
464 |
+
::s1 skaya ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::suffix possessive
|
465 |
+
::s1 skii ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::suffix possessive
|
466 |
+
::s1 skii ::s2 ian ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::suffix adjective
|
467 |
+
::s1 tsian ::s2 tian ::cost 0.05 ::lc1 rus
|
468 |
+
::s1 tsion ::s2 tion ::cost 0.05 ::lc1 rus
|
469 |
+
::s1 ts ::s2 c ::cost 0.3 ::lc1 rus
|
470 |
+
::s1 ts ::s2 c ::cost 0.02 ::right1 [-,$ ] ::lc1 rus
|
471 |
+
::s1 tsz ::s2 z ::cost 0.1 ::lc1 rus
|
472 |
+
::s1 itsa ::s2 ica ::cost 0.02 ::right1 [-,$ ] ::lc1 rus
|
473 |
+
::s1 etski ::s2 ecky ::cost 0.02 ::right1 [-,$ ] ::lc1 rus
|
474 |
+
::s1 tsiya ::s2 tion ::cost 0.02 ::right1 [-,$ ] ::lc1 rus
|
475 |
+
::s1 tsi ::s2 qi ::cost 0.15 ::lc1 rus ::comment Chinese names
|
476 |
+
::s1 tsy ::s2 qi ::cost 0.15 ::lc1 rus ::comment Chinese names
|
477 |
+
::s1 tszi ::s2 ji ::cost 0.15 ::lc1 rus ::comment Chinese names
|
478 |
+
::s1 tszy ::s2 ji ::cost 0.15 ::lc1 rus ::comment Chinese names
|
479 |
+
::s1 u ::s2 w ::right2 [aeio] ::cost 0.05 ::lc1 rus
|
480 |
+
::s1 u ::s2 w ::cost 0.2 ::lc1 rus
|
481 |
+
::s1 uo ::s2 wa ::cost 0.2 ::lc1 rus ::right2 [lnrst]
|
482 |
+
::s1 v ::s2 u ::cost 0.05 ::lc1 rus ::left1 /[bcdfghjklmnpqrstvwxz]$/ ::right1 [aeiou]
|
483 |
+
::s1 gva ::s2 gua ::cost 0.02 ::lc1 rus
|
484 |
+
::s1 gvi ::s2 gui ::cost 0.02 ::lc1 rus
|
485 |
+
::s1 x ::s2 sh ::cost 0.2 ::left2 /[aeiou]$/ ::right2 [-,aouct$-] ::lc1 rus
|
486 |
+
::s1 y ::s2 s ::cost 0.4 ::right2 [-,$-] ::lc1 rus
|
487 |
+
::s1 zh ::s2 rz ::cost 0.1 ::lc1 rus ::comment Polish rz
|
488 |
+
|
489 |
+
# Russian case endings
|
490 |
+
::s1 em ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
491 |
+
::s1 ey ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
492 |
+
::s1 om ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
493 |
+
::s1 oy ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
494 |
+
::s1 oyu ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
495 |
+
::s1 y ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
496 |
+
::s1 ya ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
497 |
+
::s1 ye ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
498 |
+
::s1 yem ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
499 |
+
::s1 ym ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
500 |
+
::s1 ymi ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
501 |
+
::s1 yu ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
502 |
+
::s1 ii ::s2 iya ::cost 0.1 ::right1 [-,$ ] ::right2 [-,$ ] ::lc1 rus ::lc2 rus ::comment Russian case endings
|
503 |
+
::s1 ii ::s2 iye ::cost 0.1 ::right1 [-,$ ] ::right2 [-,$ ] ::lc1 rus ::lc2 rus ::comment Russian case endings
|
504 |
+
|
505 |
+
::s1 am ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
506 |
+
::s1 ami ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
507 |
+
::s1 em ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
508 |
+
::s1 ev ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
509 |
+
::s1 eri ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
510 |
+
::s1 eryu ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
511 |
+
::s1 om ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
512 |
+
::s1 ov ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
513 |
+
::s1 akh ::s2 ::cost 0.3 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
514 |
+
::s1 ykh ::s2 ::cost 0.3 ::right1 [-,$ ] ::lc1 rus ::comment Russian case ending
|
515 |
+
|
516 |
+
# Ukrainian case endings
|
517 |
+
::s1 eyu ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
518 |
+
::s1 oyu ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
519 |
+
::s1 ya ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
520 |
+
::s1 yi ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
521 |
+
::s1 yu ::s2 ::cost 0.1 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
522 |
+
|
523 |
+
::s1 am ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
524 |
+
::s1 amy ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
525 |
+
::s1 em ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
526 |
+
::s1 evy ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
527 |
+
::s1 iv ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
528 |
+
::s1 om ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
529 |
+
::s1 ovy ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
530 |
+
::s1 yam ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
531 |
+
::s1 yamy ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
532 |
+
::s1 yiv ::s2 ::cost 0.2 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
533 |
+
::s1 akh ::s2 ::cost 0.3 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
534 |
+
::s1 yakh ::s2 ::cost 0.3 ::right1 [-,$ ] ::lc1 ukr ::comment Ukrainian case ending
|
535 |
+
|
536 |
+
# Uyghur
|
537 |
+
::s1 aw ::s2 ao ::cost 0.05 ::lc1 uig
|
538 |
+
::s1 aw ::s2 au ::cost 0.05 ::lc1 uig
|
539 |
+
::s1 gwi ::s2 gui ::cost 0.05 ::lc1 uig
|
540 |
+
::s1 iye ::s2 ia ::cost 0.05 ::lc1 uig
|
541 |
+
::s1 istan ::s2 ia ::cost 0.1 ::right1 [-,$ ] ::lc1 uig
|
542 |
+
::s1 j ::s2 c ::cost 0.4 ::lc1 uig
|
543 |
+
::s1 q ::s2 h ::cost 0.2 ::lc1 uig
|
544 |
+
::s1 sey ::s2 cai ::cost 0.2 ::lc1 uig
|
545 |
+
::s1 sh ::s2 x ::cost 0.2 ::lc1 uig
|
546 |
+
|
547 |
+
::s1 b ::s2 p ::cost 0.3
|
548 |
+
::s1 b ::s2 v ::cost 0.5 ::left2 /^(.*[- ])?$/
|
549 |
+
::s1 b ::s2 v ::cost 0.7
|
550 |
+
::s1 c ::s2 ch ::cost 0.25 ::right1 [eiy]
|
551 |
+
::s1 c ::s2 ck ::cost 0.02 ::right1 [-,abcdfghklmnpoqrstuvwxz$ ]
|
552 |
+
::s1 c ::s2 k ::cost 0.4
|
553 |
+
::s1 c ::s2 k ::cost 0.05 ::left1 /^(.* )?ma?$/ ::comment MacIntyre
|
554 |
+
::s1 c ::s2 k ::cost 0.02 ::right1 [-,abcdfghklmnpoqrstuvwxz$ ]
|
555 |
+
::s1 c ::s2 kk ::cost 0.02 ::right1 [-,abcdfghklmnpoqrstuvwxz$ ]
|
556 |
+
::s1 c ::s2 s ::cost 0.7
|
557 |
+
::s1 c ::s2 s ::cost 0.1 ::right1 [eiy]
|
558 |
+
::s1 c ::s2 ts ::cost 0.15 ::right1 [eiy]
|
559 |
+
::s1 c ::s2 z ::cost 0.3
|
560 |
+
::s1 ch ::s2 ck ::cost 0.2
|
561 |
+
::s1 ch ::s2 g ::cost 0.3 ::right1 [eiy] ::right2 [eiy]
|
562 |
+
::s1 ch ::s2 k ::cost 0.2
|
563 |
+
::s1 ch ::s2 kk ::cost 0.2
|
564 |
+
::s1 ch ::s2 sh ::cost 0.3
|
565 |
+
::s1 ch ::s2 sh ::cost 0.2 ::left1 /eiy$/ ::right1 [$ ]
|
566 |
+
::s1 ch ::s2 tch ::cost 0.1
|
567 |
+
::s1 ch ::s2 tsh ::cost 0.1
|
568 |
+
::s1 ch ::s2 z ::cost 0.5
|
569 |
+
::s1 ck ::s2 kk ::cost 0.02
|
570 |
+
::s1 cz ::s2 ch ::cost 0.2 ::left1 /i$/
|
571 |
+
::s1 d ::s2 t ::cost 0.3
|
572 |
+
::s1 de ::s2 dre ::cost 0.3 ::lc1 zho ::right2 [-,$ ]
|
573 |
+
::s1 dg ::s2 j ::cost 0.6 ::lc1 eng ::comment Cambridge
|
574 |
+
::s1 dg ::s2 j ::cost 0.3 ::right1 [eiy] ::lc1 eng
|
575 |
+
::s1 dg ::s2 j ::cost 0.1 ::right1 [eiy] ::lc1 eng ::lc2 fas, jpn
|
576 |
+
::s1 dt ::s2 d ::cost 0.3
|
577 |
+
::s1 dt ::s2 t ::cost 0.03
|
578 |
+
::s1 dt ::s2 tt ::cost 0.03
|
579 |
+
::s1 f ::s2 p ::cost 0.8
|
580 |
+
::s1 f ::s2 ph ::cost 0.01
|
581 |
+
::s1 ff ::s2 ph ::cost 0.02
|
582 |
+
::s1 f ::s2 pf ::cost 0.1
|
583 |
+
::s1 f ::s2 v ::cost 0.3
|
584 |
+
::s1 f ::s2 v ::cost 0.1 ::right1 [-,$ ]
|
585 |
+
::s1 ef ::s2 ev ::cost 0.1 ::right1 [-,bcdfghklmnpqrstvwxz$ ]
|
586 |
+
::s1 f ::s2 w ::cost 0.3
|
587 |
+
::s1 g ::s2 j ::cost 0.6
|
588 |
+
::s1 g ::s2 j ::cost 0.3 ::right1 [eiy]
|
589 |
+
::s1 g ::s2 j ::cost 0.1 ::right1 [eiy] ::lc2 amh, ara, fas, jpn, som
|
590 |
+
::s1 g ::s2 k ::cost 0.3
|
591 |
+
::s1 g ::s2 gh ::cost 0.3
|
592 |
+
::s1 g ::s2 ch ::cost 0.4 ::left1 /[eiy]$/ ::right1 [-,$ ] ::comment German: Ludwig, Braunschweig
|
593 |
+
::s1 gh ::s2 f ::cost 0.2 ::lc1 eng ::comment laughter
|
594 |
+
::s1 gh ::s2 "" ::cost 0.2 ::lc1 eng ::comment daughter
|
595 |
+
::s1 gh ::s2 g ::cost 0.2 ::lc1 eng ::comment Afghanistan
|
596 |
+
::s1 gl ::s2 l ::cost 0.2 ::lc1 eng ::right1 [i]
|
597 |
+
::s1 gn ::s2 n ::cost 0.05 ::left1 /^(.* )?$/ ::lc1 eng
|
598 |
+
::s1 gn ::s2 n ::cost 0.2 ::lc1 eng
|
599 |
+
::s1 gz ::s2 ks ::cost 0.2
|
600 |
+
::s1 h ::s2 e ::cost 0.4 ::lc1 fas
|
601 |
+
::s1 ise ::s2 ize ::cost 0.1
|
602 |
+
::s1 j ::s2 y ::cost 0.2
|
603 |
+
::s1 j ::s2 dj ::cost 0.2
|
604 |
+
::s1 j ::s2 h ::cost 0.4 ::right2 [aeiou] ::lc2 amh ::example Jose
|
605 |
+
::s1 j ::s2 hh ::cost 0.4 ::right2 [aeiou] ::lc2 amh ::example Tardajos
|
606 |
+
::s1 j ::s2 zh ::cost 0.2
|
607 |
+
::s1 k ::s2 cc ::cost 0.02 ::right2 [aour]
|
608 |
+
::s1 k ::s2 cc ::cost 0.3
|
609 |
+
::s1 k ::s2 cch ::cost 0.15
|
610 |
+
::s1 k ::s2 ck ::cost 0.02
|
611 |
+
::s1 k ::s2 cq ::cost 0.05
|
612 |
+
::s1 k ::s2 cqu ::cost 0.05
|
613 |
+
::s1 k ::s2 cque ::cost 0.1
|
614 |
+
::s1 k ::s2 cque ::cost 0.05 ::right2 [-,$ ]
|
615 |
+
::s1 k ::s2 cques ::cost 0.05 ::right2 [-,$ ]
|
616 |
+
::s1 k ::s2 q ::cost 0.05
|
617 |
+
::s1 k ::s2 qu ::cost 0.05
|
618 |
+
::s1 k ::s2 que ::cost 0.1
|
619 |
+
::s1 k ::s2 que ::cost 0.05 ::right2 [-,$ ]
|
620 |
+
::s1 k ::s2 ques ::cost 0.1 ::right2 [-,$ ]
|
621 |
+
::s1 kh ::s2 j ::cost 0.2
|
622 |
+
::s1 kh ::s2 q ::cost 0.2
|
623 |
+
::s1 kh ::s2 k ::cost 0.25 ::right1 [aeiouy]
|
624 |
+
::s1 kh ::s2 k ::cost 0.1 ::right1 [aeiouys] ::lc2 amh
|
625 |
+
::s1 kn ::s2 n ::cost 0.05 ::left1 /^(.* )?$/ ::lc1 eng
|
626 |
+
::s1 kj ::s2 sh ::cost 0.2 ::comment Swedish
|
627 |
+
::s1 l ::s2 r ::cost 0.1 ::lc1 zho
|
628 |
+
::s1 aib ::s2 alb ::cost 0.1 ::lc1 zho
|
629 |
+
::s1 al ::s2 ::cost 0.5 ::left1 /^(.* )?$/
|
630 |
+
::s1 al- ::s2 ::cost 0.3 ::left1 /^(.* )?$/
|
631 |
+
::s1 el ::s2 ::cost 0.5 ::left1 /^(.* )?$/
|
632 |
+
::s1 el- ::s2 ::cost 0.3 ::left1 /^(.* )?$/
|
633 |
+
::s1 ll ::s2 y ::cost 0.1 ::left1 /[aeiouy]$/ ::right1 [aeiouy] ::comment Guillermo, Guillaume
|
634 |
+
::s1 mb ::s2 m ::cost 0.2 ::right1 [-,bcdfghklmnpqstvwxz$ ] ::lc1 eng ::comment bomb
|
635 |
+
::s1 n ::s2 m ::cost 0.5 ::left1 /[aeiou]$/ ::left2 /[aeiou]$/ ::right1 [bcdfghklmnpqrstvwxz$ ] ::right2 [-,bcdfghklmnpqrstvwxz$ ]
|
636 |
+
::s1 ng ::s2 n ::cost 0.1 ::left1 /[aeiou]$/ ::lc1 zho
|
637 |
+
::s1 ng ::s2 m ::cost 0.25 ::left1 /[aeiou]$/ ::lc1 zho
|
638 |
+
::s1 ng ::s2 n ::cost 0.1 ::left2 /[aeiou]$/ ::lc2 ara, ben, rus, zho
|
639 |
+
::s1 nm ::s2 m ::cost 0.25 ::lc1 zho ::left1
|
640 |
+
::s1 pn ::s2 n ::cost 0.05 ::left1 /^(.* )?$/ ::lc1 eng
|
641 |
+
::s1 ph ::s2 p ::cost 0.3 ::lc1 amh
|
642 |
+
::s1 q ::s2 c ::cost 0.15
|
643 |
+
::s1 q ::s2 ch ::cost 0.2 ::right2 [eiy]
|
644 |
+
::s1 q ::s2 ck ::cost 0.2
|
645 |
+
::s1 q ::s2 kk ::cost 0.2
|
646 |
+
::s1 q ::s2 gh ::cost 0.2 ::lc1 fas ::right2 [aeiouy]
|
647 |
+
::s1 qi ::s2 ch ::cost 0.2 ::lc1 zho ::right1 [aeou]
|
648 |
+
::s1 qi ::s2 cci ::cost 0.1 ::lc1 zho
|
649 |
+
::s1 qi ::s2 chi ::cost 0.1 ::lc1 zho
|
650 |
+
::s1 qi ::s2 tch ::cost 0.2 ::lc1 zho ::right1 [aeou]
|
651 |
+
::s1 qi ::s2 ts ::cost 0.4 ::lc1 zho ::right1 [aeou]
|
652 |
+
::s1 qi ::s2 tsch ::cost 0.2 ::lc1 zho ::right1 [aeou]
|
653 |
+
::s1 qi ::s2 tzsch ::cost 0.2 ::lc1 zho ::right1 [aeou]
|
654 |
+
::s1 qi ::s2 czy ::cost 0.2 ::lc1 zho
|
655 |
+
::s1 qu ::s2 kw ::cost 0.15
|
656 |
+
::s1 qu ::s2 kv ::cost 0.15
|
657 |
+
::s1 e ::s2 er ::cost 0.25 ::left1 /[bcdfghklmnpqrstvwxz]$/ ::lc1 zho
|
658 |
+
::s1 re ::s2 er ::cost 0.1
|
659 |
+
::s1 rh ::s2 r ::cost 0.05 ::left1 /^(.*[- ])?$/ ::example Rhine
|
660 |
+
::s1 s ::s2 sh ::cost 0.03 ::right2 [aeiou] ::lc2 amh
|
661 |
+
::s1 s ::s2 sz ::cost 0.3 ::lc2 eng ::example Liszt (Hungarian)
|
662 |
+
::s1 s ::s2 ts ::cost 0.4 ::lc1 amh, zho
|
663 |
+
::s1 s ::s2 z ::cost 0.4
|
664 |
+
::s1 s ::s2 z ::cost 0.1 ::left1 /[aeiouy]$/ ::right1 [aeiouy] ::lc1 eng
|
665 |
+
::s1 s ::s2 z ::cost 0.1 ::left1 /[aeiouy][bdglmnrvw]?$/ ::right1 [-,$ ] ::lc1 eng
|
666 |
+
::s1 s ::s2 z ::cost 0.2 ::lc2 fas
|
667 |
+
::s1 sc ::s2 s ::cost 0.2 ::right1 [i] ::example Nascimento
|
668 |
+
::s1 sci ::s2 sh ::cost 0.2 ::example Brescia
|
669 |
+
::s1 sch ::s2 sh ::cost 0.1
|
670 |
+
::s1 sh ::s2 sz ::cost 0.2 ::example Mariusz (Polish) ::lc2 eng
|
671 |
+
::s1 si ::s2 j ::cost 0.1 ::right2 [a] ::lc1 eng
|
672 |
+
::s1 ss ::s2 z ::cost 0.5
|
673 |
+
# ::s1 smith ::s2 mith ::cost 0.75 ::lc2 zho ::comment weird, but several different Xinhua examples
|
674 |
+
::s1 tch ::s2 c ::cost 0.2 ::left2 /[aeiou]$/ ::right2 [-,e$ ]
|
675 |
+
::s1 te ::s2 tre ::cost 0.3 ::lc1 zho ::right2 [-,$ ]
|
676 |
+
::s1 th ::s2 t ::cost 0.2 ::lc2 amh, fas, uig
|
677 |
+
::s1 th ::s2 s ::cost 0.4 ::lc2 zho
|
678 |
+
::s1 th ::s2 sth ::cost 0.4 ::lc1 zho
|
679 |
+
::s1 th ::s2 ths ::cost 0.4 ::lc1 zho
|
680 |
+
::s1 th ::s2 z ::cost 0.3 ::lc2 amh ::right2 [-,$ aeot]
|
681 |
+
::s1 v ::s2 w ::cost 0.02
|
682 |
+
::s1 v ::s2 wh ::cost 0.02 ::left1 /^(.* )?$/
|
683 |
+
::s1 vv ::s2 w ::cost 0.02
|
684 |
+
::s1 w ::s2 u ::cost 0.1 ::lc2 uig
|
685 |
+
::s1 wa ::s2 ua ::cost 0.05
|
686 |
+
::s1 wh ::s2 w ::cost 0.05 ::left1 /^(.* )?$/
|
687 |
+
::s1 wr ::s2 r ::cost 0.05 ::left1 /^(.* )?$/ ::lc1 eng
|
688 |
+
::s1 x ::s2 ks ::cost 0.05
|
689 |
+
::s1 x ::s2 s ::cost 0.2 ::left1 /^(.* )?$/
|
690 |
+
::s1 x ::s2 sh ::cost 0.2 ::lc1 uig ::left1 /^(.* )?$/ ::right1 [aeiou]
|
691 |
+
::s1 x ::s2 z ::cost 0.2 ::left1 /^(.* )?$/ ::right1 [aeiouy]
|
692 |
+
::s1 x ::s2 h ::cost 0.3 ::lc1 uig
|
693 |
+
::s1 x ::s2 h ::cost 0.05 ::lc1 uig ::left1 /^(.* )?$/ ::right1 [aeiou]
|
694 |
+
::s1 x ::s2 kh ::cost 0.1 ::lc1 uig
|
695 |
+
::s1 xi ::s2 sch ::cost 0.2 ::right1 [aeou] ::lc1 zho
|
696 |
+
::s1 xi ::s2 sh ::cost 0.2 ::right1 [aeou] ::lc1 zho
|
697 |
+
::s1 xi ::s2 ch ::cost 0.4 ::right1 [aeou] ::lc1 zho
|
698 |
+
::s1 xi ::s2 sci ::cost 0.4 ::right1 [aeou] ::lc1 zho
|
699 |
+
::s1 xi ::s2 s ::cost 0.6 ::right1 [aeou] ::lc1 zho
|
700 |
+
::s1 z ::s2 dz ::cost 0.1 ::left1 /^(.*[ aeiouy])?[lnr]?$/
|
701 |
+
::s1 z ::s2 ts ::cost 0.15
|
702 |
+
::s1 z ::s2 tz ::cost 0.15
|
703 |
+
::s1 zh ::s2 g ::cost 0.2 ::right2 [eiy]
|
704 |
+
::s1 zh ::s2 g ::cost 0.1 ::right2 [eiy] ::lc2 amh
|
705 |
+
::s1 zz ::s2 ts ::cost 0.15
|
706 |
+
::s1 zz ::s2 tz ::cost 0.1
|
707 |
+
|
708 |
+
# Oromo
|
709 |
+
::s1 nb ::s2 mb ::cost 0.4 ::lc1 orm ::lc2 orm ::left1 /[aeiou]$/ ::left2 /[aeiou]$/
|
710 |
+
::s1 np ::s2 mp ::cost 0.4 ::lc1 orm ::lc2 orm ::left1 /[aeiou]$/ ::left2 /[aeiou]$/
|
711 |
+
::s1 ph ::s2 p ::cost 0.3 ::lc1 orm ::lc2 orm
|
712 |
+
|
713 |
+
# Tigrinya
|
714 |
+
::s1 aaye ::s2 a ::cost 0.4 ::lc1 tir ::lc2 tir ::left1 /[bcdfghklmnpqrstvwxz]$/ ::right1 [bcdfghklmnpqrstvwxz] ::comment internal plural
|
715 |
+
::s1 aaye ::s2 i ::cost 0.4 ::lc1 tir ::lc2 tir ::left1 /[bcdfghklmnpqrstvwxz]$/ ::right1 [bcdfghklmnpqrstvwxz] ::comment internal plural
|
716 |
+
|
717 |
+
# Somali
|
718 |
+
::s1 ay ::s2 ey ::cost 0.1 ::lc1 som ::lc2 som
|
719 |
+
::s1 ay ::s2 eey ::cost 0.15 ::lc1 som ::lc2 som
|
720 |
+
::s1 aha ::s2 ihii ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
721 |
+
::s1 aha ::s2 ihi ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
722 |
+
::s1 aha ::s2 uhu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
723 |
+
::s1 ihii ::s2 uhu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
724 |
+
::s1 ihi ::s2 uhu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
725 |
+
::s1 ha ::s2 hii ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
726 |
+
::s1 ha ::s2 hi ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
727 |
+
::s1 ha ::s2 hu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
728 |
+
::s1 hii ::s2 hu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
729 |
+
::s1 hi ::s2 hu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
730 |
+
::s1 aka ::s2 ikii ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
731 |
+
::s1 aka ::s2 iki ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
732 |
+
::s1 aka ::s2 uku ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
733 |
+
::s1 ikii ::s2 uku ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
734 |
+
::s1 iki ::s2 uku ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
735 |
+
::s1 ka ::s2 kii ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
736 |
+
::s1 ka ::s2 ki ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
737 |
+
::s1 ka ::s2 ku ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
738 |
+
::s1 kii ::s2 ku ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
739 |
+
::s1 ki ::s2 ku ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
740 |
+
::s1 aga ::s2 ugu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
741 |
+
::s1 ga ::s2 gu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
742 |
+
::s1 ata ::s2 itii ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
743 |
+
::s1 ata ::s2 iti ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
744 |
+
::s1 ata ::s2 utu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
745 |
+
::s1 itii ::s2 utu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
746 |
+
::s1 iti ::s2 utu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
747 |
+
::s1 ta ::s2 tii ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
748 |
+
::s1 ta ::s2 ti ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
749 |
+
::s1 ta ::s2 tu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
750 |
+
::s1 tii ::s2 tu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
751 |
+
::s1 ti ::s2 tu ::cost 0.15 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [-,$ ]
|
752 |
+
::s1 ata ::s2 ete ::cost 0.15 ::lc1 som ::lc2 som
|
753 |
+
::s1 ata ::s2 iti ::cost 0.2 ::lc1 som ::lc2 som
|
754 |
+
::s1 ete ::s2 iti ::cost 0.15 ::lc1 som ::lc2 som
|
755 |
+
::s1 g ::s2 k ::cost 0.2 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [aeiou]
|
756 |
+
::s1 g ::s2 k ::cost 0.25 ::lc1 som ::lc2 som
|
757 |
+
::s1 g ::s2 kh ::cost 0.25 ::lc1 som ::lc2 som
|
758 |
+
::s1 gh ::s2 kh ::cost 0.1 ::lc1 som ::lc2 som
|
759 |
+
::s1 gh ::s2 k ::cost 0.2 ::lc1 som ::lc2 som
|
760 |
+
::s1 g ::s2 q ::cost 0.25 ::lc1 som ::lc2 som
|
761 |
+
::s1 g ::s2 q ::cost 0.2 ::lc1 som ::lc2 som ::right1 [aou] ::right2 [aou]
|
762 |
+
::s1 ga ::s2 q ::cost 0.2 ::lc1 som ::lc2 som ::left1 /^(.*[aeiou])?$/ ::left2 /^(.*[aeiou])?$/ ::right1 [bcdfghklmnpqrstvwxz] ::right2 [bcdfghklmnpqrstvwxz]
|
763 |
+
::s1 g ::s2 j ::cost 0.25 ::lc1 som ::lc2 som
|
764 |
+
::s1 g ::s2 j ::cost 0.15 ::lc1 som ::lc2 som ::right1 [ei] ::right2 [ei]
|
765 |
+
::s1 gi ::s2 j ::cost 0.15 ::lc1 som ::lc2 som ::right2 [ei]
|
766 |
+
::s1 n ::s2 m ::cost 0.2 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [aeiou]
|
767 |
+
::s1 n ::s2 mm ::cost 0.2 ::lc1 som ::lc2 som ::right1 [-,$ ] ::right2 [aeiou]
|
768 |
+
::s1 n ::s2 m ::cost 0.25 ::lc1 som ::lc2 som ::right2 [aeiko]
|
769 |
+
::s1 n ::s2 mm ::cost 0.25 ::lc1 som ::lc2 som ::right2 [aeiko]
|
770 |
+
::s1 ii ::s2 a ::cost 0.15 ::lc1 som ::lc2 som
|
771 |
+
::s1 y ::s2 dj ::cost 0.2 ::lc2 som
|
772 |
+
::s1 ca ::s2 a ::cost 0.15 ::left1 /^(.*[-, ])?$/ ::lc1 som
|
773 |
+
::s1 c ::s2 ::cost 0.25 ::left1 /^(.*[-, ])?$/ ::lc1 som
|
774 |
+
::s1 x ::s2 h ::cost 0.25 ::lc1 som
|
775 |
+
::s1 x ::s2 h ::cost 0.05 ::lc1 som ::left1 /^(.* )?$/ ::right1 [aeiou]
|
776 |
+
::s1 x ::s2 h ::cost 0.1 ::lc1 som ::left1 /[aeiou]$/
|
777 |
+
::s1 b ::s2 p ::cost 0.1 ::lc1 som
|
778 |
+
::s1 majm ::s2 mahm ::cost 0.1 ::lc1 som
|
779 |
+
::s1 chalim ::s2 halim ::cost 0.1 ::lc1 som ::lc2 som
|
780 |
+
::s1 chalim ::s2 jalim ::cost 0.1 ::lc1 som ::lc2 som
|
781 |
+
::s1 chalim ::s2 kalim ::cost 0.1 ::lc1 som ::lc2 som
|
782 |
+
::s1 halim ::s2 jalim ::cost 0.1 ::lc1 som ::lc2 som
|
783 |
+
::s1 halim ::s2 kalim ::cost 0.1 ::lc1 som ::lc2 som
|
784 |
+
::s1 jalim ::s2 kalim ::cost 0.1 ::lc1 som ::lc2 som
|
785 |
+
::s1 dh ::s2 r ::cost 0.25 ::lc1 som ::lc2 som ::left1 /[aeiou]$/
|
786 |
+
::s1 j ::s2 ch ::cost 0.25 ::lc1 som ::lc2 som
|
787 |
+
::s1 j ::s2 kh ::cost 0.25 ::lc1 som ::lc2 som
|
788 |
+
::s1 ch ::s2 sh ::cost 0.2 ::lc1 som ::lc2 som
|
789 |
+
|
790 |
+
# French
|
791 |
+
::s1 aud ::s2 o ::cost 0.3 ::right1 [-,$ ] ::lc1 eng, fra
|
792 |
+
::s1 aux ::s2 o ::cost 0.05 ::right1 [-,$ ]
|
793 |
+
::s1 eaux ::s2 o ::cost 0.05 ::right1 [-,$ ]
|
794 |
+
::s1 eux ::s2 o ::cost 0.05 ::right1 [-,$ ]
|
795 |
+
::s1 eux ::s2 e ::cost 0.15 ::right1 [-,$ ]
|
796 |
+
|
797 |
+
::s1 - ::s2 " " ::cost 0.1
|
798 |
+
::s1 : ::s2 , ::cost 0.1 ::lc1 amh
|
799 |
+
|
800 |
+
# mini dictionary Amharic-English
|
801 |
+
::s1 dabube ::s2 south ::cost 0 ::lc1 amh ::lc2 eng
|
802 |
+
::s1 daseete ::s2 island ::cost 0 ::lc1 amh ::lc2 eng
|
803 |
+
::s1 daseetoche ::s2 islands ::cost 0 ::lc1 amh ::lc2 eng
|
804 |
+
::s1 kaaweneti ::s2 county ::cost 0 ::lc1 amh ::lc2 eng
|
805 |
+
::s1 katamaa ::s2 city ::cost 0 ::lc1 amh ::lc2 eng
|
806 |
+
::s1 kelele ::s2 region ::cost 0 ::lc1 amh ::lc2 eng
|
807 |
+
::s1 meseraaqe ::s2 east ::cost 0 ::lc1 amh ::lc2 eng
|
808 |
+
::s1 sameene ::s2 north ::cost 0 ::lc1 amh ::lc2 eng
|
809 |
+
::s1 setaadiyame ::s2 stadium ::cost 0 ::lc1 amh ::lc2 eng
|
810 |
+
::s1 waneze ::s2 river ::cost 0 ::lc1 amh ::lc2 eng
|
811 |
+
|
812 |
+
# mini dictionary Arabic-English
|
813 |
+
::s1 " " ::s2 " of " ::cost 0 ::lc1 ara ::lc2 eng
|
814 |
+
::s1 " alawl" ::s2 " i" ::cost 0 ::lc1 ara ::lc2 eng ::right2 [-,$ ]
|
815 |
+
|
816 |
+
# mini dictionary Bengali-English
|
817 |
+
::s1 anychala ::s2 zone ::cost 0 ::lc1 ben ::lc2 eng
|
818 |
+
::s1 pradesha ::s2 province ::cost 0 ::lc1 ben ::lc2 eng
|
819 |
+
::s1 saamraajya ::s2 empire ::cost 0 ::lc1 ben ::lc2 eng
|
820 |
+
::s1 upajelaa ::s2 upazila ::cost 0 ::lc1 ben ::lc2 eng
|
821 |
+
::s1 uttara ::s2 north ::cost 0 ::lc1 ben ::lc2 eng
|
822 |
+
::s1 "dya " ::s2 "the " ::left1 /^(.*[-, ])?$/ ::cost 0.2 ::lc1 ben ::lc2 eng
|
823 |
+
::s1 " aba " ::s2 " of " ::cost 0 ::lc1 ben ::lc2 eng
|
824 |
+
|
825 |
+
# mini dictionary Russian-English
|
826 |
+
::s1 akademiya ::s2 academy ::cost 0 ::lc1 rus ::lc2 eng
|
827 |
+
::s1 eparkhiya ::s2 diocese ::cost 0 ::lc1 rus ::lc2 eng
|
828 |
+
::s1 gorod ::s2 city ::cost 0 ::lc1 rus ::lc2 eng
|
829 |
+
::s1 gosudarstvennyi ::s2 state ::cost 0 ::lc1 rus ::lc2 eng
|
830 |
+
::s1 gubernator ::s2 governor ::cost 0 ::lc1 rus ::lc2 eng
|
831 |
+
::s1 guberniya ::s2 governate ::cost 0 ::lc1 rus ::lc2 eng
|
832 |
+
::s1 imperator ::s2 emperor ::cost 0 ::lc1 rus ::lc2 eng
|
833 |
+
::s1 komitet ::s2 committee ::cost 0 ::lc1 rus ::lc2 eng
|
834 |
+
::s1 korolevstvo ::s2 kingdom ::cost 0 ::lc1 rus ::lc2 eng
|
835 |
+
::s1 koroli ::s2 king ::cost 0 ::lc1 rus ::lc2 eng
|
836 |
+
::s1 mezhdunarodnaya ::s2 international ::cost 0 ::lc1 rus ::lc2 eng
|
837 |
+
::s1 natsionalnyi ::s2 national ::cost 0 ::lc1 rus ::lc2 eng
|
838 |
+
::s1 novyi ::s2 new ::cost 0 ::lc1 rus ::lc2 eng
|
839 |
+
::s1 oblast ::s2 province ::cost 0 ::lc1 rus ::lc2 eng
|
840 |
+
::s1 oblast ::s2 region ::cost 0 ::lc1 rus ::lc2 eng
|
841 |
+
::s1 obshchestvo ::s2 society ::cost 0 ::lc1 rus ::lc2 eng
|
842 |
+
::s1 okrug ::s2 district ::cost 0 ::lc1 rus ::lc2 eng
|
843 |
+
::s1 okrug ::s2 region ::cost 0 ::lc1 rus ::lc2 eng
|
844 |
+
::s1 ostrova ::s2 island ::cost 0 ::lc1 rus ::lc2 eng
|
845 |
+
::s1 partiya ::s2 party ::cost 0 ::lc1 rus ::lc2 eng
|
846 |
+
::s1 raion ::s2 district ::cost 0 ::lc1 rus ::lc2 eng
|
847 |
+
::s1 respublika ::s2 republic ::cost 0 ::lc1 rus ::lc2 eng
|
848 |
+
::s1 respublik ::s2 republic ::cost 0 ::lc1 rus ::lc2 eng
|
849 |
+
::s1 sbornaya ::s2 team ::cost 0 ::lc1 rus ::lc2 eng
|
850 |
+
::s1 severnaya ::s2 north ::cost 0 ::lc1 rus ::lc2 eng
|
851 |
+
::s1 sovet council ::cost 0 ::lc1 rus ::lc2 eng
|
852 |
+
::s1 soyuz ::s2 alliance ::cost 0 ::lc1 rus ::lc2 eng
|
853 |
+
::s1 soyuz ::s2 association ::cost 0 ::lc1 rus ::lc2 eng
|
854 |
+
::s1 soyuz ::s2 league ::cost 0 ::lc1 rus ::lc2 eng
|
855 |
+
::s1 soyuz ::s2 union ::cost 0 ::lc1 rus ::lc2 eng
|
856 |
+
::s1 svyataya ::s2 saint ::cost 0 ::lc1 rus ::lc2 eng
|
857 |
+
::s1 svobodnyi ::s2 free ::cost 0 ::lc1 rus ::lc2 eng
|
858 |
+
::s1 tserkov ::s2 church ::cost 0 ::lc1 rus ::lc2 eng
|
859 |
+
::s1 uezd ::s2 county ::cost 0 ::lc1 rus ::lc2 eng
|
860 |
+
::s1 universitet ::s2 university ::cost 0 ::lc1 rus ::lc2 eng
|
861 |
+
::s1 vostochnaya ::s2 east ::cost 0 ::lc1 rus ::lc2 eng
|
862 |
+
::s1 vostochnaya ::s2 eastern ::cost 0 ::lc1 rus ::lc2 eng
|
863 |
+
::s1 yuzhnaya ::s2 south ::cost 0 ::lc1 rus ::lc2 eng
|
864 |
+
::s1 yuzhnaya ::s2 southern ::cost 0 ::lc1 rus ::lc2 eng
|
865 |
+
::s1 yuzhnoi ::s2 south ::cost 0 ::lc1 rus ::lc2 eng
|
866 |
+
::s1 yuzhnoi ::s2 southern ::cost 0 ::lc1 rus ::lc2 eng
|
867 |
+
::s1 yuzhnyi ::s2 south ::cost 0 ::lc1 rus ::lc2 eng
|
868 |
+
# often dropped in Russian name
|
869 |
+
::s1 ::s2 county ::cost 0 ::lc1 rus ::lc2 eng
|
870 |
+
::s1 ::s2 island ::cost 0 ::lc1 rus ::lc2 eng
|
871 |
+
::s1 ::s2 pope ::cost 0 ::lc1 rus ::lc2 eng
|
872 |
+
::s1 ::s2 river ::cost 0 ::lc1 rus ::lc2 eng
|
873 |
+
::s1 ::s2 "the " ::cost 0 ::lc1 rus ::lc2 eng ::left2 /^(.*[- ])?$/
|
874 |
+
::s1 " " ::s2 " of " ::cost 0 ::lc1 rus ::lc2 eng
|
875 |
+
|
876 |
+
|
877 |
+
# mini dictionary Uyghur-English
|
878 |
+
::s1 aptonom ::s2 automomous ::cost 0 ::lc1 uig ::lc2 eng
|
879 |
+
::s1 aralliri ::s2 islands ::cost 0 ::lc1 uig ::lc2 eng
|
880 |
+
::s1 aralliri ::s2 ::cost 0 ::lc1 uig ::lc2 eng
|
881 |
+
::s1 arili ::s2 island ::cost 0 ::lc1 uig ::lc2 eng
|
882 |
+
::s1 arili ::s2 ::cost 0 ::lc1 uig ::lc2 eng
|
883 |
+
::s1 nahiyisi ::s2 county ::cost 0 ::lc1 uig ::lc2 eng
|
884 |
+
::s1 oelkisi ::s2 province ::cost 0 ::lc1 uig ::lc2 eng
|
885 |
+
::s1 oelkisi ::s2 ::cost 0 ::lc1 uig ::lc2 eng
|
886 |
+
::s1 ottura ::s2 central ::cost 0 ::lc1 uig ::lc2 eng
|
887 |
+
::s1 rayoni ::s2 region ::cost 0 ::lc1 uig ::lc2 eng
|
888 |
+
::s1 shehiri ::s2 city ::cost 0 ::lc1 uig ::lc2 eng
|
889 |
+
::s1 shehiri ::s2 ::cost 0 ::lc1 uig ::lc2 eng
|
890 |
+
::s1 shitati ::s2 state ::cost 0 ::lc1 uig ::lc2 eng
|
891 |
+
::s1 shitati ::s2 ::cost 0 ::lc1 uig ::lc2 eng
|
892 |
+
::s1 shtati ::s2 state ::cost 0 ::lc1 uig ::lc2 eng
|
893 |
+
::s1 shtati ::s2 ::cost 0 ::lc1 uig ::lc2 eng
|
894 |
+
::s1 uniwersiteti ::s2 university ::cost 0 ::lc1 uig ::lc2 eng
|
895 |
+
::s1 yengi ::s2 new ::cost 0 ::lc1 uig ::lc2 eng
|
896 |
+
|
uroman/lib/JSON.pm
ADDED
@@ -0,0 +1,2317 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package JSON;
|
2 |
+
|
3 |
+
|
4 |
+
use strict;
|
5 |
+
use Carp ();
|
6 |
+
use base qw(Exporter);
|
7 |
+
@JSON::EXPORT = qw(from_json to_json jsonToObj objToJson encode_json decode_json);
|
8 |
+
|
9 |
+
BEGIN {
|
10 |
+
$JSON::VERSION = '2.90';
|
11 |
+
$JSON::DEBUG = 0 unless (defined $JSON::DEBUG);
|
12 |
+
$JSON::DEBUG = $ENV{ PERL_JSON_DEBUG } if exists $ENV{ PERL_JSON_DEBUG };
|
13 |
+
}
|
14 |
+
|
15 |
+
my $Module_XS = 'JSON::XS';
|
16 |
+
my $Module_PP = 'JSON::PP';
|
17 |
+
my $Module_bp = 'JSON::backportPP'; # included in JSON distribution
|
18 |
+
my $PP_Version = '2.27203';
|
19 |
+
my $XS_Version = '2.34';
|
20 |
+
|
21 |
+
|
22 |
+
# XS and PP common methods
|
23 |
+
|
24 |
+
my @PublicMethods = qw/
|
25 |
+
ascii latin1 utf8 pretty indent space_before space_after relaxed canonical allow_nonref
|
26 |
+
allow_blessed convert_blessed filter_json_object filter_json_single_key_object
|
27 |
+
shrink max_depth max_size encode decode decode_prefix allow_unknown
|
28 |
+
/;
|
29 |
+
|
30 |
+
my @Properties = qw/
|
31 |
+
ascii latin1 utf8 indent space_before space_after relaxed canonical allow_nonref
|
32 |
+
allow_blessed convert_blessed shrink max_depth max_size allow_unknown
|
33 |
+
/;
|
34 |
+
|
35 |
+
my @XSOnlyMethods = qw/allow_tags/; # Currently nothing
|
36 |
+
|
37 |
+
my @PPOnlyMethods = qw/
|
38 |
+
indent_length sort_by
|
39 |
+
allow_singlequote allow_bignum loose allow_barekey escape_slash as_nonblessed
|
40 |
+
/; # JSON::PP specific
|
41 |
+
|
42 |
+
|
43 |
+
# used in _load_xs and _load_pp ($INSTALL_ONLY is not used currently)
|
44 |
+
my $_INSTALL_DONT_DIE = 1; # When _load_xs fails to load XS, don't die.
|
45 |
+
my $_INSTALL_ONLY = 2; # Don't call _set_methods()
|
46 |
+
my $_ALLOW_UNSUPPORTED = 0;
|
47 |
+
my $_UNIV_CONV_BLESSED = 0;
|
48 |
+
my $_USSING_bpPP = 0;
|
49 |
+
|
50 |
+
|
51 |
+
# Check the environment variable to decide worker module.
|
52 |
+
|
53 |
+
unless ($JSON::Backend) {
|
54 |
+
$JSON::DEBUG and Carp::carp("Check used worker module...");
|
55 |
+
|
56 |
+
my $backend = exists $ENV{PERL_JSON_BACKEND} ? $ENV{PERL_JSON_BACKEND} : 1;
|
57 |
+
|
58 |
+
if ($backend eq '1' or $backend =~ /JSON::XS\s*,\s*JSON::PP/) {
|
59 |
+
_load_xs($_INSTALL_DONT_DIE) or _load_pp();
|
60 |
+
}
|
61 |
+
elsif ($backend eq '0' or $backend eq 'JSON::PP') {
|
62 |
+
_load_pp();
|
63 |
+
}
|
64 |
+
elsif ($backend eq '2' or $backend eq 'JSON::XS') {
|
65 |
+
_load_xs();
|
66 |
+
}
|
67 |
+
elsif ($backend eq 'JSON::backportPP') {
|
68 |
+
$_USSING_bpPP = 1;
|
69 |
+
_load_pp();
|
70 |
+
}
|
71 |
+
else {
|
72 |
+
Carp::croak "The value of environmental variable 'PERL_JSON_BACKEND' is invalid.";
|
73 |
+
}
|
74 |
+
}
|
75 |
+
|
76 |
+
|
77 |
+
sub import {
|
78 |
+
my $pkg = shift;
|
79 |
+
my @what_to_export;
|
80 |
+
my $no_export;
|
81 |
+
|
82 |
+
for my $tag (@_) {
|
83 |
+
if ($tag eq '-support_by_pp') {
|
84 |
+
if (!$_ALLOW_UNSUPPORTED++) {
|
85 |
+
JSON::Backend::XS
|
86 |
+
->support_by_pp(@PPOnlyMethods) if ($JSON::Backend eq $Module_XS);
|
87 |
+
}
|
88 |
+
next;
|
89 |
+
}
|
90 |
+
elsif ($tag eq '-no_export') {
|
91 |
+
$no_export++, next;
|
92 |
+
}
|
93 |
+
elsif ( $tag eq '-convert_blessed_universally' ) {
|
94 |
+
eval q|
|
95 |
+
require B;
|
96 |
+
*UNIVERSAL::TO_JSON = sub {
|
97 |
+
my $b_obj = B::svref_2object( $_[0] );
|
98 |
+
return $b_obj->isa('B::HV') ? { %{ $_[0] } }
|
99 |
+
: $b_obj->isa('B::AV') ? [ @{ $_[0] } ]
|
100 |
+
: undef
|
101 |
+
;
|
102 |
+
}
|
103 |
+
| if ( !$_UNIV_CONV_BLESSED++ );
|
104 |
+
next;
|
105 |
+
}
|
106 |
+
push @what_to_export, $tag;
|
107 |
+
}
|
108 |
+
|
109 |
+
return if ($no_export);
|
110 |
+
|
111 |
+
__PACKAGE__->export_to_level(1, $pkg, @what_to_export);
|
112 |
+
}
|
113 |
+
|
114 |
+
|
115 |
+
# OBSOLETED
|
116 |
+
|
117 |
+
sub jsonToObj {
|
118 |
+
my $alternative = 'from_json';
|
119 |
+
if (defined $_[0] and UNIVERSAL::isa($_[0], 'JSON')) {
|
120 |
+
shift @_; $alternative = 'decode';
|
121 |
+
}
|
122 |
+
Carp::carp "'jsonToObj' will be obsoleted. Please use '$alternative' instead.";
|
123 |
+
return JSON::from_json(@_);
|
124 |
+
};
|
125 |
+
|
126 |
+
sub objToJson {
|
127 |
+
my $alternative = 'to_json';
|
128 |
+
if (defined $_[0] and UNIVERSAL::isa($_[0], 'JSON')) {
|
129 |
+
shift @_; $alternative = 'encode';
|
130 |
+
}
|
131 |
+
Carp::carp "'objToJson' will be obsoleted. Please use '$alternative' instead.";
|
132 |
+
JSON::to_json(@_);
|
133 |
+
};
|
134 |
+
|
135 |
+
|
136 |
+
# INTERFACES
|
137 |
+
|
138 |
+
sub to_json ($@) {
|
139 |
+
if (
|
140 |
+
ref($_[0]) eq 'JSON'
|
141 |
+
or (@_ > 2 and $_[0] eq 'JSON')
|
142 |
+
) {
|
143 |
+
Carp::croak "to_json should not be called as a method.";
|
144 |
+
}
|
145 |
+
my $json = JSON->new;
|
146 |
+
|
147 |
+
if (@_ == 2 and ref $_[1] eq 'HASH') {
|
148 |
+
my $opt = $_[1];
|
149 |
+
for my $method (keys %$opt) {
|
150 |
+
$json->$method( $opt->{$method} );
|
151 |
+
}
|
152 |
+
}
|
153 |
+
|
154 |
+
$json->encode($_[0]);
|
155 |
+
}
|
156 |
+
|
157 |
+
|
158 |
+
sub from_json ($@) {
|
159 |
+
if ( ref($_[0]) eq 'JSON' or $_[0] eq 'JSON' ) {
|
160 |
+
Carp::croak "from_json should not be called as a method.";
|
161 |
+
}
|
162 |
+
my $json = JSON->new;
|
163 |
+
|
164 |
+
if (@_ == 2 and ref $_[1] eq 'HASH') {
|
165 |
+
my $opt = $_[1];
|
166 |
+
for my $method (keys %$opt) {
|
167 |
+
$json->$method( $opt->{$method} );
|
168 |
+
}
|
169 |
+
}
|
170 |
+
|
171 |
+
return $json->decode( $_[0] );
|
172 |
+
}
|
173 |
+
|
174 |
+
|
175 |
+
|
176 |
+
sub true { $JSON::true }
|
177 |
+
|
178 |
+
sub false { $JSON::false }
|
179 |
+
|
180 |
+
sub null { undef; }
|
181 |
+
|
182 |
+
|
183 |
+
sub require_xs_version { $XS_Version; }
|
184 |
+
|
185 |
+
sub backend {
|
186 |
+
my $proto = shift;
|
187 |
+
$JSON::Backend;
|
188 |
+
}
|
189 |
+
|
190 |
+
#*module = *backend;
|
191 |
+
|
192 |
+
|
193 |
+
sub is_xs {
|
194 |
+
return $_[0]->backend eq $Module_XS;
|
195 |
+
}
|
196 |
+
|
197 |
+
|
198 |
+
sub is_pp {
|
199 |
+
return not $_[0]->is_xs;
|
200 |
+
}
|
201 |
+
|
202 |
+
|
203 |
+
sub pureperl_only_methods { @PPOnlyMethods; }
|
204 |
+
|
205 |
+
|
206 |
+
sub property {
|
207 |
+
my ($self, $name, $value) = @_;
|
208 |
+
|
209 |
+
if (@_ == 1) {
|
210 |
+
my %props;
|
211 |
+
for $name (@Properties) {
|
212 |
+
my $method = 'get_' . $name;
|
213 |
+
if ($name eq 'max_size') {
|
214 |
+
my $value = $self->$method();
|
215 |
+
$props{$name} = $value == 1 ? 0 : $value;
|
216 |
+
next;
|
217 |
+
}
|
218 |
+
$props{$name} = $self->$method();
|
219 |
+
}
|
220 |
+
return \%props;
|
221 |
+
}
|
222 |
+
elsif (@_ > 3) {
|
223 |
+
Carp::croak('property() can take only the option within 2 arguments.');
|
224 |
+
}
|
225 |
+
elsif (@_ == 2) {
|
226 |
+
if ( my $method = $self->can('get_' . $name) ) {
|
227 |
+
if ($name eq 'max_size') {
|
228 |
+
my $value = $self->$method();
|
229 |
+
return $value == 1 ? 0 : $value;
|
230 |
+
}
|
231 |
+
$self->$method();
|
232 |
+
}
|
233 |
+
}
|
234 |
+
else {
|
235 |
+
$self->$name($value);
|
236 |
+
}
|
237 |
+
|
238 |
+
}
|
239 |
+
|
240 |
+
|
241 |
+
|
242 |
+
# INTERNAL
|
243 |
+
|
244 |
+
sub _load_xs {
|
245 |
+
my $opt = shift;
|
246 |
+
|
247 |
+
$JSON::DEBUG and Carp::carp "Load $Module_XS.";
|
248 |
+
|
249 |
+
# if called after install module, overload is disable.... why?
|
250 |
+
JSON::Boolean::_overrride_overload($Module_XS);
|
251 |
+
JSON::Boolean::_overrride_overload($Module_PP);
|
252 |
+
|
253 |
+
eval qq|
|
254 |
+
use $Module_XS $XS_Version ();
|
255 |
+
|;
|
256 |
+
|
257 |
+
if ($@) {
|
258 |
+
if (defined $opt and $opt & $_INSTALL_DONT_DIE) {
|
259 |
+
$JSON::DEBUG and Carp::carp "Can't load $Module_XS...($@)";
|
260 |
+
return 0;
|
261 |
+
}
|
262 |
+
Carp::croak $@;
|
263 |
+
}
|
264 |
+
|
265 |
+
unless (defined $opt and $opt & $_INSTALL_ONLY) {
|
266 |
+
_set_module( $JSON::Backend = $Module_XS );
|
267 |
+
my $data = join("", <DATA>); # this code is from Jcode 2.xx.
|
268 |
+
close(DATA);
|
269 |
+
eval $data;
|
270 |
+
JSON::Backend::XS->init;
|
271 |
+
}
|
272 |
+
|
273 |
+
return 1;
|
274 |
+
};
|
275 |
+
|
276 |
+
|
277 |
+
sub _load_pp {
|
278 |
+
my $opt = shift;
|
279 |
+
my $backend = $_USSING_bpPP ? $Module_bp : $Module_PP;
|
280 |
+
|
281 |
+
$JSON::DEBUG and Carp::carp "Load $backend.";
|
282 |
+
|
283 |
+
# if called after install module, overload is disable.... why?
|
284 |
+
JSON::Boolean::_overrride_overload($Module_XS);
|
285 |
+
JSON::Boolean::_overrride_overload($backend);
|
286 |
+
|
287 |
+
if ( $_USSING_bpPP ) {
|
288 |
+
eval qq| require $backend |;
|
289 |
+
}
|
290 |
+
else {
|
291 |
+
eval qq| use $backend $PP_Version () |;
|
292 |
+
}
|
293 |
+
|
294 |
+
if ($@) {
|
295 |
+
if ( $backend eq $Module_PP ) {
|
296 |
+
$JSON::DEBUG and Carp::carp "Can't load $Module_PP ($@), so try to load $Module_bp";
|
297 |
+
$_USSING_bpPP++;
|
298 |
+
$backend = $Module_bp;
|
299 |
+
JSON::Boolean::_overrride_overload($backend);
|
300 |
+
local $^W; # if PP installed but invalid version, backportPP redefines methods.
|
301 |
+
eval qq| require $Module_bp |;
|
302 |
+
}
|
303 |
+
Carp::croak $@ if $@;
|
304 |
+
}
|
305 |
+
|
306 |
+
unless (defined $opt and $opt & $_INSTALL_ONLY) {
|
307 |
+
_set_module( $JSON::Backend = $Module_PP ); # even if backportPP, set $Backend with 'JSON::PP'
|
308 |
+
JSON::Backend::PP->init;
|
309 |
+
}
|
310 |
+
};
|
311 |
+
|
312 |
+
|
313 |
+
sub _set_module {
|
314 |
+
return if defined $JSON::true;
|
315 |
+
|
316 |
+
my $module = shift;
|
317 |
+
|
318 |
+
local $^W;
|
319 |
+
no strict qw(refs);
|
320 |
+
|
321 |
+
$JSON::true = ${"$module\::true"};
|
322 |
+
$JSON::false = ${"$module\::false"};
|
323 |
+
|
324 |
+
push @JSON::ISA, $module;
|
325 |
+
if ( JSON->is_xs and JSON->backend->VERSION < 3 ) {
|
326 |
+
eval 'package JSON::PP::Boolean';
|
327 |
+
push @{"$module\::Boolean::ISA"}, qw(JSON::PP::Boolean);
|
328 |
+
}
|
329 |
+
|
330 |
+
*{"JSON::is_bool"} = \&{"$module\::is_bool"};
|
331 |
+
|
332 |
+
for my $method ($module eq $Module_XS ? @PPOnlyMethods : @XSOnlyMethods) {
|
333 |
+
*{"JSON::$method"} = sub {
|
334 |
+
Carp::carp("$method is not supported in $module.");
|
335 |
+
$_[0];
|
336 |
+
};
|
337 |
+
}
|
338 |
+
|
339 |
+
return 1;
|
340 |
+
}
|
341 |
+
|
342 |
+
|
343 |
+
|
344 |
+
#
|
345 |
+
# JSON Boolean
|
346 |
+
#
|
347 |
+
|
348 |
+
package JSON::Boolean;
|
349 |
+
|
350 |
+
my %Installed;
|
351 |
+
|
352 |
+
sub _overrride_overload {
|
353 |
+
return; # this function is currently disable.
|
354 |
+
return if ($Installed{ $_[0] }++);
|
355 |
+
|
356 |
+
my $boolean = $_[0] . '::Boolean';
|
357 |
+
|
358 |
+
eval sprintf(q|
|
359 |
+
package %s;
|
360 |
+
use overload (
|
361 |
+
'""' => sub { ${$_[0]} == 1 ? 'true' : 'false' },
|
362 |
+
'eq' => sub {
|
363 |
+
my ($obj, $op) = ref ($_[0]) ? ($_[0], $_[1]) : ($_[1], $_[0]);
|
364 |
+
if ($op eq 'true' or $op eq 'false') {
|
365 |
+
return "$obj" eq 'true' ? 'true' eq $op : 'false' eq $op;
|
366 |
+
}
|
367 |
+
else {
|
368 |
+
return $obj ? 1 == $op : 0 == $op;
|
369 |
+
}
|
370 |
+
},
|
371 |
+
);
|
372 |
+
|, $boolean);
|
373 |
+
|
374 |
+
if ($@) { Carp::croak $@; }
|
375 |
+
|
376 |
+
if ( exists $INC{'JSON/XS.pm'} and $boolean eq 'JSON::XS::Boolean' ) {
|
377 |
+
local $^W;
|
378 |
+
my $true = do { bless \(my $dummy = 1), $boolean };
|
379 |
+
my $false = do { bless \(my $dummy = 0), $boolean };
|
380 |
+
*JSON::XS::true = sub () { $true };
|
381 |
+
*JSON::XS::false = sub () { $false };
|
382 |
+
}
|
383 |
+
elsif ( exists $INC{'JSON/PP.pm'} and $boolean eq 'JSON::PP::Boolean' ) {
|
384 |
+
local $^W;
|
385 |
+
my $true = do { bless \(my $dummy = 1), $boolean };
|
386 |
+
my $false = do { bless \(my $dummy = 0), $boolean };
|
387 |
+
*JSON::PP::true = sub { $true };
|
388 |
+
*JSON::PP::false = sub { $false };
|
389 |
+
}
|
390 |
+
|
391 |
+
return 1;
|
392 |
+
}
|
393 |
+
|
394 |
+
|
395 |
+
#
|
396 |
+
# Helper classes for Backend Module (PP)
|
397 |
+
#
|
398 |
+
|
399 |
+
package JSON::Backend::PP;
|
400 |
+
|
401 |
+
sub init {
|
402 |
+
local $^W;
|
403 |
+
no strict qw(refs); # this routine may be called after JSON::Backend::XS init was called.
|
404 |
+
*{"JSON::decode_json"} = \&{"JSON::PP::decode_json"};
|
405 |
+
*{"JSON::encode_json"} = \&{"JSON::PP::encode_json"};
|
406 |
+
*{"JSON::PP::is_xs"} = sub { 0 };
|
407 |
+
*{"JSON::PP::is_pp"} = sub { 1 };
|
408 |
+
return 1;
|
409 |
+
}
|
410 |
+
|
411 |
+
#
|
412 |
+
# To save memory, the below lines are read only when XS backend is used.
|
413 |
+
#
|
414 |
+
|
415 |
+
package JSON;
|
416 |
+
|
417 |
+
1;
|
418 |
+
__DATA__
|
419 |
+
|
420 |
+
|
421 |
+
#
|
422 |
+
# Helper classes for Backend Module (XS)
|
423 |
+
#
|
424 |
+
|
425 |
+
package JSON::Backend::XS;
|
426 |
+
|
427 |
+
use constant INDENT_LENGTH_FLAG => 15 << 12;
|
428 |
+
|
429 |
+
use constant UNSUPPORTED_ENCODE_FLAG => {
|
430 |
+
ESCAPE_SLASH => 0x00000010,
|
431 |
+
ALLOW_BIGNUM => 0x00000020,
|
432 |
+
AS_NONBLESSED => 0x00000040,
|
433 |
+
EXPANDED => 0x10000000, # for developer's
|
434 |
+
};
|
435 |
+
|
436 |
+
use constant UNSUPPORTED_DECODE_FLAG => {
|
437 |
+
LOOSE => 0x00000001,
|
438 |
+
ALLOW_BIGNUM => 0x00000002,
|
439 |
+
ALLOW_BAREKEY => 0x00000004,
|
440 |
+
ALLOW_SINGLEQUOTE => 0x00000008,
|
441 |
+
EXPANDED => 0x20000000, # for developer's
|
442 |
+
};
|
443 |
+
|
444 |
+
|
445 |
+
sub init {
|
446 |
+
local $^W;
|
447 |
+
no strict qw(refs);
|
448 |
+
*{"JSON::decode_json"} = \&{"JSON::XS::decode_json"};
|
449 |
+
*{"JSON::encode_json"} = \&{"JSON::XS::encode_json"};
|
450 |
+
*{"JSON::XS::is_xs"} = sub { 1 };
|
451 |
+
*{"JSON::XS::is_pp"} = sub { 0 };
|
452 |
+
return 1;
|
453 |
+
}
|
454 |
+
|
455 |
+
|
456 |
+
sub support_by_pp {
|
457 |
+
my ($class, @methods) = @_;
|
458 |
+
|
459 |
+
local $^W;
|
460 |
+
no strict qw(refs);
|
461 |
+
|
462 |
+
my $JSON_XS_encode_orignal = \&JSON::XS::encode;
|
463 |
+
my $JSON_XS_decode_orignal = \&JSON::XS::decode;
|
464 |
+
my $JSON_XS_incr_parse_orignal = \&JSON::XS::incr_parse;
|
465 |
+
|
466 |
+
*JSON::XS::decode = \&JSON::Backend::XS::Supportable::_decode;
|
467 |
+
*JSON::XS::encode = \&JSON::Backend::XS::Supportable::_encode;
|
468 |
+
*JSON::XS::incr_parse = \&JSON::Backend::XS::Supportable::_incr_parse;
|
469 |
+
|
470 |
+
*{JSON::XS::_original_decode} = $JSON_XS_decode_orignal;
|
471 |
+
*{JSON::XS::_original_encode} = $JSON_XS_encode_orignal;
|
472 |
+
*{JSON::XS::_original_incr_parse} = $JSON_XS_incr_parse_orignal;
|
473 |
+
|
474 |
+
push @JSON::Backend::XS::Supportable::ISA, 'JSON';
|
475 |
+
|
476 |
+
my $pkg = 'JSON::Backend::XS::Supportable';
|
477 |
+
|
478 |
+
*{JSON::new} = sub {
|
479 |
+
my $proto = JSON::XS->new; $$proto = 0;
|
480 |
+
bless $proto, $pkg;
|
481 |
+
};
|
482 |
+
|
483 |
+
|
484 |
+
for my $method (@methods) {
|
485 |
+
my $flag = uc($method);
|
486 |
+
my $type |= (UNSUPPORTED_ENCODE_FLAG->{$flag} || 0);
|
487 |
+
$type |= (UNSUPPORTED_DECODE_FLAG->{$flag} || 0);
|
488 |
+
|
489 |
+
next unless($type);
|
490 |
+
|
491 |
+
$pkg->_make_unsupported_method($method => $type);
|
492 |
+
}
|
493 |
+
|
494 |
+
# push @{"JSON::XS::Boolean::ISA"}, qw(JSON::PP::Boolean);
|
495 |
+
# push @{"JSON::PP::Boolean::ISA"}, qw(JSON::Boolean);
|
496 |
+
|
497 |
+
$JSON::DEBUG and Carp::carp("set -support_by_pp mode.");
|
498 |
+
|
499 |
+
return 1;
|
500 |
+
}
|
501 |
+
|
502 |
+
|
503 |
+
|
504 |
+
|
505 |
+
#
|
506 |
+
# Helper classes for XS
|
507 |
+
#
|
508 |
+
|
509 |
+
package JSON::Backend::XS::Supportable;
|
510 |
+
|
511 |
+
$Carp::Internal{'JSON::Backend::XS::Supportable'} = 1;
|
512 |
+
|
513 |
+
sub _make_unsupported_method {
|
514 |
+
my ($pkg, $method, $type) = @_;
|
515 |
+
|
516 |
+
local $^W;
|
517 |
+
no strict qw(refs);
|
518 |
+
|
519 |
+
*{"$pkg\::$method"} = sub {
|
520 |
+
local $^W;
|
521 |
+
if (defined $_[1] ? $_[1] : 1) {
|
522 |
+
${$_[0]} |= $type;
|
523 |
+
}
|
524 |
+
else {
|
525 |
+
${$_[0]} &= ~$type;
|
526 |
+
}
|
527 |
+
$_[0];
|
528 |
+
};
|
529 |
+
|
530 |
+
*{"$pkg\::get_$method"} = sub {
|
531 |
+
${$_[0]} & $type ? 1 : '';
|
532 |
+
};
|
533 |
+
|
534 |
+
}
|
535 |
+
|
536 |
+
|
537 |
+
sub _set_for_pp {
|
538 |
+
JSON::_load_pp( $_INSTALL_ONLY );
|
539 |
+
|
540 |
+
my $type = shift;
|
541 |
+
my $pp = JSON::PP->new;
|
542 |
+
my $prop = $_[0]->property;
|
543 |
+
|
544 |
+
for my $name (keys %$prop) {
|
545 |
+
$pp->$name( $prop->{$name} ? $prop->{$name} : 0 );
|
546 |
+
}
|
547 |
+
|
548 |
+
my $unsupported = $type eq 'encode' ? JSON::Backend::XS::UNSUPPORTED_ENCODE_FLAG
|
549 |
+
: JSON::Backend::XS::UNSUPPORTED_DECODE_FLAG;
|
550 |
+
my $flags = ${$_[0]} || 0;
|
551 |
+
|
552 |
+
for my $name (keys %$unsupported) {
|
553 |
+
next if ($name eq 'EXPANDED'); # for developer's
|
554 |
+
my $enable = ($flags & $unsupported->{$name}) ? 1 : 0;
|
555 |
+
my $method = lc $name;
|
556 |
+
$pp->$method($enable);
|
557 |
+
}
|
558 |
+
|
559 |
+
$pp->indent_length( $_[0]->get_indent_length );
|
560 |
+
|
561 |
+
return $pp;
|
562 |
+
}
|
563 |
+
|
564 |
+
sub _encode { # using with PP encode
|
565 |
+
if (${$_[0]}) {
|
566 |
+
_set_for_pp('encode' => @_)->encode($_[1]);
|
567 |
+
}
|
568 |
+
else {
|
569 |
+
$_[0]->_original_encode( $_[1] );
|
570 |
+
}
|
571 |
+
}
|
572 |
+
|
573 |
+
|
574 |
+
sub _decode { # if unsupported-flag is set, use PP
|
575 |
+
if (${$_[0]}) {
|
576 |
+
_set_for_pp('decode' => @_)->decode($_[1]);
|
577 |
+
}
|
578 |
+
else {
|
579 |
+
$_[0]->_original_decode( $_[1] );
|
580 |
+
}
|
581 |
+
}
|
582 |
+
|
583 |
+
|
584 |
+
sub decode_prefix { # if unsupported-flag is set, use PP
|
585 |
+
_set_for_pp('decode' => @_)->decode_prefix($_[1]);
|
586 |
+
}
|
587 |
+
|
588 |
+
|
589 |
+
sub _incr_parse {
|
590 |
+
if (${$_[0]}) {
|
591 |
+
_set_for_pp('decode' => @_)->incr_parse($_[1]);
|
592 |
+
}
|
593 |
+
else {
|
594 |
+
$_[0]->_original_incr_parse( $_[1] );
|
595 |
+
}
|
596 |
+
}
|
597 |
+
|
598 |
+
|
599 |
+
sub get_indent_length {
|
600 |
+
${$_[0]} << 4 >> 16;
|
601 |
+
}
|
602 |
+
|
603 |
+
|
604 |
+
sub indent_length {
|
605 |
+
my $length = $_[1];
|
606 |
+
|
607 |
+
if (!defined $length or $length > 15 or $length < 0) {
|
608 |
+
Carp::carp "The acceptable range of indent_length() is 0 to 15.";
|
609 |
+
}
|
610 |
+
else {
|
611 |
+
local $^W;
|
612 |
+
$length <<= 12;
|
613 |
+
${$_[0]} &= ~ JSON::Backend::XS::INDENT_LENGTH_FLAG;
|
614 |
+
${$_[0]} |= $length;
|
615 |
+
*JSON::XS::encode = \&JSON::Backend::XS::Supportable::_encode;
|
616 |
+
}
|
617 |
+
|
618 |
+
$_[0];
|
619 |
+
}
|
620 |
+
|
621 |
+
|
622 |
+
1;
|
623 |
+
__END__
|
624 |
+
|
625 |
+
=head1 NAME
|
626 |
+
|
627 |
+
JSON - JSON (JavaScript Object Notation) encoder/decoder
|
628 |
+
|
629 |
+
=head1 SYNOPSIS
|
630 |
+
|
631 |
+
use JSON; # imports encode_json, decode_json, to_json and from_json.
|
632 |
+
|
633 |
+
# simple and fast interfaces (expect/generate UTF-8)
|
634 |
+
|
635 |
+
$utf8_encoded_json_text = encode_json $perl_hash_or_arrayref;
|
636 |
+
$perl_hash_or_arrayref = decode_json $utf8_encoded_json_text;
|
637 |
+
|
638 |
+
# OO-interface
|
639 |
+
|
640 |
+
$json = JSON->new->allow_nonref;
|
641 |
+
|
642 |
+
$json_text = $json->encode( $perl_scalar );
|
643 |
+
$perl_scalar = $json->decode( $json_text );
|
644 |
+
|
645 |
+
$pretty_printed = $json->pretty->encode( $perl_scalar ); # pretty-printing
|
646 |
+
|
647 |
+
# If you want to use PP only support features, call with '-support_by_pp'
|
648 |
+
# When XS unsupported feature is enable, using PP (de|en)code instead of XS ones.
|
649 |
+
|
650 |
+
use JSON -support_by_pp;
|
651 |
+
|
652 |
+
# option-acceptable interfaces (expect/generate UNICODE by default)
|
653 |
+
|
654 |
+
$json_text = to_json( $perl_scalar, { ascii => 1, pretty => 1 } );
|
655 |
+
$perl_scalar = from_json( $json_text, { utf8 => 1 } );
|
656 |
+
|
657 |
+
# Between (en|de)code_json and (to|from)_json, if you want to write
|
658 |
+
# a code which communicates to an outer world (encoded in UTF-8),
|
659 |
+
# recommend to use (en|de)code_json.
|
660 |
+
|
661 |
+
=head1 VERSION
|
662 |
+
|
663 |
+
2.90
|
664 |
+
|
665 |
+
This version is compatible with JSON::XS B<2.34> and later.
|
666 |
+
(Not yet compatble to JSON::XS B<3.0x>.)
|
667 |
+
|
668 |
+
|
669 |
+
=head1 NOTE
|
670 |
+
|
671 |
+
JSON::PP was earlier included in the C<JSON> distribution, but
|
672 |
+
has since Perl 5.14 been a core module. For this reason,
|
673 |
+
L<JSON::PP> was removed from the JSON distribution and can now
|
674 |
+
be found also in the Perl5 repository at
|
675 |
+
|
676 |
+
=over
|
677 |
+
|
678 |
+
=item * L<http://perl5.git.perl.org/perl.git>
|
679 |
+
|
680 |
+
=back
|
681 |
+
|
682 |
+
(The newest JSON::PP version still exists in CPAN.)
|
683 |
+
|
684 |
+
Instead, the C<JSON> distribution will include JSON::backportPP
|
685 |
+
for backwards computability. JSON.pm should thus work as it did
|
686 |
+
before.
|
687 |
+
|
688 |
+
=head1 DESCRIPTION
|
689 |
+
|
690 |
+
*************************** CAUTION **************************************
|
691 |
+
* *
|
692 |
+
* INCOMPATIBLE CHANGE (JSON::XS version 2.90) *
|
693 |
+
* *
|
694 |
+
* JSON.pm had patched JSON::XS::Boolean and JSON::PP::Boolean internally *
|
695 |
+
* on loading time for making these modules inherit JSON::Boolean. *
|
696 |
+
* But since JSON::XS v3.0 it use Types::Serialiser as boolean class. *
|
697 |
+
* Then now JSON.pm breaks boolean classe overload features and *
|
698 |
+
* -support_by_pp if JSON::XS v3.0 or later is installed. *
|
699 |
+
* *
|
700 |
+
* JSON::true and JSON::false returned JSON::Boolean objects. *
|
701 |
+
* For workaround, they return JSON::PP::Boolean objects in this version. *
|
702 |
+
* *
|
703 |
+
* isa_ok(JSON::true, 'JSON::PP::Boolean'); *
|
704 |
+
* *
|
705 |
+
* And it discards a feature: *
|
706 |
+
* *
|
707 |
+
* ok(JSON::true eq 'true'); *
|
708 |
+
* *
|
709 |
+
* In other word, JSON::PP::Boolean overload numeric only. *
|
710 |
+
* *
|
711 |
+
* ok( JSON::true == 1 ); *
|
712 |
+
* *
|
713 |
+
**************************************************************************
|
714 |
+
|
715 |
+
************************** CAUTION ********************************
|
716 |
+
* This is 'JSON module version 2' and there are many differences *
|
717 |
+
* to version 1.xx *
|
718 |
+
* Please check your applications using old version. *
|
719 |
+
* See to 'INCOMPATIBLE CHANGES TO OLD VERSION' *
|
720 |
+
*******************************************************************
|
721 |
+
|
722 |
+
JSON (JavaScript Object Notation) is a simple data format.
|
723 |
+
See to L<http://www.json.org/> and C<RFC4627>(L<http://www.ietf.org/rfc/rfc4627.txt>).
|
724 |
+
|
725 |
+
This module converts Perl data structures to JSON and vice versa using either
|
726 |
+
L<JSON::XS> or L<JSON::PP>.
|
727 |
+
|
728 |
+
JSON::XS is the fastest and most proper JSON module on CPAN which must be
|
729 |
+
compiled and installed in your environment.
|
730 |
+
JSON::PP is a pure-Perl module which is bundled in this distribution and
|
731 |
+
has a strong compatibility to JSON::XS.
|
732 |
+
|
733 |
+
This module try to use JSON::XS by default and fail to it, use JSON::PP instead.
|
734 |
+
So its features completely depend on JSON::XS or JSON::PP.
|
735 |
+
|
736 |
+
See to L<BACKEND MODULE DECISION>.
|
737 |
+
|
738 |
+
To distinguish the module name 'JSON' and the format type JSON,
|
739 |
+
the former is quoted by CE<lt>E<gt> (its results vary with your using media),
|
740 |
+
and the latter is left just as it is.
|
741 |
+
|
742 |
+
Module name : C<JSON>
|
743 |
+
|
744 |
+
Format type : JSON
|
745 |
+
|
746 |
+
=head2 FEATURES
|
747 |
+
|
748 |
+
=over
|
749 |
+
|
750 |
+
=item * correct unicode handling
|
751 |
+
|
752 |
+
This module (i.e. backend modules) knows how to handle Unicode, documents
|
753 |
+
how and when it does so, and even documents what "correct" means.
|
754 |
+
|
755 |
+
Even though there are limitations, this feature is available since Perl version 5.6.
|
756 |
+
|
757 |
+
JSON::XS requires Perl 5.8.2 (but works correctly in 5.8.8 or later), so in older versions
|
758 |
+
C<JSON> should call JSON::PP as the backend which can be used since Perl 5.005.
|
759 |
+
|
760 |
+
With Perl 5.8.x JSON::PP works, but from 5.8.0 to 5.8.2, because of a Perl side problem,
|
761 |
+
JSON::PP works slower in the versions. And in 5.005, the Unicode handling is not available.
|
762 |
+
See to L<JSON::PP/UNICODE HANDLING ON PERLS> for more information.
|
763 |
+
|
764 |
+
See also to L<JSON::XS/A FEW NOTES ON UNICODE AND PERL>
|
765 |
+
and L<JSON::XS/ENCODING/CODESET_FLAG_NOTES>.
|
766 |
+
|
767 |
+
|
768 |
+
=item * round-trip integrity
|
769 |
+
|
770 |
+
When you serialise a perl data structure using only data types supported
|
771 |
+
by JSON and Perl, the deserialised data structure is identical on the Perl
|
772 |
+
level. (e.g. the string "2.0" doesn't suddenly become "2" just because
|
773 |
+
it looks like a number). There I<are> minor exceptions to this, read the
|
774 |
+
L</MAPPING> section below to learn about those.
|
775 |
+
|
776 |
+
|
777 |
+
=item * strict checking of JSON correctness
|
778 |
+
|
779 |
+
There is no guessing, no generating of illegal JSON texts by default,
|
780 |
+
and only JSON is accepted as input by default (the latter is a security
|
781 |
+
feature).
|
782 |
+
|
783 |
+
See to L<JSON::XS/FEATURES> and L<JSON::PP/FEATURES>.
|
784 |
+
|
785 |
+
=item * fast
|
786 |
+
|
787 |
+
This module returns a JSON::XS object itself if available.
|
788 |
+
Compared to other JSON modules and other serialisers such as Storable,
|
789 |
+
JSON::XS usually compares favorably in terms of speed, too.
|
790 |
+
|
791 |
+
If not available, C<JSON> returns a JSON::PP object instead of JSON::XS and
|
792 |
+
it is very slow as pure-Perl.
|
793 |
+
|
794 |
+
=item * simple to use
|
795 |
+
|
796 |
+
This module has both a simple functional interface as well as an
|
797 |
+
object oriented interface interface.
|
798 |
+
|
799 |
+
=item * reasonably versatile output formats
|
800 |
+
|
801 |
+
You can choose between the most compact guaranteed-single-line format possible
|
802 |
+
(nice for simple line-based protocols), a pure-ASCII format (for when your transport
|
803 |
+
is not 8-bit clean, still supports the whole Unicode range), or a pretty-printed
|
804 |
+
format (for when you want to read that stuff). Or you can combine those features
|
805 |
+
in whatever way you like.
|
806 |
+
|
807 |
+
=back
|
808 |
+
|
809 |
+
=head1 FUNCTIONAL INTERFACE
|
810 |
+
|
811 |
+
Some documents are copied and modified from L<JSON::XS/FUNCTIONAL INTERFACE>.
|
812 |
+
C<to_json> and C<from_json> are additional functions.
|
813 |
+
|
814 |
+
=head2 encode_json
|
815 |
+
|
816 |
+
$json_text = encode_json $perl_scalar
|
817 |
+
|
818 |
+
Converts the given Perl data structure to a UTF-8 encoded, binary string.
|
819 |
+
|
820 |
+
This function call is functionally identical to:
|
821 |
+
|
822 |
+
$json_text = JSON->new->utf8->encode($perl_scalar)
|
823 |
+
|
824 |
+
=head2 decode_json
|
825 |
+
|
826 |
+
$perl_scalar = decode_json $json_text
|
827 |
+
|
828 |
+
The opposite of C<encode_json>: expects an UTF-8 (binary) string and tries
|
829 |
+
to parse that as an UTF-8 encoded JSON text, returning the resulting
|
830 |
+
reference.
|
831 |
+
|
832 |
+
This function call is functionally identical to:
|
833 |
+
|
834 |
+
$perl_scalar = JSON->new->utf8->decode($json_text)
|
835 |
+
|
836 |
+
|
837 |
+
=head2 to_json
|
838 |
+
|
839 |
+
$json_text = to_json($perl_scalar)
|
840 |
+
|
841 |
+
Converts the given Perl data structure to a json string.
|
842 |
+
|
843 |
+
This function call is functionally identical to:
|
844 |
+
|
845 |
+
$json_text = JSON->new->encode($perl_scalar)
|
846 |
+
|
847 |
+
Takes a hash reference as the second.
|
848 |
+
|
849 |
+
$json_text = to_json($perl_scalar, $flag_hashref)
|
850 |
+
|
851 |
+
So,
|
852 |
+
|
853 |
+
$json_text = to_json($perl_scalar, {utf8 => 1, pretty => 1})
|
854 |
+
|
855 |
+
equivalent to:
|
856 |
+
|
857 |
+
$json_text = JSON->new->utf8(1)->pretty(1)->encode($perl_scalar)
|
858 |
+
|
859 |
+
If you want to write a modern perl code which communicates to outer world,
|
860 |
+
you should use C<encode_json> (supposed that JSON data are encoded in UTF-8).
|
861 |
+
|
862 |
+
=head2 from_json
|
863 |
+
|
864 |
+
$perl_scalar = from_json($json_text)
|
865 |
+
|
866 |
+
The opposite of C<to_json>: expects a json string and tries
|
867 |
+
to parse it, returning the resulting reference.
|
868 |
+
|
869 |
+
This function call is functionally identical to:
|
870 |
+
|
871 |
+
$perl_scalar = JSON->decode($json_text)
|
872 |
+
|
873 |
+
Takes a hash reference as the second.
|
874 |
+
|
875 |
+
$perl_scalar = from_json($json_text, $flag_hashref)
|
876 |
+
|
877 |
+
So,
|
878 |
+
|
879 |
+
$perl_scalar = from_json($json_text, {utf8 => 1})
|
880 |
+
|
881 |
+
equivalent to:
|
882 |
+
|
883 |
+
$perl_scalar = JSON->new->utf8(1)->decode($json_text)
|
884 |
+
|
885 |
+
If you want to write a modern perl code which communicates to outer world,
|
886 |
+
you should use C<decode_json> (supposed that JSON data are encoded in UTF-8).
|
887 |
+
|
888 |
+
=head2 JSON::is_bool
|
889 |
+
|
890 |
+
$is_boolean = JSON::is_bool($scalar)
|
891 |
+
|
892 |
+
Returns true if the passed scalar represents either JSON::true or
|
893 |
+
JSON::false, two constants that act like C<1> and C<0> respectively
|
894 |
+
and are also used to represent JSON C<true> and C<false> in Perl strings.
|
895 |
+
|
896 |
+
=head2 JSON::true
|
897 |
+
|
898 |
+
Returns JSON true value which is blessed object.
|
899 |
+
It C<isa> JSON::Boolean object.
|
900 |
+
|
901 |
+
=head2 JSON::false
|
902 |
+
|
903 |
+
Returns JSON false value which is blessed object.
|
904 |
+
It C<isa> JSON::Boolean object.
|
905 |
+
|
906 |
+
=head2 JSON::null
|
907 |
+
|
908 |
+
Returns C<undef>.
|
909 |
+
|
910 |
+
See L<MAPPING>, below, for more information on how JSON values are mapped to
|
911 |
+
Perl.
|
912 |
+
|
913 |
+
=head1 HOW DO I DECODE A DATA FROM OUTER AND ENCODE TO OUTER
|
914 |
+
|
915 |
+
This section supposes that your perl version is 5.8 or later.
|
916 |
+
|
917 |
+
If you know a JSON text from an outer world - a network, a file content, and so on,
|
918 |
+
is encoded in UTF-8, you should use C<decode_json> or C<JSON> module object
|
919 |
+
with C<utf8> enable. And the decoded result will contain UNICODE characters.
|
920 |
+
|
921 |
+
# from network
|
922 |
+
my $json = JSON->new->utf8;
|
923 |
+
my $json_text = CGI->new->param( 'json_data' );
|
924 |
+
my $perl_scalar = $json->decode( $json_text );
|
925 |
+
|
926 |
+
# from file content
|
927 |
+
local $/;
|
928 |
+
open( my $fh, '<', 'json.data' );
|
929 |
+
$json_text = <$fh>;
|
930 |
+
$perl_scalar = decode_json( $json_text );
|
931 |
+
|
932 |
+
If an outer data is not encoded in UTF-8, firstly you should C<decode> it.
|
933 |
+
|
934 |
+
use Encode;
|
935 |
+
local $/;
|
936 |
+
open( my $fh, '<', 'json.data' );
|
937 |
+
my $encoding = 'cp932';
|
938 |
+
my $unicode_json_text = decode( $encoding, <$fh> ); # UNICODE
|
939 |
+
|
940 |
+
# or you can write the below code.
|
941 |
+
#
|
942 |
+
# open( my $fh, "<:encoding($encoding)", 'json.data' );
|
943 |
+
# $unicode_json_text = <$fh>;
|
944 |
+
|
945 |
+
In this case, C<$unicode_json_text> is of course UNICODE string.
|
946 |
+
So you B<cannot> use C<decode_json> nor C<JSON> module object with C<utf8> enable.
|
947 |
+
Instead of them, you use C<JSON> module object with C<utf8> disable or C<from_json>.
|
948 |
+
|
949 |
+
$perl_scalar = $json->utf8(0)->decode( $unicode_json_text );
|
950 |
+
# or
|
951 |
+
$perl_scalar = from_json( $unicode_json_text );
|
952 |
+
|
953 |
+
Or C<encode 'utf8'> and C<decode_json>:
|
954 |
+
|
955 |
+
$perl_scalar = decode_json( encode( 'utf8', $unicode_json_text ) );
|
956 |
+
# this way is not efficient.
|
957 |
+
|
958 |
+
And now, you want to convert your C<$perl_scalar> into JSON data and
|
959 |
+
send it to an outer world - a network or a file content, and so on.
|
960 |
+
|
961 |
+
Your data usually contains UNICODE strings and you want the converted data to be encoded
|
962 |
+
in UTF-8, you should use C<encode_json> or C<JSON> module object with C<utf8> enable.
|
963 |
+
|
964 |
+
print encode_json( $perl_scalar ); # to a network? file? or display?
|
965 |
+
# or
|
966 |
+
print $json->utf8->encode( $perl_scalar );
|
967 |
+
|
968 |
+
If C<$perl_scalar> does not contain UNICODE but C<$encoding>-encoded strings
|
969 |
+
for some reason, then its characters are regarded as B<latin1> for perl
|
970 |
+
(because it does not concern with your $encoding).
|
971 |
+
You B<cannot> use C<encode_json> nor C<JSON> module object with C<utf8> enable.
|
972 |
+
Instead of them, you use C<JSON> module object with C<utf8> disable or C<to_json>.
|
973 |
+
Note that the resulted text is a UNICODE string but no problem to print it.
|
974 |
+
|
975 |
+
# $perl_scalar contains $encoding encoded string values
|
976 |
+
$unicode_json_text = $json->utf8(0)->encode( $perl_scalar );
|
977 |
+
# or
|
978 |
+
$unicode_json_text = to_json( $perl_scalar );
|
979 |
+
# $unicode_json_text consists of characters less than 0x100
|
980 |
+
print $unicode_json_text;
|
981 |
+
|
982 |
+
Or C<decode $encoding> all string values and C<encode_json>:
|
983 |
+
|
984 |
+
$perl_scalar->{ foo } = decode( $encoding, $perl_scalar->{ foo } );
|
985 |
+
# ... do it to each string values, then encode_json
|
986 |
+
$json_text = encode_json( $perl_scalar );
|
987 |
+
|
988 |
+
This method is a proper way but probably not efficient.
|
989 |
+
|
990 |
+
See to L<Encode>, L<perluniintro>.
|
991 |
+
|
992 |
+
|
993 |
+
=head1 COMMON OBJECT-ORIENTED INTERFACE
|
994 |
+
|
995 |
+
=head2 new
|
996 |
+
|
997 |
+
$json = JSON->new
|
998 |
+
|
999 |
+
Returns a new C<JSON> object inherited from either JSON::XS or JSON::PP
|
1000 |
+
that can be used to de/encode JSON strings.
|
1001 |
+
|
1002 |
+
All boolean flags described below are by default I<disabled>.
|
1003 |
+
|
1004 |
+
The mutators for flags all return the JSON object again and thus calls can
|
1005 |
+
be chained:
|
1006 |
+
|
1007 |
+
my $json = JSON->new->utf8->space_after->encode({a => [1,2]})
|
1008 |
+
=> {"a": [1, 2]}
|
1009 |
+
|
1010 |
+
=head2 ascii
|
1011 |
+
|
1012 |
+
$json = $json->ascii([$enable])
|
1013 |
+
|
1014 |
+
$enabled = $json->get_ascii
|
1015 |
+
|
1016 |
+
If $enable is true (or missing), then the encode method will not generate characters outside
|
1017 |
+
the code range 0..127. Any Unicode characters outside that range will be escaped using either
|
1018 |
+
a single \uXXXX or a double \uHHHH\uLLLLL escape sequence, as per RFC4627.
|
1019 |
+
|
1020 |
+
If $enable is false, then the encode method will not escape Unicode characters unless
|
1021 |
+
required by the JSON syntax or other flags. This results in a faster and more compact format.
|
1022 |
+
|
1023 |
+
This feature depends on the used Perl version and environment.
|
1024 |
+
|
1025 |
+
See to L<JSON::PP/UNICODE HANDLING ON PERLS> if the backend is PP.
|
1026 |
+
|
1027 |
+
JSON->new->ascii(1)->encode([chr 0x10401])
|
1028 |
+
=> ["\ud801\udc01"]
|
1029 |
+
|
1030 |
+
=head2 latin1
|
1031 |
+
|
1032 |
+
$json = $json->latin1([$enable])
|
1033 |
+
|
1034 |
+
$enabled = $json->get_latin1
|
1035 |
+
|
1036 |
+
If $enable is true (or missing), then the encode method will encode the resulting JSON
|
1037 |
+
text as latin1 (or iso-8859-1), escaping any characters outside the code range 0..255.
|
1038 |
+
|
1039 |
+
If $enable is false, then the encode method will not escape Unicode characters
|
1040 |
+
unless required by the JSON syntax or other flags.
|
1041 |
+
|
1042 |
+
JSON->new->latin1->encode (["\x{89}\x{abc}"]
|
1043 |
+
=> ["\x{89}\\u0abc"] # (perl syntax, U+abc escaped, U+89 not)
|
1044 |
+
|
1045 |
+
=head2 utf8
|
1046 |
+
|
1047 |
+
$json = $json->utf8([$enable])
|
1048 |
+
|
1049 |
+
$enabled = $json->get_utf8
|
1050 |
+
|
1051 |
+
If $enable is true (or missing), then the encode method will encode the JSON result
|
1052 |
+
into UTF-8, as required by many protocols, while the decode method expects to be handled
|
1053 |
+
an UTF-8-encoded string. Please note that UTF-8-encoded strings do not contain any
|
1054 |
+
characters outside the range 0..255, they are thus useful for bytewise/binary I/O.
|
1055 |
+
|
1056 |
+
In future versions, enabling this option might enable autodetection of the UTF-16 and UTF-32
|
1057 |
+
encoding families, as described in RFC4627.
|
1058 |
+
|
1059 |
+
If $enable is false, then the encode method will return the JSON string as a (non-encoded)
|
1060 |
+
Unicode string, while decode expects thus a Unicode string. Any decoding or encoding
|
1061 |
+
(e.g. to UTF-8 or UTF-16) needs to be done yourself, e.g. using the Encode module.
|
1062 |
+
|
1063 |
+
|
1064 |
+
Example, output UTF-16BE-encoded JSON:
|
1065 |
+
|
1066 |
+
use Encode;
|
1067 |
+
$jsontext = encode "UTF-16BE", JSON::XS->new->encode ($object);
|
1068 |
+
|
1069 |
+
Example, decode UTF-32LE-encoded JSON:
|
1070 |
+
|
1071 |
+
use Encode;
|
1072 |
+
$object = JSON::XS->new->decode (decode "UTF-32LE", $jsontext);
|
1073 |
+
|
1074 |
+
See to L<JSON::PP/UNICODE HANDLING ON PERLS> if the backend is PP.
|
1075 |
+
|
1076 |
+
|
1077 |
+
=head2 pretty
|
1078 |
+
|
1079 |
+
$json = $json->pretty([$enable])
|
1080 |
+
|
1081 |
+
This enables (or disables) all of the C<indent>, C<space_before> and
|
1082 |
+
C<space_after> (and in the future possibly more) flags in one call to
|
1083 |
+
generate the most readable (or most compact) form possible.
|
1084 |
+
|
1085 |
+
Equivalent to:
|
1086 |
+
|
1087 |
+
$json->indent->space_before->space_after
|
1088 |
+
|
1089 |
+
The indent space length is three and JSON::XS cannot change the indent
|
1090 |
+
space length.
|
1091 |
+
|
1092 |
+
=head2 indent
|
1093 |
+
|
1094 |
+
$json = $json->indent([$enable])
|
1095 |
+
|
1096 |
+
$enabled = $json->get_indent
|
1097 |
+
|
1098 |
+
If C<$enable> is true (or missing), then the C<encode> method will use a multiline
|
1099 |
+
format as output, putting every array member or object/hash key-value pair
|
1100 |
+
into its own line, identifying them properly.
|
1101 |
+
|
1102 |
+
If C<$enable> is false, no newlines or indenting will be produced, and the
|
1103 |
+
resulting JSON text is guaranteed not to contain any C<newlines>.
|
1104 |
+
|
1105 |
+
This setting has no effect when decoding JSON texts.
|
1106 |
+
|
1107 |
+
The indent space length is three.
|
1108 |
+
With JSON::PP, you can also access C<indent_length> to change indent space length.
|
1109 |
+
|
1110 |
+
|
1111 |
+
=head2 space_before
|
1112 |
+
|
1113 |
+
$json = $json->space_before([$enable])
|
1114 |
+
|
1115 |
+
$enabled = $json->get_space_before
|
1116 |
+
|
1117 |
+
If C<$enable> is true (or missing), then the C<encode> method will add an extra
|
1118 |
+
optional space before the C<:> separating keys from values in JSON objects.
|
1119 |
+
|
1120 |
+
If C<$enable> is false, then the C<encode> method will not add any extra
|
1121 |
+
space at those places.
|
1122 |
+
|
1123 |
+
This setting has no effect when decoding JSON texts.
|
1124 |
+
|
1125 |
+
Example, space_before enabled, space_after and indent disabled:
|
1126 |
+
|
1127 |
+
{"key" :"value"}
|
1128 |
+
|
1129 |
+
|
1130 |
+
=head2 space_after
|
1131 |
+
|
1132 |
+
$json = $json->space_after([$enable])
|
1133 |
+
|
1134 |
+
$enabled = $json->get_space_after
|
1135 |
+
|
1136 |
+
If C<$enable> is true (or missing), then the C<encode> method will add an extra
|
1137 |
+
optional space after the C<:> separating keys from values in JSON objects
|
1138 |
+
and extra whitespace after the C<,> separating key-value pairs and array
|
1139 |
+
members.
|
1140 |
+
|
1141 |
+
If C<$enable> is false, then the C<encode> method will not add any extra
|
1142 |
+
space at those places.
|
1143 |
+
|
1144 |
+
This setting has no effect when decoding JSON texts.
|
1145 |
+
|
1146 |
+
Example, space_before and indent disabled, space_after enabled:
|
1147 |
+
|
1148 |
+
{"key": "value"}
|
1149 |
+
|
1150 |
+
|
1151 |
+
=head2 relaxed
|
1152 |
+
|
1153 |
+
$json = $json->relaxed([$enable])
|
1154 |
+
|
1155 |
+
$enabled = $json->get_relaxed
|
1156 |
+
|
1157 |
+
If C<$enable> is true (or missing), then C<decode> will accept some
|
1158 |
+
extensions to normal JSON syntax (see below). C<encode> will not be
|
1159 |
+
affected in anyway. I<Be aware that this option makes you accept invalid
|
1160 |
+
JSON texts as if they were valid!>. I suggest only to use this option to
|
1161 |
+
parse application-specific files written by humans (configuration files,
|
1162 |
+
resource files etc.)
|
1163 |
+
|
1164 |
+
If C<$enable> is false (the default), then C<decode> will only accept
|
1165 |
+
valid JSON texts.
|
1166 |
+
|
1167 |
+
Currently accepted extensions are:
|
1168 |
+
|
1169 |
+
=over 4
|
1170 |
+
|
1171 |
+
=item * list items can have an end-comma
|
1172 |
+
|
1173 |
+
JSON I<separates> array elements and key-value pairs with commas. This
|
1174 |
+
can be annoying if you write JSON texts manually and want to be able to
|
1175 |
+
quickly append elements, so this extension accepts comma at the end of
|
1176 |
+
such items not just between them:
|
1177 |
+
|
1178 |
+
[
|
1179 |
+
1,
|
1180 |
+
2, <- this comma not normally allowed
|
1181 |
+
]
|
1182 |
+
{
|
1183 |
+
"k1": "v1",
|
1184 |
+
"k2": "v2", <- this comma not normally allowed
|
1185 |
+
}
|
1186 |
+
|
1187 |
+
=item * shell-style '#'-comments
|
1188 |
+
|
1189 |
+
Whenever JSON allows whitespace, shell-style comments are additionally
|
1190 |
+
allowed. They are terminated by the first carriage-return or line-feed
|
1191 |
+
character, after which more white-space and comments are allowed.
|
1192 |
+
|
1193 |
+
[
|
1194 |
+
1, # this comment not allowed in JSON
|
1195 |
+
# neither this one...
|
1196 |
+
]
|
1197 |
+
|
1198 |
+
=back
|
1199 |
+
|
1200 |
+
|
1201 |
+
=head2 canonical
|
1202 |
+
|
1203 |
+
$json = $json->canonical([$enable])
|
1204 |
+
|
1205 |
+
$enabled = $json->get_canonical
|
1206 |
+
|
1207 |
+
If C<$enable> is true (or missing), then the C<encode> method will output JSON objects
|
1208 |
+
by sorting their keys. This is adding a comparatively high overhead.
|
1209 |
+
|
1210 |
+
If C<$enable> is false, then the C<encode> method will output key-value
|
1211 |
+
pairs in the order Perl stores them (which will likely change between runs
|
1212 |
+
of the same script).
|
1213 |
+
|
1214 |
+
This option is useful if you want the same data structure to be encoded as
|
1215 |
+
the same JSON text (given the same overall settings). If it is disabled,
|
1216 |
+
the same hash might be encoded differently even if contains the same data,
|
1217 |
+
as key-value pairs have no inherent ordering in Perl.
|
1218 |
+
|
1219 |
+
This setting has no effect when decoding JSON texts.
|
1220 |
+
|
1221 |
+
=head2 allow_nonref
|
1222 |
+
|
1223 |
+
$json = $json->allow_nonref([$enable])
|
1224 |
+
|
1225 |
+
$enabled = $json->get_allow_nonref
|
1226 |
+
|
1227 |
+
If C<$enable> is true (or missing), then the C<encode> method can convert a
|
1228 |
+
non-reference into its corresponding string, number or null JSON value,
|
1229 |
+
which is an extension to RFC4627. Likewise, C<decode> will accept those JSON
|
1230 |
+
values instead of croaking.
|
1231 |
+
|
1232 |
+
If C<$enable> is false, then the C<encode> method will croak if it isn't
|
1233 |
+
passed an arrayref or hashref, as JSON texts must either be an object
|
1234 |
+
or array. Likewise, C<decode> will croak if given something that is not a
|
1235 |
+
JSON object or array.
|
1236 |
+
|
1237 |
+
JSON->new->allow_nonref->encode ("Hello, World!")
|
1238 |
+
=> "Hello, World!"
|
1239 |
+
|
1240 |
+
=head2 allow_unknown
|
1241 |
+
|
1242 |
+
$json = $json->allow_unknown ([$enable])
|
1243 |
+
|
1244 |
+
$enabled = $json->get_allow_unknown
|
1245 |
+
|
1246 |
+
If $enable is true (or missing), then "encode" will *not* throw an
|
1247 |
+
exception when it encounters values it cannot represent in JSON (for
|
1248 |
+
example, filehandles) but instead will encode a JSON "null" value.
|
1249 |
+
Note that blessed objects are not included here and are handled
|
1250 |
+
separately by c<allow_nonref>.
|
1251 |
+
|
1252 |
+
If $enable is false (the default), then "encode" will throw an
|
1253 |
+
exception when it encounters anything it cannot encode as JSON.
|
1254 |
+
|
1255 |
+
This option does not affect "decode" in any way, and it is
|
1256 |
+
recommended to leave it off unless you know your communications
|
1257 |
+
partner.
|
1258 |
+
|
1259 |
+
=head2 allow_blessed
|
1260 |
+
|
1261 |
+
$json = $json->allow_blessed([$enable])
|
1262 |
+
|
1263 |
+
$enabled = $json->get_allow_blessed
|
1264 |
+
|
1265 |
+
If C<$enable> is true (or missing), then the C<encode> method will not
|
1266 |
+
barf when it encounters a blessed reference. Instead, the value of the
|
1267 |
+
B<convert_blessed> option will decide whether C<null> (C<convert_blessed>
|
1268 |
+
disabled or no C<TO_JSON> method found) or a representation of the
|
1269 |
+
object (C<convert_blessed> enabled and C<TO_JSON> method found) is being
|
1270 |
+
encoded. Has no effect on C<decode>.
|
1271 |
+
|
1272 |
+
If C<$enable> is false (the default), then C<encode> will throw an
|
1273 |
+
exception when it encounters a blessed object.
|
1274 |
+
|
1275 |
+
|
1276 |
+
=head2 convert_blessed
|
1277 |
+
|
1278 |
+
$json = $json->convert_blessed([$enable])
|
1279 |
+
|
1280 |
+
$enabled = $json->get_convert_blessed
|
1281 |
+
|
1282 |
+
If C<$enable> is true (or missing), then C<encode>, upon encountering a
|
1283 |
+
blessed object, will check for the availability of the C<TO_JSON> method
|
1284 |
+
on the object's class. If found, it will be called in scalar context
|
1285 |
+
and the resulting scalar will be encoded instead of the object. If no
|
1286 |
+
C<TO_JSON> method is found, the value of C<allow_blessed> will decide what
|
1287 |
+
to do.
|
1288 |
+
|
1289 |
+
The C<TO_JSON> method may safely call die if it wants. If C<TO_JSON>
|
1290 |
+
returns other blessed objects, those will be handled in the same
|
1291 |
+
way. C<TO_JSON> must take care of not causing an endless recursion cycle
|
1292 |
+
(== crash) in this case. The name of C<TO_JSON> was chosen because other
|
1293 |
+
methods called by the Perl core (== not by the user of the object) are
|
1294 |
+
usually in upper case letters and to avoid collisions with the C<to_json>
|
1295 |
+
function or method.
|
1296 |
+
|
1297 |
+
This setting does not yet influence C<decode> in any way.
|
1298 |
+
|
1299 |
+
If C<$enable> is false, then the C<allow_blessed> setting will decide what
|
1300 |
+
to do when a blessed object is found.
|
1301 |
+
|
1302 |
+
=over
|
1303 |
+
|
1304 |
+
=item convert_blessed_universally mode
|
1305 |
+
|
1306 |
+
If use C<JSON> with C<-convert_blessed_universally>, the C<UNIVERSAL::TO_JSON>
|
1307 |
+
subroutine is defined as the below code:
|
1308 |
+
|
1309 |
+
*UNIVERSAL::TO_JSON = sub {
|
1310 |
+
my $b_obj = B::svref_2object( $_[0] );
|
1311 |
+
return $b_obj->isa('B::HV') ? { %{ $_[0] } }
|
1312 |
+
: $b_obj->isa('B::AV') ? [ @{ $_[0] } ]
|
1313 |
+
: undef
|
1314 |
+
;
|
1315 |
+
}
|
1316 |
+
|
1317 |
+
This will cause that C<encode> method converts simple blessed objects into
|
1318 |
+
JSON objects as non-blessed object.
|
1319 |
+
|
1320 |
+
JSON -convert_blessed_universally;
|
1321 |
+
$json->allow_blessed->convert_blessed->encode( $blessed_object )
|
1322 |
+
|
1323 |
+
This feature is experimental and may be removed in the future.
|
1324 |
+
|
1325 |
+
=back
|
1326 |
+
|
1327 |
+
=head2 filter_json_object
|
1328 |
+
|
1329 |
+
$json = $json->filter_json_object([$coderef])
|
1330 |
+
|
1331 |
+
When C<$coderef> is specified, it will be called from C<decode> each
|
1332 |
+
time it decodes a JSON object. The only argument passed to the coderef
|
1333 |
+
is a reference to the newly-created hash. If the code references returns
|
1334 |
+
a single scalar (which need not be a reference), this value
|
1335 |
+
(i.e. a copy of that scalar to avoid aliasing) is inserted into the
|
1336 |
+
deserialised data structure. If it returns an empty list
|
1337 |
+
(NOTE: I<not> C<undef>, which is a valid scalar), the original deserialised
|
1338 |
+
hash will be inserted. This setting can slow down decoding considerably.
|
1339 |
+
|
1340 |
+
When C<$coderef> is omitted or undefined, any existing callback will
|
1341 |
+
be removed and C<decode> will not change the deserialised hash in any
|
1342 |
+
way.
|
1343 |
+
|
1344 |
+
Example, convert all JSON objects into the integer 5:
|
1345 |
+
|
1346 |
+
my $js = JSON->new->filter_json_object (sub { 5 });
|
1347 |
+
# returns [5]
|
1348 |
+
$js->decode ('[{}]'); # the given subroutine takes a hash reference.
|
1349 |
+
# throw an exception because allow_nonref is not enabled
|
1350 |
+
# so a lone 5 is not allowed.
|
1351 |
+
$js->decode ('{"a":1, "b":2}');
|
1352 |
+
|
1353 |
+
|
1354 |
+
=head2 filter_json_single_key_object
|
1355 |
+
|
1356 |
+
$json = $json->filter_json_single_key_object($key [=> $coderef])
|
1357 |
+
|
1358 |
+
Works remotely similar to C<filter_json_object>, but is only called for
|
1359 |
+
JSON objects having a single key named C<$key>.
|
1360 |
+
|
1361 |
+
This C<$coderef> is called before the one specified via
|
1362 |
+
C<filter_json_object>, if any. It gets passed the single value in the JSON
|
1363 |
+
object. If it returns a single value, it will be inserted into the data
|
1364 |
+
structure. If it returns nothing (not even C<undef> but the empty list),
|
1365 |
+
the callback from C<filter_json_object> will be called next, as if no
|
1366 |
+
single-key callback were specified.
|
1367 |
+
|
1368 |
+
If C<$coderef> is omitted or undefined, the corresponding callback will be
|
1369 |
+
disabled. There can only ever be one callback for a given key.
|
1370 |
+
|
1371 |
+
As this callback gets called less often then the C<filter_json_object>
|
1372 |
+
one, decoding speed will not usually suffer as much. Therefore, single-key
|
1373 |
+
objects make excellent targets to serialise Perl objects into, especially
|
1374 |
+
as single-key JSON objects are as close to the type-tagged value concept
|
1375 |
+
as JSON gets (it's basically an ID/VALUE tuple). Of course, JSON does not
|
1376 |
+
support this in any way, so you need to make sure your data never looks
|
1377 |
+
like a serialised Perl hash.
|
1378 |
+
|
1379 |
+
Typical names for the single object key are C<__class_whatever__>, or
|
1380 |
+
C<$__dollars_are_rarely_used__$> or C<}ugly_brace_placement>, or even
|
1381 |
+
things like C<__class_md5sum(classname)__>, to reduce the risk of clashing
|
1382 |
+
with real hashes.
|
1383 |
+
|
1384 |
+
Example, decode JSON objects of the form C<< { "__widget__" => <id> } >>
|
1385 |
+
into the corresponding C<< $WIDGET{<id>} >> object:
|
1386 |
+
|
1387 |
+
# return whatever is in $WIDGET{5}:
|
1388 |
+
JSON
|
1389 |
+
->new
|
1390 |
+
->filter_json_single_key_object (__widget__ => sub {
|
1391 |
+
$WIDGET{ $_[0] }
|
1392 |
+
})
|
1393 |
+
->decode ('{"__widget__": 5')
|
1394 |
+
|
1395 |
+
# this can be used with a TO_JSON method in some "widget" class
|
1396 |
+
# for serialisation to json:
|
1397 |
+
sub WidgetBase::TO_JSON {
|
1398 |
+
my ($self) = @_;
|
1399 |
+
|
1400 |
+
unless ($self->{id}) {
|
1401 |
+
$self->{id} = ..get..some..id..;
|
1402 |
+
$WIDGET{$self->{id}} = $self;
|
1403 |
+
}
|
1404 |
+
|
1405 |
+
{ __widget__ => $self->{id} }
|
1406 |
+
}
|
1407 |
+
|
1408 |
+
|
1409 |
+
=head2 shrink
|
1410 |
+
|
1411 |
+
$json = $json->shrink([$enable])
|
1412 |
+
|
1413 |
+
$enabled = $json->get_shrink
|
1414 |
+
|
1415 |
+
With JSON::XS, this flag resizes strings generated by either
|
1416 |
+
C<encode> or C<decode> to their minimum size possible. This can save
|
1417 |
+
memory when your JSON texts are either very very long or you have many
|
1418 |
+
short strings. It will also try to downgrade any strings to octet-form
|
1419 |
+
if possible: perl stores strings internally either in an encoding called
|
1420 |
+
UTF-X or in octet-form. The latter cannot store everything but uses less
|
1421 |
+
space in general (and some buggy Perl or C code might even rely on that
|
1422 |
+
internal representation being used).
|
1423 |
+
|
1424 |
+
With JSON::PP, it is noop about resizing strings but tries
|
1425 |
+
C<utf8::downgrade> to the returned string by C<encode>. See to L<utf8>.
|
1426 |
+
|
1427 |
+
See to L<JSON::XS/OBJECT-ORIENTED INTERFACE> and L<JSON::PP/METHODS>.
|
1428 |
+
|
1429 |
+
=head2 max_depth
|
1430 |
+
|
1431 |
+
$json = $json->max_depth([$maximum_nesting_depth])
|
1432 |
+
|
1433 |
+
$max_depth = $json->get_max_depth
|
1434 |
+
|
1435 |
+
Sets the maximum nesting level (default C<512>) accepted while encoding
|
1436 |
+
or decoding. If a higher nesting level is detected in JSON text or a Perl
|
1437 |
+
data structure, then the encoder and decoder will stop and croak at that
|
1438 |
+
point.
|
1439 |
+
|
1440 |
+
Nesting level is defined by number of hash- or arrayrefs that the encoder
|
1441 |
+
needs to traverse to reach a given point or the number of C<{> or C<[>
|
1442 |
+
characters without their matching closing parenthesis crossed to reach a
|
1443 |
+
given character in a string.
|
1444 |
+
|
1445 |
+
If no argument is given, the highest possible setting will be used, which
|
1446 |
+
is rarely useful.
|
1447 |
+
|
1448 |
+
Note that nesting is implemented by recursion in C. The default value has
|
1449 |
+
been chosen to be as large as typical operating systems allow without
|
1450 |
+
crashing. (JSON::XS)
|
1451 |
+
|
1452 |
+
With JSON::PP as the backend, when a large value (100 or more) was set and
|
1453 |
+
it de/encodes a deep nested object/text, it may raise a warning
|
1454 |
+
'Deep recursion on subroutine' at the perl runtime phase.
|
1455 |
+
|
1456 |
+
See L<JSON::XS/SECURITY CONSIDERATIONS> for more info on why this is useful.
|
1457 |
+
|
1458 |
+
=head2 max_size
|
1459 |
+
|
1460 |
+
$json = $json->max_size([$maximum_string_size])
|
1461 |
+
|
1462 |
+
$max_size = $json->get_max_size
|
1463 |
+
|
1464 |
+
Set the maximum length a JSON text may have (in bytes) where decoding is
|
1465 |
+
being attempted. The default is C<0>, meaning no limit. When C<decode>
|
1466 |
+
is called on a string that is longer then this many bytes, it will not
|
1467 |
+
attempt to decode the string but throw an exception. This setting has no
|
1468 |
+
effect on C<encode> (yet).
|
1469 |
+
|
1470 |
+
If no argument is given, the limit check will be deactivated (same as when
|
1471 |
+
C<0> is specified).
|
1472 |
+
|
1473 |
+
See L<JSON::XS/SECURITY CONSIDERATIONS>, below, for more info on why this is useful.
|
1474 |
+
|
1475 |
+
=head2 encode
|
1476 |
+
|
1477 |
+
$json_text = $json->encode($perl_scalar)
|
1478 |
+
|
1479 |
+
Converts the given Perl data structure (a simple scalar or a reference
|
1480 |
+
to a hash or array) to its JSON representation. Simple scalars will be
|
1481 |
+
converted into JSON string or number sequences, while references to arrays
|
1482 |
+
become JSON arrays and references to hashes become JSON objects. Undefined
|
1483 |
+
Perl values (e.g. C<undef>) become JSON C<null> values.
|
1484 |
+
References to the integers C<0> and C<1> are converted into C<true> and C<false>.
|
1485 |
+
|
1486 |
+
=head2 decode
|
1487 |
+
|
1488 |
+
$perl_scalar = $json->decode($json_text)
|
1489 |
+
|
1490 |
+
The opposite of C<encode>: expects a JSON text and tries to parse it,
|
1491 |
+
returning the resulting simple scalar or reference. Croaks on error.
|
1492 |
+
|
1493 |
+
JSON numbers and strings become simple Perl scalars. JSON arrays become
|
1494 |
+
Perl arrayrefs and JSON objects become Perl hashrefs. C<true> becomes
|
1495 |
+
C<1> (C<JSON::true>), C<false> becomes C<0> (C<JSON::false>) and
|
1496 |
+
C<null> becomes C<undef>.
|
1497 |
+
|
1498 |
+
=head2 decode_prefix
|
1499 |
+
|
1500 |
+
($perl_scalar, $characters) = $json->decode_prefix($json_text)
|
1501 |
+
|
1502 |
+
This works like the C<decode> method, but instead of raising an exception
|
1503 |
+
when there is trailing garbage after the first JSON object, it will
|
1504 |
+
silently stop parsing there and return the number of characters consumed
|
1505 |
+
so far.
|
1506 |
+
|
1507 |
+
JSON->new->decode_prefix ("[1] the tail")
|
1508 |
+
=> ([], 3)
|
1509 |
+
|
1510 |
+
See to L<JSON::XS/OBJECT-ORIENTED INTERFACE>
|
1511 |
+
|
1512 |
+
=head2 property
|
1513 |
+
|
1514 |
+
$boolean = $json->property($property_name)
|
1515 |
+
|
1516 |
+
Returns a boolean value about above some properties.
|
1517 |
+
|
1518 |
+
The available properties are C<ascii>, C<latin1>, C<utf8>,
|
1519 |
+
C<indent>,C<space_before>, C<space_after>, C<relaxed>, C<canonical>,
|
1520 |
+
C<allow_nonref>, C<allow_unknown>, C<allow_blessed>, C<convert_blessed>,
|
1521 |
+
C<shrink>, C<max_depth> and C<max_size>.
|
1522 |
+
|
1523 |
+
$boolean = $json->property('utf8');
|
1524 |
+
=> 0
|
1525 |
+
$json->utf8;
|
1526 |
+
$boolean = $json->property('utf8');
|
1527 |
+
=> 1
|
1528 |
+
|
1529 |
+
Sets the property with a given boolean value.
|
1530 |
+
|
1531 |
+
$json = $json->property($property_name => $boolean);
|
1532 |
+
|
1533 |
+
With no argument, it returns all the above properties as a hash reference.
|
1534 |
+
|
1535 |
+
$flag_hashref = $json->property();
|
1536 |
+
|
1537 |
+
=head1 INCREMENTAL PARSING
|
1538 |
+
|
1539 |
+
Most of this section are copied and modified from L<JSON::XS/INCREMENTAL PARSING>.
|
1540 |
+
|
1541 |
+
In some cases, there is the need for incremental parsing of JSON texts.
|
1542 |
+
This module does allow you to parse a JSON stream incrementally.
|
1543 |
+
It does so by accumulating text until it has a full JSON object, which
|
1544 |
+
it then can decode. This process is similar to using C<decode_prefix>
|
1545 |
+
to see if a full JSON object is available, but is much more efficient
|
1546 |
+
(and can be implemented with a minimum of method calls).
|
1547 |
+
|
1548 |
+
The backend module will only attempt to parse the JSON text once it is sure it
|
1549 |
+
has enough text to get a decisive result, using a very simple but
|
1550 |
+
truly incremental parser. This means that it sometimes won't stop as
|
1551 |
+
early as the full parser, for example, it doesn't detect parenthesis
|
1552 |
+
mismatches. The only thing it guarantees is that it starts decoding as
|
1553 |
+
soon as a syntactically valid JSON text has been seen. This means you need
|
1554 |
+
to set resource limits (e.g. C<max_size>) to ensure the parser will stop
|
1555 |
+
parsing in the presence if syntax errors.
|
1556 |
+
|
1557 |
+
The following methods implement this incremental parser.
|
1558 |
+
|
1559 |
+
=head2 incr_parse
|
1560 |
+
|
1561 |
+
$json->incr_parse( [$string] ) # void context
|
1562 |
+
|
1563 |
+
$obj_or_undef = $json->incr_parse( [$string] ) # scalar context
|
1564 |
+
|
1565 |
+
@obj_or_empty = $json->incr_parse( [$string] ) # list context
|
1566 |
+
|
1567 |
+
This is the central parsing function. It can both append new text and
|
1568 |
+
extract objects from the stream accumulated so far (both of these
|
1569 |
+
functions are optional).
|
1570 |
+
|
1571 |
+
If C<$string> is given, then this string is appended to the already
|
1572 |
+
existing JSON fragment stored in the C<$json> object.
|
1573 |
+
|
1574 |
+
After that, if the function is called in void context, it will simply
|
1575 |
+
return without doing anything further. This can be used to add more text
|
1576 |
+
in as many chunks as you want.
|
1577 |
+
|
1578 |
+
If the method is called in scalar context, then it will try to extract
|
1579 |
+
exactly I<one> JSON object. If that is successful, it will return this
|
1580 |
+
object, otherwise it will return C<undef>. If there is a parse error,
|
1581 |
+
this method will croak just as C<decode> would do (one can then use
|
1582 |
+
C<incr_skip> to skip the erroneous part). This is the most common way of
|
1583 |
+
using the method.
|
1584 |
+
|
1585 |
+
And finally, in list context, it will try to extract as many objects
|
1586 |
+
from the stream as it can find and return them, or the empty list
|
1587 |
+
otherwise. For this to work, there must be no separators between the JSON
|
1588 |
+
objects or arrays, instead they must be concatenated back-to-back. If
|
1589 |
+
an error occurs, an exception will be raised as in the scalar context
|
1590 |
+
case. Note that in this case, any previously-parsed JSON texts will be
|
1591 |
+
lost.
|
1592 |
+
|
1593 |
+
Example: Parse some JSON arrays/objects in a given string and return them.
|
1594 |
+
|
1595 |
+
my @objs = JSON->new->incr_parse ("[5][7][1,2]");
|
1596 |
+
|
1597 |
+
=head2 incr_text
|
1598 |
+
|
1599 |
+
$lvalue_string = $json->incr_text
|
1600 |
+
|
1601 |
+
This method returns the currently stored JSON fragment as an lvalue, that
|
1602 |
+
is, you can manipulate it. This I<only> works when a preceding call to
|
1603 |
+
C<incr_parse> in I<scalar context> successfully returned an object. Under
|
1604 |
+
all other circumstances you must not call this function (I mean it.
|
1605 |
+
although in simple tests it might actually work, it I<will> fail under
|
1606 |
+
real world conditions). As a special exception, you can also call this
|
1607 |
+
method before having parsed anything.
|
1608 |
+
|
1609 |
+
This function is useful in two cases: a) finding the trailing text after a
|
1610 |
+
JSON object or b) parsing multiple JSON objects separated by non-JSON text
|
1611 |
+
(such as commas).
|
1612 |
+
|
1613 |
+
$json->incr_text =~ s/\s*,\s*//;
|
1614 |
+
|
1615 |
+
In Perl 5.005, C<lvalue> attribute is not available.
|
1616 |
+
You must write codes like the below:
|
1617 |
+
|
1618 |
+
$string = $json->incr_text;
|
1619 |
+
$string =~ s/\s*,\s*//;
|
1620 |
+
$json->incr_text( $string );
|
1621 |
+
|
1622 |
+
=head2 incr_skip
|
1623 |
+
|
1624 |
+
$json->incr_skip
|
1625 |
+
|
1626 |
+
This will reset the state of the incremental parser and will remove the
|
1627 |
+
parsed text from the input buffer. This is useful after C<incr_parse>
|
1628 |
+
died, in which case the input buffer and incremental parser state is left
|
1629 |
+
unchanged, to skip the text parsed so far and to reset the parse state.
|
1630 |
+
|
1631 |
+
=head2 incr_reset
|
1632 |
+
|
1633 |
+
$json->incr_reset
|
1634 |
+
|
1635 |
+
This completely resets the incremental parser, that is, after this call,
|
1636 |
+
it will be as if the parser had never parsed anything.
|
1637 |
+
|
1638 |
+
This is useful if you want to repeatedly parse JSON objects and want to
|
1639 |
+
ignore any trailing data, which means you have to reset the parser after
|
1640 |
+
each successful decode.
|
1641 |
+
|
1642 |
+
See to L<JSON::XS/INCREMENTAL PARSING> for examples.
|
1643 |
+
|
1644 |
+
|
1645 |
+
=head1 JSON::PP SUPPORT METHODS
|
1646 |
+
|
1647 |
+
The below methods are JSON::PP own methods, so when C<JSON> works
|
1648 |
+
with JSON::PP (i.e. the created object is a JSON::PP object), available.
|
1649 |
+
See to L<JSON::PP/JSON::PP OWN METHODS> in detail.
|
1650 |
+
|
1651 |
+
If you use C<JSON> with additional C<-support_by_pp>, some methods
|
1652 |
+
are available even with JSON::XS. See to L<USE PP FEATURES EVEN THOUGH XS BACKEND>.
|
1653 |
+
|
1654 |
+
BEING { $ENV{PERL_JSON_BACKEND} = 'JSON::XS' }
|
1655 |
+
|
1656 |
+
use JSON -support_by_pp;
|
1657 |
+
|
1658 |
+
my $json = JSON->new;
|
1659 |
+
$json->allow_nonref->escape_slash->encode("/");
|
1660 |
+
|
1661 |
+
# functional interfaces too.
|
1662 |
+
print to_json(["/"], {escape_slash => 1});
|
1663 |
+
print from_json('["foo"]', {utf8 => 1});
|
1664 |
+
|
1665 |
+
If you do not want to all functions but C<-support_by_pp>,
|
1666 |
+
use C<-no_export>.
|
1667 |
+
|
1668 |
+
use JSON -support_by_pp, -no_export;
|
1669 |
+
# functional interfaces are not exported.
|
1670 |
+
|
1671 |
+
=head2 allow_singlequote
|
1672 |
+
|
1673 |
+
$json = $json->allow_singlequote([$enable])
|
1674 |
+
|
1675 |
+
If C<$enable> is true (or missing), then C<decode> will accept
|
1676 |
+
any JSON strings quoted by single quotations that are invalid JSON
|
1677 |
+
format.
|
1678 |
+
|
1679 |
+
$json->allow_singlequote->decode({"foo":'bar'});
|
1680 |
+
$json->allow_singlequote->decode({'foo':"bar"});
|
1681 |
+
$json->allow_singlequote->decode({'foo':'bar'});
|
1682 |
+
|
1683 |
+
As same as the C<relaxed> option, this option may be used to parse
|
1684 |
+
application-specific files written by humans.
|
1685 |
+
|
1686 |
+
=head2 allow_barekey
|
1687 |
+
|
1688 |
+
$json = $json->allow_barekey([$enable])
|
1689 |
+
|
1690 |
+
If C<$enable> is true (or missing), then C<decode> will accept
|
1691 |
+
bare keys of JSON object that are invalid JSON format.
|
1692 |
+
|
1693 |
+
As same as the C<relaxed> option, this option may be used to parse
|
1694 |
+
application-specific files written by humans.
|
1695 |
+
|
1696 |
+
$json->allow_barekey->decode('{foo:"bar"}');
|
1697 |
+
|
1698 |
+
=head2 allow_bignum
|
1699 |
+
|
1700 |
+
$json = $json->allow_bignum([$enable])
|
1701 |
+
|
1702 |
+
If C<$enable> is true (or missing), then C<decode> will convert
|
1703 |
+
the big integer Perl cannot handle as integer into a L<Math::BigInt>
|
1704 |
+
object and convert a floating number (any) into a L<Math::BigFloat>.
|
1705 |
+
|
1706 |
+
On the contrary, C<encode> converts C<Math::BigInt> objects and C<Math::BigFloat>
|
1707 |
+
objects into JSON numbers with C<allow_blessed> enable.
|
1708 |
+
|
1709 |
+
$json->allow_nonref->allow_blessed->allow_bignum;
|
1710 |
+
$bigfloat = $json->decode('2.000000000000000000000000001');
|
1711 |
+
print $json->encode($bigfloat);
|
1712 |
+
# => 2.000000000000000000000000001
|
1713 |
+
|
1714 |
+
See to L<MAPPING> about the conversion of JSON number.
|
1715 |
+
|
1716 |
+
=head2 loose
|
1717 |
+
|
1718 |
+
$json = $json->loose([$enable])
|
1719 |
+
|
1720 |
+
The unescaped [\x00-\x1f\x22\x2f\x5c] strings are invalid in JSON strings
|
1721 |
+
and the module doesn't allow to C<decode> to these (except for \x2f).
|
1722 |
+
If C<$enable> is true (or missing), then C<decode> will accept these
|
1723 |
+
unescaped strings.
|
1724 |
+
|
1725 |
+
$json->loose->decode(qq|["abc
|
1726 |
+
def"]|);
|
1727 |
+
|
1728 |
+
See to L<JSON::PP/JSON::PP OWN METHODS>.
|
1729 |
+
|
1730 |
+
=head2 escape_slash
|
1731 |
+
|
1732 |
+
$json = $json->escape_slash([$enable])
|
1733 |
+
|
1734 |
+
According to JSON Grammar, I<slash> (U+002F) is escaped. But by default
|
1735 |
+
JSON backend modules encode strings without escaping slash.
|
1736 |
+
|
1737 |
+
If C<$enable> is true (or missing), then C<encode> will escape slashes.
|
1738 |
+
|
1739 |
+
=head2 indent_length
|
1740 |
+
|
1741 |
+
$json = $json->indent_length($length)
|
1742 |
+
|
1743 |
+
With JSON::XS, The indent space length is 3 and cannot be changed.
|
1744 |
+
With JSON::PP, it sets the indent space length with the given $length.
|
1745 |
+
The default is 3. The acceptable range is 0 to 15.
|
1746 |
+
|
1747 |
+
=head2 sort_by
|
1748 |
+
|
1749 |
+
$json = $json->sort_by($function_name)
|
1750 |
+
$json = $json->sort_by($subroutine_ref)
|
1751 |
+
|
1752 |
+
If $function_name or $subroutine_ref are set, its sort routine are used.
|
1753 |
+
|
1754 |
+
$js = $pc->sort_by(sub { $JSON::PP::a cmp $JSON::PP::b })->encode($obj);
|
1755 |
+
# is($js, q|{"a":1,"b":2,"c":3,"d":4,"e":5,"f":6,"g":7,"h":8,"i":9}|);
|
1756 |
+
|
1757 |
+
$js = $pc->sort_by('own_sort')->encode($obj);
|
1758 |
+
# is($js, q|{"a":1,"b":2,"c":3,"d":4,"e":5,"f":6,"g":7,"h":8,"i":9}|);
|
1759 |
+
|
1760 |
+
sub JSON::PP::own_sort { $JSON::PP::a cmp $JSON::PP::b }
|
1761 |
+
|
1762 |
+
As the sorting routine runs in the JSON::PP scope, the given
|
1763 |
+
subroutine name and the special variables C<$a>, C<$b> will begin
|
1764 |
+
with 'JSON::PP::'.
|
1765 |
+
|
1766 |
+
If $integer is set, then the effect is same as C<canonical> on.
|
1767 |
+
|
1768 |
+
See to L<JSON::PP/JSON::PP OWN METHODS>.
|
1769 |
+
|
1770 |
+
=head1 MAPPING
|
1771 |
+
|
1772 |
+
This section is copied from JSON::XS and modified to C<JSON>.
|
1773 |
+
JSON::XS and JSON::PP mapping mechanisms are almost equivalent.
|
1774 |
+
|
1775 |
+
See to L<JSON::XS/MAPPING>.
|
1776 |
+
|
1777 |
+
=head2 JSON -> PERL
|
1778 |
+
|
1779 |
+
=over 4
|
1780 |
+
|
1781 |
+
=item object
|
1782 |
+
|
1783 |
+
A JSON object becomes a reference to a hash in Perl. No ordering of object
|
1784 |
+
keys is preserved (JSON does not preserver object key ordering itself).
|
1785 |
+
|
1786 |
+
=item array
|
1787 |
+
|
1788 |
+
A JSON array becomes a reference to an array in Perl.
|
1789 |
+
|
1790 |
+
=item string
|
1791 |
+
|
1792 |
+
A JSON string becomes a string scalar in Perl - Unicode codepoints in JSON
|
1793 |
+
are represented by the same codepoints in the Perl string, so no manual
|
1794 |
+
decoding is necessary.
|
1795 |
+
|
1796 |
+
=item number
|
1797 |
+
|
1798 |
+
A JSON number becomes either an integer, numeric (floating point) or
|
1799 |
+
string scalar in perl, depending on its range and any fractional parts. On
|
1800 |
+
the Perl level, there is no difference between those as Perl handles all
|
1801 |
+
the conversion details, but an integer may take slightly less memory and
|
1802 |
+
might represent more values exactly than floating point numbers.
|
1803 |
+
|
1804 |
+
If the number consists of digits only, C<JSON> will try to represent
|
1805 |
+
it as an integer value. If that fails, it will try to represent it as
|
1806 |
+
a numeric (floating point) value if that is possible without loss of
|
1807 |
+
precision. Otherwise it will preserve the number as a string value (in
|
1808 |
+
which case you lose roundtripping ability, as the JSON number will be
|
1809 |
+
re-encoded to a JSON string).
|
1810 |
+
|
1811 |
+
Numbers containing a fractional or exponential part will always be
|
1812 |
+
represented as numeric (floating point) values, possibly at a loss of
|
1813 |
+
precision (in which case you might lose perfect roundtripping ability, but
|
1814 |
+
the JSON number will still be re-encoded as a JSON number).
|
1815 |
+
|
1816 |
+
Note that precision is not accuracy - binary floating point values cannot
|
1817 |
+
represent most decimal fractions exactly, and when converting from and to
|
1818 |
+
floating point, C<JSON> only guarantees precision up to but not including
|
1819 |
+
the least significant bit.
|
1820 |
+
|
1821 |
+
If the backend is JSON::PP and C<allow_bignum> is enable, the big integers
|
1822 |
+
and the numeric can be optionally converted into L<Math::BigInt> and
|
1823 |
+
L<Math::BigFloat> objects.
|
1824 |
+
|
1825 |
+
=item true, false
|
1826 |
+
|
1827 |
+
These JSON atoms become C<JSON::true> and C<JSON::false>,
|
1828 |
+
respectively. They are overloaded to act almost exactly like the numbers
|
1829 |
+
C<1> and C<0>. You can check whether a scalar is a JSON boolean by using
|
1830 |
+
the C<JSON::is_bool> function.
|
1831 |
+
|
1832 |
+
print JSON::true + 1;
|
1833 |
+
=> 1
|
1834 |
+
|
1835 |
+
ok(JSON::true eq '1');
|
1836 |
+
ok(JSON::true == 1);
|
1837 |
+
|
1838 |
+
C<JSON> will install these missing overloading features to the backend modules.
|
1839 |
+
|
1840 |
+
|
1841 |
+
=item null
|
1842 |
+
|
1843 |
+
A JSON null atom becomes C<undef> in Perl.
|
1844 |
+
|
1845 |
+
C<JSON::null> returns C<undef>.
|
1846 |
+
|
1847 |
+
=back
|
1848 |
+
|
1849 |
+
|
1850 |
+
=head2 PERL -> JSON
|
1851 |
+
|
1852 |
+
The mapping from Perl to JSON is slightly more difficult, as Perl is a
|
1853 |
+
truly typeless language, so we can only guess which JSON type is meant by
|
1854 |
+
a Perl value.
|
1855 |
+
|
1856 |
+
=over 4
|
1857 |
+
|
1858 |
+
=item hash references
|
1859 |
+
|
1860 |
+
Perl hash references become JSON objects. As there is no inherent ordering
|
1861 |
+
in hash keys (or JSON objects), they will usually be encoded in a
|
1862 |
+
pseudo-random order that can change between runs of the same program but
|
1863 |
+
stays generally the same within a single run of a program. C<JSON>
|
1864 |
+
optionally sort the hash keys (determined by the I<canonical> flag), so
|
1865 |
+
the same data structure will serialise to the same JSON text (given same
|
1866 |
+
settings and version of JSON::XS), but this incurs a runtime overhead
|
1867 |
+
and is only rarely useful, e.g. when you want to compare some JSON text
|
1868 |
+
against another for equality.
|
1869 |
+
|
1870 |
+
In future, the ordered object feature will be added to JSON::PP using C<tie> mechanism.
|
1871 |
+
|
1872 |
+
|
1873 |
+
=item array references
|
1874 |
+
|
1875 |
+
Perl array references become JSON arrays.
|
1876 |
+
|
1877 |
+
=item other references
|
1878 |
+
|
1879 |
+
Other unblessed references are generally not allowed and will cause an
|
1880 |
+
exception to be thrown, except for references to the integers C<0> and
|
1881 |
+
C<1>, which get turned into C<false> and C<true> atoms in JSON. You can
|
1882 |
+
also use C<JSON::false> and C<JSON::true> to improve readability.
|
1883 |
+
|
1884 |
+
to_json [\0,JSON::true] # yields [false,true]
|
1885 |
+
|
1886 |
+
=item JSON::true, JSON::false, JSON::null
|
1887 |
+
|
1888 |
+
These special values become JSON true and JSON false values,
|
1889 |
+
respectively. You can also use C<\1> and C<\0> directly if you want.
|
1890 |
+
|
1891 |
+
JSON::null returns C<undef>.
|
1892 |
+
|
1893 |
+
=item blessed objects
|
1894 |
+
|
1895 |
+
Blessed objects are not directly representable in JSON. See the
|
1896 |
+
C<allow_blessed> and C<convert_blessed> methods on various options on
|
1897 |
+
how to deal with this: basically, you can choose between throwing an
|
1898 |
+
exception, encoding the reference as if it weren't blessed, or provide
|
1899 |
+
your own serialiser method.
|
1900 |
+
|
1901 |
+
With C<convert_blessed_universally> mode, C<encode> converts blessed
|
1902 |
+
hash references or blessed array references (contains other blessed references)
|
1903 |
+
into JSON members and arrays.
|
1904 |
+
|
1905 |
+
use JSON -convert_blessed_universally;
|
1906 |
+
JSON->new->allow_blessed->convert_blessed->encode( $blessed_object );
|
1907 |
+
|
1908 |
+
See to L<convert_blessed>.
|
1909 |
+
|
1910 |
+
=item simple scalars
|
1911 |
+
|
1912 |
+
Simple Perl scalars (any scalar that is not a reference) are the most
|
1913 |
+
difficult objects to encode: JSON::XS and JSON::PP will encode undefined scalars as
|
1914 |
+
JSON C<null> values, scalars that have last been used in a string context
|
1915 |
+
before encoding as JSON strings, and anything else as number value:
|
1916 |
+
|
1917 |
+
# dump as number
|
1918 |
+
encode_json [2] # yields [2]
|
1919 |
+
encode_json [-3.0e17] # yields [-3e+17]
|
1920 |
+
my $value = 5; encode_json [$value] # yields [5]
|
1921 |
+
|
1922 |
+
# used as string, so dump as string
|
1923 |
+
print $value;
|
1924 |
+
encode_json [$value] # yields ["5"]
|
1925 |
+
|
1926 |
+
# undef becomes null
|
1927 |
+
encode_json [undef] # yields [null]
|
1928 |
+
|
1929 |
+
You can force the type to be a string by stringifying it:
|
1930 |
+
|
1931 |
+
my $x = 3.1; # some variable containing a number
|
1932 |
+
"$x"; # stringified
|
1933 |
+
$x .= ""; # another, more awkward way to stringify
|
1934 |
+
print $x; # perl does it for you, too, quite often
|
1935 |
+
|
1936 |
+
You can force the type to be a number by numifying it:
|
1937 |
+
|
1938 |
+
my $x = "3"; # some variable containing a string
|
1939 |
+
$x += 0; # numify it, ensuring it will be dumped as a number
|
1940 |
+
$x *= 1; # same thing, the choice is yours.
|
1941 |
+
|
1942 |
+
You can not currently force the type in other, less obscure, ways.
|
1943 |
+
|
1944 |
+
Note that numerical precision has the same meaning as under Perl (so
|
1945 |
+
binary to decimal conversion follows the same rules as in Perl, which
|
1946 |
+
can differ to other languages). Also, your perl interpreter might expose
|
1947 |
+
extensions to the floating point numbers of your platform, such as
|
1948 |
+
infinities or NaN's - these cannot be represented in JSON, and it is an
|
1949 |
+
error to pass those in.
|
1950 |
+
|
1951 |
+
=item Big Number
|
1952 |
+
|
1953 |
+
If the backend is JSON::PP and C<allow_bignum> is enable,
|
1954 |
+
C<encode> converts C<Math::BigInt> objects and C<Math::BigFloat>
|
1955 |
+
objects into JSON numbers.
|
1956 |
+
|
1957 |
+
|
1958 |
+
=back
|
1959 |
+
|
1960 |
+
=head1 JSON and ECMAscript
|
1961 |
+
|
1962 |
+
See to L<JSON::XS/JSON and ECMAscript>.
|
1963 |
+
|
1964 |
+
=head1 JSON and YAML
|
1965 |
+
|
1966 |
+
JSON is not a subset of YAML.
|
1967 |
+
See to L<JSON::XS/JSON and YAML>.
|
1968 |
+
|
1969 |
+
|
1970 |
+
=head1 BACKEND MODULE DECISION
|
1971 |
+
|
1972 |
+
When you use C<JSON>, C<JSON> tries to C<use> JSON::XS. If this call failed, it will
|
1973 |
+
C<uses> JSON::PP. The required JSON::XS version is I<2.2> or later.
|
1974 |
+
|
1975 |
+
The C<JSON> constructor method returns an object inherited from the backend module,
|
1976 |
+
and JSON::XS object is a blessed scalar reference while JSON::PP is a blessed hash
|
1977 |
+
reference.
|
1978 |
+
|
1979 |
+
So, your program should not depend on the backend module, especially
|
1980 |
+
returned objects should not be modified.
|
1981 |
+
|
1982 |
+
my $json = JSON->new; # XS or PP?
|
1983 |
+
$json->{stash} = 'this is xs object'; # this code may raise an error!
|
1984 |
+
|
1985 |
+
To check the backend module, there are some methods - C<backend>, C<is_pp> and C<is_xs>.
|
1986 |
+
|
1987 |
+
JSON->backend; # 'JSON::XS' or 'JSON::PP'
|
1988 |
+
|
1989 |
+
JSON->backend->is_pp: # 0 or 1
|
1990 |
+
|
1991 |
+
JSON->backend->is_xs: # 1 or 0
|
1992 |
+
|
1993 |
+
$json->is_xs; # 1 or 0
|
1994 |
+
|
1995 |
+
$json->is_pp; # 0 or 1
|
1996 |
+
|
1997 |
+
|
1998 |
+
If you set an environment variable C<PERL_JSON_BACKEND>, the calling action will be changed.
|
1999 |
+
|
2000 |
+
=over
|
2001 |
+
|
2002 |
+
=item PERL_JSON_BACKEND = 0 or PERL_JSON_BACKEND = 'JSON::PP'
|
2003 |
+
|
2004 |
+
Always use JSON::PP
|
2005 |
+
|
2006 |
+
=item PERL_JSON_BACKEND == 1 or PERL_JSON_BACKEND = 'JSON::XS,JSON::PP'
|
2007 |
+
|
2008 |
+
(The default) Use compiled JSON::XS if it is properly compiled & installed,
|
2009 |
+
otherwise use JSON::PP.
|
2010 |
+
|
2011 |
+
=item PERL_JSON_BACKEND == 2 or PERL_JSON_BACKEND = 'JSON::XS'
|
2012 |
+
|
2013 |
+
Always use compiled JSON::XS, die if it isn't properly compiled & installed.
|
2014 |
+
|
2015 |
+
=item PERL_JSON_BACKEND = 'JSON::backportPP'
|
2016 |
+
|
2017 |
+
Always use JSON::backportPP.
|
2018 |
+
JSON::backportPP is JSON::PP back port module.
|
2019 |
+
C<JSON> includes JSON::backportPP instead of JSON::PP.
|
2020 |
+
|
2021 |
+
=back
|
2022 |
+
|
2023 |
+
These ideas come from L<DBI::PurePerl> mechanism.
|
2024 |
+
|
2025 |
+
example:
|
2026 |
+
|
2027 |
+
BEGIN { $ENV{PERL_JSON_BACKEND} = 'JSON::PP' }
|
2028 |
+
use JSON; # always uses JSON::PP
|
2029 |
+
|
2030 |
+
In future, it may be able to specify another module.
|
2031 |
+
|
2032 |
+
=head1 USE PP FEATURES EVEN THOUGH XS BACKEND
|
2033 |
+
|
2034 |
+
Many methods are available with either JSON::XS or JSON::PP and
|
2035 |
+
when the backend module is JSON::XS, if any JSON::PP specific (i.e. JSON::XS unsupported)
|
2036 |
+
method is called, it will C<warn> and be noop.
|
2037 |
+
|
2038 |
+
But If you C<use> C<JSON> passing the optional string C<-support_by_pp>,
|
2039 |
+
it makes a part of those unsupported methods available.
|
2040 |
+
This feature is achieved by using JSON::PP in C<de/encode>.
|
2041 |
+
|
2042 |
+
BEGIN { $ENV{PERL_JSON_BACKEND} = 2 } # with JSON::XS
|
2043 |
+
use JSON -support_by_pp;
|
2044 |
+
my $json = JSON->new;
|
2045 |
+
$json->allow_nonref->escape_slash->encode("/");
|
2046 |
+
|
2047 |
+
At this time, the returned object is a C<JSON::Backend::XS::Supportable>
|
2048 |
+
object (re-blessed XS object), and by checking JSON::XS unsupported flags
|
2049 |
+
in de/encoding, can support some unsupported methods - C<loose>, C<allow_bignum>,
|
2050 |
+
C<allow_barekey>, C<allow_singlequote>, C<escape_slash> and C<indent_length>.
|
2051 |
+
|
2052 |
+
When any unsupported methods are not enable, C<XS de/encode> will be
|
2053 |
+
used as is. The switch is achieved by changing the symbolic tables.
|
2054 |
+
|
2055 |
+
C<-support_by_pp> is effective only when the backend module is JSON::XS
|
2056 |
+
and it makes the de/encoding speed down a bit.
|
2057 |
+
|
2058 |
+
See to L<JSON::PP SUPPORT METHODS>.
|
2059 |
+
|
2060 |
+
=head1 INCOMPATIBLE CHANGES TO OLD VERSION
|
2061 |
+
|
2062 |
+
There are big incompatibility between new version (2.00) and old (1.xx).
|
2063 |
+
If you use old C<JSON> 1.xx in your code, please check it.
|
2064 |
+
|
2065 |
+
See to L<Transition ways from 1.xx to 2.xx.>
|
2066 |
+
|
2067 |
+
=over
|
2068 |
+
|
2069 |
+
=item jsonToObj and objToJson are obsoleted.
|
2070 |
+
|
2071 |
+
Non Perl-style name C<jsonToObj> and C<objToJson> are obsoleted
|
2072 |
+
(but not yet deleted from the source).
|
2073 |
+
If you use these functions in your code, please replace them
|
2074 |
+
with C<from_json> and C<to_json>.
|
2075 |
+
|
2076 |
+
|
2077 |
+
=item Global variables are no longer available.
|
2078 |
+
|
2079 |
+
C<JSON> class variables - C<$JSON::AUTOCONVERT>, C<$JSON::BareKey>, etc...
|
2080 |
+
- are not available any longer.
|
2081 |
+
Instead, various features can be used through object methods.
|
2082 |
+
|
2083 |
+
|
2084 |
+
=item Package JSON::Converter and JSON::Parser are deleted.
|
2085 |
+
|
2086 |
+
Now C<JSON> bundles with JSON::PP which can handle JSON more properly than them.
|
2087 |
+
|
2088 |
+
=item Package JSON::NotString is deleted.
|
2089 |
+
|
2090 |
+
There was C<JSON::NotString> class which represents JSON value C<true>, C<false>, C<null>
|
2091 |
+
and numbers. It was deleted and replaced by C<JSON::Boolean>.
|
2092 |
+
|
2093 |
+
C<JSON::Boolean> represents C<true> and C<false>.
|
2094 |
+
|
2095 |
+
C<JSON::Boolean> does not represent C<null>.
|
2096 |
+
|
2097 |
+
C<JSON::null> returns C<undef>.
|
2098 |
+
|
2099 |
+
C<JSON> makes L<JSON::XS::Boolean> and L<JSON::PP::Boolean> is-a relation
|
2100 |
+
to L<JSON::Boolean>.
|
2101 |
+
|
2102 |
+
=item function JSON::Number is obsoleted.
|
2103 |
+
|
2104 |
+
C<JSON::Number> is now needless because JSON::XS and JSON::PP have
|
2105 |
+
round-trip integrity.
|
2106 |
+
|
2107 |
+
=item JSONRPC modules are deleted.
|
2108 |
+
|
2109 |
+
Perl implementation of JSON-RPC protocol - C<JSONRPC >, C<JSONRPC::Transport::HTTP>
|
2110 |
+
and C<Apache::JSONRPC > are deleted in this distribution.
|
2111 |
+
Instead of them, there is L<JSON::RPC> which supports JSON-RPC protocol version 1.1.
|
2112 |
+
|
2113 |
+
=back
|
2114 |
+
|
2115 |
+
=head2 Transition ways from 1.xx to 2.xx.
|
2116 |
+
|
2117 |
+
You should set C<suport_by_pp> mode firstly, because
|
2118 |
+
it is always successful for the below codes even with JSON::XS.
|
2119 |
+
|
2120 |
+
use JSON -support_by_pp;
|
2121 |
+
|
2122 |
+
=over
|
2123 |
+
|
2124 |
+
=item Exported jsonToObj (simple)
|
2125 |
+
|
2126 |
+
from_json($json_text);
|
2127 |
+
|
2128 |
+
=item Exported objToJson (simple)
|
2129 |
+
|
2130 |
+
to_json($perl_scalar);
|
2131 |
+
|
2132 |
+
=item Exported jsonToObj (advanced)
|
2133 |
+
|
2134 |
+
$flags = {allow_barekey => 1, allow_singlequote => 1};
|
2135 |
+
from_json($json_text, $flags);
|
2136 |
+
|
2137 |
+
equivalent to:
|
2138 |
+
|
2139 |
+
$JSON::BareKey = 1;
|
2140 |
+
$JSON::QuotApos = 1;
|
2141 |
+
jsonToObj($json_text);
|
2142 |
+
|
2143 |
+
=item Exported objToJson (advanced)
|
2144 |
+
|
2145 |
+
$flags = {allow_blessed => 1, allow_barekey => 1};
|
2146 |
+
to_json($perl_scalar, $flags);
|
2147 |
+
|
2148 |
+
equivalent to:
|
2149 |
+
|
2150 |
+
$JSON::BareKey = 1;
|
2151 |
+
objToJson($perl_scalar);
|
2152 |
+
|
2153 |
+
=item jsonToObj as object method
|
2154 |
+
|
2155 |
+
$json->decode($json_text);
|
2156 |
+
|
2157 |
+
=item objToJson as object method
|
2158 |
+
|
2159 |
+
$json->encode($perl_scalar);
|
2160 |
+
|
2161 |
+
=item new method with parameters
|
2162 |
+
|
2163 |
+
The C<new> method in 2.x takes any parameters no longer.
|
2164 |
+
You can set parameters instead;
|
2165 |
+
|
2166 |
+
$json = JSON->new->pretty;
|
2167 |
+
|
2168 |
+
=item $JSON::Pretty, $JSON::Indent, $JSON::Delimiter
|
2169 |
+
|
2170 |
+
If C<indent> is enable, that means C<$JSON::Pretty> flag set. And
|
2171 |
+
C<$JSON::Delimiter> was substituted by C<space_before> and C<space_after>.
|
2172 |
+
In conclusion:
|
2173 |
+
|
2174 |
+
$json->indent->space_before->space_after;
|
2175 |
+
|
2176 |
+
Equivalent to:
|
2177 |
+
|
2178 |
+
$json->pretty;
|
2179 |
+
|
2180 |
+
To change indent length, use C<indent_length>.
|
2181 |
+
|
2182 |
+
(Only with JSON::PP, if C<-support_by_pp> is not used.)
|
2183 |
+
|
2184 |
+
$json->pretty->indent_length(2)->encode($perl_scalar);
|
2185 |
+
|
2186 |
+
=item $JSON::BareKey
|
2187 |
+
|
2188 |
+
(Only with JSON::PP, if C<-support_by_pp> is not used.)
|
2189 |
+
|
2190 |
+
$json->allow_barekey->decode($json_text)
|
2191 |
+
|
2192 |
+
=item $JSON::ConvBlessed
|
2193 |
+
|
2194 |
+
use C<-convert_blessed_universally>. See to L<convert_blessed>.
|
2195 |
+
|
2196 |
+
=item $JSON::QuotApos
|
2197 |
+
|
2198 |
+
(Only with JSON::PP, if C<-support_by_pp> is not used.)
|
2199 |
+
|
2200 |
+
$json->allow_singlequote->decode($json_text)
|
2201 |
+
|
2202 |
+
=item $JSON::SingleQuote
|
2203 |
+
|
2204 |
+
Disable. C<JSON> does not make such a invalid JSON string any longer.
|
2205 |
+
|
2206 |
+
=item $JSON::KeySort
|
2207 |
+
|
2208 |
+
$json->canonical->encode($perl_scalar)
|
2209 |
+
|
2210 |
+
This is the ascii sort.
|
2211 |
+
|
2212 |
+
If you want to use with your own sort routine, check the C<sort_by> method.
|
2213 |
+
|
2214 |
+
(Only with JSON::PP, even if C<-support_by_pp> is used currently.)
|
2215 |
+
|
2216 |
+
$json->sort_by($sort_routine_ref)->encode($perl_scalar)
|
2217 |
+
|
2218 |
+
$json->sort_by(sub { $JSON::PP::a <=> $JSON::PP::b })->encode($perl_scalar)
|
2219 |
+
|
2220 |
+
Can't access C<$a> and C<$b> but C<$JSON::PP::a> and C<$JSON::PP::b>.
|
2221 |
+
|
2222 |
+
=item $JSON::SkipInvalid
|
2223 |
+
|
2224 |
+
$json->allow_unknown
|
2225 |
+
|
2226 |
+
=item $JSON::AUTOCONVERT
|
2227 |
+
|
2228 |
+
Needless. C<JSON> backend modules have the round-trip integrity.
|
2229 |
+
|
2230 |
+
=item $JSON::UTF8
|
2231 |
+
|
2232 |
+
Needless because C<JSON> (JSON::XS/JSON::PP) sets
|
2233 |
+
the UTF8 flag on properly.
|
2234 |
+
|
2235 |
+
# With UTF8-flagged strings
|
2236 |
+
|
2237 |
+
$json->allow_nonref;
|
2238 |
+
$str = chr(1000); # UTF8-flagged
|
2239 |
+
|
2240 |
+
$json_text = $json->utf8(0)->encode($str);
|
2241 |
+
utf8::is_utf8($json_text);
|
2242 |
+
# true
|
2243 |
+
$json_text = $json->utf8(1)->encode($str);
|
2244 |
+
utf8::is_utf8($json_text);
|
2245 |
+
# false
|
2246 |
+
|
2247 |
+
$str = '"' . chr(1000) . '"'; # UTF8-flagged
|
2248 |
+
|
2249 |
+
$perl_scalar = $json->utf8(0)->decode($str);
|
2250 |
+
utf8::is_utf8($perl_scalar);
|
2251 |
+
# true
|
2252 |
+
$perl_scalar = $json->utf8(1)->decode($str);
|
2253 |
+
# died because of 'Wide character in subroutine'
|
2254 |
+
|
2255 |
+
See to L<JSON::XS/A FEW NOTES ON UNICODE AND PERL>.
|
2256 |
+
|
2257 |
+
=item $JSON::UnMapping
|
2258 |
+
|
2259 |
+
Disable. See to L<MAPPING>.
|
2260 |
+
|
2261 |
+
=item $JSON::SelfConvert
|
2262 |
+
|
2263 |
+
This option was deleted.
|
2264 |
+
Instead of it, if a given blessed object has the C<TO_JSON> method,
|
2265 |
+
C<TO_JSON> will be executed with C<convert_blessed>.
|
2266 |
+
|
2267 |
+
$json->convert_blessed->encode($blessed_hashref_or_arrayref)
|
2268 |
+
# if need, call allow_blessed
|
2269 |
+
|
2270 |
+
Note that it was C<toJson> in old version, but now not C<toJson> but C<TO_JSON>.
|
2271 |
+
|
2272 |
+
=back
|
2273 |
+
|
2274 |
+
=head1 TODO
|
2275 |
+
|
2276 |
+
=over
|
2277 |
+
|
2278 |
+
=item example programs
|
2279 |
+
|
2280 |
+
=back
|
2281 |
+
|
2282 |
+
=head1 THREADS
|
2283 |
+
|
2284 |
+
No test with JSON::PP. If with JSON::XS, See to L<JSON::XS/THREADS>.
|
2285 |
+
|
2286 |
+
|
2287 |
+
=head1 BUGS
|
2288 |
+
|
2289 |
+
Please report bugs relevant to C<JSON> to E<lt>makamaka[at]cpan.orgE<gt>.
|
2290 |
+
|
2291 |
+
|
2292 |
+
=head1 SEE ALSO
|
2293 |
+
|
2294 |
+
Most of the document is copied and modified from JSON::XS doc.
|
2295 |
+
|
2296 |
+
L<JSON::XS>, L<JSON::PP>
|
2297 |
+
|
2298 |
+
C<RFC4627>(L<http://www.ietf.org/rfc/rfc4627.txt>)
|
2299 |
+
|
2300 |
+
=head1 AUTHOR
|
2301 |
+
|
2302 |
+
Makamaka Hannyaharamitu, E<lt>makamaka[at]cpan.orgE<gt>
|
2303 |
+
|
2304 |
+
JSON::XS was written by Marc Lehmann <schmorp[at]schmorp.de>
|
2305 |
+
|
2306 |
+
The release of this new version owes to the courtesy of Marc Lehmann.
|
2307 |
+
|
2308 |
+
|
2309 |
+
=head1 COPYRIGHT AND LICENSE
|
2310 |
+
|
2311 |
+
Copyright 2005-2013 by Makamaka Hannyaharamitu
|
2312 |
+
|
2313 |
+
This library is free software; you can redistribute it and/or modify
|
2314 |
+
it under the same terms as Perl itself.
|
2315 |
+
|
2316 |
+
=cut
|
2317 |
+
|
uroman/lib/JSON/backportPP.pm
ADDED
@@ -0,0 +1,2806 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package # This is JSON::backportPP
|
2 |
+
JSON::PP;
|
3 |
+
|
4 |
+
# JSON-2.0
|
5 |
+
|
6 |
+
use 5.005;
|
7 |
+
use strict;
|
8 |
+
use base qw(Exporter);
|
9 |
+
use overload ();
|
10 |
+
|
11 |
+
use Carp ();
|
12 |
+
use B ();
|
13 |
+
#use Devel::Peek;
|
14 |
+
|
15 |
+
use vars qw($VERSION);
|
16 |
+
$VERSION = '2.27204';
|
17 |
+
|
18 |
+
@JSON::PP::EXPORT = qw(encode_json decode_json from_json to_json);
|
19 |
+
|
20 |
+
# instead of hash-access, i tried index-access for speed.
|
21 |
+
# but this method is not faster than what i expected. so it will be changed.
|
22 |
+
|
23 |
+
use constant P_ASCII => 0;
|
24 |
+
use constant P_LATIN1 => 1;
|
25 |
+
use constant P_UTF8 => 2;
|
26 |
+
use constant P_INDENT => 3;
|
27 |
+
use constant P_CANONICAL => 4;
|
28 |
+
use constant P_SPACE_BEFORE => 5;
|
29 |
+
use constant P_SPACE_AFTER => 6;
|
30 |
+
use constant P_ALLOW_NONREF => 7;
|
31 |
+
use constant P_SHRINK => 8;
|
32 |
+
use constant P_ALLOW_BLESSED => 9;
|
33 |
+
use constant P_CONVERT_BLESSED => 10;
|
34 |
+
use constant P_RELAXED => 11;
|
35 |
+
|
36 |
+
use constant P_LOOSE => 12;
|
37 |
+
use constant P_ALLOW_BIGNUM => 13;
|
38 |
+
use constant P_ALLOW_BAREKEY => 14;
|
39 |
+
use constant P_ALLOW_SINGLEQUOTE => 15;
|
40 |
+
use constant P_ESCAPE_SLASH => 16;
|
41 |
+
use constant P_AS_NONBLESSED => 17;
|
42 |
+
|
43 |
+
use constant P_ALLOW_UNKNOWN => 18;
|
44 |
+
|
45 |
+
use constant OLD_PERL => $] < 5.008 ? 1 : 0;
|
46 |
+
|
47 |
+
BEGIN {
|
48 |
+
my @xs_compati_bit_properties = qw(
|
49 |
+
latin1 ascii utf8 indent canonical space_before space_after allow_nonref shrink
|
50 |
+
allow_blessed convert_blessed relaxed allow_unknown
|
51 |
+
);
|
52 |
+
my @pp_bit_properties = qw(
|
53 |
+
allow_singlequote allow_bignum loose
|
54 |
+
allow_barekey escape_slash as_nonblessed
|
55 |
+
);
|
56 |
+
|
57 |
+
# Perl version check, Unicode handling is enable?
|
58 |
+
# Helper module sets @JSON::PP::_properties.
|
59 |
+
if ($] < 5.008 ) {
|
60 |
+
my $helper = $] >= 5.006 ? 'JSON::backportPP::Compat5006' : 'JSON::backportPP::Compat5005';
|
61 |
+
eval qq| require $helper |;
|
62 |
+
if ($@) { Carp::croak $@; }
|
63 |
+
}
|
64 |
+
|
65 |
+
for my $name (@xs_compati_bit_properties, @pp_bit_properties) {
|
66 |
+
my $flag_name = 'P_' . uc($name);
|
67 |
+
|
68 |
+
eval qq/
|
69 |
+
sub $name {
|
70 |
+
my \$enable = defined \$_[1] ? \$_[1] : 1;
|
71 |
+
|
72 |
+
if (\$enable) {
|
73 |
+
\$_[0]->{PROPS}->[$flag_name] = 1;
|
74 |
+
}
|
75 |
+
else {
|
76 |
+
\$_[0]->{PROPS}->[$flag_name] = 0;
|
77 |
+
}
|
78 |
+
|
79 |
+
\$_[0];
|
80 |
+
}
|
81 |
+
|
82 |
+
sub get_$name {
|
83 |
+
\$_[0]->{PROPS}->[$flag_name] ? 1 : '';
|
84 |
+
}
|
85 |
+
/;
|
86 |
+
}
|
87 |
+
|
88 |
+
}
|
89 |
+
|
90 |
+
|
91 |
+
|
92 |
+
# Functions
|
93 |
+
|
94 |
+
my %encode_allow_method
|
95 |
+
= map {($_ => 1)} qw/utf8 pretty allow_nonref latin1 self_encode escape_slash
|
96 |
+
allow_blessed convert_blessed indent indent_length allow_bignum
|
97 |
+
as_nonblessed
|
98 |
+
/;
|
99 |
+
my %decode_allow_method
|
100 |
+
= map {($_ => 1)} qw/utf8 allow_nonref loose allow_singlequote allow_bignum
|
101 |
+
allow_barekey max_size relaxed/;
|
102 |
+
|
103 |
+
|
104 |
+
my $JSON; # cache
|
105 |
+
|
106 |
+
sub encode_json ($) { # encode
|
107 |
+
($JSON ||= __PACKAGE__->new->utf8)->encode(@_);
|
108 |
+
}
|
109 |
+
|
110 |
+
|
111 |
+
sub decode_json { # decode
|
112 |
+
($JSON ||= __PACKAGE__->new->utf8)->decode(@_);
|
113 |
+
}
|
114 |
+
|
115 |
+
# Obsoleted
|
116 |
+
|
117 |
+
sub to_json($) {
|
118 |
+
Carp::croak ("JSON::PP::to_json has been renamed to encode_json.");
|
119 |
+
}
|
120 |
+
|
121 |
+
|
122 |
+
sub from_json($) {
|
123 |
+
Carp::croak ("JSON::PP::from_json has been renamed to decode_json.");
|
124 |
+
}
|
125 |
+
|
126 |
+
|
127 |
+
# Methods
|
128 |
+
|
129 |
+
sub new {
|
130 |
+
my $class = shift;
|
131 |
+
my $self = {
|
132 |
+
max_depth => 512,
|
133 |
+
max_size => 0,
|
134 |
+
indent => 0,
|
135 |
+
FLAGS => 0,
|
136 |
+
fallback => sub { encode_error('Invalid value. JSON can only reference.') },
|
137 |
+
indent_length => 3,
|
138 |
+
};
|
139 |
+
|
140 |
+
bless $self, $class;
|
141 |
+
}
|
142 |
+
|
143 |
+
|
144 |
+
sub encode {
|
145 |
+
return $_[0]->PP_encode_json($_[1]);
|
146 |
+
}
|
147 |
+
|
148 |
+
|
149 |
+
sub decode {
|
150 |
+
return $_[0]->PP_decode_json($_[1], 0x00000000);
|
151 |
+
}
|
152 |
+
|
153 |
+
|
154 |
+
sub decode_prefix {
|
155 |
+
return $_[0]->PP_decode_json($_[1], 0x00000001);
|
156 |
+
}
|
157 |
+
|
158 |
+
|
159 |
+
# accessor
|
160 |
+
|
161 |
+
|
162 |
+
# pretty printing
|
163 |
+
|
164 |
+
sub pretty {
|
165 |
+
my ($self, $v) = @_;
|
166 |
+
my $enable = defined $v ? $v : 1;
|
167 |
+
|
168 |
+
if ($enable) { # indent_length(3) for JSON::XS compatibility
|
169 |
+
$self->indent(1)->indent_length(3)->space_before(1)->space_after(1);
|
170 |
+
}
|
171 |
+
else {
|
172 |
+
$self->indent(0)->space_before(0)->space_after(0);
|
173 |
+
}
|
174 |
+
|
175 |
+
$self;
|
176 |
+
}
|
177 |
+
|
178 |
+
# etc
|
179 |
+
|
180 |
+
sub max_depth {
|
181 |
+
my $max = defined $_[1] ? $_[1] : 0x80000000;
|
182 |
+
$_[0]->{max_depth} = $max;
|
183 |
+
$_[0];
|
184 |
+
}
|
185 |
+
|
186 |
+
|
187 |
+
sub get_max_depth { $_[0]->{max_depth}; }
|
188 |
+
|
189 |
+
|
190 |
+
sub max_size {
|
191 |
+
my $max = defined $_[1] ? $_[1] : 0;
|
192 |
+
$_[0]->{max_size} = $max;
|
193 |
+
$_[0];
|
194 |
+
}
|
195 |
+
|
196 |
+
|
197 |
+
sub get_max_size { $_[0]->{max_size}; }
|
198 |
+
|
199 |
+
|
200 |
+
sub filter_json_object {
|
201 |
+
$_[0]->{cb_object} = defined $_[1] ? $_[1] : 0;
|
202 |
+
$_[0]->{F_HOOK} = ($_[0]->{cb_object} or $_[0]->{cb_sk_object}) ? 1 : 0;
|
203 |
+
$_[0];
|
204 |
+
}
|
205 |
+
|
206 |
+
sub filter_json_single_key_object {
|
207 |
+
if (@_ > 1) {
|
208 |
+
$_[0]->{cb_sk_object}->{$_[1]} = $_[2];
|
209 |
+
}
|
210 |
+
$_[0]->{F_HOOK} = ($_[0]->{cb_object} or $_[0]->{cb_sk_object}) ? 1 : 0;
|
211 |
+
$_[0];
|
212 |
+
}
|
213 |
+
|
214 |
+
sub indent_length {
|
215 |
+
if (!defined $_[1] or $_[1] > 15 or $_[1] < 0) {
|
216 |
+
Carp::carp "The acceptable range of indent_length() is 0 to 15.";
|
217 |
+
}
|
218 |
+
else {
|
219 |
+
$_[0]->{indent_length} = $_[1];
|
220 |
+
}
|
221 |
+
$_[0];
|
222 |
+
}
|
223 |
+
|
224 |
+
sub get_indent_length {
|
225 |
+
$_[0]->{indent_length};
|
226 |
+
}
|
227 |
+
|
228 |
+
sub sort_by {
|
229 |
+
$_[0]->{sort_by} = defined $_[1] ? $_[1] : 1;
|
230 |
+
$_[0];
|
231 |
+
}
|
232 |
+
|
233 |
+
sub allow_bigint {
|
234 |
+
Carp::carp("allow_bigint() is obsoleted. use allow_bignum() insted.");
|
235 |
+
}
|
236 |
+
|
237 |
+
###############################
|
238 |
+
|
239 |
+
###
|
240 |
+
### Perl => JSON
|
241 |
+
###
|
242 |
+
|
243 |
+
|
244 |
+
{ # Convert
|
245 |
+
|
246 |
+
my $max_depth;
|
247 |
+
my $indent;
|
248 |
+
my $ascii;
|
249 |
+
my $latin1;
|
250 |
+
my $utf8;
|
251 |
+
my $space_before;
|
252 |
+
my $space_after;
|
253 |
+
my $canonical;
|
254 |
+
my $allow_blessed;
|
255 |
+
my $convert_blessed;
|
256 |
+
|
257 |
+
my $indent_length;
|
258 |
+
my $escape_slash;
|
259 |
+
my $bignum;
|
260 |
+
my $as_nonblessed;
|
261 |
+
|
262 |
+
my $depth;
|
263 |
+
my $indent_count;
|
264 |
+
my $keysort;
|
265 |
+
|
266 |
+
|
267 |
+
sub PP_encode_json {
|
268 |
+
my $self = shift;
|
269 |
+
my $obj = shift;
|
270 |
+
|
271 |
+
$indent_count = 0;
|
272 |
+
$depth = 0;
|
273 |
+
|
274 |
+
my $idx = $self->{PROPS};
|
275 |
+
|
276 |
+
($ascii, $latin1, $utf8, $indent, $canonical, $space_before, $space_after, $allow_blessed,
|
277 |
+
$convert_blessed, $escape_slash, $bignum, $as_nonblessed)
|
278 |
+
= @{$idx}[P_ASCII .. P_SPACE_AFTER, P_ALLOW_BLESSED, P_CONVERT_BLESSED,
|
279 |
+
P_ESCAPE_SLASH, P_ALLOW_BIGNUM, P_AS_NONBLESSED];
|
280 |
+
|
281 |
+
($max_depth, $indent_length) = @{$self}{qw/max_depth indent_length/};
|
282 |
+
|
283 |
+
$keysort = $canonical ? sub { $a cmp $b } : undef;
|
284 |
+
|
285 |
+
if ($self->{sort_by}) {
|
286 |
+
$keysort = ref($self->{sort_by}) eq 'CODE' ? $self->{sort_by}
|
287 |
+
: $self->{sort_by} =~ /\D+/ ? $self->{sort_by}
|
288 |
+
: sub { $a cmp $b };
|
289 |
+
}
|
290 |
+
|
291 |
+
encode_error("hash- or arrayref expected (not a simple scalar, use allow_nonref to allow this)")
|
292 |
+
if(!ref $obj and !$idx->[ P_ALLOW_NONREF ]);
|
293 |
+
|
294 |
+
my $str = $self->object_to_json($obj);
|
295 |
+
|
296 |
+
$str .= "\n" if ( $indent ); # JSON::XS 2.26 compatible
|
297 |
+
|
298 |
+
unless ($ascii or $latin1 or $utf8) {
|
299 |
+
utf8::upgrade($str);
|
300 |
+
}
|
301 |
+
|
302 |
+
if ($idx->[ P_SHRINK ]) {
|
303 |
+
utf8::downgrade($str, 1);
|
304 |
+
}
|
305 |
+
|
306 |
+
return $str;
|
307 |
+
}
|
308 |
+
|
309 |
+
|
310 |
+
sub object_to_json {
|
311 |
+
my ($self, $obj) = @_;
|
312 |
+
my $type = ref($obj);
|
313 |
+
|
314 |
+
if($type eq 'HASH'){
|
315 |
+
return $self->hash_to_json($obj);
|
316 |
+
}
|
317 |
+
elsif($type eq 'ARRAY'){
|
318 |
+
return $self->array_to_json($obj);
|
319 |
+
}
|
320 |
+
elsif ($type) { # blessed object?
|
321 |
+
if (blessed($obj)) {
|
322 |
+
|
323 |
+
return $self->value_to_json($obj) if ( $obj->isa('JSON::PP::Boolean') );
|
324 |
+
|
325 |
+
if ( $convert_blessed and $obj->can('TO_JSON') ) {
|
326 |
+
my $result = $obj->TO_JSON();
|
327 |
+
if ( defined $result and ref( $result ) ) {
|
328 |
+
if ( refaddr( $obj ) eq refaddr( $result ) ) {
|
329 |
+
encode_error( sprintf(
|
330 |
+
"%s::TO_JSON method returned same object as was passed instead of a new one",
|
331 |
+
ref $obj
|
332 |
+
) );
|
333 |
+
}
|
334 |
+
}
|
335 |
+
|
336 |
+
return $self->object_to_json( $result );
|
337 |
+
}
|
338 |
+
|
339 |
+
return "$obj" if ( $bignum and _is_bignum($obj) );
|
340 |
+
return $self->blessed_to_json($obj) if ($allow_blessed and $as_nonblessed); # will be removed.
|
341 |
+
|
342 |
+
encode_error( sprintf("encountered object '%s', but neither allow_blessed "
|
343 |
+
. "nor convert_blessed settings are enabled", $obj)
|
344 |
+
) unless ($allow_blessed);
|
345 |
+
|
346 |
+
return 'null';
|
347 |
+
}
|
348 |
+
else {
|
349 |
+
return $self->value_to_json($obj);
|
350 |
+
}
|
351 |
+
}
|
352 |
+
else{
|
353 |
+
return $self->value_to_json($obj);
|
354 |
+
}
|
355 |
+
}
|
356 |
+
|
357 |
+
|
358 |
+
sub hash_to_json {
|
359 |
+
my ($self, $obj) = @_;
|
360 |
+
my @res;
|
361 |
+
|
362 |
+
encode_error("json text or perl structure exceeds maximum nesting level (max_depth set too low?)")
|
363 |
+
if (++$depth > $max_depth);
|
364 |
+
|
365 |
+
my ($pre, $post) = $indent ? $self->_up_indent() : ('', '');
|
366 |
+
my $del = ($space_before ? ' ' : '') . ':' . ($space_after ? ' ' : '');
|
367 |
+
|
368 |
+
for my $k ( _sort( $obj ) ) {
|
369 |
+
if ( OLD_PERL ) { utf8::decode($k) } # key for Perl 5.6 / be optimized
|
370 |
+
push @res, string_to_json( $self, $k )
|
371 |
+
. $del
|
372 |
+
. ( $self->object_to_json( $obj->{$k} ) || $self->value_to_json( $obj->{$k} ) );
|
373 |
+
}
|
374 |
+
|
375 |
+
--$depth;
|
376 |
+
$self->_down_indent() if ($indent);
|
377 |
+
|
378 |
+
return '{' . ( @res ? $pre : '' ) . ( @res ? join( ",$pre", @res ) . $post : '' ) . '}';
|
379 |
+
}
|
380 |
+
|
381 |
+
|
382 |
+
sub array_to_json {
|
383 |
+
my ($self, $obj) = @_;
|
384 |
+
my @res;
|
385 |
+
|
386 |
+
encode_error("json text or perl structure exceeds maximum nesting level (max_depth set too low?)")
|
387 |
+
if (++$depth > $max_depth);
|
388 |
+
|
389 |
+
my ($pre, $post) = $indent ? $self->_up_indent() : ('', '');
|
390 |
+
|
391 |
+
for my $v (@$obj){
|
392 |
+
push @res, $self->object_to_json($v) || $self->value_to_json($v);
|
393 |
+
}
|
394 |
+
|
395 |
+
--$depth;
|
396 |
+
$self->_down_indent() if ($indent);
|
397 |
+
|
398 |
+
return '[' . ( @res ? $pre : '' ) . ( @res ? join( ",$pre", @res ) . $post : '' ) . ']';
|
399 |
+
}
|
400 |
+
|
401 |
+
|
402 |
+
sub value_to_json {
|
403 |
+
my ($self, $value) = @_;
|
404 |
+
|
405 |
+
return 'null' if(!defined $value);
|
406 |
+
|
407 |
+
my $b_obj = B::svref_2object(\$value); # for round trip problem
|
408 |
+
my $flags = $b_obj->FLAGS;
|
409 |
+
|
410 |
+
return $value # as is
|
411 |
+
if $flags & ( B::SVp_IOK | B::SVp_NOK ) and !( $flags & B::SVp_POK ); # SvTYPE is IV or NV?
|
412 |
+
|
413 |
+
my $type = ref($value);
|
414 |
+
|
415 |
+
if(!$type){
|
416 |
+
return string_to_json($self, $value);
|
417 |
+
}
|
418 |
+
elsif( blessed($value) and $value->isa('JSON::PP::Boolean') ){
|
419 |
+
return $$value == 1 ? 'true' : 'false';
|
420 |
+
}
|
421 |
+
elsif ($type) {
|
422 |
+
if ((overload::StrVal($value) =~ /=(\w+)/)[0]) {
|
423 |
+
return $self->value_to_json("$value");
|
424 |
+
}
|
425 |
+
|
426 |
+
if ($type eq 'SCALAR' and defined $$value) {
|
427 |
+
return $$value eq '1' ? 'true'
|
428 |
+
: $$value eq '0' ? 'false'
|
429 |
+
: $self->{PROPS}->[ P_ALLOW_UNKNOWN ] ? 'null'
|
430 |
+
: encode_error("cannot encode reference to scalar");
|
431 |
+
}
|
432 |
+
|
433 |
+
if ( $self->{PROPS}->[ P_ALLOW_UNKNOWN ] ) {
|
434 |
+
return 'null';
|
435 |
+
}
|
436 |
+
else {
|
437 |
+
if ( $type eq 'SCALAR' or $type eq 'REF' ) {
|
438 |
+
encode_error("cannot encode reference to scalar");
|
439 |
+
}
|
440 |
+
else {
|
441 |
+
encode_error("encountered $value, but JSON can only represent references to arrays or hashes");
|
442 |
+
}
|
443 |
+
}
|
444 |
+
|
445 |
+
}
|
446 |
+
else {
|
447 |
+
return $self->{fallback}->($value)
|
448 |
+
if ($self->{fallback} and ref($self->{fallback}) eq 'CODE');
|
449 |
+
return 'null';
|
450 |
+
}
|
451 |
+
|
452 |
+
}
|
453 |
+
|
454 |
+
|
455 |
+
my %esc = (
|
456 |
+
"\n" => '\n',
|
457 |
+
"\r" => '\r',
|
458 |
+
"\t" => '\t',
|
459 |
+
"\f" => '\f',
|
460 |
+
"\b" => '\b',
|
461 |
+
"\"" => '\"',
|
462 |
+
"\\" => '\\\\',
|
463 |
+
"\'" => '\\\'',
|
464 |
+
);
|
465 |
+
|
466 |
+
|
467 |
+
sub string_to_json {
|
468 |
+
my ($self, $arg) = @_;
|
469 |
+
|
470 |
+
$arg =~ s/([\x22\x5c\n\r\t\f\b])/$esc{$1}/g;
|
471 |
+
$arg =~ s/\//\\\//g if ($escape_slash);
|
472 |
+
$arg =~ s/([\x00-\x08\x0b\x0e-\x1f])/'\\u00' . unpack('H2', $1)/eg;
|
473 |
+
|
474 |
+
if ($ascii) {
|
475 |
+
$arg = JSON_PP_encode_ascii($arg);
|
476 |
+
}
|
477 |
+
|
478 |
+
if ($latin1) {
|
479 |
+
$arg = JSON_PP_encode_latin1($arg);
|
480 |
+
}
|
481 |
+
|
482 |
+
if ($utf8) {
|
483 |
+
utf8::encode($arg);
|
484 |
+
}
|
485 |
+
|
486 |
+
return '"' . $arg . '"';
|
487 |
+
}
|
488 |
+
|
489 |
+
|
490 |
+
sub blessed_to_json {
|
491 |
+
my $reftype = reftype($_[1]) || '';
|
492 |
+
if ($reftype eq 'HASH') {
|
493 |
+
return $_[0]->hash_to_json($_[1]);
|
494 |
+
}
|
495 |
+
elsif ($reftype eq 'ARRAY') {
|
496 |
+
return $_[0]->array_to_json($_[1]);
|
497 |
+
}
|
498 |
+
else {
|
499 |
+
return 'null';
|
500 |
+
}
|
501 |
+
}
|
502 |
+
|
503 |
+
|
504 |
+
sub encode_error {
|
505 |
+
my $error = shift;
|
506 |
+
Carp::croak "$error";
|
507 |
+
}
|
508 |
+
|
509 |
+
|
510 |
+
sub _sort {
|
511 |
+
defined $keysort ? (sort $keysort (keys %{$_[0]})) : keys %{$_[0]};
|
512 |
+
}
|
513 |
+
|
514 |
+
|
515 |
+
sub _up_indent {
|
516 |
+
my $self = shift;
|
517 |
+
my $space = ' ' x $indent_length;
|
518 |
+
|
519 |
+
my ($pre,$post) = ('','');
|
520 |
+
|
521 |
+
$post = "\n" . $space x $indent_count;
|
522 |
+
|
523 |
+
$indent_count++;
|
524 |
+
|
525 |
+
$pre = "\n" . $space x $indent_count;
|
526 |
+
|
527 |
+
return ($pre,$post);
|
528 |
+
}
|
529 |
+
|
530 |
+
|
531 |
+
sub _down_indent { $indent_count--; }
|
532 |
+
|
533 |
+
|
534 |
+
sub PP_encode_box {
|
535 |
+
{
|
536 |
+
depth => $depth,
|
537 |
+
indent_count => $indent_count,
|
538 |
+
};
|
539 |
+
}
|
540 |
+
|
541 |
+
} # Convert
|
542 |
+
|
543 |
+
|
544 |
+
sub _encode_ascii {
|
545 |
+
join('',
|
546 |
+
map {
|
547 |
+
$_ <= 127 ?
|
548 |
+
chr($_) :
|
549 |
+
$_ <= 65535 ?
|
550 |
+
sprintf('\u%04x', $_) : sprintf('\u%x\u%x', _encode_surrogates($_));
|
551 |
+
} unpack('U*', $_[0])
|
552 |
+
);
|
553 |
+
}
|
554 |
+
|
555 |
+
|
556 |
+
sub _encode_latin1 {
|
557 |
+
join('',
|
558 |
+
map {
|
559 |
+
$_ <= 255 ?
|
560 |
+
chr($_) :
|
561 |
+
$_ <= 65535 ?
|
562 |
+
sprintf('\u%04x', $_) : sprintf('\u%x\u%x', _encode_surrogates($_));
|
563 |
+
} unpack('U*', $_[0])
|
564 |
+
);
|
565 |
+
}
|
566 |
+
|
567 |
+
|
568 |
+
sub _encode_surrogates { # from perlunicode
|
569 |
+
my $uni = $_[0] - 0x10000;
|
570 |
+
return ($uni / 0x400 + 0xD800, $uni % 0x400 + 0xDC00);
|
571 |
+
}
|
572 |
+
|
573 |
+
|
574 |
+
sub _is_bignum {
|
575 |
+
$_[0]->isa('Math::BigInt') or $_[0]->isa('Math::BigFloat');
|
576 |
+
}
|
577 |
+
|
578 |
+
|
579 |
+
|
580 |
+
#
|
581 |
+
# JSON => Perl
|
582 |
+
#
|
583 |
+
|
584 |
+
my $max_intsize;
|
585 |
+
|
586 |
+
BEGIN {
|
587 |
+
my $checkint = 1111;
|
588 |
+
for my $d (5..64) {
|
589 |
+
$checkint .= 1;
|
590 |
+
my $int = eval qq| $checkint |;
|
591 |
+
if ($int =~ /[eE]/) {
|
592 |
+
$max_intsize = $d - 1;
|
593 |
+
last;
|
594 |
+
}
|
595 |
+
}
|
596 |
+
}
|
597 |
+
|
598 |
+
{ # PARSE
|
599 |
+
|
600 |
+
my %escapes = ( # by Jeremy Muhlich <jmuhlich [at] bitflood.org>
|
601 |
+
b => "\x8",
|
602 |
+
t => "\x9",
|
603 |
+
n => "\xA",
|
604 |
+
f => "\xC",
|
605 |
+
r => "\xD",
|
606 |
+
'\\' => '\\',
|
607 |
+
'"' => '"',
|
608 |
+
'/' => '/',
|
609 |
+
);
|
610 |
+
|
611 |
+
my $text; # json data
|
612 |
+
my $at; # offset
|
613 |
+
my $ch; # 1chracter
|
614 |
+
my $len; # text length (changed according to UTF8 or NON UTF8)
|
615 |
+
# INTERNAL
|
616 |
+
my $depth; # nest counter
|
617 |
+
my $encoding; # json text encoding
|
618 |
+
my $is_valid_utf8; # temp variable
|
619 |
+
my $utf8_len; # utf8 byte length
|
620 |
+
# FLAGS
|
621 |
+
my $utf8; # must be utf8
|
622 |
+
my $max_depth; # max nest number of objects and arrays
|
623 |
+
my $max_size;
|
624 |
+
my $relaxed;
|
625 |
+
my $cb_object;
|
626 |
+
my $cb_sk_object;
|
627 |
+
|
628 |
+
my $F_HOOK;
|
629 |
+
|
630 |
+
my $allow_bigint; # using Math::BigInt
|
631 |
+
my $singlequote; # loosely quoting
|
632 |
+
my $loose; #
|
633 |
+
my $allow_barekey; # bareKey
|
634 |
+
|
635 |
+
# $opt flag
|
636 |
+
# 0x00000001 .... decode_prefix
|
637 |
+
# 0x10000000 .... incr_parse
|
638 |
+
|
639 |
+
sub PP_decode_json {
|
640 |
+
my ($self, $opt); # $opt is an effective flag during this decode_json.
|
641 |
+
|
642 |
+
($self, $text, $opt) = @_;
|
643 |
+
|
644 |
+
($at, $ch, $depth) = (0, '', 0);
|
645 |
+
|
646 |
+
if ( !defined $text or ref $text ) {
|
647 |
+
decode_error("malformed JSON string, neither array, object, number, string or atom");
|
648 |
+
}
|
649 |
+
|
650 |
+
my $idx = $self->{PROPS};
|
651 |
+
|
652 |
+
($utf8, $relaxed, $loose, $allow_bigint, $allow_barekey, $singlequote)
|
653 |
+
= @{$idx}[P_UTF8, P_RELAXED, P_LOOSE .. P_ALLOW_SINGLEQUOTE];
|
654 |
+
|
655 |
+
if ( $utf8 ) {
|
656 |
+
utf8::downgrade( $text, 1 ) or Carp::croak("Wide character in subroutine entry");
|
657 |
+
}
|
658 |
+
else {
|
659 |
+
utf8::upgrade( $text );
|
660 |
+
}
|
661 |
+
|
662 |
+
$len = length $text;
|
663 |
+
|
664 |
+
($max_depth, $max_size, $cb_object, $cb_sk_object, $F_HOOK)
|
665 |
+
= @{$self}{qw/max_depth max_size cb_object cb_sk_object F_HOOK/};
|
666 |
+
|
667 |
+
if ($max_size > 1) {
|
668 |
+
use bytes;
|
669 |
+
my $bytes = length $text;
|
670 |
+
decode_error(
|
671 |
+
sprintf("attempted decode of JSON text of %s bytes size, but max_size is set to %s"
|
672 |
+
, $bytes, $max_size), 1
|
673 |
+
) if ($bytes > $max_size);
|
674 |
+
}
|
675 |
+
|
676 |
+
# Currently no effect
|
677 |
+
# should use regexp
|
678 |
+
my @octets = unpack('C4', $text);
|
679 |
+
$encoding = ( $octets[0] and $octets[1]) ? 'UTF-8'
|
680 |
+
: (!$octets[0] and $octets[1]) ? 'UTF-16BE'
|
681 |
+
: (!$octets[0] and !$octets[1]) ? 'UTF-32BE'
|
682 |
+
: ( $octets[2] ) ? 'UTF-16LE'
|
683 |
+
: (!$octets[2] ) ? 'UTF-32LE'
|
684 |
+
: 'unknown';
|
685 |
+
|
686 |
+
white(); # remove head white space
|
687 |
+
|
688 |
+
my $valid_start = defined $ch; # Is there a first character for JSON structure?
|
689 |
+
|
690 |
+
my $result = value();
|
691 |
+
|
692 |
+
return undef if ( !$result && ( $opt & 0x10000000 ) ); # for incr_parse
|
693 |
+
|
694 |
+
decode_error("malformed JSON string, neither array, object, number, string or atom") unless $valid_start;
|
695 |
+
|
696 |
+
if ( !$idx->[ P_ALLOW_NONREF ] and !ref $result ) {
|
697 |
+
decode_error(
|
698 |
+
'JSON text must be an object or array (but found number, string, true, false or null,'
|
699 |
+
. ' use allow_nonref to allow this)', 1);
|
700 |
+
}
|
701 |
+
|
702 |
+
Carp::croak('something wrong.') if $len < $at; # we won't arrive here.
|
703 |
+
|
704 |
+
my $consumed = defined $ch ? $at - 1 : $at; # consumed JSON text length
|
705 |
+
|
706 |
+
white(); # remove tail white space
|
707 |
+
|
708 |
+
if ( $ch ) {
|
709 |
+
return ( $result, $consumed ) if ($opt & 0x00000001); # all right if decode_prefix
|
710 |
+
decode_error("garbage after JSON object");
|
711 |
+
}
|
712 |
+
|
713 |
+
( $opt & 0x00000001 ) ? ( $result, $consumed ) : $result;
|
714 |
+
}
|
715 |
+
|
716 |
+
|
717 |
+
sub next_chr {
|
718 |
+
return $ch = undef if($at >= $len);
|
719 |
+
$ch = substr($text, $at++, 1);
|
720 |
+
}
|
721 |
+
|
722 |
+
|
723 |
+
sub value {
|
724 |
+
white();
|
725 |
+
return if(!defined $ch);
|
726 |
+
return object() if($ch eq '{');
|
727 |
+
return array() if($ch eq '[');
|
728 |
+
return string() if($ch eq '"' or ($singlequote and $ch eq "'"));
|
729 |
+
return number() if($ch =~ /[0-9]/ or $ch eq '-');
|
730 |
+
return word();
|
731 |
+
}
|
732 |
+
|
733 |
+
sub string {
|
734 |
+
my ($i, $s, $t, $u);
|
735 |
+
my $utf16;
|
736 |
+
my $is_utf8;
|
737 |
+
|
738 |
+
($is_valid_utf8, $utf8_len) = ('', 0);
|
739 |
+
|
740 |
+
$s = ''; # basically UTF8 flag on
|
741 |
+
|
742 |
+
if($ch eq '"' or ($singlequote and $ch eq "'")){
|
743 |
+
my $boundChar = $ch;
|
744 |
+
|
745 |
+
OUTER: while( defined(next_chr()) ){
|
746 |
+
|
747 |
+
if($ch eq $boundChar){
|
748 |
+
next_chr();
|
749 |
+
|
750 |
+
if ($utf16) {
|
751 |
+
decode_error("missing low surrogate character in surrogate pair");
|
752 |
+
}
|
753 |
+
|
754 |
+
utf8::decode($s) if($is_utf8);
|
755 |
+
|
756 |
+
return $s;
|
757 |
+
}
|
758 |
+
elsif($ch eq '\\'){
|
759 |
+
next_chr();
|
760 |
+
if(exists $escapes{$ch}){
|
761 |
+
$s .= $escapes{$ch};
|
762 |
+
}
|
763 |
+
elsif($ch eq 'u'){ # UNICODE handling
|
764 |
+
my $u = '';
|
765 |
+
|
766 |
+
for(1..4){
|
767 |
+
$ch = next_chr();
|
768 |
+
last OUTER if($ch !~ /[0-9a-fA-F]/);
|
769 |
+
$u .= $ch;
|
770 |
+
}
|
771 |
+
|
772 |
+
# U+D800 - U+DBFF
|
773 |
+
if ($u =~ /^[dD][89abAB][0-9a-fA-F]{2}/) { # UTF-16 high surrogate?
|
774 |
+
$utf16 = $u;
|
775 |
+
}
|
776 |
+
# U+DC00 - U+DFFF
|
777 |
+
elsif ($u =~ /^[dD][c-fC-F][0-9a-fA-F]{2}/) { # UTF-16 low surrogate?
|
778 |
+
unless (defined $utf16) {
|
779 |
+
decode_error("missing high surrogate character in surrogate pair");
|
780 |
+
}
|
781 |
+
$is_utf8 = 1;
|
782 |
+
$s .= JSON_PP_decode_surrogates($utf16, $u) || next;
|
783 |
+
$utf16 = undef;
|
784 |
+
}
|
785 |
+
else {
|
786 |
+
if (defined $utf16) {
|
787 |
+
decode_error("surrogate pair expected");
|
788 |
+
}
|
789 |
+
|
790 |
+
if ( ( my $hex = hex( $u ) ) > 127 ) {
|
791 |
+
$is_utf8 = 1;
|
792 |
+
$s .= JSON_PP_decode_unicode($u) || next;
|
793 |
+
}
|
794 |
+
else {
|
795 |
+
$s .= chr $hex;
|
796 |
+
}
|
797 |
+
}
|
798 |
+
|
799 |
+
}
|
800 |
+
else{
|
801 |
+
unless ($loose) {
|
802 |
+
$at -= 2;
|
803 |
+
decode_error('illegal backslash escape sequence in string');
|
804 |
+
}
|
805 |
+
$s .= $ch;
|
806 |
+
}
|
807 |
+
}
|
808 |
+
else{
|
809 |
+
|
810 |
+
if ( ord $ch > 127 ) {
|
811 |
+
if ( $utf8 ) {
|
812 |
+
unless( $ch = is_valid_utf8($ch) ) {
|
813 |
+
$at -= 1;
|
814 |
+
decode_error("malformed UTF-8 character in JSON string");
|
815 |
+
}
|
816 |
+
else {
|
817 |
+
$at += $utf8_len - 1;
|
818 |
+
}
|
819 |
+
}
|
820 |
+
else {
|
821 |
+
utf8::encode( $ch );
|
822 |
+
}
|
823 |
+
|
824 |
+
$is_utf8 = 1;
|
825 |
+
}
|
826 |
+
|
827 |
+
if (!$loose) {
|
828 |
+
if ($ch =~ /[\x00-\x1f\x22\x5c]/) { # '/' ok
|
829 |
+
$at--;
|
830 |
+
decode_error('invalid character encountered while parsing JSON string');
|
831 |
+
}
|
832 |
+
}
|
833 |
+
|
834 |
+
$s .= $ch;
|
835 |
+
}
|
836 |
+
}
|
837 |
+
}
|
838 |
+
|
839 |
+
decode_error("unexpected end of string while parsing JSON string");
|
840 |
+
}
|
841 |
+
|
842 |
+
|
843 |
+
sub white {
|
844 |
+
while( defined $ch ){
|
845 |
+
if($ch le ' '){
|
846 |
+
next_chr();
|
847 |
+
}
|
848 |
+
elsif($ch eq '/'){
|
849 |
+
next_chr();
|
850 |
+
if(defined $ch and $ch eq '/'){
|
851 |
+
1 while(defined(next_chr()) and $ch ne "\n" and $ch ne "\r");
|
852 |
+
}
|
853 |
+
elsif(defined $ch and $ch eq '*'){
|
854 |
+
next_chr();
|
855 |
+
while(1){
|
856 |
+
if(defined $ch){
|
857 |
+
if($ch eq '*'){
|
858 |
+
if(defined(next_chr()) and $ch eq '/'){
|
859 |
+
next_chr();
|
860 |
+
last;
|
861 |
+
}
|
862 |
+
}
|
863 |
+
else{
|
864 |
+
next_chr();
|
865 |
+
}
|
866 |
+
}
|
867 |
+
else{
|
868 |
+
decode_error("Unterminated comment");
|
869 |
+
}
|
870 |
+
}
|
871 |
+
next;
|
872 |
+
}
|
873 |
+
else{
|
874 |
+
$at--;
|
875 |
+
decode_error("malformed JSON string, neither array, object, number, string or atom");
|
876 |
+
}
|
877 |
+
}
|
878 |
+
else{
|
879 |
+
if ($relaxed and $ch eq '#') { # correctly?
|
880 |
+
pos($text) = $at;
|
881 |
+
$text =~ /\G([^\n]*(?:\r\n|\r|\n|$))/g;
|
882 |
+
$at = pos($text);
|
883 |
+
next_chr;
|
884 |
+
next;
|
885 |
+
}
|
886 |
+
|
887 |
+
last;
|
888 |
+
}
|
889 |
+
}
|
890 |
+
}
|
891 |
+
|
892 |
+
|
893 |
+
sub array {
|
894 |
+
my $a = $_[0] || []; # you can use this code to use another array ref object.
|
895 |
+
|
896 |
+
decode_error('json text or perl structure exceeds maximum nesting level (max_depth set too low?)')
|
897 |
+
if (++$depth > $max_depth);
|
898 |
+
|
899 |
+
next_chr();
|
900 |
+
white();
|
901 |
+
|
902 |
+
if(defined $ch and $ch eq ']'){
|
903 |
+
--$depth;
|
904 |
+
next_chr();
|
905 |
+
return $a;
|
906 |
+
}
|
907 |
+
else {
|
908 |
+
while(defined($ch)){
|
909 |
+
push @$a, value();
|
910 |
+
|
911 |
+
white();
|
912 |
+
|
913 |
+
if (!defined $ch) {
|
914 |
+
last;
|
915 |
+
}
|
916 |
+
|
917 |
+
if($ch eq ']'){
|
918 |
+
--$depth;
|
919 |
+
next_chr();
|
920 |
+
return $a;
|
921 |
+
}
|
922 |
+
|
923 |
+
if($ch ne ','){
|
924 |
+
last;
|
925 |
+
}
|
926 |
+
|
927 |
+
next_chr();
|
928 |
+
white();
|
929 |
+
|
930 |
+
if ($relaxed and $ch eq ']') {
|
931 |
+
--$depth;
|
932 |
+
next_chr();
|
933 |
+
return $a;
|
934 |
+
}
|
935 |
+
|
936 |
+
}
|
937 |
+
}
|
938 |
+
|
939 |
+
decode_error(", or ] expected while parsing array");
|
940 |
+
}
|
941 |
+
|
942 |
+
|
943 |
+
sub object {
|
944 |
+
my $o = $_[0] || {}; # you can use this code to use another hash ref object.
|
945 |
+
my $k;
|
946 |
+
|
947 |
+
decode_error('json text or perl structure exceeds maximum nesting level (max_depth set too low?)')
|
948 |
+
if (++$depth > $max_depth);
|
949 |
+
next_chr();
|
950 |
+
white();
|
951 |
+
|
952 |
+
if(defined $ch and $ch eq '}'){
|
953 |
+
--$depth;
|
954 |
+
next_chr();
|
955 |
+
if ($F_HOOK) {
|
956 |
+
return _json_object_hook($o);
|
957 |
+
}
|
958 |
+
return $o;
|
959 |
+
}
|
960 |
+
else {
|
961 |
+
while (defined $ch) {
|
962 |
+
$k = ($allow_barekey and $ch ne '"' and $ch ne "'") ? bareKey() : string();
|
963 |
+
white();
|
964 |
+
|
965 |
+
if(!defined $ch or $ch ne ':'){
|
966 |
+
$at--;
|
967 |
+
decode_error("':' expected");
|
968 |
+
}
|
969 |
+
|
970 |
+
next_chr();
|
971 |
+
$o->{$k} = value();
|
972 |
+
white();
|
973 |
+
|
974 |
+
last if (!defined $ch);
|
975 |
+
|
976 |
+
if($ch eq '}'){
|
977 |
+
--$depth;
|
978 |
+
next_chr();
|
979 |
+
if ($F_HOOK) {
|
980 |
+
return _json_object_hook($o);
|
981 |
+
}
|
982 |
+
return $o;
|
983 |
+
}
|
984 |
+
|
985 |
+
if($ch ne ','){
|
986 |
+
last;
|
987 |
+
}
|
988 |
+
|
989 |
+
next_chr();
|
990 |
+
white();
|
991 |
+
|
992 |
+
if ($relaxed and $ch eq '}') {
|
993 |
+
--$depth;
|
994 |
+
next_chr();
|
995 |
+
if ($F_HOOK) {
|
996 |
+
return _json_object_hook($o);
|
997 |
+
}
|
998 |
+
return $o;
|
999 |
+
}
|
1000 |
+
|
1001 |
+
}
|
1002 |
+
|
1003 |
+
}
|
1004 |
+
|
1005 |
+
$at--;
|
1006 |
+
decode_error(", or } expected while parsing object/hash");
|
1007 |
+
}
|
1008 |
+
|
1009 |
+
|
1010 |
+
sub bareKey { # doesn't strictly follow Standard ECMA-262 3rd Edition
|
1011 |
+
my $key;
|
1012 |
+
while($ch =~ /[^\x00-\x23\x25-\x2F\x3A-\x40\x5B-\x5E\x60\x7B-\x7F]/){
|
1013 |
+
$key .= $ch;
|
1014 |
+
next_chr();
|
1015 |
+
}
|
1016 |
+
return $key;
|
1017 |
+
}
|
1018 |
+
|
1019 |
+
|
1020 |
+
sub word {
|
1021 |
+
my $word = substr($text,$at-1,4);
|
1022 |
+
|
1023 |
+
if($word eq 'true'){
|
1024 |
+
$at += 3;
|
1025 |
+
next_chr;
|
1026 |
+
return $JSON::PP::true;
|
1027 |
+
}
|
1028 |
+
elsif($word eq 'null'){
|
1029 |
+
$at += 3;
|
1030 |
+
next_chr;
|
1031 |
+
return undef;
|
1032 |
+
}
|
1033 |
+
elsif($word eq 'fals'){
|
1034 |
+
$at += 3;
|
1035 |
+
if(substr($text,$at,1) eq 'e'){
|
1036 |
+
$at++;
|
1037 |
+
next_chr;
|
1038 |
+
return $JSON::PP::false;
|
1039 |
+
}
|
1040 |
+
}
|
1041 |
+
|
1042 |
+
$at--; # for decode_error report
|
1043 |
+
|
1044 |
+
decode_error("'null' expected") if ($word =~ /^n/);
|
1045 |
+
decode_error("'true' expected") if ($word =~ /^t/);
|
1046 |
+
decode_error("'false' expected") if ($word =~ /^f/);
|
1047 |
+
decode_error("malformed JSON string, neither array, object, number, string or atom");
|
1048 |
+
}
|
1049 |
+
|
1050 |
+
|
1051 |
+
sub number {
|
1052 |
+
my $n = '';
|
1053 |
+
my $v;
|
1054 |
+
|
1055 |
+
# According to RFC4627, hex or oct digits are invalid.
|
1056 |
+
if($ch eq '0'){
|
1057 |
+
my $peek = substr($text,$at,1);
|
1058 |
+
my $hex = $peek =~ /[xX]/; # 0 or 1
|
1059 |
+
|
1060 |
+
if($hex){
|
1061 |
+
decode_error("malformed number (leading zero must not be followed by another digit)");
|
1062 |
+
($n) = ( substr($text, $at+1) =~ /^([0-9a-fA-F]+)/);
|
1063 |
+
}
|
1064 |
+
else{ # oct
|
1065 |
+
($n) = ( substr($text, $at) =~ /^([0-7]+)/);
|
1066 |
+
if (defined $n and length $n > 1) {
|
1067 |
+
decode_error("malformed number (leading zero must not be followed by another digit)");
|
1068 |
+
}
|
1069 |
+
}
|
1070 |
+
|
1071 |
+
if(defined $n and length($n)){
|
1072 |
+
if (!$hex and length($n) == 1) {
|
1073 |
+
decode_error("malformed number (leading zero must not be followed by another digit)");
|
1074 |
+
}
|
1075 |
+
$at += length($n) + $hex;
|
1076 |
+
next_chr;
|
1077 |
+
return $hex ? hex($n) : oct($n);
|
1078 |
+
}
|
1079 |
+
}
|
1080 |
+
|
1081 |
+
if($ch eq '-'){
|
1082 |
+
$n = '-';
|
1083 |
+
next_chr;
|
1084 |
+
if (!defined $ch or $ch !~ /\d/) {
|
1085 |
+
decode_error("malformed number (no digits after initial minus)");
|
1086 |
+
}
|
1087 |
+
}
|
1088 |
+
|
1089 |
+
while(defined $ch and $ch =~ /\d/){
|
1090 |
+
$n .= $ch;
|
1091 |
+
next_chr;
|
1092 |
+
}
|
1093 |
+
|
1094 |
+
if(defined $ch and $ch eq '.'){
|
1095 |
+
$n .= '.';
|
1096 |
+
|
1097 |
+
next_chr;
|
1098 |
+
if (!defined $ch or $ch !~ /\d/) {
|
1099 |
+
decode_error("malformed number (no digits after decimal point)");
|
1100 |
+
}
|
1101 |
+
else {
|
1102 |
+
$n .= $ch;
|
1103 |
+
}
|
1104 |
+
|
1105 |
+
while(defined(next_chr) and $ch =~ /\d/){
|
1106 |
+
$n .= $ch;
|
1107 |
+
}
|
1108 |
+
}
|
1109 |
+
|
1110 |
+
if(defined $ch and ($ch eq 'e' or $ch eq 'E')){
|
1111 |
+
$n .= $ch;
|
1112 |
+
next_chr;
|
1113 |
+
|
1114 |
+
if(defined($ch) and ($ch eq '+' or $ch eq '-')){
|
1115 |
+
$n .= $ch;
|
1116 |
+
next_chr;
|
1117 |
+
if (!defined $ch or $ch =~ /\D/) {
|
1118 |
+
decode_error("malformed number (no digits after exp sign)");
|
1119 |
+
}
|
1120 |
+
$n .= $ch;
|
1121 |
+
}
|
1122 |
+
elsif(defined($ch) and $ch =~ /\d/){
|
1123 |
+
$n .= $ch;
|
1124 |
+
}
|
1125 |
+
else {
|
1126 |
+
decode_error("malformed number (no digits after exp sign)");
|
1127 |
+
}
|
1128 |
+
|
1129 |
+
while(defined(next_chr) and $ch =~ /\d/){
|
1130 |
+
$n .= $ch;
|
1131 |
+
}
|
1132 |
+
|
1133 |
+
}
|
1134 |
+
|
1135 |
+
$v .= $n;
|
1136 |
+
|
1137 |
+
if ($v !~ /[.eE]/ and length $v > $max_intsize) {
|
1138 |
+
if ($allow_bigint) { # from Adam Sussman
|
1139 |
+
require Math::BigInt;
|
1140 |
+
return Math::BigInt->new($v);
|
1141 |
+
}
|
1142 |
+
else {
|
1143 |
+
return "$v";
|
1144 |
+
}
|
1145 |
+
}
|
1146 |
+
elsif ($allow_bigint) {
|
1147 |
+
require Math::BigFloat;
|
1148 |
+
return Math::BigFloat->new($v);
|
1149 |
+
}
|
1150 |
+
|
1151 |
+
return 0+$v;
|
1152 |
+
}
|
1153 |
+
|
1154 |
+
|
1155 |
+
sub is_valid_utf8 {
|
1156 |
+
|
1157 |
+
$utf8_len = $_[0] =~ /[\x00-\x7F]/ ? 1
|
1158 |
+
: $_[0] =~ /[\xC2-\xDF]/ ? 2
|
1159 |
+
: $_[0] =~ /[\xE0-\xEF]/ ? 3
|
1160 |
+
: $_[0] =~ /[\xF0-\xF4]/ ? 4
|
1161 |
+
: 0
|
1162 |
+
;
|
1163 |
+
|
1164 |
+
return unless $utf8_len;
|
1165 |
+
|
1166 |
+
my $is_valid_utf8 = substr($text, $at - 1, $utf8_len);
|
1167 |
+
|
1168 |
+
return ( $is_valid_utf8 =~ /^(?:
|
1169 |
+
[\x00-\x7F]
|
1170 |
+
|[\xC2-\xDF][\x80-\xBF]
|
1171 |
+
|[\xE0][\xA0-\xBF][\x80-\xBF]
|
1172 |
+
|[\xE1-\xEC][\x80-\xBF][\x80-\xBF]
|
1173 |
+
|[\xED][\x80-\x9F][\x80-\xBF]
|
1174 |
+
|[\xEE-\xEF][\x80-\xBF][\x80-\xBF]
|
1175 |
+
|[\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF]
|
1176 |
+
|[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]
|
1177 |
+
|[\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF]
|
1178 |
+
)$/x ) ? $is_valid_utf8 : '';
|
1179 |
+
}
|
1180 |
+
|
1181 |
+
|
1182 |
+
sub decode_error {
|
1183 |
+
my $error = shift;
|
1184 |
+
my $no_rep = shift;
|
1185 |
+
my $str = defined $text ? substr($text, $at) : '';
|
1186 |
+
my $mess = '';
|
1187 |
+
my $type = $] >= 5.008 ? 'U*'
|
1188 |
+
: $] < 5.006 ? 'C*'
|
1189 |
+
: utf8::is_utf8( $str ) ? 'U*' # 5.6
|
1190 |
+
: 'C*'
|
1191 |
+
;
|
1192 |
+
|
1193 |
+
for my $c ( unpack( $type, $str ) ) { # emulate pv_uni_display() ?
|
1194 |
+
$mess .= $c == 0x07 ? '\a'
|
1195 |
+
: $c == 0x09 ? '\t'
|
1196 |
+
: $c == 0x0a ? '\n'
|
1197 |
+
: $c == 0x0d ? '\r'
|
1198 |
+
: $c == 0x0c ? '\f'
|
1199 |
+
: $c < 0x20 ? sprintf('\x{%x}', $c)
|
1200 |
+
: $c == 0x5c ? '\\\\'
|
1201 |
+
: $c < 0x80 ? chr($c)
|
1202 |
+
: sprintf('\x{%x}', $c)
|
1203 |
+
;
|
1204 |
+
if ( length $mess >= 20 ) {
|
1205 |
+
$mess .= '...';
|
1206 |
+
last;
|
1207 |
+
}
|
1208 |
+
}
|
1209 |
+
|
1210 |
+
unless ( length $mess ) {
|
1211 |
+
$mess = '(end of string)';
|
1212 |
+
}
|
1213 |
+
|
1214 |
+
Carp::croak (
|
1215 |
+
$no_rep ? "$error" : "$error, at character offset $at (before \"$mess\")"
|
1216 |
+
);
|
1217 |
+
|
1218 |
+
}
|
1219 |
+
|
1220 |
+
|
1221 |
+
sub _json_object_hook {
|
1222 |
+
my $o = $_[0];
|
1223 |
+
my @ks = keys %{$o};
|
1224 |
+
|
1225 |
+
if ( $cb_sk_object and @ks == 1 and exists $cb_sk_object->{ $ks[0] } and ref $cb_sk_object->{ $ks[0] } ) {
|
1226 |
+
my @val = $cb_sk_object->{ $ks[0] }->( $o->{$ks[0]} );
|
1227 |
+
if (@val == 1) {
|
1228 |
+
return $val[0];
|
1229 |
+
}
|
1230 |
+
}
|
1231 |
+
|
1232 |
+
my @val = $cb_object->($o) if ($cb_object);
|
1233 |
+
if (@val == 0 or @val > 1) {
|
1234 |
+
return $o;
|
1235 |
+
}
|
1236 |
+
else {
|
1237 |
+
return $val[0];
|
1238 |
+
}
|
1239 |
+
}
|
1240 |
+
|
1241 |
+
|
1242 |
+
sub PP_decode_box {
|
1243 |
+
{
|
1244 |
+
text => $text,
|
1245 |
+
at => $at,
|
1246 |
+
ch => $ch,
|
1247 |
+
len => $len,
|
1248 |
+
depth => $depth,
|
1249 |
+
encoding => $encoding,
|
1250 |
+
is_valid_utf8 => $is_valid_utf8,
|
1251 |
+
};
|
1252 |
+
}
|
1253 |
+
|
1254 |
+
} # PARSE
|
1255 |
+
|
1256 |
+
|
1257 |
+
sub _decode_surrogates { # from perlunicode
|
1258 |
+
my $uni = 0x10000 + (hex($_[0]) - 0xD800) * 0x400 + (hex($_[1]) - 0xDC00);
|
1259 |
+
my $un = pack('U*', $uni);
|
1260 |
+
utf8::encode( $un );
|
1261 |
+
return $un;
|
1262 |
+
}
|
1263 |
+
|
1264 |
+
|
1265 |
+
sub _decode_unicode {
|
1266 |
+
my $un = pack('U', hex shift);
|
1267 |
+
utf8::encode( $un );
|
1268 |
+
return $un;
|
1269 |
+
}
|
1270 |
+
|
1271 |
+
#
|
1272 |
+
# Setup for various Perl versions (the code from JSON::PP58)
|
1273 |
+
#
|
1274 |
+
|
1275 |
+
BEGIN {
|
1276 |
+
|
1277 |
+
unless ( defined &utf8::is_utf8 ) {
|
1278 |
+
require Encode;
|
1279 |
+
*utf8::is_utf8 = *Encode::is_utf8;
|
1280 |
+
}
|
1281 |
+
|
1282 |
+
if ( $] >= 5.008 ) {
|
1283 |
+
*JSON::PP::JSON_PP_encode_ascii = \&_encode_ascii;
|
1284 |
+
*JSON::PP::JSON_PP_encode_latin1 = \&_encode_latin1;
|
1285 |
+
*JSON::PP::JSON_PP_decode_surrogates = \&_decode_surrogates;
|
1286 |
+
*JSON::PP::JSON_PP_decode_unicode = \&_decode_unicode;
|
1287 |
+
}
|
1288 |
+
|
1289 |
+
if ($] >= 5.008 and $] < 5.008003) { # join() in 5.8.0 - 5.8.2 is broken.
|
1290 |
+
package # hide from PAUSE
|
1291 |
+
JSON::PP;
|
1292 |
+
require subs;
|
1293 |
+
subs->import('join');
|
1294 |
+
eval q|
|
1295 |
+
sub join {
|
1296 |
+
return '' if (@_ < 2);
|
1297 |
+
my $j = shift;
|
1298 |
+
my $str = shift;
|
1299 |
+
for (@_) { $str .= $j . $_; }
|
1300 |
+
return $str;
|
1301 |
+
}
|
1302 |
+
|;
|
1303 |
+
}
|
1304 |
+
|
1305 |
+
|
1306 |
+
sub JSON::PP::incr_parse {
|
1307 |
+
local $Carp::CarpLevel = 1;
|
1308 |
+
( $_[0]->{_incr_parser} ||= JSON::PP::IncrParser->new )->incr_parse( @_ );
|
1309 |
+
}
|
1310 |
+
|
1311 |
+
|
1312 |
+
sub JSON::PP::incr_skip {
|
1313 |
+
( $_[0]->{_incr_parser} ||= JSON::PP::IncrParser->new )->incr_skip;
|
1314 |
+
}
|
1315 |
+
|
1316 |
+
|
1317 |
+
sub JSON::PP::incr_reset {
|
1318 |
+
( $_[0]->{_incr_parser} ||= JSON::PP::IncrParser->new )->incr_reset;
|
1319 |
+
}
|
1320 |
+
|
1321 |
+
eval q{
|
1322 |
+
sub JSON::PP::incr_text : lvalue {
|
1323 |
+
$_[0]->{_incr_parser} ||= JSON::PP::IncrParser->new;
|
1324 |
+
|
1325 |
+
if ( $_[0]->{_incr_parser}->{incr_parsing} ) {
|
1326 |
+
Carp::croak("incr_text can not be called when the incremental parser already started parsing");
|
1327 |
+
}
|
1328 |
+
$_[0]->{_incr_parser}->{incr_text};
|
1329 |
+
}
|
1330 |
+
} if ( $] >= 5.006 );
|
1331 |
+
|
1332 |
+
} # Setup for various Perl versions (the code from JSON::PP58)
|
1333 |
+
|
1334 |
+
|
1335 |
+
###############################
|
1336 |
+
# Utilities
|
1337 |
+
#
|
1338 |
+
|
1339 |
+
BEGIN {
|
1340 |
+
eval 'require Scalar::Util';
|
1341 |
+
unless($@){
|
1342 |
+
*JSON::PP::blessed = \&Scalar::Util::blessed;
|
1343 |
+
*JSON::PP::reftype = \&Scalar::Util::reftype;
|
1344 |
+
*JSON::PP::refaddr = \&Scalar::Util::refaddr;
|
1345 |
+
}
|
1346 |
+
else{ # This code is from Scalar::Util.
|
1347 |
+
# warn $@;
|
1348 |
+
eval 'sub UNIVERSAL::a_sub_not_likely_to_be_here { ref($_[0]) }';
|
1349 |
+
*JSON::PP::blessed = sub {
|
1350 |
+
local($@, $SIG{__DIE__}, $SIG{__WARN__});
|
1351 |
+
ref($_[0]) ? eval { $_[0]->a_sub_not_likely_to_be_here } : undef;
|
1352 |
+
};
|
1353 |
+
my %tmap = qw(
|
1354 |
+
B::NULL SCALAR
|
1355 |
+
B::HV HASH
|
1356 |
+
B::AV ARRAY
|
1357 |
+
B::CV CODE
|
1358 |
+
B::IO IO
|
1359 |
+
B::GV GLOB
|
1360 |
+
B::REGEXP REGEXP
|
1361 |
+
);
|
1362 |
+
*JSON::PP::reftype = sub {
|
1363 |
+
my $r = shift;
|
1364 |
+
|
1365 |
+
return undef unless length(ref($r));
|
1366 |
+
|
1367 |
+
my $t = ref(B::svref_2object($r));
|
1368 |
+
|
1369 |
+
return
|
1370 |
+
exists $tmap{$t} ? $tmap{$t}
|
1371 |
+
: length(ref($$r)) ? 'REF'
|
1372 |
+
: 'SCALAR';
|
1373 |
+
};
|
1374 |
+
*JSON::PP::refaddr = sub {
|
1375 |
+
return undef unless length(ref($_[0]));
|
1376 |
+
|
1377 |
+
my $addr;
|
1378 |
+
if(defined(my $pkg = blessed($_[0]))) {
|
1379 |
+
$addr .= bless $_[0], 'Scalar::Util::Fake';
|
1380 |
+
bless $_[0], $pkg;
|
1381 |
+
}
|
1382 |
+
else {
|
1383 |
+
$addr .= $_[0]
|
1384 |
+
}
|
1385 |
+
|
1386 |
+
$addr =~ /0x(\w+)/;
|
1387 |
+
local $^W;
|
1388 |
+
#no warnings 'portable';
|
1389 |
+
hex($1);
|
1390 |
+
}
|
1391 |
+
}
|
1392 |
+
}
|
1393 |
+
|
1394 |
+
|
1395 |
+
# shamelessly copied and modified from JSON::XS code.
|
1396 |
+
|
1397 |
+
unless ( $INC{'JSON/PP.pm'} ) {
|
1398 |
+
eval q|
|
1399 |
+
package
|
1400 |
+
JSON::PP::Boolean;
|
1401 |
+
|
1402 |
+
use overload (
|
1403 |
+
"0+" => sub { ${$_[0]} },
|
1404 |
+
"++" => sub { $_[0] = ${$_[0]} + 1 },
|
1405 |
+
"--" => sub { $_[0] = ${$_[0]} - 1 },
|
1406 |
+
fallback => 1,
|
1407 |
+
);
|
1408 |
+
|;
|
1409 |
+
}
|
1410 |
+
|
1411 |
+
$JSON::PP::true = do { bless \(my $dummy = 1), "JSON::PP::Boolean" };
|
1412 |
+
$JSON::PP::false = do { bless \(my $dummy = 0), "JSON::PP::Boolean" };
|
1413 |
+
|
1414 |
+
sub is_bool { defined $_[0] and UNIVERSAL::isa($_[0], "JSON::PP::Boolean"); }
|
1415 |
+
|
1416 |
+
sub true { $JSON::PP::true }
|
1417 |
+
sub false { $JSON::PP::false }
|
1418 |
+
sub null { undef; }
|
1419 |
+
|
1420 |
+
###############################
|
1421 |
+
|
1422 |
+
###############################
|
1423 |
+
|
1424 |
+
package # hide from PAUSE
|
1425 |
+
JSON::PP::IncrParser;
|
1426 |
+
|
1427 |
+
use strict;
|
1428 |
+
|
1429 |
+
use constant INCR_M_WS => 0; # initial whitespace skipping
|
1430 |
+
use constant INCR_M_STR => 1; # inside string
|
1431 |
+
use constant INCR_M_BS => 2; # inside backslash
|
1432 |
+
use constant INCR_M_JSON => 3; # outside anything, count nesting
|
1433 |
+
use constant INCR_M_C0 => 4;
|
1434 |
+
use constant INCR_M_C1 => 5;
|
1435 |
+
|
1436 |
+
use vars qw($VERSION);
|
1437 |
+
$VERSION = '1.01';
|
1438 |
+
|
1439 |
+
my $unpack_format = $] < 5.006 ? 'C*' : 'U*';
|
1440 |
+
|
1441 |
+
sub new {
|
1442 |
+
my ( $class ) = @_;
|
1443 |
+
|
1444 |
+
bless {
|
1445 |
+
incr_nest => 0,
|
1446 |
+
incr_text => undef,
|
1447 |
+
incr_parsing => 0,
|
1448 |
+
incr_p => 0,
|
1449 |
+
}, $class;
|
1450 |
+
}
|
1451 |
+
|
1452 |
+
|
1453 |
+
sub incr_parse {
|
1454 |
+
my ( $self, $coder, $text ) = @_;
|
1455 |
+
|
1456 |
+
$self->{incr_text} = '' unless ( defined $self->{incr_text} );
|
1457 |
+
|
1458 |
+
if ( defined $text ) {
|
1459 |
+
if ( utf8::is_utf8( $text ) and !utf8::is_utf8( $self->{incr_text} ) ) {
|
1460 |
+
utf8::upgrade( $self->{incr_text} ) ;
|
1461 |
+
utf8::decode( $self->{incr_text} ) ;
|
1462 |
+
}
|
1463 |
+
$self->{incr_text} .= $text;
|
1464 |
+
}
|
1465 |
+
|
1466 |
+
|
1467 |
+
my $max_size = $coder->get_max_size;
|
1468 |
+
|
1469 |
+
if ( defined wantarray ) {
|
1470 |
+
|
1471 |
+
$self->{incr_mode} = INCR_M_WS unless defined $self->{incr_mode};
|
1472 |
+
|
1473 |
+
if ( wantarray ) {
|
1474 |
+
my @ret;
|
1475 |
+
|
1476 |
+
$self->{incr_parsing} = 1;
|
1477 |
+
|
1478 |
+
do {
|
1479 |
+
push @ret, $self->_incr_parse( $coder, $self->{incr_text} );
|
1480 |
+
|
1481 |
+
unless ( !$self->{incr_nest} and $self->{incr_mode} == INCR_M_JSON ) {
|
1482 |
+
$self->{incr_mode} = INCR_M_WS if $self->{incr_mode} != INCR_M_STR;
|
1483 |
+
}
|
1484 |
+
|
1485 |
+
} until ( length $self->{incr_text} >= $self->{incr_p} );
|
1486 |
+
|
1487 |
+
$self->{incr_parsing} = 0;
|
1488 |
+
|
1489 |
+
return @ret;
|
1490 |
+
}
|
1491 |
+
else { # in scalar context
|
1492 |
+
$self->{incr_parsing} = 1;
|
1493 |
+
my $obj = $self->_incr_parse( $coder, $self->{incr_text} );
|
1494 |
+
$self->{incr_parsing} = 0 if defined $obj; # pointed by Martin J. Evans
|
1495 |
+
return $obj ? $obj : undef; # $obj is an empty string, parsing was completed.
|
1496 |
+
}
|
1497 |
+
|
1498 |
+
}
|
1499 |
+
|
1500 |
+
}
|
1501 |
+
|
1502 |
+
|
1503 |
+
sub _incr_parse {
|
1504 |
+
my ( $self, $coder, $text, $skip ) = @_;
|
1505 |
+
my $p = $self->{incr_p};
|
1506 |
+
my $restore = $p;
|
1507 |
+
|
1508 |
+
my @obj;
|
1509 |
+
my $len = length $text;
|
1510 |
+
|
1511 |
+
if ( $self->{incr_mode} == INCR_M_WS ) {
|
1512 |
+
while ( $len > $p ) {
|
1513 |
+
my $s = substr( $text, $p, 1 );
|
1514 |
+
$p++ and next if ( 0x20 >= unpack($unpack_format, $s) );
|
1515 |
+
$self->{incr_mode} = INCR_M_JSON;
|
1516 |
+
last;
|
1517 |
+
}
|
1518 |
+
}
|
1519 |
+
|
1520 |
+
while ( $len > $p ) {
|
1521 |
+
my $s = substr( $text, $p++, 1 );
|
1522 |
+
|
1523 |
+
if ( $s eq '"' ) {
|
1524 |
+
if (substr( $text, $p - 2, 1 ) eq '\\' ) {
|
1525 |
+
next;
|
1526 |
+
}
|
1527 |
+
|
1528 |
+
if ( $self->{incr_mode} != INCR_M_STR ) {
|
1529 |
+
$self->{incr_mode} = INCR_M_STR;
|
1530 |
+
}
|
1531 |
+
else {
|
1532 |
+
$self->{incr_mode} = INCR_M_JSON;
|
1533 |
+
unless ( $self->{incr_nest} ) {
|
1534 |
+
last;
|
1535 |
+
}
|
1536 |
+
}
|
1537 |
+
}
|
1538 |
+
|
1539 |
+
if ( $self->{incr_mode} == INCR_M_JSON ) {
|
1540 |
+
|
1541 |
+
if ( $s eq '[' or $s eq '{' ) {
|
1542 |
+
if ( ++$self->{incr_nest} > $coder->get_max_depth ) {
|
1543 |
+
Carp::croak('json text or perl structure exceeds maximum nesting level (max_depth set too low?)');
|
1544 |
+
}
|
1545 |
+
}
|
1546 |
+
elsif ( $s eq ']' or $s eq '}' ) {
|
1547 |
+
last if ( --$self->{incr_nest} <= 0 );
|
1548 |
+
}
|
1549 |
+
elsif ( $s eq '#' ) {
|
1550 |
+
while ( $len > $p ) {
|
1551 |
+
last if substr( $text, $p++, 1 ) eq "\n";
|
1552 |
+
}
|
1553 |
+
}
|
1554 |
+
|
1555 |
+
}
|
1556 |
+
|
1557 |
+
}
|
1558 |
+
|
1559 |
+
$self->{incr_p} = $p;
|
1560 |
+
|
1561 |
+
return if ( $self->{incr_mode} == INCR_M_STR and not $self->{incr_nest} );
|
1562 |
+
return if ( $self->{incr_mode} == INCR_M_JSON and $self->{incr_nest} > 0 );
|
1563 |
+
|
1564 |
+
return '' unless ( length substr( $self->{incr_text}, 0, $p ) );
|
1565 |
+
|
1566 |
+
local $Carp::CarpLevel = 2;
|
1567 |
+
|
1568 |
+
$self->{incr_p} = $restore;
|
1569 |
+
$self->{incr_c} = $p;
|
1570 |
+
|
1571 |
+
my ( $obj, $tail ) = $coder->PP_decode_json( substr( $self->{incr_text}, 0, $p ), 0x10000001 );
|
1572 |
+
|
1573 |
+
$self->{incr_text} = substr( $self->{incr_text}, $p );
|
1574 |
+
$self->{incr_p} = 0;
|
1575 |
+
|
1576 |
+
return $obj || '';
|
1577 |
+
}
|
1578 |
+
|
1579 |
+
|
1580 |
+
sub incr_text {
|
1581 |
+
if ( $_[0]->{incr_parsing} ) {
|
1582 |
+
Carp::croak("incr_text can not be called when the incremental parser already started parsing");
|
1583 |
+
}
|
1584 |
+
$_[0]->{incr_text};
|
1585 |
+
}
|
1586 |
+
|
1587 |
+
|
1588 |
+
sub incr_skip {
|
1589 |
+
my $self = shift;
|
1590 |
+
$self->{incr_text} = substr( $self->{incr_text}, $self->{incr_c} );
|
1591 |
+
$self->{incr_p} = 0;
|
1592 |
+
}
|
1593 |
+
|
1594 |
+
|
1595 |
+
sub incr_reset {
|
1596 |
+
my $self = shift;
|
1597 |
+
$self->{incr_text} = undef;
|
1598 |
+
$self->{incr_p} = 0;
|
1599 |
+
$self->{incr_mode} = 0;
|
1600 |
+
$self->{incr_nest} = 0;
|
1601 |
+
$self->{incr_parsing} = 0;
|
1602 |
+
}
|
1603 |
+
|
1604 |
+
###############################
|
1605 |
+
|
1606 |
+
|
1607 |
+
1;
|
1608 |
+
__END__
|
1609 |
+
=pod
|
1610 |
+
|
1611 |
+
=head1 NAME
|
1612 |
+
|
1613 |
+
JSON::PP - JSON::XS compatible pure-Perl module.
|
1614 |
+
|
1615 |
+
=head1 SYNOPSIS
|
1616 |
+
|
1617 |
+
use JSON::PP;
|
1618 |
+
|
1619 |
+
# exported functions, they croak on error
|
1620 |
+
# and expect/generate UTF-8
|
1621 |
+
|
1622 |
+
$utf8_encoded_json_text = encode_json $perl_hash_or_arrayref;
|
1623 |
+
$perl_hash_or_arrayref = decode_json $utf8_encoded_json_text;
|
1624 |
+
|
1625 |
+
# OO-interface
|
1626 |
+
|
1627 |
+
$coder = JSON::PP->new->ascii->pretty->allow_nonref;
|
1628 |
+
|
1629 |
+
$json_text = $json->encode( $perl_scalar );
|
1630 |
+
$perl_scalar = $json->decode( $json_text );
|
1631 |
+
|
1632 |
+
$pretty_printed = $json->pretty->encode( $perl_scalar ); # pretty-printing
|
1633 |
+
|
1634 |
+
# Note that JSON version 2.0 and above will automatically use
|
1635 |
+
# JSON::XS or JSON::PP, so you should be able to just:
|
1636 |
+
|
1637 |
+
use JSON;
|
1638 |
+
|
1639 |
+
|
1640 |
+
=head1 VERSION
|
1641 |
+
|
1642 |
+
2.27200
|
1643 |
+
|
1644 |
+
L<JSON::XS> 2.27 (~2.30) compatible.
|
1645 |
+
|
1646 |
+
=head1 DESCRIPTION
|
1647 |
+
|
1648 |
+
This module is L<JSON::XS> compatible pure Perl module.
|
1649 |
+
(Perl 5.8 or later is recommended)
|
1650 |
+
|
1651 |
+
JSON::XS is the fastest and most proper JSON module on CPAN.
|
1652 |
+
It is written by Marc Lehmann in C, so must be compiled and
|
1653 |
+
installed in the used environment.
|
1654 |
+
|
1655 |
+
JSON::PP is a pure-Perl module and has compatibility to JSON::XS.
|
1656 |
+
|
1657 |
+
|
1658 |
+
=head2 FEATURES
|
1659 |
+
|
1660 |
+
=over
|
1661 |
+
|
1662 |
+
=item * correct unicode handling
|
1663 |
+
|
1664 |
+
This module knows how to handle Unicode (depending on Perl version).
|
1665 |
+
|
1666 |
+
See to L<JSON::XS/A FEW NOTES ON UNICODE AND PERL> and
|
1667 |
+
L<UNICODE HANDLING ON PERLS>.
|
1668 |
+
|
1669 |
+
|
1670 |
+
=item * round-trip integrity
|
1671 |
+
|
1672 |
+
When you serialise a perl data structure using only data types
|
1673 |
+
supported by JSON and Perl, the deserialised data structure is
|
1674 |
+
identical on the Perl level. (e.g. the string "2.0" doesn't suddenly
|
1675 |
+
become "2" just because it looks like a number). There I<are> minor
|
1676 |
+
exceptions to this, read the MAPPING section below to learn about
|
1677 |
+
those.
|
1678 |
+
|
1679 |
+
|
1680 |
+
=item * strict checking of JSON correctness
|
1681 |
+
|
1682 |
+
There is no guessing, no generating of illegal JSON texts by default,
|
1683 |
+
and only JSON is accepted as input by default (the latter is a
|
1684 |
+
security feature). But when some options are set, loose checking
|
1685 |
+
features are available.
|
1686 |
+
|
1687 |
+
=back
|
1688 |
+
|
1689 |
+
=head1 FUNCTIONAL INTERFACE
|
1690 |
+
|
1691 |
+
Some documents are copied and modified from L<JSON::XS/FUNCTIONAL INTERFACE>.
|
1692 |
+
|
1693 |
+
=head2 encode_json
|
1694 |
+
|
1695 |
+
$json_text = encode_json $perl_scalar
|
1696 |
+
|
1697 |
+
Converts the given Perl data structure to a UTF-8 encoded, binary string.
|
1698 |
+
|
1699 |
+
This function call is functionally identical to:
|
1700 |
+
|
1701 |
+
$json_text = JSON::PP->new->utf8->encode($perl_scalar)
|
1702 |
+
|
1703 |
+
=head2 decode_json
|
1704 |
+
|
1705 |
+
$perl_scalar = decode_json $json_text
|
1706 |
+
|
1707 |
+
The opposite of C<encode_json>: expects an UTF-8 (binary) string and tries
|
1708 |
+
to parse that as an UTF-8 encoded JSON text, returning the resulting
|
1709 |
+
reference.
|
1710 |
+
|
1711 |
+
This function call is functionally identical to:
|
1712 |
+
|
1713 |
+
$perl_scalar = JSON::PP->new->utf8->decode($json_text)
|
1714 |
+
|
1715 |
+
=head2 JSON::PP::is_bool
|
1716 |
+
|
1717 |
+
$is_boolean = JSON::PP::is_bool($scalar)
|
1718 |
+
|
1719 |
+
Returns true if the passed scalar represents either JSON::PP::true or
|
1720 |
+
JSON::PP::false, two constants that act like C<1> and C<0> respectively
|
1721 |
+
and are also used to represent JSON C<true> and C<false> in Perl strings.
|
1722 |
+
|
1723 |
+
=head2 JSON::PP::true
|
1724 |
+
|
1725 |
+
Returns JSON true value which is blessed object.
|
1726 |
+
It C<isa> JSON::PP::Boolean object.
|
1727 |
+
|
1728 |
+
=head2 JSON::PP::false
|
1729 |
+
|
1730 |
+
Returns JSON false value which is blessed object.
|
1731 |
+
It C<isa> JSON::PP::Boolean object.
|
1732 |
+
|
1733 |
+
=head2 JSON::PP::null
|
1734 |
+
|
1735 |
+
Returns C<undef>.
|
1736 |
+
|
1737 |
+
See L<MAPPING>, below, for more information on how JSON values are mapped to
|
1738 |
+
Perl.
|
1739 |
+
|
1740 |
+
|
1741 |
+
=head1 HOW DO I DECODE A DATA FROM OUTER AND ENCODE TO OUTER
|
1742 |
+
|
1743 |
+
This section supposes that your perl version is 5.8 or later.
|
1744 |
+
|
1745 |
+
If you know a JSON text from an outer world - a network, a file content, and so on,
|
1746 |
+
is encoded in UTF-8, you should use C<decode_json> or C<JSON> module object
|
1747 |
+
with C<utf8> enable. And the decoded result will contain UNICODE characters.
|
1748 |
+
|
1749 |
+
# from network
|
1750 |
+
my $json = JSON::PP->new->utf8;
|
1751 |
+
my $json_text = CGI->new->param( 'json_data' );
|
1752 |
+
my $perl_scalar = $json->decode( $json_text );
|
1753 |
+
|
1754 |
+
# from file content
|
1755 |
+
local $/;
|
1756 |
+
open( my $fh, '<', 'json.data' );
|
1757 |
+
$json_text = <$fh>;
|
1758 |
+
$perl_scalar = decode_json( $json_text );
|
1759 |
+
|
1760 |
+
If an outer data is not encoded in UTF-8, firstly you should C<decode> it.
|
1761 |
+
|
1762 |
+
use Encode;
|
1763 |
+
local $/;
|
1764 |
+
open( my $fh, '<', 'json.data' );
|
1765 |
+
my $encoding = 'cp932';
|
1766 |
+
my $unicode_json_text = decode( $encoding, <$fh> ); # UNICODE
|
1767 |
+
|
1768 |
+
# or you can write the below code.
|
1769 |
+
#
|
1770 |
+
# open( my $fh, "<:encoding($encoding)", 'json.data' );
|
1771 |
+
# $unicode_json_text = <$fh>;
|
1772 |
+
|
1773 |
+
In this case, C<$unicode_json_text> is of course UNICODE string.
|
1774 |
+
So you B<cannot> use C<decode_json> nor C<JSON> module object with C<utf8> enable.
|
1775 |
+
Instead of them, you use C<JSON> module object with C<utf8> disable.
|
1776 |
+
|
1777 |
+
$perl_scalar = $json->utf8(0)->decode( $unicode_json_text );
|
1778 |
+
|
1779 |
+
Or C<encode 'utf8'> and C<decode_json>:
|
1780 |
+
|
1781 |
+
$perl_scalar = decode_json( encode( 'utf8', $unicode_json_text ) );
|
1782 |
+
# this way is not efficient.
|
1783 |
+
|
1784 |
+
And now, you want to convert your C<$perl_scalar> into JSON data and
|
1785 |
+
send it to an outer world - a network or a file content, and so on.
|
1786 |
+
|
1787 |
+
Your data usually contains UNICODE strings and you want the converted data to be encoded
|
1788 |
+
in UTF-8, you should use C<encode_json> or C<JSON> module object with C<utf8> enable.
|
1789 |
+
|
1790 |
+
print encode_json( $perl_scalar ); # to a network? file? or display?
|
1791 |
+
# or
|
1792 |
+
print $json->utf8->encode( $perl_scalar );
|
1793 |
+
|
1794 |
+
If C<$perl_scalar> does not contain UNICODE but C<$encoding>-encoded strings
|
1795 |
+
for some reason, then its characters are regarded as B<latin1> for perl
|
1796 |
+
(because it does not concern with your $encoding).
|
1797 |
+
You B<cannot> use C<encode_json> nor C<JSON> module object with C<utf8> enable.
|
1798 |
+
Instead of them, you use C<JSON> module object with C<utf8> disable.
|
1799 |
+
Note that the resulted text is a UNICODE string but no problem to print it.
|
1800 |
+
|
1801 |
+
# $perl_scalar contains $encoding encoded string values
|
1802 |
+
$unicode_json_text = $json->utf8(0)->encode( $perl_scalar );
|
1803 |
+
# $unicode_json_text consists of characters less than 0x100
|
1804 |
+
print $unicode_json_text;
|
1805 |
+
|
1806 |
+
Or C<decode $encoding> all string values and C<encode_json>:
|
1807 |
+
|
1808 |
+
$perl_scalar->{ foo } = decode( $encoding, $perl_scalar->{ foo } );
|
1809 |
+
# ... do it to each string values, then encode_json
|
1810 |
+
$json_text = encode_json( $perl_scalar );
|
1811 |
+
|
1812 |
+
This method is a proper way but probably not efficient.
|
1813 |
+
|
1814 |
+
See to L<Encode>, L<perluniintro>.
|
1815 |
+
|
1816 |
+
|
1817 |
+
=head1 METHODS
|
1818 |
+
|
1819 |
+
Basically, check to L<JSON> or L<JSON::XS>.
|
1820 |
+
|
1821 |
+
=head2 new
|
1822 |
+
|
1823 |
+
$json = JSON::PP->new
|
1824 |
+
|
1825 |
+
Returns a new JSON::PP object that can be used to de/encode JSON
|
1826 |
+
strings.
|
1827 |
+
|
1828 |
+
All boolean flags described below are by default I<disabled>.
|
1829 |
+
|
1830 |
+
The mutators for flags all return the JSON object again and thus calls can
|
1831 |
+
be chained:
|
1832 |
+
|
1833 |
+
my $json = JSON::PP->new->utf8->space_after->encode({a => [1,2]})
|
1834 |
+
=> {"a": [1, 2]}
|
1835 |
+
|
1836 |
+
=head2 ascii
|
1837 |
+
|
1838 |
+
$json = $json->ascii([$enable])
|
1839 |
+
|
1840 |
+
$enabled = $json->get_ascii
|
1841 |
+
|
1842 |
+
If $enable is true (or missing), then the encode method will not generate characters outside
|
1843 |
+
the code range 0..127. Any Unicode characters outside that range will be escaped using either
|
1844 |
+
a single \uXXXX or a double \uHHHH\uLLLLL escape sequence, as per RFC4627.
|
1845 |
+
(See to L<JSON::XS/OBJECT-ORIENTED INTERFACE>).
|
1846 |
+
|
1847 |
+
In Perl 5.005, there is no character having high value (more than 255).
|
1848 |
+
See to L<UNICODE HANDLING ON PERLS>.
|
1849 |
+
|
1850 |
+
If $enable is false, then the encode method will not escape Unicode characters unless
|
1851 |
+
required by the JSON syntax or other flags. This results in a faster and more compact format.
|
1852 |
+
|
1853 |
+
JSON::PP->new->ascii(1)->encode([chr 0x10401])
|
1854 |
+
=> ["\ud801\udc01"]
|
1855 |
+
|
1856 |
+
=head2 latin1
|
1857 |
+
|
1858 |
+
$json = $json->latin1([$enable])
|
1859 |
+
|
1860 |
+
$enabled = $json->get_latin1
|
1861 |
+
|
1862 |
+
If $enable is true (or missing), then the encode method will encode the resulting JSON
|
1863 |
+
text as latin1 (or iso-8859-1), escaping any characters outside the code range 0..255.
|
1864 |
+
|
1865 |
+
If $enable is false, then the encode method will not escape Unicode characters
|
1866 |
+
unless required by the JSON syntax or other flags.
|
1867 |
+
|
1868 |
+
JSON::XS->new->latin1->encode (["\x{89}\x{abc}"]
|
1869 |
+
=> ["\x{89}\\u0abc"] # (perl syntax, U+abc escaped, U+89 not)
|
1870 |
+
|
1871 |
+
See to L<UNICODE HANDLING ON PERLS>.
|
1872 |
+
|
1873 |
+
=head2 utf8
|
1874 |
+
|
1875 |
+
$json = $json->utf8([$enable])
|
1876 |
+
|
1877 |
+
$enabled = $json->get_utf8
|
1878 |
+
|
1879 |
+
If $enable is true (or missing), then the encode method will encode the JSON result
|
1880 |
+
into UTF-8, as required by many protocols, while the decode method expects to be handled
|
1881 |
+
an UTF-8-encoded string. Please note that UTF-8-encoded strings do not contain any
|
1882 |
+
characters outside the range 0..255, they are thus useful for bytewise/binary I/O.
|
1883 |
+
|
1884 |
+
(In Perl 5.005, any character outside the range 0..255 does not exist.
|
1885 |
+
See to L<UNICODE HANDLING ON PERLS>.)
|
1886 |
+
|
1887 |
+
In future versions, enabling this option might enable autodetection of the UTF-16 and UTF-32
|
1888 |
+
encoding families, as described in RFC4627.
|
1889 |
+
|
1890 |
+
If $enable is false, then the encode method will return the JSON string as a (non-encoded)
|
1891 |
+
Unicode string, while decode expects thus a Unicode string. Any decoding or encoding
|
1892 |
+
(e.g. to UTF-8 or UTF-16) needs to be done yourself, e.g. using the Encode module.
|
1893 |
+
|
1894 |
+
Example, output UTF-16BE-encoded JSON:
|
1895 |
+
|
1896 |
+
use Encode;
|
1897 |
+
$jsontext = encode "UTF-16BE", JSON::PP->new->encode ($object);
|
1898 |
+
|
1899 |
+
Example, decode UTF-32LE-encoded JSON:
|
1900 |
+
|
1901 |
+
use Encode;
|
1902 |
+
$object = JSON::PP->new->decode (decode "UTF-32LE", $jsontext);
|
1903 |
+
|
1904 |
+
|
1905 |
+
=head2 pretty
|
1906 |
+
|
1907 |
+
$json = $json->pretty([$enable])
|
1908 |
+
|
1909 |
+
This enables (or disables) all of the C<indent>, C<space_before> and
|
1910 |
+
C<space_after> flags in one call to generate the most readable
|
1911 |
+
(or most compact) form possible.
|
1912 |
+
|
1913 |
+
Equivalent to:
|
1914 |
+
|
1915 |
+
$json->indent->space_before->space_after
|
1916 |
+
|
1917 |
+
=head2 indent
|
1918 |
+
|
1919 |
+
$json = $json->indent([$enable])
|
1920 |
+
|
1921 |
+
$enabled = $json->get_indent
|
1922 |
+
|
1923 |
+
The default indent space length is three.
|
1924 |
+
You can use C<indent_length> to change the length.
|
1925 |
+
|
1926 |
+
=head2 space_before
|
1927 |
+
|
1928 |
+
$json = $json->space_before([$enable])
|
1929 |
+
|
1930 |
+
$enabled = $json->get_space_before
|
1931 |
+
|
1932 |
+
If C<$enable> is true (or missing), then the C<encode> method will add an extra
|
1933 |
+
optional space before the C<:> separating keys from values in JSON objects.
|
1934 |
+
|
1935 |
+
If C<$enable> is false, then the C<encode> method will not add any extra
|
1936 |
+
space at those places.
|
1937 |
+
|
1938 |
+
This setting has no effect when decoding JSON texts.
|
1939 |
+
|
1940 |
+
Example, space_before enabled, space_after and indent disabled:
|
1941 |
+
|
1942 |
+
{"key" :"value"}
|
1943 |
+
|
1944 |
+
=head2 space_after
|
1945 |
+
|
1946 |
+
$json = $json->space_after([$enable])
|
1947 |
+
|
1948 |
+
$enabled = $json->get_space_after
|
1949 |
+
|
1950 |
+
If C<$enable> is true (or missing), then the C<encode> method will add an extra
|
1951 |
+
optional space after the C<:> separating keys from values in JSON objects
|
1952 |
+
and extra whitespace after the C<,> separating key-value pairs and array
|
1953 |
+
members.
|
1954 |
+
|
1955 |
+
If C<$enable> is false, then the C<encode> method will not add any extra
|
1956 |
+
space at those places.
|
1957 |
+
|
1958 |
+
This setting has no effect when decoding JSON texts.
|
1959 |
+
|
1960 |
+
Example, space_before and indent disabled, space_after enabled:
|
1961 |
+
|
1962 |
+
{"key": "value"}
|
1963 |
+
|
1964 |
+
=head2 relaxed
|
1965 |
+
|
1966 |
+
$json = $json->relaxed([$enable])
|
1967 |
+
|
1968 |
+
$enabled = $json->get_relaxed
|
1969 |
+
|
1970 |
+
If C<$enable> is true (or missing), then C<decode> will accept some
|
1971 |
+
extensions to normal JSON syntax (see below). C<encode> will not be
|
1972 |
+
affected in anyway. I<Be aware that this option makes you accept invalid
|
1973 |
+
JSON texts as if they were valid!>. I suggest only to use this option to
|
1974 |
+
parse application-specific files written by humans (configuration files,
|
1975 |
+
resource files etc.)
|
1976 |
+
|
1977 |
+
If C<$enable> is false (the default), then C<decode> will only accept
|
1978 |
+
valid JSON texts.
|
1979 |
+
|
1980 |
+
Currently accepted extensions are:
|
1981 |
+
|
1982 |
+
=over 4
|
1983 |
+
|
1984 |
+
=item * list items can have an end-comma
|
1985 |
+
|
1986 |
+
JSON I<separates> array elements and key-value pairs with commas. This
|
1987 |
+
can be annoying if you write JSON texts manually and want to be able to
|
1988 |
+
quickly append elements, so this extension accepts comma at the end of
|
1989 |
+
such items not just between them:
|
1990 |
+
|
1991 |
+
[
|
1992 |
+
1,
|
1993 |
+
2, <- this comma not normally allowed
|
1994 |
+
]
|
1995 |
+
{
|
1996 |
+
"k1": "v1",
|
1997 |
+
"k2": "v2", <- this comma not normally allowed
|
1998 |
+
}
|
1999 |
+
|
2000 |
+
=item * shell-style '#'-comments
|
2001 |
+
|
2002 |
+
Whenever JSON allows whitespace, shell-style comments are additionally
|
2003 |
+
allowed. They are terminated by the first carriage-return or line-feed
|
2004 |
+
character, after which more white-space and comments are allowed.
|
2005 |
+
|
2006 |
+
[
|
2007 |
+
1, # this comment not allowed in JSON
|
2008 |
+
# neither this one...
|
2009 |
+
]
|
2010 |
+
|
2011 |
+
=back
|
2012 |
+
|
2013 |
+
=head2 canonical
|
2014 |
+
|
2015 |
+
$json = $json->canonical([$enable])
|
2016 |
+
|
2017 |
+
$enabled = $json->get_canonical
|
2018 |
+
|
2019 |
+
If C<$enable> is true (or missing), then the C<encode> method will output JSON objects
|
2020 |
+
by sorting their keys. This is adding a comparatively high overhead.
|
2021 |
+
|
2022 |
+
If C<$enable> is false, then the C<encode> method will output key-value
|
2023 |
+
pairs in the order Perl stores them (which will likely change between runs
|
2024 |
+
of the same script).
|
2025 |
+
|
2026 |
+
This option is useful if you want the same data structure to be encoded as
|
2027 |
+
the same JSON text (given the same overall settings). If it is disabled,
|
2028 |
+
the same hash might be encoded differently even if contains the same data,
|
2029 |
+
as key-value pairs have no inherent ordering in Perl.
|
2030 |
+
|
2031 |
+
This setting has no effect when decoding JSON texts.
|
2032 |
+
|
2033 |
+
If you want your own sorting routine, you can give a code reference
|
2034 |
+
or a subroutine name to C<sort_by>. See to C<JSON::PP OWN METHODS>.
|
2035 |
+
|
2036 |
+
=head2 allow_nonref
|
2037 |
+
|
2038 |
+
$json = $json->allow_nonref([$enable])
|
2039 |
+
|
2040 |
+
$enabled = $json->get_allow_nonref
|
2041 |
+
|
2042 |
+
If C<$enable> is true (or missing), then the C<encode> method can convert a
|
2043 |
+
non-reference into its corresponding string, number or null JSON value,
|
2044 |
+
which is an extension to RFC4627. Likewise, C<decode> will accept those JSON
|
2045 |
+
values instead of croaking.
|
2046 |
+
|
2047 |
+
If C<$enable> is false, then the C<encode> method will croak if it isn't
|
2048 |
+
passed an arrayref or hashref, as JSON texts must either be an object
|
2049 |
+
or array. Likewise, C<decode> will croak if given something that is not a
|
2050 |
+
JSON object or array.
|
2051 |
+
|
2052 |
+
JSON::PP->new->allow_nonref->encode ("Hello, World!")
|
2053 |
+
=> "Hello, World!"
|
2054 |
+
|
2055 |
+
=head2 allow_unknown
|
2056 |
+
|
2057 |
+
$json = $json->allow_unknown ([$enable])
|
2058 |
+
|
2059 |
+
$enabled = $json->get_allow_unknown
|
2060 |
+
|
2061 |
+
If $enable is true (or missing), then "encode" will *not* throw an
|
2062 |
+
exception when it encounters values it cannot represent in JSON (for
|
2063 |
+
example, filehandles) but instead will encode a JSON "null" value.
|
2064 |
+
Note that blessed objects are not included here and are handled
|
2065 |
+
separately by c<allow_nonref>.
|
2066 |
+
|
2067 |
+
If $enable is false (the default), then "encode" will throw an
|
2068 |
+
exception when it encounters anything it cannot encode as JSON.
|
2069 |
+
|
2070 |
+
This option does not affect "decode" in any way, and it is
|
2071 |
+
recommended to leave it off unless you know your communications
|
2072 |
+
partner.
|
2073 |
+
|
2074 |
+
=head2 allow_blessed
|
2075 |
+
|
2076 |
+
$json = $json->allow_blessed([$enable])
|
2077 |
+
|
2078 |
+
$enabled = $json->get_allow_blessed
|
2079 |
+
|
2080 |
+
If C<$enable> is true (or missing), then the C<encode> method will not
|
2081 |
+
barf when it encounters a blessed reference. Instead, the value of the
|
2082 |
+
B<convert_blessed> option will decide whether C<null> (C<convert_blessed>
|
2083 |
+
disabled or no C<TO_JSON> method found) or a representation of the
|
2084 |
+
object (C<convert_blessed> enabled and C<TO_JSON> method found) is being
|
2085 |
+
encoded. Has no effect on C<decode>.
|
2086 |
+
|
2087 |
+
If C<$enable> is false (the default), then C<encode> will throw an
|
2088 |
+
exception when it encounters a blessed object.
|
2089 |
+
|
2090 |
+
=head2 convert_blessed
|
2091 |
+
|
2092 |
+
$json = $json->convert_blessed([$enable])
|
2093 |
+
|
2094 |
+
$enabled = $json->get_convert_blessed
|
2095 |
+
|
2096 |
+
If C<$enable> is true (or missing), then C<encode>, upon encountering a
|
2097 |
+
blessed object, will check for the availability of the C<TO_JSON> method
|
2098 |
+
on the object's class. If found, it will be called in scalar context
|
2099 |
+
and the resulting scalar will be encoded instead of the object. If no
|
2100 |
+
C<TO_JSON> method is found, the value of C<allow_blessed> will decide what
|
2101 |
+
to do.
|
2102 |
+
|
2103 |
+
The C<TO_JSON> method may safely call die if it wants. If C<TO_JSON>
|
2104 |
+
returns other blessed objects, those will be handled in the same
|
2105 |
+
way. C<TO_JSON> must take care of not causing an endless recursion cycle
|
2106 |
+
(== crash) in this case. The name of C<TO_JSON> was chosen because other
|
2107 |
+
methods called by the Perl core (== not by the user of the object) are
|
2108 |
+
usually in upper case letters and to avoid collisions with the C<to_json>
|
2109 |
+
function or method.
|
2110 |
+
|
2111 |
+
This setting does not yet influence C<decode> in any way.
|
2112 |
+
|
2113 |
+
If C<$enable> is false, then the C<allow_blessed> setting will decide what
|
2114 |
+
to do when a blessed object is found.
|
2115 |
+
|
2116 |
+
=head2 filter_json_object
|
2117 |
+
|
2118 |
+
$json = $json->filter_json_object([$coderef])
|
2119 |
+
|
2120 |
+
When C<$coderef> is specified, it will be called from C<decode> each
|
2121 |
+
time it decodes a JSON object. The only argument passed to the coderef
|
2122 |
+
is a reference to the newly-created hash. If the code references returns
|
2123 |
+
a single scalar (which need not be a reference), this value
|
2124 |
+
(i.e. a copy of that scalar to avoid aliasing) is inserted into the
|
2125 |
+
deserialised data structure. If it returns an empty list
|
2126 |
+
(NOTE: I<not> C<undef>, which is a valid scalar), the original deserialised
|
2127 |
+
hash will be inserted. This setting can slow down decoding considerably.
|
2128 |
+
|
2129 |
+
When C<$coderef> is omitted or undefined, any existing callback will
|
2130 |
+
be removed and C<decode> will not change the deserialised hash in any
|
2131 |
+
way.
|
2132 |
+
|
2133 |
+
Example, convert all JSON objects into the integer 5:
|
2134 |
+
|
2135 |
+
my $js = JSON::PP->new->filter_json_object (sub { 5 });
|
2136 |
+
# returns [5]
|
2137 |
+
$js->decode ('[{}]'); # the given subroutine takes a hash reference.
|
2138 |
+
# throw an exception because allow_nonref is not enabled
|
2139 |
+
# so a lone 5 is not allowed.
|
2140 |
+
$js->decode ('{"a":1, "b":2}');
|
2141 |
+
|
2142 |
+
=head2 filter_json_single_key_object
|
2143 |
+
|
2144 |
+
$json = $json->filter_json_single_key_object($key [=> $coderef])
|
2145 |
+
|
2146 |
+
Works remotely similar to C<filter_json_object>, but is only called for
|
2147 |
+
JSON objects having a single key named C<$key>.
|
2148 |
+
|
2149 |
+
This C<$coderef> is called before the one specified via
|
2150 |
+
C<filter_json_object>, if any. It gets passed the single value in the JSON
|
2151 |
+
object. If it returns a single value, it will be inserted into the data
|
2152 |
+
structure. If it returns nothing (not even C<undef> but the empty list),
|
2153 |
+
the callback from C<filter_json_object> will be called next, as if no
|
2154 |
+
single-key callback were specified.
|
2155 |
+
|
2156 |
+
If C<$coderef> is omitted or undefined, the corresponding callback will be
|
2157 |
+
disabled. There can only ever be one callback for a given key.
|
2158 |
+
|
2159 |
+
As this callback gets called less often then the C<filter_json_object>
|
2160 |
+
one, decoding speed will not usually suffer as much. Therefore, single-key
|
2161 |
+
objects make excellent targets to serialise Perl objects into, especially
|
2162 |
+
as single-key JSON objects are as close to the type-tagged value concept
|
2163 |
+
as JSON gets (it's basically an ID/VALUE tuple). Of course, JSON does not
|
2164 |
+
support this in any way, so you need to make sure your data never looks
|
2165 |
+
like a serialised Perl hash.
|
2166 |
+
|
2167 |
+
Typical names for the single object key are C<__class_whatever__>, or
|
2168 |
+
C<$__dollars_are_rarely_used__$> or C<}ugly_brace_placement>, or even
|
2169 |
+
things like C<__class_md5sum(classname)__>, to reduce the risk of clashing
|
2170 |
+
with real hashes.
|
2171 |
+
|
2172 |
+
Example, decode JSON objects of the form C<< { "__widget__" => <id> } >>
|
2173 |
+
into the corresponding C<< $WIDGET{<id>} >> object:
|
2174 |
+
|
2175 |
+
# return whatever is in $WIDGET{5}:
|
2176 |
+
JSON::PP
|
2177 |
+
->new
|
2178 |
+
->filter_json_single_key_object (__widget__ => sub {
|
2179 |
+
$WIDGET{ $_[0] }
|
2180 |
+
})
|
2181 |
+
->decode ('{"__widget__": 5')
|
2182 |
+
|
2183 |
+
# this can be used with a TO_JSON method in some "widget" class
|
2184 |
+
# for serialisation to json:
|
2185 |
+
sub WidgetBase::TO_JSON {
|
2186 |
+
my ($self) = @_;
|
2187 |
+
|
2188 |
+
unless ($self->{id}) {
|
2189 |
+
$self->{id} = ..get..some..id..;
|
2190 |
+
$WIDGET{$self->{id}} = $self;
|
2191 |
+
}
|
2192 |
+
|
2193 |
+
{ __widget__ => $self->{id} }
|
2194 |
+
}
|
2195 |
+
|
2196 |
+
=head2 shrink
|
2197 |
+
|
2198 |
+
$json = $json->shrink([$enable])
|
2199 |
+
|
2200 |
+
$enabled = $json->get_shrink
|
2201 |
+
|
2202 |
+
In JSON::XS, this flag resizes strings generated by either
|
2203 |
+
C<encode> or C<decode> to their minimum size possible.
|
2204 |
+
It will also try to downgrade any strings to octet-form if possible.
|
2205 |
+
|
2206 |
+
In JSON::PP, it is noop about resizing strings but tries
|
2207 |
+
C<utf8::downgrade> to the returned string by C<encode>.
|
2208 |
+
See to L<utf8>.
|
2209 |
+
|
2210 |
+
See to L<JSON::XS/OBJECT-ORIENTED INTERFACE>
|
2211 |
+
|
2212 |
+
=head2 max_depth
|
2213 |
+
|
2214 |
+
$json = $json->max_depth([$maximum_nesting_depth])
|
2215 |
+
|
2216 |
+
$max_depth = $json->get_max_depth
|
2217 |
+
|
2218 |
+
Sets the maximum nesting level (default C<512>) accepted while encoding
|
2219 |
+
or decoding. If a higher nesting level is detected in JSON text or a Perl
|
2220 |
+
data structure, then the encoder and decoder will stop and croak at that
|
2221 |
+
point.
|
2222 |
+
|
2223 |
+
Nesting level is defined by number of hash- or arrayrefs that the encoder
|
2224 |
+
needs to traverse to reach a given point or the number of C<{> or C<[>
|
2225 |
+
characters without their matching closing parenthesis crossed to reach a
|
2226 |
+
given character in a string.
|
2227 |
+
|
2228 |
+
If no argument is given, the highest possible setting will be used, which
|
2229 |
+
is rarely useful.
|
2230 |
+
|
2231 |
+
See L<JSON::XS/SSECURITY CONSIDERATIONS> for more info on why this is useful.
|
2232 |
+
|
2233 |
+
When a large value (100 or more) was set and it de/encodes a deep nested object/text,
|
2234 |
+
it may raise a warning 'Deep recursion on subroutine' at the perl runtime phase.
|
2235 |
+
|
2236 |
+
=head2 max_size
|
2237 |
+
|
2238 |
+
$json = $json->max_size([$maximum_string_size])
|
2239 |
+
|
2240 |
+
$max_size = $json->get_max_size
|
2241 |
+
|
2242 |
+
Set the maximum length a JSON text may have (in bytes) where decoding is
|
2243 |
+
being attempted. The default is C<0>, meaning no limit. When C<decode>
|
2244 |
+
is called on a string that is longer then this many bytes, it will not
|
2245 |
+
attempt to decode the string but throw an exception. This setting has no
|
2246 |
+
effect on C<encode> (yet).
|
2247 |
+
|
2248 |
+
If no argument is given, the limit check will be deactivated (same as when
|
2249 |
+
C<0> is specified).
|
2250 |
+
|
2251 |
+
See L<JSON::XS/SECURITY CONSIDERATIONS> for more info on why this is useful.
|
2252 |
+
|
2253 |
+
=head2 encode
|
2254 |
+
|
2255 |
+
$json_text = $json->encode($perl_scalar)
|
2256 |
+
|
2257 |
+
Converts the given Perl data structure (a simple scalar or a reference
|
2258 |
+
to a hash or array) to its JSON representation. Simple scalars will be
|
2259 |
+
converted into JSON string or number sequences, while references to arrays
|
2260 |
+
become JSON arrays and references to hashes become JSON objects. Undefined
|
2261 |
+
Perl values (e.g. C<undef>) become JSON C<null> values.
|
2262 |
+
References to the integers C<0> and C<1> are converted into C<true> and C<false>.
|
2263 |
+
|
2264 |
+
=head2 decode
|
2265 |
+
|
2266 |
+
$perl_scalar = $json->decode($json_text)
|
2267 |
+
|
2268 |
+
The opposite of C<encode>: expects a JSON text and tries to parse it,
|
2269 |
+
returning the resulting simple scalar or reference. Croaks on error.
|
2270 |
+
|
2271 |
+
JSON numbers and strings become simple Perl scalars. JSON arrays become
|
2272 |
+
Perl arrayrefs and JSON objects become Perl hashrefs. C<true> becomes
|
2273 |
+
C<1> (C<JSON::true>), C<false> becomes C<0> (C<JSON::false>) and
|
2274 |
+
C<null> becomes C<undef>.
|
2275 |
+
|
2276 |
+
=head2 decode_prefix
|
2277 |
+
|
2278 |
+
($perl_scalar, $characters) = $json->decode_prefix($json_text)
|
2279 |
+
|
2280 |
+
This works like the C<decode> method, but instead of raising an exception
|
2281 |
+
when there is trailing garbage after the first JSON object, it will
|
2282 |
+
silently stop parsing there and return the number of characters consumed
|
2283 |
+
so far.
|
2284 |
+
|
2285 |
+
JSON->new->decode_prefix ("[1] the tail")
|
2286 |
+
=> ([], 3)
|
2287 |
+
|
2288 |
+
=head1 INCREMENTAL PARSING
|
2289 |
+
|
2290 |
+
Most of this section are copied and modified from L<JSON::XS/INCREMENTAL PARSING>.
|
2291 |
+
|
2292 |
+
In some cases, there is the need for incremental parsing of JSON texts.
|
2293 |
+
This module does allow you to parse a JSON stream incrementally.
|
2294 |
+
It does so by accumulating text until it has a full JSON object, which
|
2295 |
+
it then can decode. This process is similar to using C<decode_prefix>
|
2296 |
+
to see if a full JSON object is available, but is much more efficient
|
2297 |
+
(and can be implemented with a minimum of method calls).
|
2298 |
+
|
2299 |
+
This module will only attempt to parse the JSON text once it is sure it
|
2300 |
+
has enough text to get a decisive result, using a very simple but
|
2301 |
+
truly incremental parser. This means that it sometimes won't stop as
|
2302 |
+
early as the full parser, for example, it doesn't detect parenthesis
|
2303 |
+
mismatches. The only thing it guarantees is that it starts decoding as
|
2304 |
+
soon as a syntactically valid JSON text has been seen. This means you need
|
2305 |
+
to set resource limits (e.g. C<max_size>) to ensure the parser will stop
|
2306 |
+
parsing in the presence if syntax errors.
|
2307 |
+
|
2308 |
+
The following methods implement this incremental parser.
|
2309 |
+
|
2310 |
+
=head2 incr_parse
|
2311 |
+
|
2312 |
+
$json->incr_parse( [$string] ) # void context
|
2313 |
+
|
2314 |
+
$obj_or_undef = $json->incr_parse( [$string] ) # scalar context
|
2315 |
+
|
2316 |
+
@obj_or_empty = $json->incr_parse( [$string] ) # list context
|
2317 |
+
|
2318 |
+
This is the central parsing function. It can both append new text and
|
2319 |
+
extract objects from the stream accumulated so far (both of these
|
2320 |
+
functions are optional).
|
2321 |
+
|
2322 |
+
If C<$string> is given, then this string is appended to the already
|
2323 |
+
existing JSON fragment stored in the C<$json> object.
|
2324 |
+
|
2325 |
+
After that, if the function is called in void context, it will simply
|
2326 |
+
return without doing anything further. This can be used to add more text
|
2327 |
+
in as many chunks as you want.
|
2328 |
+
|
2329 |
+
If the method is called in scalar context, then it will try to extract
|
2330 |
+
exactly I<one> JSON object. If that is successful, it will return this
|
2331 |
+
object, otherwise it will return C<undef>. If there is a parse error,
|
2332 |
+
this method will croak just as C<decode> would do (one can then use
|
2333 |
+
C<incr_skip> to skip the erroneous part). This is the most common way of
|
2334 |
+
using the method.
|
2335 |
+
|
2336 |
+
And finally, in list context, it will try to extract as many objects
|
2337 |
+
from the stream as it can find and return them, or the empty list
|
2338 |
+
otherwise. For this to work, there must be no separators between the JSON
|
2339 |
+
objects or arrays, instead they must be concatenated back-to-back. If
|
2340 |
+
an error occurs, an exception will be raised as in the scalar context
|
2341 |
+
case. Note that in this case, any previously-parsed JSON texts will be
|
2342 |
+
lost.
|
2343 |
+
|
2344 |
+
Example: Parse some JSON arrays/objects in a given string and return them.
|
2345 |
+
|
2346 |
+
my @objs = JSON->new->incr_parse ("[5][7][1,2]");
|
2347 |
+
|
2348 |
+
=head2 incr_text
|
2349 |
+
|
2350 |
+
$lvalue_string = $json->incr_text
|
2351 |
+
|
2352 |
+
This method returns the currently stored JSON fragment as an lvalue, that
|
2353 |
+
is, you can manipulate it. This I<only> works when a preceding call to
|
2354 |
+
C<incr_parse> in I<scalar context> successfully returned an object. Under
|
2355 |
+
all other circumstances you must not call this function (I mean it.
|
2356 |
+
although in simple tests it might actually work, it I<will> fail under
|
2357 |
+
real world conditions). As a special exception, you can also call this
|
2358 |
+
method before having parsed anything.
|
2359 |
+
|
2360 |
+
This function is useful in two cases: a) finding the trailing text after a
|
2361 |
+
JSON object or b) parsing multiple JSON objects separated by non-JSON text
|
2362 |
+
(such as commas).
|
2363 |
+
|
2364 |
+
$json->incr_text =~ s/\s*,\s*//;
|
2365 |
+
|
2366 |
+
In Perl 5.005, C<lvalue> attribute is not available.
|
2367 |
+
You must write codes like the below:
|
2368 |
+
|
2369 |
+
$string = $json->incr_text;
|
2370 |
+
$string =~ s/\s*,\s*//;
|
2371 |
+
$json->incr_text( $string );
|
2372 |
+
|
2373 |
+
=head2 incr_skip
|
2374 |
+
|
2375 |
+
$json->incr_skip
|
2376 |
+
|
2377 |
+
This will reset the state of the incremental parser and will remove the
|
2378 |
+
parsed text from the input buffer. This is useful after C<incr_parse>
|
2379 |
+
died, in which case the input buffer and incremental parser state is left
|
2380 |
+
unchanged, to skip the text parsed so far and to reset the parse state.
|
2381 |
+
|
2382 |
+
=head2 incr_reset
|
2383 |
+
|
2384 |
+
$json->incr_reset
|
2385 |
+
|
2386 |
+
This completely resets the incremental parser, that is, after this call,
|
2387 |
+
it will be as if the parser had never parsed anything.
|
2388 |
+
|
2389 |
+
This is useful if you want to repeatedly parse JSON objects and want to
|
2390 |
+
ignore any trailing data, which means you have to reset the parser after
|
2391 |
+
each successful decode.
|
2392 |
+
|
2393 |
+
See to L<JSON::XS/INCREMENTAL PARSING> for examples.
|
2394 |
+
|
2395 |
+
|
2396 |
+
=head1 JSON::PP OWN METHODS
|
2397 |
+
|
2398 |
+
=head2 allow_singlequote
|
2399 |
+
|
2400 |
+
$json = $json->allow_singlequote([$enable])
|
2401 |
+
|
2402 |
+
If C<$enable> is true (or missing), then C<decode> will accept
|
2403 |
+
JSON strings quoted by single quotations that are invalid JSON
|
2404 |
+
format.
|
2405 |
+
|
2406 |
+
$json->allow_singlequote->decode({"foo":'bar'});
|
2407 |
+
$json->allow_singlequote->decode({'foo':"bar"});
|
2408 |
+
$json->allow_singlequote->decode({'foo':'bar'});
|
2409 |
+
|
2410 |
+
As same as the C<relaxed> option, this option may be used to parse
|
2411 |
+
application-specific files written by humans.
|
2412 |
+
|
2413 |
+
|
2414 |
+
=head2 allow_barekey
|
2415 |
+
|
2416 |
+
$json = $json->allow_barekey([$enable])
|
2417 |
+
|
2418 |
+
If C<$enable> is true (or missing), then C<decode> will accept
|
2419 |
+
bare keys of JSON object that are invalid JSON format.
|
2420 |
+
|
2421 |
+
As same as the C<relaxed> option, this option may be used to parse
|
2422 |
+
application-specific files written by humans.
|
2423 |
+
|
2424 |
+
$json->allow_barekey->decode('{foo:"bar"}');
|
2425 |
+
|
2426 |
+
=head2 allow_bignum
|
2427 |
+
|
2428 |
+
$json = $json->allow_bignum([$enable])
|
2429 |
+
|
2430 |
+
If C<$enable> is true (or missing), then C<decode> will convert
|
2431 |
+
the big integer Perl cannot handle as integer into a L<Math::BigInt>
|
2432 |
+
object and convert a floating number (any) into a L<Math::BigFloat>.
|
2433 |
+
|
2434 |
+
On the contrary, C<encode> converts C<Math::BigInt> objects and C<Math::BigFloat>
|
2435 |
+
objects into JSON numbers with C<allow_blessed> enable.
|
2436 |
+
|
2437 |
+
$json->allow_nonref->allow_blessed->allow_bignum;
|
2438 |
+
$bigfloat = $json->decode('2.000000000000000000000000001');
|
2439 |
+
print $json->encode($bigfloat);
|
2440 |
+
# => 2.000000000000000000000000001
|
2441 |
+
|
2442 |
+
See to L<JSON::XS/MAPPING> about the normal conversion of JSON number.
|
2443 |
+
|
2444 |
+
=head2 loose
|
2445 |
+
|
2446 |
+
$json = $json->loose([$enable])
|
2447 |
+
|
2448 |
+
The unescaped [\x00-\x1f\x22\x2f\x5c] strings are invalid in JSON strings
|
2449 |
+
and the module doesn't allow to C<decode> to these (except for \x2f).
|
2450 |
+
If C<$enable> is true (or missing), then C<decode> will accept these
|
2451 |
+
unescaped strings.
|
2452 |
+
|
2453 |
+
$json->loose->decode(qq|["abc
|
2454 |
+
def"]|);
|
2455 |
+
|
2456 |
+
See L<JSON::XS/SSECURITY CONSIDERATIONS>.
|
2457 |
+
|
2458 |
+
=head2 escape_slash
|
2459 |
+
|
2460 |
+
$json = $json->escape_slash([$enable])
|
2461 |
+
|
2462 |
+
According to JSON Grammar, I<slash> (U+002F) is escaped. But default
|
2463 |
+
JSON::PP (as same as JSON::XS) encodes strings without escaping slash.
|
2464 |
+
|
2465 |
+
If C<$enable> is true (or missing), then C<encode> will escape slashes.
|
2466 |
+
|
2467 |
+
=head2 indent_length
|
2468 |
+
|
2469 |
+
$json = $json->indent_length($length)
|
2470 |
+
|
2471 |
+
JSON::XS indent space length is 3 and cannot be changed.
|
2472 |
+
JSON::PP set the indent space length with the given $length.
|
2473 |
+
The default is 3. The acceptable range is 0 to 15.
|
2474 |
+
|
2475 |
+
=head2 sort_by
|
2476 |
+
|
2477 |
+
$json = $json->sort_by($function_name)
|
2478 |
+
$json = $json->sort_by($subroutine_ref)
|
2479 |
+
|
2480 |
+
If $function_name or $subroutine_ref are set, its sort routine are used
|
2481 |
+
in encoding JSON objects.
|
2482 |
+
|
2483 |
+
$js = $pc->sort_by(sub { $JSON::PP::a cmp $JSON::PP::b })->encode($obj);
|
2484 |
+
# is($js, q|{"a":1,"b":2,"c":3,"d":4,"e":5,"f":6,"g":7,"h":8,"i":9}|);
|
2485 |
+
|
2486 |
+
$js = $pc->sort_by('own_sort')->encode($obj);
|
2487 |
+
# is($js, q|{"a":1,"b":2,"c":3,"d":4,"e":5,"f":6,"g":7,"h":8,"i":9}|);
|
2488 |
+
|
2489 |
+
sub JSON::PP::own_sort { $JSON::PP::a cmp $JSON::PP::b }
|
2490 |
+
|
2491 |
+
As the sorting routine runs in the JSON::PP scope, the given
|
2492 |
+
subroutine name and the special variables C<$a>, C<$b> will begin
|
2493 |
+
'JSON::PP::'.
|
2494 |
+
|
2495 |
+
If $integer is set, then the effect is same as C<canonical> on.
|
2496 |
+
|
2497 |
+
=head1 INTERNAL
|
2498 |
+
|
2499 |
+
For developers.
|
2500 |
+
|
2501 |
+
=over
|
2502 |
+
|
2503 |
+
=item PP_encode_box
|
2504 |
+
|
2505 |
+
Returns
|
2506 |
+
|
2507 |
+
{
|
2508 |
+
depth => $depth,
|
2509 |
+
indent_count => $indent_count,
|
2510 |
+
}
|
2511 |
+
|
2512 |
+
|
2513 |
+
=item PP_decode_box
|
2514 |
+
|
2515 |
+
Returns
|
2516 |
+
|
2517 |
+
{
|
2518 |
+
text => $text,
|
2519 |
+
at => $at,
|
2520 |
+
ch => $ch,
|
2521 |
+
len => $len,
|
2522 |
+
depth => $depth,
|
2523 |
+
encoding => $encoding,
|
2524 |
+
is_valid_utf8 => $is_valid_utf8,
|
2525 |
+
};
|
2526 |
+
|
2527 |
+
=back
|
2528 |
+
|
2529 |
+
=head1 MAPPING
|
2530 |
+
|
2531 |
+
This section is copied from JSON::XS and modified to C<JSON::PP>.
|
2532 |
+
JSON::XS and JSON::PP mapping mechanisms are almost equivalent.
|
2533 |
+
|
2534 |
+
See to L<JSON::XS/MAPPING>.
|
2535 |
+
|
2536 |
+
=head2 JSON -> PERL
|
2537 |
+
|
2538 |
+
=over 4
|
2539 |
+
|
2540 |
+
=item object
|
2541 |
+
|
2542 |
+
A JSON object becomes a reference to a hash in Perl. No ordering of object
|
2543 |
+
keys is preserved (JSON does not preserver object key ordering itself).
|
2544 |
+
|
2545 |
+
=item array
|
2546 |
+
|
2547 |
+
A JSON array becomes a reference to an array in Perl.
|
2548 |
+
|
2549 |
+
=item string
|
2550 |
+
|
2551 |
+
A JSON string becomes a string scalar in Perl - Unicode codepoints in JSON
|
2552 |
+
are represented by the same codepoints in the Perl string, so no manual
|
2553 |
+
decoding is necessary.
|
2554 |
+
|
2555 |
+
=item number
|
2556 |
+
|
2557 |
+
A JSON number becomes either an integer, numeric (floating point) or
|
2558 |
+
string scalar in perl, depending on its range and any fractional parts. On
|
2559 |
+
the Perl level, there is no difference between those as Perl handles all
|
2560 |
+
the conversion details, but an integer may take slightly less memory and
|
2561 |
+
might represent more values exactly than floating point numbers.
|
2562 |
+
|
2563 |
+
If the number consists of digits only, C<JSON> will try to represent
|
2564 |
+
it as an integer value. If that fails, it will try to represent it as
|
2565 |
+
a numeric (floating point) value if that is possible without loss of
|
2566 |
+
precision. Otherwise it will preserve the number as a string value (in
|
2567 |
+
which case you lose roundtripping ability, as the JSON number will be
|
2568 |
+
re-encoded to a JSON string).
|
2569 |
+
|
2570 |
+
Numbers containing a fractional or exponential part will always be
|
2571 |
+
represented as numeric (floating point) values, possibly at a loss of
|
2572 |
+
precision (in which case you might lose perfect roundtripping ability, but
|
2573 |
+
the JSON number will still be re-encoded as a JSON number).
|
2574 |
+
|
2575 |
+
Note that precision is not accuracy - binary floating point values cannot
|
2576 |
+
represent most decimal fractions exactly, and when converting from and to
|
2577 |
+
floating point, C<JSON> only guarantees precision up to but not including
|
2578 |
+
the least significant bit.
|
2579 |
+
|
2580 |
+
When C<allow_bignum> is enable, the big integers
|
2581 |
+
and the numeric can be optionally converted into L<Math::BigInt> and
|
2582 |
+
L<Math::BigFloat> objects.
|
2583 |
+
|
2584 |
+
=item true, false
|
2585 |
+
|
2586 |
+
These JSON atoms become C<JSON::PP::true> and C<JSON::PP::false>,
|
2587 |
+
respectively. They are overloaded to act almost exactly like the numbers
|
2588 |
+
C<1> and C<0>. You can check whether a scalar is a JSON boolean by using
|
2589 |
+
the C<JSON::is_bool> function.
|
2590 |
+
|
2591 |
+
print JSON::PP::true . "\n";
|
2592 |
+
=> true
|
2593 |
+
print JSON::PP::true + 1;
|
2594 |
+
=> 1
|
2595 |
+
|
2596 |
+
ok(JSON::true eq '1');
|
2597 |
+
ok(JSON::true == 1);
|
2598 |
+
|
2599 |
+
C<JSON> will install these missing overloading features to the backend modules.
|
2600 |
+
|
2601 |
+
|
2602 |
+
=item null
|
2603 |
+
|
2604 |
+
A JSON null atom becomes C<undef> in Perl.
|
2605 |
+
|
2606 |
+
C<JSON::PP::null> returns C<undef>.
|
2607 |
+
|
2608 |
+
=back
|
2609 |
+
|
2610 |
+
|
2611 |
+
=head2 PERL -> JSON
|
2612 |
+
|
2613 |
+
The mapping from Perl to JSON is slightly more difficult, as Perl is a
|
2614 |
+
truly typeless language, so we can only guess which JSON type is meant by
|
2615 |
+
a Perl value.
|
2616 |
+
|
2617 |
+
=over 4
|
2618 |
+
|
2619 |
+
=item hash references
|
2620 |
+
|
2621 |
+
Perl hash references become JSON objects. As there is no inherent ordering
|
2622 |
+
in hash keys (or JSON objects), they will usually be encoded in a
|
2623 |
+
pseudo-random order that can change between runs of the same program but
|
2624 |
+
stays generally the same within a single run of a program. C<JSON>
|
2625 |
+
optionally sort the hash keys (determined by the I<canonical> flag), so
|
2626 |
+
the same data structure will serialise to the same JSON text (given same
|
2627 |
+
settings and version of JSON::XS), but this incurs a runtime overhead
|
2628 |
+
and is only rarely useful, e.g. when you want to compare some JSON text
|
2629 |
+
against another for equality.
|
2630 |
+
|
2631 |
+
|
2632 |
+
=item array references
|
2633 |
+
|
2634 |
+
Perl array references become JSON arrays.
|
2635 |
+
|
2636 |
+
=item other references
|
2637 |
+
|
2638 |
+
Other unblessed references are generally not allowed and will cause an
|
2639 |
+
exception to be thrown, except for references to the integers C<0> and
|
2640 |
+
C<1>, which get turned into C<false> and C<true> atoms in JSON. You can
|
2641 |
+
also use C<JSON::false> and C<JSON::true> to improve readability.
|
2642 |
+
|
2643 |
+
to_json [\0,JSON::PP::true] # yields [false,true]
|
2644 |
+
|
2645 |
+
=item JSON::PP::true, JSON::PP::false, JSON::PP::null
|
2646 |
+
|
2647 |
+
These special values become JSON true and JSON false values,
|
2648 |
+
respectively. You can also use C<\1> and C<\0> directly if you want.
|
2649 |
+
|
2650 |
+
JSON::PP::null returns C<undef>.
|
2651 |
+
|
2652 |
+
=item blessed objects
|
2653 |
+
|
2654 |
+
Blessed objects are not directly representable in JSON. See the
|
2655 |
+
C<allow_blessed> and C<convert_blessed> methods on various options on
|
2656 |
+
how to deal with this: basically, you can choose between throwing an
|
2657 |
+
exception, encoding the reference as if it weren't blessed, or provide
|
2658 |
+
your own serialiser method.
|
2659 |
+
|
2660 |
+
See to L<convert_blessed>.
|
2661 |
+
|
2662 |
+
=item simple scalars
|
2663 |
+
|
2664 |
+
Simple Perl scalars (any scalar that is not a reference) are the most
|
2665 |
+
difficult objects to encode: JSON::XS and JSON::PP will encode undefined scalars as
|
2666 |
+
JSON C<null> values, scalars that have last been used in a string context
|
2667 |
+
before encoding as JSON strings, and anything else as number value:
|
2668 |
+
|
2669 |
+
# dump as number
|
2670 |
+
encode_json [2] # yields [2]
|
2671 |
+
encode_json [-3.0e17] # yields [-3e+17]
|
2672 |
+
my $value = 5; encode_json [$value] # yields [5]
|
2673 |
+
|
2674 |
+
# used as string, so dump as string
|
2675 |
+
print $value;
|
2676 |
+
encode_json [$value] # yields ["5"]
|
2677 |
+
|
2678 |
+
# undef becomes null
|
2679 |
+
encode_json [undef] # yields [null]
|
2680 |
+
|
2681 |
+
You can force the type to be a string by stringifying it:
|
2682 |
+
|
2683 |
+
my $x = 3.1; # some variable containing a number
|
2684 |
+
"$x"; # stringified
|
2685 |
+
$x .= ""; # another, more awkward way to stringify
|
2686 |
+
print $x; # perl does it for you, too, quite often
|
2687 |
+
|
2688 |
+
You can force the type to be a number by numifying it:
|
2689 |
+
|
2690 |
+
my $x = "3"; # some variable containing a string
|
2691 |
+
$x += 0; # numify it, ensuring it will be dumped as a number
|
2692 |
+
$x *= 1; # same thing, the choice is yours.
|
2693 |
+
|
2694 |
+
You can not currently force the type in other, less obscure, ways.
|
2695 |
+
|
2696 |
+
Note that numerical precision has the same meaning as under Perl (so
|
2697 |
+
binary to decimal conversion follows the same rules as in Perl, which
|
2698 |
+
can differ to other languages). Also, your perl interpreter might expose
|
2699 |
+
extensions to the floating point numbers of your platform, such as
|
2700 |
+
infinities or NaN's - these cannot be represented in JSON, and it is an
|
2701 |
+
error to pass those in.
|
2702 |
+
|
2703 |
+
=item Big Number
|
2704 |
+
|
2705 |
+
When C<allow_bignum> is enable,
|
2706 |
+
C<encode> converts C<Math::BigInt> objects and C<Math::BigFloat>
|
2707 |
+
objects into JSON numbers.
|
2708 |
+
|
2709 |
+
|
2710 |
+
=back
|
2711 |
+
|
2712 |
+
=head1 UNICODE HANDLING ON PERLS
|
2713 |
+
|
2714 |
+
If you do not know about Unicode on Perl well,
|
2715 |
+
please check L<JSON::XS/A FEW NOTES ON UNICODE AND PERL>.
|
2716 |
+
|
2717 |
+
=head2 Perl 5.8 and later
|
2718 |
+
|
2719 |
+
Perl can handle Unicode and the JSON::PP de/encode methods also work properly.
|
2720 |
+
|
2721 |
+
$json->allow_nonref->encode(chr hex 3042);
|
2722 |
+
$json->allow_nonref->encode(chr hex 12345);
|
2723 |
+
|
2724 |
+
Returns C<"\u3042"> and C<"\ud808\udf45"> respectively.
|
2725 |
+
|
2726 |
+
$json->allow_nonref->decode('"\u3042"');
|
2727 |
+
$json->allow_nonref->decode('"\ud808\udf45"');
|
2728 |
+
|
2729 |
+
Returns UTF-8 encoded strings with UTF8 flag, regarded as C<U+3042> and C<U+12345>.
|
2730 |
+
|
2731 |
+
Note that the versions from Perl 5.8.0 to 5.8.2, Perl built-in C<join> was broken,
|
2732 |
+
so JSON::PP wraps the C<join> with a subroutine. Thus JSON::PP works slow in the versions.
|
2733 |
+
|
2734 |
+
|
2735 |
+
=head2 Perl 5.6
|
2736 |
+
|
2737 |
+
Perl can handle Unicode and the JSON::PP de/encode methods also work.
|
2738 |
+
|
2739 |
+
=head2 Perl 5.005
|
2740 |
+
|
2741 |
+
Perl 5.005 is a byte semantics world -- all strings are sequences of bytes.
|
2742 |
+
That means the unicode handling is not available.
|
2743 |
+
|
2744 |
+
In encoding,
|
2745 |
+
|
2746 |
+
$json->allow_nonref->encode(chr hex 3042); # hex 3042 is 12354.
|
2747 |
+
$json->allow_nonref->encode(chr hex 12345); # hex 12345 is 74565.
|
2748 |
+
|
2749 |
+
Returns C<B> and C<E>, as C<chr> takes a value more than 255, it treats
|
2750 |
+
as C<$value % 256>, so the above codes are equivalent to :
|
2751 |
+
|
2752 |
+
$json->allow_nonref->encode(chr 66);
|
2753 |
+
$json->allow_nonref->encode(chr 69);
|
2754 |
+
|
2755 |
+
In decoding,
|
2756 |
+
|
2757 |
+
$json->decode('"\u00e3\u0081\u0082"');
|
2758 |
+
|
2759 |
+
The returned is a byte sequence C<0xE3 0x81 0x82> for UTF-8 encoded
|
2760 |
+
japanese character (C<HIRAGANA LETTER A>).
|
2761 |
+
And if it is represented in Unicode code point, C<U+3042>.
|
2762 |
+
|
2763 |
+
Next,
|
2764 |
+
|
2765 |
+
$json->decode('"\u3042"');
|
2766 |
+
|
2767 |
+
We ordinary expect the returned value is a Unicode character C<U+3042>.
|
2768 |
+
But here is 5.005 world. This is C<0xE3 0x81 0x82>.
|
2769 |
+
|
2770 |
+
$json->decode('"\ud808\udf45"');
|
2771 |
+
|
2772 |
+
This is not a character C<U+12345> but bytes - C<0xf0 0x92 0x8d 0x85>.
|
2773 |
+
|
2774 |
+
|
2775 |
+
=head1 TODO
|
2776 |
+
|
2777 |
+
=over
|
2778 |
+
|
2779 |
+
=item speed
|
2780 |
+
|
2781 |
+
=item memory saving
|
2782 |
+
|
2783 |
+
=back
|
2784 |
+
|
2785 |
+
|
2786 |
+
=head1 SEE ALSO
|
2787 |
+
|
2788 |
+
Most of the document are copied and modified from JSON::XS doc.
|
2789 |
+
|
2790 |
+
L<JSON::XS>
|
2791 |
+
|
2792 |
+
RFC4627 (L<http://www.ietf.org/rfc/rfc4627.txt>)
|
2793 |
+
|
2794 |
+
=head1 AUTHOR
|
2795 |
+
|
2796 |
+
Makamaka Hannyaharamitu, E<lt>makamaka[at]cpan.orgE<gt>
|
2797 |
+
|
2798 |
+
|
2799 |
+
=head1 COPYRIGHT AND LICENSE
|
2800 |
+
|
2801 |
+
Copyright 2007-2012 by Makamaka Hannyaharamitu
|
2802 |
+
|
2803 |
+
This library is free software; you can redistribute it and/or modify
|
2804 |
+
it under the same terms as Perl itself.
|
2805 |
+
|
2806 |
+
=cut
|
uroman/lib/JSON/backportPP/Boolean.pm
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
=head1 NAME
|
2 |
+
|
3 |
+
JSON::PP::Boolean - dummy module providing JSON::PP::Boolean
|
4 |
+
|
5 |
+
=head1 SYNOPSIS
|
6 |
+
|
7 |
+
# do not "use" yourself
|
8 |
+
|
9 |
+
=head1 DESCRIPTION
|
10 |
+
|
11 |
+
This module exists only to provide overload resolution for Storable
|
12 |
+
and similar modules. See L<JSON::PP> for more info about this class.
|
13 |
+
|
14 |
+
=cut
|
15 |
+
|
16 |
+
use JSON::backportPP ();
|
17 |
+
use strict;
|
18 |
+
|
19 |
+
1;
|
20 |
+
|
21 |
+
=head1 AUTHOR
|
22 |
+
|
23 |
+
This idea is from L<JSON::XS::Boolean> written by
|
24 |
+
Marc Lehmann <schmorp[at]schmorp.de>
|
25 |
+
|
26 |
+
=cut
|
27 |
+
|
uroman/lib/JSON/backportPP/Compat5005.pm
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package # This is JSON::backportPP
|
2 |
+
JSON::backportPP5005;
|
3 |
+
|
4 |
+
use 5.005;
|
5 |
+
use strict;
|
6 |
+
|
7 |
+
my @properties;
|
8 |
+
|
9 |
+
$JSON::PP5005::VERSION = '1.10';
|
10 |
+
|
11 |
+
BEGIN {
|
12 |
+
|
13 |
+
sub utf8::is_utf8 {
|
14 |
+
0; # It is considered that UTF8 flag off for Perl 5.005.
|
15 |
+
}
|
16 |
+
|
17 |
+
sub utf8::upgrade {
|
18 |
+
}
|
19 |
+
|
20 |
+
sub utf8::downgrade {
|
21 |
+
1; # must always return true.
|
22 |
+
}
|
23 |
+
|
24 |
+
sub utf8::encode {
|
25 |
+
}
|
26 |
+
|
27 |
+
sub utf8::decode {
|
28 |
+
}
|
29 |
+
|
30 |
+
*JSON::PP::JSON_PP_encode_ascii = \&_encode_ascii;
|
31 |
+
*JSON::PP::JSON_PP_encode_latin1 = \&_encode_latin1;
|
32 |
+
*JSON::PP::JSON_PP_decode_surrogates = \&_decode_surrogates;
|
33 |
+
*JSON::PP::JSON_PP_decode_unicode = \&_decode_unicode;
|
34 |
+
|
35 |
+
# missing in B module.
|
36 |
+
sub B::SVp_IOK () { 0x01000000; }
|
37 |
+
sub B::SVp_NOK () { 0x02000000; }
|
38 |
+
sub B::SVp_POK () { 0x04000000; }
|
39 |
+
|
40 |
+
$INC{'bytes.pm'} = 1; # dummy
|
41 |
+
}
|
42 |
+
|
43 |
+
|
44 |
+
|
45 |
+
sub _encode_ascii {
|
46 |
+
join('', map { $_ <= 127 ? chr($_) : sprintf('\u%04x', $_) } unpack('C*', $_[0]) );
|
47 |
+
}
|
48 |
+
|
49 |
+
|
50 |
+
sub _encode_latin1 {
|
51 |
+
join('', map { chr($_) } unpack('C*', $_[0]) );
|
52 |
+
}
|
53 |
+
|
54 |
+
|
55 |
+
sub _decode_surrogates { # from http://homepage1.nifty.com/nomenclator/unicode/ucs_utf.htm
|
56 |
+
my $uni = 0x10000 + (hex($_[0]) - 0xD800) * 0x400 + (hex($_[1]) - 0xDC00); # from perlunicode
|
57 |
+
my $bit = unpack('B32', pack('N', $uni));
|
58 |
+
|
59 |
+
if ( $bit =~ /^00000000000(...)(......)(......)(......)$/ ) {
|
60 |
+
my ($w, $x, $y, $z) = ($1, $2, $3, $4);
|
61 |
+
return pack('B*', sprintf('11110%s10%s10%s10%s', $w, $x, $y, $z));
|
62 |
+
}
|
63 |
+
else {
|
64 |
+
Carp::croak("Invalid surrogate pair");
|
65 |
+
}
|
66 |
+
}
|
67 |
+
|
68 |
+
|
69 |
+
sub _decode_unicode {
|
70 |
+
my ($u) = @_;
|
71 |
+
my ($utf8bit);
|
72 |
+
|
73 |
+
if ( $u =~ /^00([89a-f][0-9a-f])$/i ) { # 0x80-0xff
|
74 |
+
return pack( 'H2', $1 );
|
75 |
+
}
|
76 |
+
|
77 |
+
my $bit = unpack("B*", pack("H*", $u));
|
78 |
+
|
79 |
+
if ( $bit =~ /^00000(.....)(......)$/ ) {
|
80 |
+
$utf8bit = sprintf('110%s10%s', $1, $2);
|
81 |
+
}
|
82 |
+
elsif ( $bit =~ /^(....)(......)(......)$/ ) {
|
83 |
+
$utf8bit = sprintf('1110%s10%s10%s', $1, $2, $3);
|
84 |
+
}
|
85 |
+
else {
|
86 |
+
Carp::croak("Invalid escaped unicode");
|
87 |
+
}
|
88 |
+
|
89 |
+
return pack('B*', $utf8bit);
|
90 |
+
}
|
91 |
+
|
92 |
+
|
93 |
+
sub JSON::PP::incr_text {
|
94 |
+
$_[0]->{_incr_parser} ||= JSON::PP::IncrParser->new;
|
95 |
+
|
96 |
+
if ( $_[0]->{_incr_parser}->{incr_parsing} ) {
|
97 |
+
Carp::croak("incr_text can not be called when the incremental parser already started parsing");
|
98 |
+
}
|
99 |
+
|
100 |
+
$_[0]->{_incr_parser}->{incr_text} = $_[1] if ( @_ > 1 );
|
101 |
+
$_[0]->{_incr_parser}->{incr_text};
|
102 |
+
}
|
103 |
+
|
104 |
+
|
105 |
+
1;
|
106 |
+
__END__
|
107 |
+
|
108 |
+
=pod
|
109 |
+
|
110 |
+
=head1 NAME
|
111 |
+
|
112 |
+
JSON::PP5005 - Helper module in using JSON::PP in Perl 5.005
|
113 |
+
|
114 |
+
=head1 DESCRIPTION
|
115 |
+
|
116 |
+
JSON::PP calls internally.
|
117 |
+
|
118 |
+
=head1 AUTHOR
|
119 |
+
|
120 |
+
Makamaka Hannyaharamitu, E<lt>makamaka[at]cpan.orgE<gt>
|
121 |
+
|
122 |
+
|
123 |
+
=head1 COPYRIGHT AND LICENSE
|
124 |
+
|
125 |
+
Copyright 2007-2012 by Makamaka Hannyaharamitu
|
126 |
+
|
127 |
+
This library is free software; you can redistribute it and/or modify
|
128 |
+
it under the same terms as Perl itself.
|
129 |
+
|
130 |
+
=cut
|
131 |
+
|
uroman/lib/JSON/backportPP/Compat5006.pm
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package # This is JSON::backportPP
|
2 |
+
JSON::backportPP56;
|
3 |
+
|
4 |
+
use 5.006;
|
5 |
+
use strict;
|
6 |
+
|
7 |
+
my @properties;
|
8 |
+
|
9 |
+
$JSON::PP56::VERSION = '1.08';
|
10 |
+
|
11 |
+
BEGIN {
|
12 |
+
|
13 |
+
sub utf8::is_utf8 {
|
14 |
+
my $len = length $_[0]; # char length
|
15 |
+
{
|
16 |
+
use bytes; # byte length;
|
17 |
+
return $len != length $_[0]; # if !=, UTF8-flagged on.
|
18 |
+
}
|
19 |
+
}
|
20 |
+
|
21 |
+
|
22 |
+
sub utf8::upgrade {
|
23 |
+
; # noop;
|
24 |
+
}
|
25 |
+
|
26 |
+
|
27 |
+
sub utf8::downgrade ($;$) {
|
28 |
+
return 1 unless ( utf8::is_utf8( $_[0] ) );
|
29 |
+
|
30 |
+
if ( _is_valid_utf8( $_[0] ) ) {
|
31 |
+
my $downgrade;
|
32 |
+
for my $c ( unpack( "U*", $_[0] ) ) {
|
33 |
+
if ( $c < 256 ) {
|
34 |
+
$downgrade .= pack("C", $c);
|
35 |
+
}
|
36 |
+
else {
|
37 |
+
$downgrade .= pack("U", $c);
|
38 |
+
}
|
39 |
+
}
|
40 |
+
$_[0] = $downgrade;
|
41 |
+
return 1;
|
42 |
+
}
|
43 |
+
else {
|
44 |
+
Carp::croak("Wide character in subroutine entry") unless ( $_[1] );
|
45 |
+
0;
|
46 |
+
}
|
47 |
+
}
|
48 |
+
|
49 |
+
|
50 |
+
sub utf8::encode ($) { # UTF8 flag off
|
51 |
+
if ( utf8::is_utf8( $_[0] ) ) {
|
52 |
+
$_[0] = pack( "C*", unpack( "C*", $_[0] ) );
|
53 |
+
}
|
54 |
+
else {
|
55 |
+
$_[0] = pack( "U*", unpack( "C*", $_[0] ) );
|
56 |
+
$_[0] = pack( "C*", unpack( "C*", $_[0] ) );
|
57 |
+
}
|
58 |
+
}
|
59 |
+
|
60 |
+
|
61 |
+
sub utf8::decode ($) { # UTF8 flag on
|
62 |
+
if ( _is_valid_utf8( $_[0] ) ) {
|
63 |
+
utf8::downgrade( $_[0] );
|
64 |
+
$_[0] = pack( "U*", unpack( "U*", $_[0] ) );
|
65 |
+
}
|
66 |
+
}
|
67 |
+
|
68 |
+
|
69 |
+
*JSON::PP::JSON_PP_encode_ascii = \&_encode_ascii;
|
70 |
+
*JSON::PP::JSON_PP_encode_latin1 = \&_encode_latin1;
|
71 |
+
*JSON::PP::JSON_PP_decode_surrogates = \&JSON::PP::_decode_surrogates;
|
72 |
+
*JSON::PP::JSON_PP_decode_unicode = \&JSON::PP::_decode_unicode;
|
73 |
+
|
74 |
+
unless ( defined &B::SVp_NOK ) { # missing in B module.
|
75 |
+
eval q{ sub B::SVp_NOK () { 0x02000000; } };
|
76 |
+
}
|
77 |
+
|
78 |
+
}
|
79 |
+
|
80 |
+
|
81 |
+
|
82 |
+
sub _encode_ascii {
|
83 |
+
join('',
|
84 |
+
map {
|
85 |
+
$_ <= 127 ?
|
86 |
+
chr($_) :
|
87 |
+
$_ <= 65535 ?
|
88 |
+
sprintf('\u%04x', $_) : sprintf('\u%x\u%x', JSON::PP::_encode_surrogates($_));
|
89 |
+
} _unpack_emu($_[0])
|
90 |
+
);
|
91 |
+
}
|
92 |
+
|
93 |
+
|
94 |
+
sub _encode_latin1 {
|
95 |
+
join('',
|
96 |
+
map {
|
97 |
+
$_ <= 255 ?
|
98 |
+
chr($_) :
|
99 |
+
$_ <= 65535 ?
|
100 |
+
sprintf('\u%04x', $_) : sprintf('\u%x\u%x', JSON::PP::_encode_surrogates($_));
|
101 |
+
} _unpack_emu($_[0])
|
102 |
+
);
|
103 |
+
}
|
104 |
+
|
105 |
+
|
106 |
+
sub _unpack_emu { # for Perl 5.6 unpack warnings
|
107 |
+
return !utf8::is_utf8($_[0]) ? unpack('C*', $_[0])
|
108 |
+
: _is_valid_utf8($_[0]) ? unpack('U*', $_[0])
|
109 |
+
: unpack('C*', $_[0]);
|
110 |
+
}
|
111 |
+
|
112 |
+
|
113 |
+
sub _is_valid_utf8 {
|
114 |
+
my $str = $_[0];
|
115 |
+
my $is_utf8;
|
116 |
+
|
117 |
+
while ($str =~ /(?:
|
118 |
+
(
|
119 |
+
[\x00-\x7F]
|
120 |
+
|[\xC2-\xDF][\x80-\xBF]
|
121 |
+
|[\xE0][\xA0-\xBF][\x80-\xBF]
|
122 |
+
|[\xE1-\xEC][\x80-\xBF][\x80-\xBF]
|
123 |
+
|[\xED][\x80-\x9F][\x80-\xBF]
|
124 |
+
|[\xEE-\xEF][\x80-\xBF][\x80-\xBF]
|
125 |
+
|[\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF]
|
126 |
+
|[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]
|
127 |
+
|[\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF]
|
128 |
+
)
|
129 |
+
| (.)
|
130 |
+
)/xg)
|
131 |
+
{
|
132 |
+
if (defined $1) {
|
133 |
+
$is_utf8 = 1 if (!defined $is_utf8);
|
134 |
+
}
|
135 |
+
else {
|
136 |
+
$is_utf8 = 0 if (!defined $is_utf8);
|
137 |
+
if ($is_utf8) { # eventually, not utf8
|
138 |
+
return;
|
139 |
+
}
|
140 |
+
}
|
141 |
+
}
|
142 |
+
|
143 |
+
return $is_utf8;
|
144 |
+
}
|
145 |
+
|
146 |
+
|
147 |
+
1;
|
148 |
+
__END__
|
149 |
+
|
150 |
+
=pod
|
151 |
+
|
152 |
+
=head1 NAME
|
153 |
+
|
154 |
+
JSON::PP56 - Helper module in using JSON::PP in Perl 5.6
|
155 |
+
|
156 |
+
=head1 DESCRIPTION
|
157 |
+
|
158 |
+
JSON::PP calls internally.
|
159 |
+
|
160 |
+
=head1 AUTHOR
|
161 |
+
|
162 |
+
Makamaka Hannyaharamitu, E<lt>makamaka[at]cpan.orgE<gt>
|
163 |
+
|
164 |
+
|
165 |
+
=head1 COPYRIGHT AND LICENSE
|
166 |
+
|
167 |
+
Copyright 2007-2012 by Makamaka Hannyaharamitu
|
168 |
+
|
169 |
+
This library is free software; you can redistribute it and/or modify
|
170 |
+
it under the same terms as Perl itself.
|
171 |
+
|
172 |
+
=cut
|
173 |
+
|
uroman/lib/NLP/Chinese.pm
ADDED
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
################################################################
|
2 |
+
# #
|
3 |
+
# Chinese #
|
4 |
+
# #
|
5 |
+
################################################################
|
6 |
+
|
7 |
+
package NLP::Chinese;
|
8 |
+
|
9 |
+
$utf8 = NLP::UTF8;
|
10 |
+
%empty_ht = ();
|
11 |
+
|
12 |
+
sub read_chinese_tonal_pinyin_files {
|
13 |
+
local($caller, *ht, @filenames) = @_;
|
14 |
+
|
15 |
+
$n_kHanyuPinlu = 0;
|
16 |
+
$n_kXHC1983 = 0;
|
17 |
+
$n_kHanyuPinyin = 0;
|
18 |
+
$n_kMandarin = 0;
|
19 |
+
$n_cedict = 0;
|
20 |
+
$n_simple_pinyin = 0;
|
21 |
+
|
22 |
+
foreach $filename (@filenames) {
|
23 |
+
if ($filename =~ /unihan/i) {
|
24 |
+
my $line_number = 0;
|
25 |
+
if (open(IN, $filename)) {
|
26 |
+
while (<IN>) {
|
27 |
+
$line_number++;
|
28 |
+
next if /^#/;
|
29 |
+
s/\s*$//;
|
30 |
+
if (($u, $type, $value) = split(/\t/, $_)) {
|
31 |
+
if ($type =~ /^(kHanyuPinlu|kXHC1983|kHanyuPinyin|kMandarin)$/) {
|
32 |
+
$u = $util->trim($u);
|
33 |
+
$type = $util->trim($type);
|
34 |
+
$value = $util->trim($value);
|
35 |
+
$f = $utf8->unicode_string2string($u);
|
36 |
+
|
37 |
+
if ($type eq "kHanyuPinlu") {
|
38 |
+
$value =~ s/\(.*?\)//g;
|
39 |
+
$value = $util->trim($value);
|
40 |
+
$translit = $caller->number_to_accent_tone($value);
|
41 |
+
$ht{"kHanyuPinlu"}->{$f} = $translit;
|
42 |
+
$n_kHanyuPinlu++;
|
43 |
+
} elsif ($type eq "kXHC1983") {
|
44 |
+
@translits = ($value =~ /:(\S+)/g);
|
45 |
+
$translit = join(" ", @translits);
|
46 |
+
$ht{"kXHC1983"}->{$f} = $translit;
|
47 |
+
$n_kXHC1983++;
|
48 |
+
} elsif ($type eq "kHanyuPinyin") {
|
49 |
+
$value =~ s/^.*://;
|
50 |
+
$value =~ s/,/ /g;
|
51 |
+
$ht{"kHanyuPinyin"}->{$f} = $value;
|
52 |
+
$n_kHanyuPinyin++;
|
53 |
+
} elsif ($type eq "kMandarin") {
|
54 |
+
$ht{"kMandarin"}->{$f} = $value;
|
55 |
+
$n_kMandarin++;
|
56 |
+
}
|
57 |
+
}
|
58 |
+
}
|
59 |
+
}
|
60 |
+
close(IN);
|
61 |
+
print "Read in $n_kHanyuPinlu kHanyuPinlu, $n_kXHC1983 n_kXHC1983, $n_kHanyuPinyin n_kHanyuPinyin $n_kMandarin n_kMandarin\n";
|
62 |
+
} else {
|
63 |
+
print STDERR "Can't open $filename\n";
|
64 |
+
}
|
65 |
+
} elsif ($filename =~ /cedict/i) {
|
66 |
+
if (open(IN, $filename)) {
|
67 |
+
my $line_number = 0;
|
68 |
+
while (<IN>) {
|
69 |
+
$line_number++;
|
70 |
+
next if /^#/;
|
71 |
+
s/\s*$//;
|
72 |
+
if (($f, $translit) = ($_ =~ /^\S+\s+(\S+)\s+\[([^\[\]]+)\]/)) {
|
73 |
+
$translit = $utf8->extended_lower_case($translit);
|
74 |
+
$translit = $caller->number_to_accent_tone($translit);
|
75 |
+
$translit =~ s/\s//g;
|
76 |
+
if ($old_translit = $ht{"cedict"}->{$f}) {
|
77 |
+
# $ht{CONFLICT}->{("DUPLICATE " . $f)} = "CEDICT($f): $old_translit\nCEDICT($f): $translit (duplicate)\n" unless $translit eq $old_translit;
|
78 |
+
$ht{"cedicts"}->{$f} = join(" ", $ht{"cedicts"}->{$f}, $translit) unless $old_translit eq $translit;
|
79 |
+
} else {
|
80 |
+
$ht{"cedict"}->{$f} = $translit;
|
81 |
+
$ht{"cedicts"}->{$f} = $translit;
|
82 |
+
}
|
83 |
+
$n_cedict++;
|
84 |
+
}
|
85 |
+
}
|
86 |
+
close(IN);
|
87 |
+
# print "Read in $n_cedict n_cedict\n";
|
88 |
+
} else {
|
89 |
+
print STDERR "Can't open $filename";
|
90 |
+
}
|
91 |
+
} elsif ($filename =~ /chinese_to_pinyin/i) {
|
92 |
+
if (open(IN, $filename)) {
|
93 |
+
my $line_number = 0;
|
94 |
+
while (<IN>) {
|
95 |
+
$line_number++;
|
96 |
+
next if /^#/;
|
97 |
+
if (($f, $translit) = ($_ =~ /^(\S+)\t(\S+)\s*$/)) {
|
98 |
+
$ht{"simple_pinyin"}->{$f} = $translit;
|
99 |
+
$n_simple_pinyin++;
|
100 |
+
}
|
101 |
+
}
|
102 |
+
close(IN);
|
103 |
+
# print "Read in $n_simple_pinyin n_simple_pinyin\n";
|
104 |
+
} else {
|
105 |
+
print STDERR "Can't open $filename";
|
106 |
+
}
|
107 |
+
} else {
|
108 |
+
print STDERR "Don't know what to do with file $filename (in read_chinese_tonal_pinyin_files)\n";
|
109 |
+
}
|
110 |
+
}
|
111 |
+
}
|
112 |
+
|
113 |
+
sub tonal_pinyin {
|
114 |
+
local($caller, $s, *ht, $gloss) = @_;
|
115 |
+
|
116 |
+
return $result if defined($result = $ht{COMBINED}->{$s});
|
117 |
+
|
118 |
+
$cedict_pinyin = $ht{"cedict"}->{$s} || "";
|
119 |
+
$cedicts_pinyin = $ht{"cedicts"}->{$s} || "";
|
120 |
+
$unihan_pinyin = "";
|
121 |
+
@characters = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht);
|
122 |
+
foreach $c (@characters) {
|
123 |
+
if ($pinyin = $ht{"simple_pinyin"}->{$c}) {
|
124 |
+
$unihan_pinyin .= $pinyin;
|
125 |
+
} elsif ($pinyin = $ht{"kHanyuPinlu"}->{$c}) {
|
126 |
+
$pinyin =~ s/^(\S+)\s.*$/$1/;
|
127 |
+
$unihan_pinyin .= $pinyin;
|
128 |
+
} elsif ($pinyin = $ht{"kXHC1983"}->{$c}) {
|
129 |
+
$pinyin =~ s/^(\S+)\s.*$/$1/;
|
130 |
+
$unihan_pinyin .= $pinyin;
|
131 |
+
} elsif ($pinyin = $ht{"kHanyuPinyin"}->{$c}) {
|
132 |
+
$pinyin =~ s/^(\S+)\s.*$/$1/;
|
133 |
+
$unihan_pinyin .= $pinyin;
|
134 |
+
} elsif ($pinyin = $ht{"cedicts"}->{$c}) {
|
135 |
+
$pinyin =~ s/^(\S+)\s.*$/$1/;
|
136 |
+
$unihan_pinyin .= $pinyin;
|
137 |
+
# middle dot, katakana middle dot, multiplication sign
|
138 |
+
} elsif ($c =~ /^(\xC2\xB7|\xE3\x83\xBB|\xC3\x97)$/) {
|
139 |
+
$unihan_pinyin .= $c;
|
140 |
+
# ASCII
|
141 |
+
} elsif ($c =~ /^([\x21-\x7E])$/) {
|
142 |
+
$unihan_pinyin .= $c;
|
143 |
+
} else {
|
144 |
+
$unihan_pinyin .= "?";
|
145 |
+
$hex = $utf8->utf8_to_hex($c);
|
146 |
+
$unicode = uc $utf8->utf8_to_4hex_unicode($c);
|
147 |
+
# print STDERR "Tonal pinyin: Unknown character $c ($hex/U+$unicode) -> ?\n";
|
148 |
+
}
|
149 |
+
}
|
150 |
+
$pinyin_title = "";
|
151 |
+
if (($#characters >= 1) && $cedicts_pinyin) {
|
152 |
+
foreach $pinyin (split(/\s+/, $cedicts_pinyin)) {
|
153 |
+
$pinyin_title .= "$s $pinyin (CEDICT)\n";
|
154 |
+
}
|
155 |
+
$pinyin_title .= "\n";
|
156 |
+
}
|
157 |
+
foreach $c (@characters) {
|
158 |
+
my %local_ht = ();
|
159 |
+
@pinyins = ();
|
160 |
+
foreach $type (("kHanyuPinlu", "kXHC1983", "kHanyuPinyin", "cedicts")) {
|
161 |
+
if ($pinyin_s = $ht{$type}->{$c}) {
|
162 |
+
foreach $pinyin (split(/\s+/, $pinyin_s)) {
|
163 |
+
push(@pinyins, $pinyin) unless $util->member($pinyin, @pinyins);
|
164 |
+
$type2 = ($type eq "cedicts") ? "CEDICT" : $type;
|
165 |
+
$local_ht{$pinyin} = ($local_ht{$pinyin}) ? join(", ", $local_ht{$pinyin}, $type2) : $type2;
|
166 |
+
}
|
167 |
+
}
|
168 |
+
}
|
169 |
+
foreach $pinyin (@pinyins) {
|
170 |
+
$type_s = $local_ht{$pinyin};
|
171 |
+
$pinyin_title .= "$c $pinyin ($type_s)\n";
|
172 |
+
}
|
173 |
+
}
|
174 |
+
$pinyin_title =~ s/\n$//;
|
175 |
+
$pinyin_title =~ s/\n/
/g;
|
176 |
+
$unihan_pinyin = "" if $unihan_pinyin =~ /^\?+$/;
|
177 |
+
if (($#characters >= 1) && $cedict_pinyin && $unihan_pinyin && ($unihan_pinyin ne $cedict_pinyin)) {
|
178 |
+
$log = "Gloss($s): $gloss\nCEdict($s): $cedicts_pinyin\nUnihan($s): $unihan_pinyin\n";
|
179 |
+
foreach $type (("kHanyuPinlu", "kXHC1983", "kHanyuPinyin")) {
|
180 |
+
$log_line = "$type($s): ";
|
181 |
+
foreach $c (@characters) {
|
182 |
+
$pinyin = $ht{$type}->{$c} || "";
|
183 |
+
if ($pinyin =~ / /) {
|
184 |
+
$log_line .= "($pinyin)";
|
185 |
+
} elsif ($pinyin) {
|
186 |
+
$log_line .= $pinyin;
|
187 |
+
} else {
|
188 |
+
$log_line .= "?";
|
189 |
+
}
|
190 |
+
}
|
191 |
+
$log .= "$log_line\n";
|
192 |
+
}
|
193 |
+
$ht{CONFLICT}->{$s} = $log;
|
194 |
+
}
|
195 |
+
$result = $unihan_pinyin || $cedict_pinyin;
|
196 |
+
$result = $cedict_pinyin if ($#characters > 0) && $cedict_pinyin;
|
197 |
+
$ht{COMBINED}->{$s} = $result;
|
198 |
+
$ht{PINYIN_TITLE}->{$s} = $pinyin_title;
|
199 |
+
return $result;
|
200 |
+
}
|
201 |
+
|
202 |
+
%number_to_accent_tone_ht = (
|
203 |
+
"a1", "\xC4\x81", "a2", "\xC3\xA1", "a3", "\xC7\x8E", "a4", "\xC3\xA0",
|
204 |
+
"e1", "\xC4\x93", "e2", "\xC3\xA9", "e3", "\xC4\x9B", "e4", "\xC3\xA8",
|
205 |
+
"i1", "\xC4\xAB", "i2", "\xC3\xAD", "i3", "\xC7\x90", "i4", "\xC3\xAC",
|
206 |
+
"o1", "\xC5\x8D", "o2", "\xC3\xB3", "o3", "\xC7\x92", "o4", "\xC3\xB2",
|
207 |
+
"u1", "\xC5\xAB", "u2", "\xC3\xBA", "u3", "\xC7\x94", "u4", "\xC3\xB9",
|
208 |
+
"u:1","\xC7\x96", "u:2","\xC7\x98", "u:3","\xC7\x9A", "u:4","\xC7\x9C",
|
209 |
+
"\xC3\xBC1","\xC7\x96","\xC3\xBC2","\xC7\x98","\xC3\xBC3","\xC7\x9A","\xC3\xBC4","\xC7\x9C"
|
210 |
+
);
|
211 |
+
|
212 |
+
sub number_to_accent_tone {
|
213 |
+
local($caller, $s) = @_;
|
214 |
+
|
215 |
+
my $result = "";
|
216 |
+
while (($pre,$alpha,$tone_number,$rest) = ($s =~ /^(.*?)((?:[a-z]|u:|\xC3\xBC)+)([1-5])(.*)$/i)) {
|
217 |
+
if ($tone_number eq "5") {
|
218 |
+
$result .= "$pre$alpha";
|
219 |
+
} elsif ((($pre_acc,$acc_letter,$post_acc) = ($alpha =~ /^(.*)([ae])(.*)$/))
|
220 |
+
|| (($pre_acc,$acc_letter,$post_acc) = ($alpha =~ /^(.*)(o)(u.*)$/))
|
221 |
+
|| (($pre_acc,$acc_letter,$post_acc) = ($alpha =~ /^(.*)(u:|[iou]|\xC3\xBC)([^aeiou]*)$/))) {
|
222 |
+
$result .= "$pre$pre_acc" . ($number_to_accent_tone_ht{($acc_letter . $tone_number)} || ($acc_letter . $tone_number)) . $post_acc;
|
223 |
+
} else {
|
224 |
+
$result .= "$pre$alpha$tone_number";
|
225 |
+
}
|
226 |
+
$s = $rest;
|
227 |
+
}
|
228 |
+
$result .= $s;
|
229 |
+
$result =~ s/u:/\xC3\xBC/g;
|
230 |
+
return $result;
|
231 |
+
}
|
232 |
+
|
233 |
+
sub string_contains_utf8_cjk_unified_ideograph_p {
|
234 |
+
local($caller, $s) = @_;
|
235 |
+
|
236 |
+
return ($s =~ /([\xE4-\xE9]|\xE3[\x90-\xBF]|\xF0[\xA0-\xAC])/);
|
237 |
+
}
|
238 |
+
|
239 |
+
1;
|
uroman/lib/NLP/English.pm
ADDED
The diff for this file is too large to render.
See raw diff
|
|
uroman/lib/NLP/Romanizer.pm
ADDED
@@ -0,0 +1,2020 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
################################################################
|
2 |
+
# #
|
3 |
+
# Romanizer #
|
4 |
+
# #
|
5 |
+
################################################################
|
6 |
+
|
7 |
+
package NLP::Romanizer;
|
8 |
+
|
9 |
+
use NLP::Chinese;
|
10 |
+
use NLP::UTF8;
|
11 |
+
use NLP::utilities;
|
12 |
+
use JSON;
|
13 |
+
$utf8 = NLP::UTF8;
|
14 |
+
$util = NLP::utilities;
|
15 |
+
$chinesePM = NLP::Chinese;
|
16 |
+
|
17 |
+
my $verbosePM = 0;
|
18 |
+
%empty_ht = ();
|
19 |
+
|
20 |
+
my $braille_capital_letter_indicator = "\xE2\xA0\xA0";
|
21 |
+
my $braille_number_indicator = "\xE2\xA0\xBC";
|
22 |
+
my $braille_decimal_point = "\xE2\xA0\xA8";
|
23 |
+
my $braille_comma = "\xE2\xA0\x82";
|
24 |
+
my $braille_solidus = "\xE2\xA0\x8C";
|
25 |
+
my $braille_numeric_space = "\xE2\xA0\x90";
|
26 |
+
my $braille_letter_indicator = "\xE2\xA0\xB0";
|
27 |
+
my $braille_period = "\xE2\xA0\xB2";
|
28 |
+
|
29 |
+
sub new {
|
30 |
+
local($caller) = @_;
|
31 |
+
|
32 |
+
my $object = {};
|
33 |
+
my $class = ref( $caller ) || $caller;
|
34 |
+
bless($object, $class);
|
35 |
+
return $object;
|
36 |
+
}
|
37 |
+
|
38 |
+
sub load_unicode_data {
|
39 |
+
local($this, *ht, $filename) = @_;
|
40 |
+
# ../../data/UnicodeData.txt
|
41 |
+
|
42 |
+
$n = 0;
|
43 |
+
if (open(IN, $filename)) {
|
44 |
+
while (<IN>) {
|
45 |
+
if (($unicode_value, $char_name, $general_category, $canon_comb_classes, $bidir_category, $char_decomp_mapping, $decimal_digit_value, $digit_value, $numeric_value, $mirrored, $unicode_1_0_name, $comment_field, $uc_mapping, $lc_mapping, $title_case_mapping) = split(";", $_)) {
|
46 |
+
$utf8_code = $utf8->unicode_hex_string2string($unicode_value);
|
47 |
+
$ht{UTF_TO_CHAR_NAME}->{$utf8_code} = $char_name;
|
48 |
+
$ht{UTF_NAME_TO_UNICODE}->{$char_name} = $unicode_value;
|
49 |
+
$ht{UTF_NAME_TO_CODE}->{$char_name} = $utf8_code;
|
50 |
+
$ht{UTF_TO_CAT}->{$utf8_code} = $general_category;
|
51 |
+
$ht{UTF_TO_NUMERIC}->{$utf8_code} = $numeric_value unless $numeric_value eq "";
|
52 |
+
$n++;
|
53 |
+
}
|
54 |
+
}
|
55 |
+
close(IN);
|
56 |
+
# print STDERR "Loaded $n entries from $filename\n";
|
57 |
+
} else {
|
58 |
+
print STDERR "Can't open $filename\n";
|
59 |
+
}
|
60 |
+
}
|
61 |
+
|
62 |
+
sub load_unicode_overwrite_romanization {
|
63 |
+
local($this, *ht, $filename) = @_;
|
64 |
+
# ../../data/UnicodeDataOverwrite.txt
|
65 |
+
|
66 |
+
$n = 0;
|
67 |
+
if (open(IN, $filename)) {
|
68 |
+
while (<IN>) {
|
69 |
+
next if /^#/;
|
70 |
+
$unicode_value = $util->slot_value_in_double_colon_del_list($_, "u");
|
71 |
+
$romanization = $util->slot_value_in_double_colon_del_list($_, "r");
|
72 |
+
$numeric = $util->slot_value_in_double_colon_del_list($_, "num");
|
73 |
+
$picture = $util->slot_value_in_double_colon_del_list($_, "pic");
|
74 |
+
$syllable_info = $util->slot_value_in_double_colon_del_list($_, "syllable-info");
|
75 |
+
$tone_mark = $util->slot_value_in_double_colon_del_list($_, "tone-mark");
|
76 |
+
$char_name = $util->slot_value_in_double_colon_del_list($_, "name");
|
77 |
+
$entry_processed_p = 0;
|
78 |
+
$utf8_code = $utf8->unicode_hex_string2string($unicode_value);
|
79 |
+
if ($unicode_value) {
|
80 |
+
$ht{UTF_TO_CHAR_ROMANIZATION}->{$utf8_code} = $romanization if $romanization;
|
81 |
+
$ht{UTF_TO_NUMERIC}->{$utf8_code} = $numeric if defined($numeric) && ($numeric ne "");
|
82 |
+
$ht{UTF_TO_PICTURE_DESCR}->{$utf8_code} = $picture if $picture;
|
83 |
+
$ht{UTF_TO_SYLLABLE_INFO}->{$utf8_code} = $syllable_info if $syllable_info;
|
84 |
+
$ht{UTF_TO_TONE_MARK}->{$utf8_code} = $tone_mark if $tone_mark;
|
85 |
+
$ht{UTF_TO_CHAR_NAME}->{$utf8_code} = $char_name if $char_name;
|
86 |
+
$entry_processed_p = 1 if $romanization || $numeric || $picture || $syllable_info || $tone_mark;
|
87 |
+
}
|
88 |
+
$n++ if $entry_processed_p;
|
89 |
+
}
|
90 |
+
close(IN);
|
91 |
+
} else {
|
92 |
+
print STDERR "Can't open $filename\n";
|
93 |
+
}
|
94 |
+
}
|
95 |
+
|
96 |
+
sub load_script_data {
|
97 |
+
local($this, *ht, $filename) = @_;
|
98 |
+
# ../../data/Scripts.txt
|
99 |
+
|
100 |
+
$n = 0;
|
101 |
+
if (open(IN, $filename)) {
|
102 |
+
while (<IN>) {
|
103 |
+
next unless $script_name = $util->slot_value_in_double_colon_del_list($_, "script-name");
|
104 |
+
$abugida_default_vowel_s = $util->slot_value_in_double_colon_del_list($_, "abugida-default-vowel");
|
105 |
+
$alt_script_name_s = $util->slot_value_in_double_colon_del_list($_, "alt-script-name");
|
106 |
+
$language_s = $util->slot_value_in_double_colon_del_list($_, "language");
|
107 |
+
$direction = $util->slot_value_in_double_colon_del_list($_, "direction"); # right-to-left
|
108 |
+
$font_family_s = $util->slot_value_in_double_colon_del_list($_, "font-family");
|
109 |
+
$ht{SCRIPT_P}->{$script_name} = 1;
|
110 |
+
$ht{SCRIPT_NORM}->{(uc $script_name)} = $script_name;
|
111 |
+
$ht{DIRECTION}->{$script_name} = $direction if $direction;
|
112 |
+
foreach $language (split(/,\s*/, $language_s)) {
|
113 |
+
$ht{SCRIPT_LANGUAGE}->{$script_name}->{$language} = 1;
|
114 |
+
$ht{LANGUAGE_SCRIPT}->{$language}->{$script_name} = 1;
|
115 |
+
}
|
116 |
+
foreach $alt_script_name (split(/,\s*/, $alt_script_name_s)) {
|
117 |
+
$ht{SCRIPT_NORM}->{$alt_script_name} = $script_name;
|
118 |
+
$ht{SCRIPT_NORM}->{(uc $alt_script_name)} = $script_name;
|
119 |
+
}
|
120 |
+
foreach $abugida_default_vowel (split(/,\s*/, $abugida_default_vowel_s)) {
|
121 |
+
$ht{SCRIPT_ABUDIGA_DEFAULT_VOWEL}->{$script_name}->{$abugida_default_vowel} = 1 if $abugida_default_vowel;
|
122 |
+
}
|
123 |
+
foreach $font_family (split(/,\s*/, $font_family_s)) {
|
124 |
+
$ht{SCRIPT_FONT}->{$script_name}->{$font_family} = 1 if $font_family;
|
125 |
+
}
|
126 |
+
$n++;
|
127 |
+
}
|
128 |
+
close(IN);
|
129 |
+
# print STDERR "Loaded $n entries from $filename\n";
|
130 |
+
} else {
|
131 |
+
print STDERR "Can't open $filename\n";
|
132 |
+
}
|
133 |
+
}
|
134 |
+
|
135 |
+
sub unicode_hangul_romanization {
|
136 |
+
local($this, $s, $pass_through_p) = @_;
|
137 |
+
|
138 |
+
$pass_through_p = 0 unless defined($pass_through_p);
|
139 |
+
@leads = split(/\s+/, "g gg n d dd r m b bb s ss - j jj c k t p h");
|
140 |
+
# @vowels = split(/\s+/, "a ae ya yai e ei ye yei o oa oai oi yo u ue uei ui yu w wi i");
|
141 |
+
@vowels = split(/\s+/, "a ae ya yae eo e yeo ye o wa wai oe yo u weo we wi yu eu yi i");
|
142 |
+
@tails = split(/\s+/, "- g gg gs n nj nh d l lg lm lb ls lt lp lh m b bs s ss ng j c k t p h");
|
143 |
+
$result = "";
|
144 |
+
@chars = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht);
|
145 |
+
foreach $char (@chars) {
|
146 |
+
$unicode = $utf8->utf8_to_unicode($char);
|
147 |
+
if (($unicode >= 0xAC00) && ($unicode <= 0xD7A3)) {
|
148 |
+
$code = $unicode - 0xAC00;
|
149 |
+
$lead_index = int($code / (28*21));
|
150 |
+
$vowel_index = int($code/28) % 21;
|
151 |
+
$tail_index = $code % 28;
|
152 |
+
$rom = $leads[$lead_index] . $vowels[$vowel_index] . $tails[$tail_index];
|
153 |
+
$rom =~ s/-//g;
|
154 |
+
$result .= $rom;
|
155 |
+
} elsif ($pass_through_p) {
|
156 |
+
$result .= $char;
|
157 |
+
}
|
158 |
+
}
|
159 |
+
return $result;
|
160 |
+
}
|
161 |
+
|
162 |
+
sub listify_comma_sep_string {
|
163 |
+
local($this, $s) = @_;
|
164 |
+
|
165 |
+
@result_list = ();
|
166 |
+
return @result_list unless $s =~ /\S/;
|
167 |
+
$s = $util->trim2($s);
|
168 |
+
my $elem;
|
169 |
+
|
170 |
+
while (($elem, $rest) = ($s =~ /^("(?:\\"|[^"])*"|'(?:\\'|[^'])*'|[^"', ]+),\s*(.*)$/)) {
|
171 |
+
push(@result_list, $util->dequote_string($elem));
|
172 |
+
$s = $rest;
|
173 |
+
}
|
174 |
+
push(@result_list, $util->dequote_string($s)) if $s =~ /\S/;
|
175 |
+
|
176 |
+
return @result_list;
|
177 |
+
}
|
178 |
+
|
179 |
+
sub braille_string_p {
|
180 |
+
local($this, $s) = @_;
|
181 |
+
|
182 |
+
return ($s =~ /^(\xE2[\xA0-\xA3][\x80-\xBF])+$/);
|
183 |
+
}
|
184 |
+
|
185 |
+
sub register_word_boundary_info {
|
186 |
+
local($this, *ht, $lang_code, $utf8_source_string, $utf8_target_string, $use_only_for_whole_word_p,
|
187 |
+
$use_only_at_start_of_word_p, $use_only_at_end_of_word_p,
|
188 |
+
$dont_use_at_start_of_word_p, $dont_use_at_end_of_word_p) = @_;
|
189 |
+
|
190 |
+
if ($use_only_for_whole_word_p) {
|
191 |
+
if ($lang_code) {
|
192 |
+
$ht{USE_ONLY_FOR_WHOLE_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1;
|
193 |
+
} else {
|
194 |
+
$ht{USE_ONLY_FOR_WHOLE_WORD}->{$utf8_source_string}->{$utf8_target_string} = 1;
|
195 |
+
}
|
196 |
+
}
|
197 |
+
if ($use_only_at_start_of_word_p) {
|
198 |
+
if ($lang_code) {
|
199 |
+
$ht{USE_ONLY_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1;
|
200 |
+
} else {
|
201 |
+
$ht{USE_ONLY_AT_START_OF_WORD}->{$utf8_source_string}->{$utf8_target_string} = 1;
|
202 |
+
}
|
203 |
+
}
|
204 |
+
if ($use_only_at_end_of_word_p) {
|
205 |
+
if ($lang_code) {
|
206 |
+
$ht{USE_ONLY_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1;
|
207 |
+
} else {
|
208 |
+
$ht{USE_ONLY_AT_END_OF_WORD}->{$utf8_source_string}->{$utf8_target_string} = 1;
|
209 |
+
}
|
210 |
+
}
|
211 |
+
if ($dont_use_at_start_of_word_p) {
|
212 |
+
if ($lang_code) {
|
213 |
+
$ht{DONT_USE_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1;
|
214 |
+
} else {
|
215 |
+
$ht{DONT_USE_AT_START_OF_WORD}->{$utf8_source_string}->{$utf8_target_string} = 1;
|
216 |
+
}
|
217 |
+
}
|
218 |
+
if ($dont_use_at_end_of_word_p) {
|
219 |
+
if ($lang_code) {
|
220 |
+
$ht{DONT_USE_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1;
|
221 |
+
} else {
|
222 |
+
$ht{DONT_USE_AT_END_OF_WORD}->{$utf8_source_string}->{$utf8_target_string} = 1;
|
223 |
+
}
|
224 |
+
}
|
225 |
+
}
|
226 |
+
|
227 |
+
sub load_romanization_table {
|
228 |
+
local($this, *ht, $filename) = @_;
|
229 |
+
# ../../data/romanization-table.txt
|
230 |
+
|
231 |
+
$n = 0;
|
232 |
+
$line_number = 0;
|
233 |
+
if (open(IN, $filename)) {
|
234 |
+
while (<IN>) {
|
235 |
+
$line_number++;
|
236 |
+
next if /^#/;
|
237 |
+
if ($_ =~ /^::preserve\s/) {
|
238 |
+
$from_unicode = $util->slot_value_in_double_colon_del_list($_, "from");
|
239 |
+
$to_unicode = $util->slot_value_in_double_colon_del_list($_, "to");
|
240 |
+
if ($from_unicode =~ /^(?:U\+|\\u)[0-9A-F]{4,}$/i) {
|
241 |
+
$from_unicode =~ s/^(?:U\+|\\u)//;
|
242 |
+
$from_code_point = hex($from_unicode);
|
243 |
+
} else {
|
244 |
+
$from_code_point = "";
|
245 |
+
}
|
246 |
+
if ($to_unicode =~ /^(?:U\+|\\u)[0-9A-F]{4,}$/i) {
|
247 |
+
$to_unicode =~ s/^(?:U\+|\\u)//;
|
248 |
+
$to_code_point = hex($to_unicode);
|
249 |
+
} else {
|
250 |
+
$to_code_point = $from_code_point;
|
251 |
+
}
|
252 |
+
if ($from_code_point ne "") {
|
253 |
+
# print STDERR "Preserve code-points $from_unicode--$to_unicode = $from_code_point--$to_code_point\n";
|
254 |
+
foreach $code_point (($from_code_point .. $to_code_point)) {
|
255 |
+
$utf8_string = $utf8->unicode2string($code_point);
|
256 |
+
$ht{UTF_CHAR_MAPPING}->{$utf8_string}->{$utf8_string} = 1;
|
257 |
+
}
|
258 |
+
$n++;
|
259 |
+
}
|
260 |
+
next;
|
261 |
+
}
|
262 |
+
$utf8_source_string = $util->slot_value_in_double_colon_del_list($_, "s");
|
263 |
+
$utf8_target_string = $util->slot_value_in_double_colon_del_list($_, "t");
|
264 |
+
$utf8_alt_target_string_s = $util->slot_value_in_double_colon_del_list($_, "t-alt");
|
265 |
+
$use_alt_in_pointed_p = ($_ =~ /::use-alt-in-pointed\b/);
|
266 |
+
$use_only_for_whole_word_p = ($_ =~ /::use-only-for-whole-word\b/);
|
267 |
+
$use_only_at_start_of_word_p = ($_ =~ /::use-only-at-start-of-word\b/);
|
268 |
+
$use_only_at_end_of_word_p = ($_ =~ /::use-only-at-end-of-word\b/);
|
269 |
+
$dont_use_at_start_of_word_p = ($_ =~ /::dont-use-at-start-of-word\b/);
|
270 |
+
$dont_use_at_end_of_word_p = ($_ =~ /::dont-use-at-end-of-word\b/);
|
271 |
+
$use_only_in_lower_case_enviroment_p = ($_ =~ /::use-only-in-lower-case-enviroment\b/);
|
272 |
+
$word_external_punctuation_p = ($_ =~ /::word-external-punctuation\b/);
|
273 |
+
$utf8_source_string =~ s/\s*$//;
|
274 |
+
$utf8_target_string =~ s/\s*$//;
|
275 |
+
$utf8_alt_target_string_s =~ s/\s*$//;
|
276 |
+
$utf8_target_string =~ s/^"(.*)"$/$1/;
|
277 |
+
$utf8_target_string =~ s/^'(.*)'$/$1/;
|
278 |
+
@utf8_alt_targets = $this->listify_comma_sep_string($utf8_alt_target_string_s);
|
279 |
+
$numeric = $util->slot_value_in_double_colon_del_list($_, "num");
|
280 |
+
$numeric =~ s/\s*$//;
|
281 |
+
$annotation = $util->slot_value_in_double_colon_del_list($_, "annotation");
|
282 |
+
$annotation =~ s/\s*$//;
|
283 |
+
$lang_code = $util->slot_value_in_double_colon_del_list($_, "lcode");
|
284 |
+
$prob = $util->slot_value_in_double_colon_del_list($_, "p") || 1;
|
285 |
+
unless (($utf8_target_string eq "") && ($numeric =~ /\d/)) {
|
286 |
+
if ($lang_code) {
|
287 |
+
$ht{UTF_CHAR_MAPPING_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = $prob;
|
288 |
+
} else {
|
289 |
+
$ht{UTF_CHAR_MAPPING}->{$utf8_source_string}->{$utf8_target_string} = $prob;
|
290 |
+
}
|
291 |
+
if ($word_external_punctuation_p) {
|
292 |
+
if ($lang_code) {
|
293 |
+
$ht{WORD_EXTERNAL_PUNCTUATION_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = $prob;
|
294 |
+
} else {
|
295 |
+
$ht{WORD_EXTERNAL_PUNCTUATION}->{$utf8_source_string}->{$utf8_target_string} = $prob;
|
296 |
+
}
|
297 |
+
}
|
298 |
+
if ($this->braille_string_p($utf8_source_string)) {
|
299 |
+
if (($utf8_target_string =~ /^[a-z]+$/)
|
300 |
+
&& (! ($utf8_source_string =~ /^$braille_capital_letter_indicator/))) {
|
301 |
+
my $uc_utf8_source_string = "$braille_capital_letter_indicator$utf8_source_string";
|
302 |
+
my $uc_utf8_target_string = ucfirst $utf8_target_string;
|
303 |
+
if ($lang_code) {
|
304 |
+
$ht{UTF_CHAR_MAPPING_LANG_SPEC}->{$lang_code}->{$uc_utf8_source_string}->{$uc_utf8_target_string} = $prob;
|
305 |
+
} else {
|
306 |
+
$ht{UTF_CHAR_MAPPING}->{$uc_utf8_source_string}->{$uc_utf8_target_string} = $prob;
|
307 |
+
}
|
308 |
+
$this->register_word_boundary_info(*ht, $lang_code, $uc_utf8_source_string, $uc_utf8_target_string,
|
309 |
+
$use_only_for_whole_word_p, $use_only_at_start_of_word_p, $use_only_at_end_of_word_p,
|
310 |
+
$dont_use_at_start_of_word_p, $dont_use_at_end_of_word_p);
|
311 |
+
}
|
312 |
+
if (($utf8_target_string =~ /^[0-9]$/)
|
313 |
+
&& ($utf8_source_string =~ /^$braille_number_indicator./)) {
|
314 |
+
my $core_number_char = $utf8_source_string;
|
315 |
+
$core_number_char =~ s/$braille_number_indicator//;
|
316 |
+
$ht{BRAILLE_TO_DIGIT}->{$core_number_char} = $utf8_target_string;
|
317 |
+
}
|
318 |
+
}
|
319 |
+
}
|
320 |
+
if ($use_only_in_lower_case_enviroment_p) {
|
321 |
+
if ($lang_code) {
|
322 |
+
$ht{USE_ONLY_IN_LOWER_CASE_ENVIROMENT_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_target_string} = 1;
|
323 |
+
} else {
|
324 |
+
$ht{USE_ONLY_IN_LOWER_CASE_ENVIROMENT}->{$utf8_source_string}->{$utf8_target_string} = 1;
|
325 |
+
}
|
326 |
+
}
|
327 |
+
$this->register_word_boundary_info(*ht, $lang_code, $utf8_source_string, $utf8_target_string,
|
328 |
+
$use_only_for_whole_word_p, $use_only_at_start_of_word_p, $use_only_at_end_of_word_p,
|
329 |
+
$dont_use_at_start_of_word_p, $dont_use_at_end_of_word_p);
|
330 |
+
foreach $utf8_alt_target (@utf8_alt_targets) {
|
331 |
+
if ($lang_code) {
|
332 |
+
$ht{UTF_CHAR_ALT_MAPPING_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_alt_target} = $prob;
|
333 |
+
$ht{USE_ALT_IN_POINTED_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_alt_target} = 1 if $use_alt_in_pointed_p;
|
334 |
+
} else {
|
335 |
+
$ht{UTF_CHAR_ALT_MAPPING}->{$utf8_source_string}->{$utf8_alt_target} = $prob;
|
336 |
+
$ht{USE_ALT_IN_POINTED}->{$utf8_source_string}->{$utf8_alt_target} = 1 if $use_alt_in_pointed_p;
|
337 |
+
}
|
338 |
+
if ($use_only_for_whole_word_p) {
|
339 |
+
if ($lang_code) {
|
340 |
+
$ht{USE_ALT_ONLY_FOR_WHOLE_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_alt_target} = 1;
|
341 |
+
} else {
|
342 |
+
$ht{USE_ALT_ONLY_FOR_WHOLE_WORD}->{$utf8_source_string}->{$utf8_alt_target} = 1;
|
343 |
+
}
|
344 |
+
}
|
345 |
+
if ($use_only_at_start_of_word_p) {
|
346 |
+
if ($lang_code) {
|
347 |
+
$ht{USE_ALT_ONLY_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_alt_target} = 1;
|
348 |
+
} else {
|
349 |
+
$ht{USE_ALT_ONLY_AT_START_OF_WORD}->{$utf8_source_string}->{$utf8_alt_target} = 1;
|
350 |
+
}
|
351 |
+
}
|
352 |
+
if ($use_only_at_end_of_word_p) {
|
353 |
+
if ($lang_code) {
|
354 |
+
$ht{USE_ALT_ONLY_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$utf8_source_string}->{$utf8_alt_target} = 1;
|
355 |
+
} else {
|
356 |
+
$ht{USE_ALT_ONLY_AT_END_OF_WORD}->{$utf8_source_string}->{$utf8_alt_target} = 1;
|
357 |
+
}
|
358 |
+
}
|
359 |
+
}
|
360 |
+
if ($numeric =~ /\d/) {
|
361 |
+
$ht{UTF_TO_NUMERIC}->{$utf8_source_string} = $numeric;
|
362 |
+
}
|
363 |
+
if ($annotation =~ /\S/) {
|
364 |
+
$ht{UTF_ANNOTATION}->{$utf8_source_string} = $annotation;
|
365 |
+
}
|
366 |
+
$n++;
|
367 |
+
}
|
368 |
+
close(IN);
|
369 |
+
# print STDERR "Loaded $n entries from $filename\n";
|
370 |
+
} else {
|
371 |
+
print STDERR "Can't open $filename\n";
|
372 |
+
}
|
373 |
+
}
|
374 |
+
|
375 |
+
sub char_name_to_script {
|
376 |
+
local($this, $char_name, *ht) = @_;
|
377 |
+
|
378 |
+
return $cached_result if $cached_result = $ht{CHAR_NAME_TO_SCRIPT}->{$char_name};
|
379 |
+
$orig_char_name = $char_name;
|
380 |
+
$char_name =~ s/\s+(CONSONANT|LETTER|LIGATURE|SIGN|SYLLABLE|SYLLABICS|VOWEL)\b.*$//;
|
381 |
+
my $script_name;
|
382 |
+
while ($char_name) {
|
383 |
+
last if $script_name = $ht{SCRIPT_NORM}->{(uc $char_name)};
|
384 |
+
$char_name =~ s/\s*\S+\s*$//;
|
385 |
+
}
|
386 |
+
$script_name = "" unless defined($script_name);
|
387 |
+
$ht{CHAR_NAME_TO_SCRIPT}->{$char_name} = $script_name;
|
388 |
+
return $script_name;
|
389 |
+
}
|
390 |
+
|
391 |
+
sub letter_plus_char_p {
|
392 |
+
local($this, $char_name) = @_;
|
393 |
+
|
394 |
+
return $cached_result if $cached_result = $ht{CHAR_NAME_LETTER_PLUS}->{$char_name};
|
395 |
+
my $letter_plus_p = ($char_name =~ /\b(?:LETTER|VOWEL SIGN|AU LENGTH MARK|CONSONANT SIGN|SIGN VIRAMA|SIGN PAMAAEH|SIGN COENG|SIGN AL-LAKUNA|SIGN ASAT|SIGN ANUSVARA|SIGN ANUSVARAYA|SIGN BINDI|TIPPI|SIGN NIKAHIT|SIGN CANDRABINDU|SIGN VISARGA|SIGN REAHMUK|SIGN NUKTA|SIGN DOT BELOW|HEBREW POINT)\b/) ? 1 : 0;
|
396 |
+
$ht{CHAR_NAME_LETTER_PLUS}->{$char_name} = $letter_plus_p;
|
397 |
+
return $letter_plus_p;
|
398 |
+
}
|
399 |
+
|
400 |
+
sub subjoined_char_p {
|
401 |
+
local($this, $char_name) = @_;
|
402 |
+
|
403 |
+
return $cached_result if $cached_result = $ht{CHAR_NAME_SUBJOINED}->{$char_name};
|
404 |
+
my $subjoined_p = (($char_name =~ /\b(?:SUBJOINED LETTER|VOWEL SIGN|AU LENGTH MARK|EMPHASIS MARK|CONSONANT SIGN|SIGN VIRAMA|SIGN PAMAAEH|SIGN COENG|SIGN ASAT|SIGN ANUSVARA|SIGN ANUSVARAYA|SIGN BINDI|TIPPI|SIGN NIKAHIT|SIGN CANDRABINDU|SIGN VISARGA|SIGN REAHMUK|SIGN DOT BELOW|HEBREW (POINT|PUNCTUATION GERESH)|ARABIC (?:DAMMA|DAMMATAN|FATHA|FATHATAN|HAMZA|KASRA|KASRATAN|MADDAH|SHADDA|SUKUN))\b/)) ? 1 : 0;
|
405 |
+
$ht{CHAR_NAME_SUBJOINED}->{$char_name} = $subjoined_p;
|
406 |
+
return $subjoined_p;
|
407 |
+
}
|
408 |
+
|
409 |
+
sub new_node_id {
|
410 |
+
local($this, *chart_ht) = @_;
|
411 |
+
|
412 |
+
my $n_nodes = $chart_ht{N_NODES};
|
413 |
+
$n_nodes++;
|
414 |
+
$chart_ht{N_NODES} = $n_nodes;
|
415 |
+
return $n_nodes;
|
416 |
+
}
|
417 |
+
|
418 |
+
sub add_node {
|
419 |
+
local($this, $s, $start, $end, *chart_ht, $type, $comment) = @_;
|
420 |
+
|
421 |
+
my $node_id = $this->new_node_id(*chart_ht);
|
422 |
+
# print STDERR "add_node($node_id, $start-$end): $s [$comment]\n" if $comment =~ /number/;
|
423 |
+
# print STDERR "add_node($node_id, $start-$end): $s [$comment]\n" if ($start >= 0) && ($start < 50);
|
424 |
+
$chart_ht{NODE_START}->{$node_id} = $start;
|
425 |
+
$chart_ht{NODE_END}->{$node_id} = $end;
|
426 |
+
$chart_ht{NODES_STARTING_AT}->{$start}->{$node_id} = 1;
|
427 |
+
$chart_ht{NODES_ENDING_AT}->{$end}->{$node_id} = 1;
|
428 |
+
$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}->{$node_id} = 1;
|
429 |
+
$chart_ht{NODE_TYPE}->{$node_id} = $type;
|
430 |
+
$chart_ht{NODE_COMMENT}->{$node_id} = $comment;
|
431 |
+
$chart_ht{NODE_ROMAN}->{$node_id} = $s;
|
432 |
+
return $node_id;
|
433 |
+
}
|
434 |
+
|
435 |
+
sub get_node_for_span {
|
436 |
+
local($this, $start, $end, *chart_ht) = @_;
|
437 |
+
|
438 |
+
return "" unless defined($chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end});
|
439 |
+
my @node_ids = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}};
|
440 |
+
|
441 |
+
return (@node_ids) ? $node_ids[0] : "";
|
442 |
+
}
|
443 |
+
|
444 |
+
sub get_node_for_span_and_type {
|
445 |
+
local($this, $start, $end, *chart_ht, $type) = @_;
|
446 |
+
|
447 |
+
return "" unless defined($chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end});
|
448 |
+
my @node_ids = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}};
|
449 |
+
|
450 |
+
foreach $node_id (@node_ids) {
|
451 |
+
return $node_id if $chart_ht{NODE_TYPE}->{$node_id} eq $type;
|
452 |
+
}
|
453 |
+
return "";
|
454 |
+
}
|
455 |
+
|
456 |
+
sub get_node_roman {
|
457 |
+
local($this, $node_id, *chart_id, $default) = @_;
|
458 |
+
|
459 |
+
$default = "" unless defined($default);
|
460 |
+
my $roman = $chart_ht{NODE_ROMAN}->{$node_id};
|
461 |
+
return (defined($roman)) ? $roman : $default;
|
462 |
+
}
|
463 |
+
|
464 |
+
sub set_node_id_slot_value {
|
465 |
+
local($this, $node_id, $slot, $value, *chart_id) = @_;
|
466 |
+
|
467 |
+
$chart_ht{NODE_SLOT}->{$node_id}->{$slot} = $value;
|
468 |
+
}
|
469 |
+
|
470 |
+
sub copy_slot_values {
|
471 |
+
local($this, $old_node_id, $new_node_id, *chart_id, @slots) = @_;
|
472 |
+
|
473 |
+
if (@slots) {
|
474 |
+
foreach $slot (keys %{$chart_ht{NODE_SLOT}->{$old_node_id}}) {
|
475 |
+
if (($slots[0] eq "all") || $util->member($slot, @slots)) {
|
476 |
+
my $value = $chart_ht{NODE_SLOT}->{$old_node_id}->{$slot};
|
477 |
+
$chart_ht{NODE_SLOT}->{$new_node_id}->{$slot} = $value if defined($value);
|
478 |
+
}
|
479 |
+
}
|
480 |
+
}
|
481 |
+
}
|
482 |
+
|
483 |
+
sub get_node_id_slot_value {
|
484 |
+
local($this, $node_id, $slot, *chart_id, $default) = @_;
|
485 |
+
|
486 |
+
$default = "" unless defined($default);
|
487 |
+
my $value = $chart_ht{NODE_SLOT}->{$node_id}->{$slot};
|
488 |
+
return (defined($value)) ? $value : $default;
|
489 |
+
}
|
490 |
+
|
491 |
+
sub get_node_for_span_with_slot_value {
|
492 |
+
local($this, $start, $end, $slot, *chart_id, $default) = @_;
|
493 |
+
|
494 |
+
$default = "" unless defined($default);
|
495 |
+
return $default unless defined($chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end});
|
496 |
+
my @node_ids = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}};
|
497 |
+
foreach $node_id (@node_ids) {
|
498 |
+
my $value = $chart_ht{NODE_SLOT}->{$node_id}->{$slot};
|
499 |
+
return $value if defined($value);
|
500 |
+
}
|
501 |
+
return $default;
|
502 |
+
}
|
503 |
+
|
504 |
+
sub get_node_for_span_with_slot {
|
505 |
+
local($this, $start, $end, $slot, *chart_id, $default) = @_;
|
506 |
+
|
507 |
+
$default = "" unless defined($default);
|
508 |
+
return $default unless defined($chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end});
|
509 |
+
my @node_ids = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}};
|
510 |
+
foreach $node_id (@node_ids) {
|
511 |
+
my $value = $chart_ht{NODE_SLOT}->{$node_id}->{$slot};
|
512 |
+
return $node_id if defined($value);
|
513 |
+
}
|
514 |
+
return $default;
|
515 |
+
}
|
516 |
+
|
517 |
+
sub register_new_complex_number_span_segment {
|
518 |
+
local($this, $start, $mid, $end, *chart_id, $line_number) = @_;
|
519 |
+
# e.g. 4 10 (= 40); 20 5 (= 25)
|
520 |
+
# might become part of larger complex number span, e.g. 4 1000 3 100 20 1
|
521 |
+
|
522 |
+
# print STDERR "register_new_complex_number_span_segment $start-$mid-$end\n" if $line_number == 43;
|
523 |
+
if (defined($old_start = $chart_ht{COMPLEX_NUMERIC_END_START}->{$mid})) {
|
524 |
+
undef($chart_ht{COMPLEX_NUMERIC_END_START}->{$mid});
|
525 |
+
$chart_ht{COMPLEX_NUMERIC_START_END}->{$old_start} = $end;
|
526 |
+
$chart_ht{COMPLEX_NUMERIC_END_START}->{$end} = $old_start;
|
527 |
+
} else {
|
528 |
+
$chart_ht{COMPLEX_NUMERIC_START_END}->{$start} = $end;
|
529 |
+
$chart_ht{COMPLEX_NUMERIC_END_START}->{$end} = $start;
|
530 |
+
}
|
531 |
+
}
|
532 |
+
|
533 |
+
sub romanize_by_token_with_caching {
|
534 |
+
local($this, $s, $lang_code, $output_style, *ht, *pinyin_ht, $initial_char_offset, $control, $line_number) = @_;
|
535 |
+
|
536 |
+
$control = "" unless defined($control);
|
537 |
+
my $return_chart_p = ($control =~ /return chart/i);
|
538 |
+
my $return_offset_mappings_p = ($control =~ /return offset mappings/i);
|
539 |
+
return $this->romanize($s, $lang_code, $output_style, *ht, *pinyin_ht, $initial_char_offset, $control, $line_number)
|
540 |
+
if $return_chart_p || $return_offset_mappings_p;
|
541 |
+
my $result = "";
|
542 |
+
my @separators = ();
|
543 |
+
my @tokens = ();
|
544 |
+
$s =~ s/\n$//; # Added May 2, 2019 as bug-fix (duplicate empty lines)
|
545 |
+
while (($sep, $token, $rest) = ($s =~ /^(\s*)(\S+)(.*)$/)) {
|
546 |
+
push(@separators, $sep);
|
547 |
+
push(@tokens, $token);
|
548 |
+
$s = $rest;
|
549 |
+
}
|
550 |
+
push(@separators, $s);
|
551 |
+
while (@tokens) {
|
552 |
+
my $sep = shift @separators;
|
553 |
+
my $token = shift @tokens;
|
554 |
+
$result .= $sep;
|
555 |
+
if ($token =~ /^[\x00-\x7F]*$/) { # all ASCII
|
556 |
+
$result .= $token;
|
557 |
+
} else {
|
558 |
+
my $rom_token = $ht{CACHED_ROMANIZATION}->{$lang_code}->{$token};
|
559 |
+
unless (defined($rom_token)) {
|
560 |
+
$rom_token = $this->romanize($token, $lang_code, $output_style, *ht, *pinyin_ht, $initial_char_offset, $control, $line_number);
|
561 |
+
$ht{CACHED_ROMANIZATION}->{$lang_code}->{$token} = $rom_token if defined($rom_token);
|
562 |
+
}
|
563 |
+
$result .= $rom_token;
|
564 |
+
}
|
565 |
+
}
|
566 |
+
my $sep = shift @separators;
|
567 |
+
$result .= $sep if defined($sep);
|
568 |
+
|
569 |
+
return $result;
|
570 |
+
}
|
571 |
+
|
572 |
+
sub romanize {
|
573 |
+
local($this, $s, $lang_code, $output_style, *ht, *pinyin_ht, $initial_char_offset, $control, $line_number, $initial_rom_char_offset) = @_;
|
574 |
+
|
575 |
+
my $orig_lang_code = $lang_code;
|
576 |
+
# Check whether the text (to be romanized) starts with a language code directive.
|
577 |
+
if (($line_lang_code) = ($s =~ /^::lcode\s+([a-z][a-z][a-z])\s/)) {
|
578 |
+
$lang_code = $line_lang_code;
|
579 |
+
}
|
580 |
+
$initial_char_offset = 0 unless defined($initial_char_offset);
|
581 |
+
$initial_rom_char_offset = 0 unless defined($initial_rom_char_offset);
|
582 |
+
$control = "" unless defined($control);
|
583 |
+
my $return_chart_p = ($control =~ /return chart/i);
|
584 |
+
my $return_offset_mappings_p = ($control =~ /return offset mappings/i);
|
585 |
+
$line_number = "" unless defined($line_number);
|
586 |
+
my @chars = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht);
|
587 |
+
my $n_characters = $#chars + 1;
|
588 |
+
%chart_ht = ();
|
589 |
+
$chart_ht{N_CHARS} = $n_characters;
|
590 |
+
$chart_ht{N_NODES} = 0;
|
591 |
+
my $char = "";
|
592 |
+
my $char_name = "";
|
593 |
+
my $prev_script = "";
|
594 |
+
my $current_script = "";
|
595 |
+
my $script_start = 0;
|
596 |
+
my $script_end = 0;
|
597 |
+
my $prev_letter_plus_script = "";
|
598 |
+
my $current_letter_plus_script = "";
|
599 |
+
my $letter_plus_script_start = 0;
|
600 |
+
my $letter_plus_script_end = 0;
|
601 |
+
my $log ="";
|
602 |
+
my $n_right_to_left_chars = 0;
|
603 |
+
my $n_left_to_right_chars = 0;
|
604 |
+
my $hebrew_word_start = ""; # used to identify Hebrew words with points
|
605 |
+
my $hebrew_word_contains_point = 0;
|
606 |
+
my $current_word_start = "";
|
607 |
+
my $current_word_script = "";
|
608 |
+
my $braille_all_caps_p = 0;
|
609 |
+
|
610 |
+
# prep
|
611 |
+
foreach $i ((0 .. ($#chars + 1))) {
|
612 |
+
if ($i <= $#chars) {
|
613 |
+
$char = $chars[$i];
|
614 |
+
$chart_ht{ORIG_CHAR}->{$i} = $char;
|
615 |
+
$char_name = $ht{UTF_TO_CHAR_NAME}->{$char} || "";
|
616 |
+
$chart_ht{CHAR_NAME}->{$i} = $char_name;
|
617 |
+
$current_script = $this->char_name_to_script($char_name, *ht);
|
618 |
+
$current_script_direction = $ht{DIRECTION}->{$current_script} || '';
|
619 |
+
if ($current_script_direction eq 'right-to-left') {
|
620 |
+
$n_right_to_left_chars++;
|
621 |
+
} elsif (($char =~ /^[a-z]$/i) || ! ($char =~ /^[\x00-\x7F]$/)) {
|
622 |
+
$n_left_to_right_chars++;
|
623 |
+
}
|
624 |
+
$chart_ht{CHAR_SCRIPT}->{$i} = $current_script;
|
625 |
+
$chart_ht{SCRIPT_SEGMENT_START}->{$i} = ""; # default value, to be updated later
|
626 |
+
$chart_ht{SCRIPT_SEGMENT_END}->{$i} = ""; # default value, to be updated later
|
627 |
+
$chart_ht{LETTER_TOKEN_SEGMENT_START}->{$i} = ""; # default value, to be updated later
|
628 |
+
$chart_ht{LETTER_TOKEN_SEGMENT_END}->{$i} = ""; # default value, to be updated later
|
629 |
+
$subjoined_char_p = $this->subjoined_char_p($char_name);
|
630 |
+
$chart_ht{CHAR_SUBJOINED}->{$i} = $subjoined_char_p;
|
631 |
+
$letter_plus_char_p = $this->letter_plus_char_p($char_name);
|
632 |
+
$chart_ht{CHAR_LETTER_PLUS}->{$i} = $letter_plus_char_p;
|
633 |
+
$current_letter_plus_script = ($letter_plus_char_p) ? $current_script : "";
|
634 |
+
$numeric_value = $ht{UTF_TO_NUMERIC}->{$char};
|
635 |
+
$numeric_value = "" unless defined($numeric_value);
|
636 |
+
$annotation = $ht{UTF_ANNOTATION}->{$char};
|
637 |
+
$annotation = "" unless defined($annotation);
|
638 |
+
$chart_ht{CHAR_NUMERIC_VALUE}->{$i} = $numeric_value;
|
639 |
+
$chart_ht{CHAR_ANNOTATION}->{$i} = $annotation;
|
640 |
+
$syllable_info = $ht{UTF_TO_SYLLABLE_INFO}->{$char} || "";
|
641 |
+
$chart_ht{CHAR_SYLLABLE_INFO}->{$i} = $syllable_info;
|
642 |
+
$tone_mark = $ht{UTF_TO_TONE_MARK}->{$char} || "";
|
643 |
+
$chart_ht{CHAR_TONE_MARK}->{$i} = $tone_mark;
|
644 |
+
} else {
|
645 |
+
$char = "";
|
646 |
+
$char_name = "";
|
647 |
+
$current_script = "";
|
648 |
+
$current_letter_plus_script = "";
|
649 |
+
}
|
650 |
+
if ($char_name =~ /^HEBREW (LETTER|POINT|PUNCTUATION GERESH) /) {
|
651 |
+
$hebrew_word_start = $i if $hebrew_word_start eq "";
|
652 |
+
$hebrew_word_contains_point = 1 if $char_name =~ /^HEBREW POINT /;
|
653 |
+
} elsif ($hebrew_word_start ne "") {
|
654 |
+
if ($hebrew_word_contains_point) {
|
655 |
+
foreach $j (($hebrew_word_start .. ($i-1))) {
|
656 |
+
$chart_ht{CHAR_PART_OF_POINTED_HEBREW_WORD}->{$j} = 1;
|
657 |
+
}
|
658 |
+
$chart_ht{CHAR_START_OF_WORD}->{$hebrew_word_start} = 1;
|
659 |
+
$chart_ht{CHAR_END_OF_WORD}->{($i-1)} = 1;
|
660 |
+
}
|
661 |
+
$hebrew_word_start = "";
|
662 |
+
$hebrew_word_contains_point = 0;
|
663 |
+
}
|
664 |
+
my $part_of_word_p = $current_script
|
665 |
+
&& ($this->letter_plus_char_p($char_name)
|
666 |
+
|| $this->subjoined_char_p($char_name)
|
667 |
+
|| ($char_name =~ /\b(LETTER|SYLLABLE|SYLLABICS|LIGATURE)\b/));
|
668 |
+
|
669 |
+
# Braille punctuation
|
670 |
+
my $end_offset = 0;
|
671 |
+
if ($char_name =~ /^Braille\b/i) {
|
672 |
+
if (($char =~ /^\s*$/) || ($char_name =~ /BLANK/)) {
|
673 |
+
$part_of_word_p = 0;
|
674 |
+
$braille_all_caps_p = 0;
|
675 |
+
} elsif ($chart_ht{NOT_PART_OF_WORD_P}->{$i}) {
|
676 |
+
$part_of_word_p = 0;
|
677 |
+
$braille_all_caps_p = 0;
|
678 |
+
} elsif ((keys %{$ht{WORD_EXTERNAL_PUNCTUATION_LANG_SPEC}->{$lang_code}->{$char}})
|
679 |
+
|| (keys %{$ht{WORD_EXTERNAL_PUNCTUATION}->{$char}})) {
|
680 |
+
$part_of_word_p = 0;
|
681 |
+
$braille_all_caps_p = 0;
|
682 |
+
} elsif (($i+1 <= $#chars)
|
683 |
+
&& ($s1 = $char . $chars[$i+1])
|
684 |
+
&& ((keys %{$ht{WORD_EXTERNAL_PUNCTUATION_LANG_SPEC}->{$lang_code}->{$s1}})
|
685 |
+
|| (keys %{$ht{WORD_EXTERNAL_PUNCTUATION}->{$s1}}))) {
|
686 |
+
$part_of_word_p = 0;
|
687 |
+
$braille_all_caps_p = 0;
|
688 |
+
$chart_ht{NOT_PART_OF_WORD_P}->{($i+1)} = 1;
|
689 |
+
} elsif (($i+2 <= $#chars)
|
690 |
+
&& ($s2 = $char . $chars[$i+1] . $chars[$i+2])
|
691 |
+
&& ((keys %{$ht{WORD_EXTERNAL_PUNCTUATION_LANG_SPEC}->{$lang_code}->{$s2}})
|
692 |
+
|| (keys %{$ht{WORD_EXTERNAL_PUNCTUATION}->{$s2}}))) {
|
693 |
+
$part_of_word_p = 0;
|
694 |
+
$braille_all_caps_p = 0;
|
695 |
+
$chart_ht{NOT_PART_OF_WORD_P}->{($i+1)} = 1;
|
696 |
+
$chart_ht{NOT_PART_OF_WORD_P}->{($i+2)} = 1;
|
697 |
+
} elsif (($i+1 <= $#chars)
|
698 |
+
&& ($char eq $braille_capital_letter_indicator)
|
699 |
+
&& ($chars[$i+1] eq $braille_capital_letter_indicator)) {
|
700 |
+
$braille_all_caps_p = 1;
|
701 |
+
} else {
|
702 |
+
$part_of_word_p = 1;
|
703 |
+
}
|
704 |
+
# last period in Braille text is also not part_of_word_p
|
705 |
+
if (($char eq $braille_period)
|
706 |
+
&& (($i == $#chars)
|
707 |
+
|| (($i < $#chars)
|
708 |
+
&& (! $this->braille_string_p($chars[$i+1]))))) {
|
709 |
+
$part_of_word_p = 0;
|
710 |
+
}
|
711 |
+
# period before other word-external punctuation is also not part_of_word_p
|
712 |
+
if (($i > 0)
|
713 |
+
&& ($chars[$i-1] eq $braille_period)
|
714 |
+
&& (! $part_of_word_p)
|
715 |
+
&& ($current_word_start ne "")) {
|
716 |
+
$end_offset = -1;
|
717 |
+
}
|
718 |
+
} else {
|
719 |
+
$braille_all_caps_p = 0;
|
720 |
+
}
|
721 |
+
$chart_ht{BRAILLE_ALL_CAPS_P}->{$i} = $braille_all_caps_p;
|
722 |
+
|
723 |
+
if (($current_word_start ne "")
|
724 |
+
&& ((! $part_of_word_p)
|
725 |
+
|| ($current_script ne $current_word_script))) {
|
726 |
+
# END OF WORD
|
727 |
+
$chart_ht{CHAR_START_OF_WORD}->{$current_word_start} = 1;
|
728 |
+
$chart_ht{CHAR_END_OF_WORD}->{($i-1+$end_offset)} = 1;
|
729 |
+
my $word = join("", @chars[$current_word_start .. ($i-1+$end_offset)]);
|
730 |
+
$chart_ht{WORD_START_END}->{$current_word_start}->{$i} = $word;
|
731 |
+
$chart_ht{WORD_END_START}->{$i+$end_offset}->{$current_word_start} = $word;
|
732 |
+
# print STDERR "Word ($current_word_start-$i+$end_offset): $word ($current_word_script)\n";
|
733 |
+
$current_word_start = "";
|
734 |
+
$current_word_script = "";
|
735 |
+
}
|
736 |
+
if ($part_of_word_p && ($current_word_start eq "")) {
|
737 |
+
# START OF WORD
|
738 |
+
$current_word_start = $i;
|
739 |
+
$current_word_script = $current_script;
|
740 |
+
}
|
741 |
+
# print STDERR "$i char: $char ($current_script)\n";
|
742 |
+
unless ($current_script eq $prev_script) {
|
743 |
+
if ($prev_script && ($i-1 >= $script_start)) {
|
744 |
+
my $script_end = $i;
|
745 |
+
$chart_ht{SCRIPT_SEGMENT_START_TO_END}->{$script_start} = $script_end;
|
746 |
+
$chart_ht{SCRIPT_SEGMENT_END_TO_START}->{$script_end} = $script_start;
|
747 |
+
foreach $i (($script_start .. $script_end)) {
|
748 |
+
$chart_ht{SCRIPT_SEGMENT_START}->{$i} = $script_start;
|
749 |
+
$chart_ht{SCRIPT_SEGMENT_END}->{$i} = $script_end;
|
750 |
+
}
|
751 |
+
# print STDERR "Script segment $script_start-$script_end: $prev_script\n";
|
752 |
+
}
|
753 |
+
$script_start = $i;
|
754 |
+
}
|
755 |
+
unless ($current_letter_plus_script eq $prev_letter_plus_script) {
|
756 |
+
if ($prev_letter_plus_script && ($i-1 >= $letter_plus_script_start)) {
|
757 |
+
my $letter_plus_script_end = $i;
|
758 |
+
$chart_ht{LETTER_TOKEN_SEGMENT_START_TO_END}->{$letter_plus_script_start} = $letter_plus_script_end;
|
759 |
+
$chart_ht{LETTER_TOKEN_SEGMENT_END_TO_START}->{$letter_plus_script_end} = $letter_plus_script_start;
|
760 |
+
foreach $i (($letter_plus_script_start .. $letter_plus_script_end)) {
|
761 |
+
$chart_ht{LETTER_TOKEN_SEGMENT_START}->{$i} = $letter_plus_script_start;
|
762 |
+
$chart_ht{LETTER_TOKEN_SEGMENT_END}->{$i} = $letter_plus_script_end;
|
763 |
+
}
|
764 |
+
# print STDERR "Script token segment $letter_plus_script_start-$letter_plus_script_end: $prev_letter_plus_script\n";
|
765 |
+
}
|
766 |
+
$letter_plus_script_start = $i;
|
767 |
+
}
|
768 |
+
$prev_script = $current_script;
|
769 |
+
$prev_letter_plus_script = $current_letter_plus_script;
|
770 |
+
}
|
771 |
+
$ht{STRING_IS_DOMINANTLY_RIGHT_TO_LEFT}->{$s} = 1 if $n_right_to_left_chars > $n_left_to_right_chars;
|
772 |
+
|
773 |
+
# main
|
774 |
+
my $i = 0;
|
775 |
+
while ($i <= $#chars) {
|
776 |
+
my $char = $chart_ht{ORIG_CHAR}->{$i};
|
777 |
+
my $current_script = $chart_ht{CHAR_SCRIPT}->{$i};
|
778 |
+
$chart_ht{CHART_CONTAINS_SCRIPT}->{$current_script} = 1;
|
779 |
+
my $script_segment_start = $chart_ht{SCRIPT_SEGMENT_START}->{$i};
|
780 |
+
my $script_segment_end = $chart_ht{SCRIPT_SEGMENT_END}->{$i};
|
781 |
+
my $char_name = $chart_ht{CHAR_NAME}->{$i};
|
782 |
+
my $subjoined_char_p = $chart_ht{CHAR_SUBJOINED}->{$i};
|
783 |
+
my $letter_plus_char_p = $chart_ht{CHAR_LETTER_PLUS}->{$i};
|
784 |
+
my $numeric_value = $chart_ht{CHAR_NUMERIC_VALUE}->{$i};
|
785 |
+
my $annotation = $chart_ht{CHAR_ANNOTATION}->{$i};
|
786 |
+
# print STDERR " $char_name annotation: $annotation\n" if $annotation;
|
787 |
+
my $tone_mark = $chart_ht{CHAR_TONE_MARK}->{$i};
|
788 |
+
my $found_char_mapping_p = 0;
|
789 |
+
my $prev_char_name = ($i >= 1) ? $chart_ht{CHAR_NAME}->{($i-1)} : "";
|
790 |
+
my $prev2_script = ($i >= 2) ? $chart_ht{CHAR_SCRIPT}->{($i-2)} : "";
|
791 |
+
my $prev_script = ($i >= 1) ? $chart_ht{CHAR_SCRIPT}->{($i-1)} : "";
|
792 |
+
my $next_script = ($i < $#chars) ? $chart_ht{CHAR_SCRIPT}->{($i+1)} : "";
|
793 |
+
my $next_char = ($i < $#chars) ? $chart_ht{ORIG_CHAR}->{($i+1)} : "";
|
794 |
+
my $next_char_name = $ht{UTF_TO_CHAR_NAME}->{$next_char} || "";
|
795 |
+
my $prev2_letter_plus_char_p = ($i >= 2) ? $chart_ht{CHAR_LETTER_PLUS}->{($i-2)} : 0;
|
796 |
+
my $prev_letter_plus_char_p = ($i >= 1) ? $chart_ht{CHAR_LETTER_PLUS}->{($i-1)} : 0;
|
797 |
+
my $next_letter_plus_char_p = ($i < $#chars) ? $chart_ht{CHAR_LETTER_PLUS}->{($i+1)} : 0;
|
798 |
+
my $next_index = $i + 1;
|
799 |
+
|
800 |
+
# Braille numeric mode
|
801 |
+
if ($char eq $braille_number_indicator) {
|
802 |
+
my $offset = 0;
|
803 |
+
my $numeric_value = "";
|
804 |
+
my $digit;
|
805 |
+
while ($i+$offset < $#chars) {
|
806 |
+
$offset++;
|
807 |
+
my $offset_char = $chart_ht{ORIG_CHAR}->{$i+$offset};
|
808 |
+
if (defined($digit = $ht{BRAILLE_TO_DIGIT}->{$offset_char})) {
|
809 |
+
$numeric_value .= $digit;
|
810 |
+
} elsif (($offset_char eq $braille_decimal_point)
|
811 |
+
|| ($ht{UTF_CHAR_MAPPING}->{$offset_char}->{"."})) {
|
812 |
+
$numeric_value .= ".";
|
813 |
+
} elsif ($offset_char eq $braille_comma) {
|
814 |
+
$numeric_value .= ",";
|
815 |
+
} elsif ($offset_char eq $braille_numeric_space) {
|
816 |
+
$numeric_value .= " ";
|
817 |
+
} elsif ($offset_char eq $braille_solidus) {
|
818 |
+
$numeric_value .= "/";
|
819 |
+
} elsif ($offset_char eq $braille_number_indicator) {
|
820 |
+
# stay in Braille numeric mode
|
821 |
+
} elsif ($offset_char eq $braille_letter_indicator) {
|
822 |
+
# consider as part of number, but without contributing to numeric_value
|
823 |
+
last;
|
824 |
+
} else {
|
825 |
+
$offset--;
|
826 |
+
last;
|
827 |
+
}
|
828 |
+
}
|
829 |
+
if ($offset) {
|
830 |
+
$next_index = $i + $offset + 1;
|
831 |
+
$node_id = $this->add_node($numeric_value, $i, $next_index, *chart_ht, "", "braille number");
|
832 |
+
$found_char_mapping_p = 1;
|
833 |
+
}
|
834 |
+
}
|
835 |
+
|
836 |
+
unless ($found_char_mapping_p) {
|
837 |
+
foreach $string_length (reverse(1 .. 6)) {
|
838 |
+
next if ($i + $string_length-1) > $#chars;
|
839 |
+
my $start_of_word_p = $chart_ht{CHAR_START_OF_WORD}->{$i} || 0;
|
840 |
+
my $end_of_word_p = $chart_ht{CHAR_END_OF_WORD}->{($i+$string_length-1)} || 0;
|
841 |
+
my $multi_char_substring = join("", @chars[$i..($i+$string_length-1)]);
|
842 |
+
my @mappings = keys %{$ht{UTF_CHAR_MAPPING_LANG_SPEC}->{$lang_code}->{$multi_char_substring}};
|
843 |
+
@mappings = keys %{$ht{UTF_CHAR_MAPPING}->{$multi_char_substring}} unless @mappings;
|
844 |
+
my @mappings_whole = ();
|
845 |
+
my @mappings_start_or_end = ();
|
846 |
+
my @mappings_other = ();
|
847 |
+
foreach $mapping (@mappings) {
|
848 |
+
next if $mapping =~ /\(__.*__\)/;
|
849 |
+
if ($ht{USE_ONLY_FOR_WHOLE_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$mapping}
|
850 |
+
|| $ht{USE_ONLY_FOR_WHOLE_WORD}->{$multi_char_substring}->{$mapping}) {
|
851 |
+
push(@mappings_whole, $mapping) if $start_of_word_p && $end_of_word_p;
|
852 |
+
} elsif ($ht{USE_ONLY_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$mapping}
|
853 |
+
|| $ht{USE_ONLY_AT_START_OF_WORD}->{$multi_char_substring}->{$mapping}) {
|
854 |
+
push(@mappings_start_or_end, $mapping) if $start_of_word_p;
|
855 |
+
} elsif ($ht{USE_ONLY_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$mapping}
|
856 |
+
|| $ht{USE_ONLY_AT_END_OF_WORD}->{$multi_char_substring}->{$mapping}) {
|
857 |
+
push(@mappings_start_or_end, $mapping) if $end_of_word_p;
|
858 |
+
} else {
|
859 |
+
push(@mappings_other, $mapping);
|
860 |
+
}
|
861 |
+
}
|
862 |
+
@mappings = @mappings_whole;
|
863 |
+
@mappings = @mappings_start_or_end unless @mappings;
|
864 |
+
@mappings = @mappings_other unless @mappings;
|
865 |
+
foreach $mapping (@mappings) {
|
866 |
+
next if $mapping =~ /\(__.*__\)/;
|
867 |
+
if ($ht{DONT_USE_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$mapping}
|
868 |
+
|| $ht{DONT_USE_AT_START_OF_WORD}->{$multi_char_substring}->{$mapping}) {
|
869 |
+
next if $start_of_word_p;
|
870 |
+
}
|
871 |
+
if ($ht{DONT_USE_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$mapping}
|
872 |
+
|| $ht{DONT_USE_AT_END_OF_WORD}->{$multi_char_substring}->{$mapping}) {
|
873 |
+
next if $end_of_word_p;
|
874 |
+
}
|
875 |
+
my $mapping2 = ($chart_ht{BRAILLE_ALL_CAPS_P}->{$i}) ? (uc $mapping) : $mapping;
|
876 |
+
$node_id = $this->add_node($mapping2, $i, $i+$string_length, *chart_ht, "", "multi-char-mapping");
|
877 |
+
$next_index = $i + $string_length;
|
878 |
+
$found_char_mapping_p = 1;
|
879 |
+
if ($annotation) {
|
880 |
+
@annotation_elems = split(/,\s*/, $annotation);
|
881 |
+
foreach $annotation_elem (@annotation_elems) {
|
882 |
+
if (($a_slot, $a_value) = ($annotation_elem =~ /^(\S+?):(\S+)\s*$/)) {
|
883 |
+
$this->set_node_id_slot_value($node_id, $a_slot, $a_value, *chart_ht);
|
884 |
+
} else {
|
885 |
+
$this->set_node_id_slot_value($node_id, $annotation_elem, 1, *chart_ht);
|
886 |
+
}
|
887 |
+
}
|
888 |
+
}
|
889 |
+
}
|
890 |
+
my @alt_mappings = keys %{$ht{UTF_CHAR_ALT_MAPPING_LANG_SPEC}->{$lang_code}->{$multi_char_substring}};
|
891 |
+
@alt_mappings = keys %{$ht{UTF_CHAR_ALT_MAPPING}->{$multi_char_substring}} unless @alt_mappings;
|
892 |
+
@alt_mappings = () if ($#alt_mappings == 0) && ($alt_mappings[0] eq "_NONE_");
|
893 |
+
foreach $alt_mapping (@alt_mappings) {
|
894 |
+
if ($chart_ht{CHAR_PART_OF_POINTED_HEBREW_WORD}->{$i}) {
|
895 |
+
next unless
|
896 |
+
$ht{USE_ALT_IN_POINTED_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$alt_mapping}
|
897 |
+
|| $ht{USE_ALT_IN_POINTED}->{$multi_char_substring}->{$alt_mapping};
|
898 |
+
}
|
899 |
+
if ($ht{USE_ALT_ONLY_FOR_WHOLE_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$alt_mapping}
|
900 |
+
|| $ht{USE_ALT_ONLY_FOR_WHOLE_WORD}->{$multi_char_substring}->{$alt_mapping}) {
|
901 |
+
next unless $start_of_word_p && $end_of_word_p;
|
902 |
+
}
|
903 |
+
if ($ht{USE_ALT_ONLY_AT_START_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$alt_mapping}
|
904 |
+
|| $ht{USE_ALT_ONLY_AT_START_OF_WORD}->{$multi_char_substring}->{$alt_mapping}) {
|
905 |
+
next unless $start_of_word_p;
|
906 |
+
}
|
907 |
+
if ($ht{USE_ALT_ONLY_AT_END_OF_WORD_LANG_SPEC}->{$lang_code}->{$multi_char_substring}->{$alt_mapping}
|
908 |
+
|| $ht{USE_ALT_ONLY_AT_END_OF_WORD}->{$multi_char_substring}->{$alt_mapping}) {
|
909 |
+
next unless $end_of_word_p;
|
910 |
+
}
|
911 |
+
my $alt_mapping2 = ($chart_ht{BRAILLE_ALL_CAPS_P}->{$i}) ? (uc $alt_mapping) : $alt_mapping;
|
912 |
+
$node_id = $this->add_node($alt_mapping2, $i, $i+$string_length, *chart_ht, "alt", "multi-char-mapping");
|
913 |
+
if ($annotation) {
|
914 |
+
@annotation_elems = split(/,\s*/, $annotation);
|
915 |
+
foreach $annotation_elem (@annotation_elems) {
|
916 |
+
if (($a_slot, $a_value) = ($annotation_elem =~ /^(\S+?):(\S+)\s*$/)) {
|
917 |
+
$this->set_node_id_slot_value($node_id, $a_slot, $a_value, *chart_ht);
|
918 |
+
} else {
|
919 |
+
$this->set_node_id_slot_value($node_id, $annotation_elem, 1, *chart_ht);
|
920 |
+
}
|
921 |
+
}
|
922 |
+
}
|
923 |
+
}
|
924 |
+
}
|
925 |
+
}
|
926 |
+
unless ($found_char_mapping_p) {
|
927 |
+
my $prev_node_id = $this->get_node_for_span($i-4, $i, *chart_ht)
|
928 |
+
|| $this->get_node_for_span($i-3, $i, *chart_ht)
|
929 |
+
|| $this->get_node_for_span($i-2, $i, *chart_ht)
|
930 |
+
|| $this->get_node_for_span($i-1, $i, *chart_ht);
|
931 |
+
my $prev_char_roman = ($prev_node_id) ? $this->get_node_roman($prev_node_id, *chart_id) : "";
|
932 |
+
my $prev_node_start = ($prev_node_id) ? $chart_ht{NODE_START}->{$prev_node_id} : "";
|
933 |
+
|
934 |
+
# Number
|
935 |
+
if (($numeric_value =~ /\d/)
|
936 |
+
&& (! ($char_name =~ /SUPERSCRIPT/))) {
|
937 |
+
my $prev_numeric_value = $this->get_node_for_span_with_slot_value($i-1, $i, "numeric-value", *chart_id);
|
938 |
+
my $sep = "";
|
939 |
+
$sep = " " if ($char_name =~ /^vulgar fraction /i) && ($prev_numeric_value =~ /\d/);
|
940 |
+
$node_id = $this->add_node("$sep$numeric_value", $i, $i+1, *chart_ht, "", "number");
|
941 |
+
$this->set_node_id_slot_value($node_id, "numeric-value", $numeric_value, *chart_ht);
|
942 |
+
if ((($prev_numeric_value =~ /\d/) && ($numeric_value =~ /\d\d/))
|
943 |
+
|| (($prev_numeric_value =~ /\d\d/) && ($numeric_value =~ /\d/))) {
|
944 |
+
# pull in any other parts of single digits
|
945 |
+
my $j = 1;
|
946 |
+
# pull in any single digits adjoining on left
|
947 |
+
if ($prev_numeric_value =~ /^\d$/) {
|
948 |
+
while (1) {
|
949 |
+
if (($i-$j-1 >= 0)
|
950 |
+
&& defined($digit_value = $this->get_node_for_span_with_slot_value($i-$j-1, $i-$j, "numeric-value", *chart_id))
|
951 |
+
&& ($digit_value =~ /^\d$/)) {
|
952 |
+
$j++;
|
953 |
+
} elsif (($i-$j-2 >= 0)
|
954 |
+
&& ($chart_ht{ORIG_CHAR}->{($i-$j-1)} =~ /^[.,]$/)
|
955 |
+
&& defined($digit_value = $this->get_node_for_span_with_slot_value($i-$j-2, $i-$j-1, "numeric-value", *chart_id))
|
956 |
+
&& ($digit_value =~ /^\d$/)) {
|
957 |
+
$j += 2;
|
958 |
+
} else {
|
959 |
+
last;
|
960 |
+
}
|
961 |
+
}
|
962 |
+
}
|
963 |
+
# pull in any single digits adjoining on right
|
964 |
+
my $k = 0;
|
965 |
+
if ($numeric_value =~ /^\d$/) {
|
966 |
+
while (1) {
|
967 |
+
if (defined($next_numeric_value = $chart_ht{CHAR_NUMERIC_VALUE}->{($i+$k+1)})
|
968 |
+
&& ($next_numeric_value =~ /^\d$/)) {
|
969 |
+
$k++;
|
970 |
+
} else {
|
971 |
+
last;
|
972 |
+
}
|
973 |
+
}
|
974 |
+
}
|
975 |
+
$this->register_new_complex_number_span_segment($i-$j, $i, $i+$k+1, *chart_ht, $line_number);
|
976 |
+
}
|
977 |
+
if ($chinesePM->string_contains_utf8_cjk_unified_ideograph_p($char)
|
978 |
+
&& ($tonal_translit = $chinesePM->tonal_pinyin($char, *pinyin_ht, ""))) {
|
979 |
+
$de_accented_translit = $util->de_accent_string($tonal_translit);
|
980 |
+
if ($numeric_value =~ /^(10000|1000000000000|10000000000000000)$/) {
|
981 |
+
$chart_ht{NODE_TYPE}->{$node_id} = "alt"; # keep, but demote
|
982 |
+
$alt_node_id = $this->add_node($de_accented_translit, $i, $i+1, *chart_ht, "", "CJK");
|
983 |
+
} else {
|
984 |
+
$alt_node_id = $this->add_node($de_accented_translit, $i, $i+1, *chart_ht, "alt", "CJK");
|
985 |
+
}
|
986 |
+
}
|
987 |
+
|
988 |
+
# ASCII
|
989 |
+
} elsif ($char =~ /^[\x00-\x7F]$/) {
|
990 |
+
$this->add_node($char, $i, $i+1, *chart_ht, "", "ASCII"); # ASCII character, incl. control characters
|
991 |
+
|
992 |
+
# Emoji, dingbats, pictographs
|
993 |
+
} elsif ($char =~ /^(\xE2[\x98-\x9E]|\xF0\x9F[\x8C-\xA7])/) {
|
994 |
+
$this->add_node($char, $i, $i+1, *chart_ht, "", "pictograph");
|
995 |
+
|
996 |
+
# Hangul (Korean)
|
997 |
+
} elsif (($char =~ /^[\xEA-\xED]/)
|
998 |
+
&& ($romanized_char = $this->unicode_hangul_romanization($char))) {
|
999 |
+
$this->add_node($romanized_char, $i, $i+1, *chart_ht, "", "Hangul");
|
1000 |
+
|
1001 |
+
# CJK (Chinese, Japanese, Korean)
|
1002 |
+
} elsif ($chinesePM->string_contains_utf8_cjk_unified_ideograph_p($char)
|
1003 |
+
&& ($tonal_translit = $chinesePM->tonal_pinyin($char, *pinyin_ht, ""))) {
|
1004 |
+
$de_accented_translit = $util->de_accent_string($tonal_translit);
|
1005 |
+
$this->add_node($de_accented_translit, $i, $i+1, *chart_ht, "", "CJK");
|
1006 |
+
|
1007 |
+
# Virama (cancel preceding vowel in Abudiga scripts)
|
1008 |
+
} elsif ($char_name =~ /\bSIGN (?:VIRAMA|AL-LAKUNA|ASAT|COENG|PAMAAEH)\b/) {
|
1009 |
+
# VIRAMA: cancel preceding default vowel (in Abudiga scripts)
|
1010 |
+
if (($prev_script eq $current_script)
|
1011 |
+
&& (($prev_char_roman_consonant, $prev_char_roman_vowel) = ($prev_char_roman =~ /^(.*[bcdfghjklmnpqrstvwxyz])([aeiou]+)$/i))
|
1012 |
+
&& ($ht{SCRIPT_ABUDIGA_DEFAULT_VOWEL}->{$current_script}->{(lc $prev_char_roman_vowel)})) {
|
1013 |
+
$this->add_node($prev_char_roman_consonant, $prev_node_start, $i+1, *chart_ht, "", "virama");
|
1014 |
+
} else {
|
1015 |
+
$this->add_node("", $i, $i+1, *chart_ht, "", "unexpected-virama");
|
1016 |
+
}
|
1017 |
+
|
1018 |
+
# Nukta (special (typically foreign) variant)
|
1019 |
+
} elsif ($char_name =~ /\bSIGN (?:NUKTA)\b/) {
|
1020 |
+
# NUKTA (dot): indicates special (typically foreign) variant; normally covered by multi-mappings
|
1021 |
+
if ($prev_script eq $current_script) {
|
1022 |
+
my $node_id = $this->add_node($prev_char_roman, $prev_node_start, $i+1, *chart_ht, "", "nukta");
|
1023 |
+
$this->copy_slot_values($prev_node_id, $node_id, *chart_id, "all");
|
1024 |
+
$this->set_node_id_slot_value($node_id, "nukta", 1, *chart_ht);
|
1025 |
+
} else {
|
1026 |
+
$this->add_node("", $i, $i+1, *chart_ht, "", "unexpected-nukta");
|
1027 |
+
}
|
1028 |
+
|
1029 |
+
# Zero-width character, incl. zero width space/non-joiner/joiner, left-to-right/right-to-left mark
|
1030 |
+
} elsif ($char =~ /^\xE2\x80[\x8B-\x8F\xAA-\xAE]$/) {
|
1031 |
+
if ($prev_node_id) {
|
1032 |
+
my $node_id = $this->add_node($prev_char_roman, $prev_node_start, $i+1, *chart_ht, "", "zero-width-char");
|
1033 |
+
$this->copy_slot_values($prev_node_id, $node_id, *chart_id, "all");
|
1034 |
+
} else {
|
1035 |
+
$this->add_node("", $i, $i+1, *chart_ht, "", "zero-width-char");
|
1036 |
+
}
|
1037 |
+
} elsif (($char =~ /^\xEF\xBB\xBF$/) && $prev_node_id) { # OK to leave byte-order-mark at beginning of line
|
1038 |
+
my $node_id = $this->add_node($prev_char_roman, $prev_node_start, $i+1, *chart_ht, "", "zero-width-char");
|
1039 |
+
$this->copy_slot_values($prev_node_id, $node_id, *chart_id, "all");
|
1040 |
+
|
1041 |
+
# Tone mark
|
1042 |
+
} elsif ($tone_mark) {
|
1043 |
+
if ($prev_script eq $current_script) {
|
1044 |
+
my $node_id = $this->add_node($prev_char_roman, $prev_node_start, $i+1, *chart_ht, "", "tone-mark");
|
1045 |
+
$this->copy_slot_values($prev_node_id, $node_id, *chart_id, "all");
|
1046 |
+
$this->set_node_id_slot_value($node_id, "tone-mark", $tone_mark, *chart_ht);
|
1047 |
+
} else {
|
1048 |
+
$this->add_node("", $i, $i+1, *chart_ht, "", "unexpected-tone-mark");
|
1049 |
+
}
|
1050 |
+
|
1051 |
+
# Diacritic
|
1052 |
+
} elsif (($char_name =~ /\b(ACCENT|TONE|COMBINING DIAERESIS|COMBINING DIAERESIS BELOW|COMBINING MACRON|COMBINING VERTICAL LINE ABOVE|COMBINING DOT ABOVE RIGHT|COMBINING TILDE|COMBINING CYRILLIC|MUUSIKATOAN|TRIISAP)\b/) && ($ht{UTF_TO_CAT}->{$char} =~ /^Mn/)) {
|
1053 |
+
if ($prev_script eq $current_script) {
|
1054 |
+
my $node_id = $this->add_node($prev_char_roman, $prev_node_start, $i+1, *chart_ht, "", "diacritic");
|
1055 |
+
$this->copy_slot_values($prev_node_id, $node_id, *chart_id, "all");
|
1056 |
+
$diacritic = lc $char_name;
|
1057 |
+
$diacritic =~ s/^.*(?:COMBINING CYRILLIC|COMBINING|SIGN)\s+//i;
|
1058 |
+
$diacritic =~ s/^.*(ACCENT|TONE)/$1/i;
|
1059 |
+
$diacritic =~ s/^\s*//;
|
1060 |
+
$this->set_node_id_slot_value($node_id, "diacritic", $diacritic, *chart_ht);
|
1061 |
+
# print STDERR "diacritic: $diacritic\n";
|
1062 |
+
} else {
|
1063 |
+
$this->add_node("", $i, $i+1, *chart_ht, "", "unexpected-diacritic");
|
1064 |
+
}
|
1065 |
+
|
1066 |
+
# Romanize to find out more
|
1067 |
+
} elsif ($char_name) {
|
1068 |
+
if (defined($romanized_char = $this->romanize_char_at_position($i, $lang_code, $output_style, *ht, *chart_ht))) {
|
1069 |
+
# print STDERR "ROM l.$line_number/$i: $romanized_char\n" if $line_number =~ /^[12]$/;
|
1070 |
+
print STDOUT "ROM l.$line_number/$i: $romanized_char\n" if $verbosePM;
|
1071 |
+
|
1072 |
+
# Empty string mapping
|
1073 |
+
if ($romanized_char eq "\"\"") {
|
1074 |
+
$this->add_node("", $i, $i+1, *chart_ht, "", "empty-string-mapping");
|
1075 |
+
# consider adding something for implausible romanizations of length 6+
|
1076 |
+
|
1077 |
+
# keep original character (instead of romanized_char lengthener, character-18b00 etc.)
|
1078 |
+
} elsif (($romanized_char =~ /^(character|lengthener|modifier)/)) {
|
1079 |
+
$this->add_node($char, $i, $i+1, *chart_ht, "", "nevermind-keep-original");
|
1080 |
+
|
1081 |
+
# Syllabic suffix in Abudiga languages, e.g. -m, -ng
|
1082 |
+
} elsif (($romanized_char =~ /^\+(H|M|N|NG)$/i)
|
1083 |
+
&& ($prev_script eq $current_script)
|
1084 |
+
&& ($ht{SCRIPT_ABUDIGA_DEFAULT_VOWEL}->{$current_script}->{"a"})) {
|
1085 |
+
my $core_suffix = $romanized_char;
|
1086 |
+
$core_suffix =~ s/^\+//;
|
1087 |
+
if ($prev_char_roman =~ /[aeiou]$/i) {
|
1088 |
+
$this->add_node($core_suffix, $i, $i+1, *chart_ht, "", "syllable-end-consonant");
|
1089 |
+
} else {
|
1090 |
+
$this->add_node(join("", $prev_char_roman, "a", $core_suffix), $prev_node_start, $i+1, *chart_ht, "", "syllable-end-consonant-with-added-a");
|
1091 |
+
$this->add_node(join("", "a", $core_suffix), $i, $i+1, *chart_ht, "backup", "syllable-end-consonant");
|
1092 |
+
}
|
1093 |
+
|
1094 |
+
# Japanese special cases
|
1095 |
+
} elsif ($char_name =~ /(?:HIRAGANA|KATAKANA) LETTER SMALL Y/) {
|
1096 |
+
if (($prev_script eq $current_script)
|
1097 |
+
&& (($prev_char_roman_consonant) = ($prev_char_roman =~ /^(.*[bcdfghjklmnpqrstvwxyz])i$/i))) {
|
1098 |
+
unless ($this->get_node_for_span_and_type($prev_node_start, $i+1, *chart_ht, "")) {
|
1099 |
+
$this->add_node("$prev_char_roman_consonant$romanized_char", $prev_node_start, $i+1, *chart_ht, "", "japanese-contraction");
|
1100 |
+
}
|
1101 |
+
} else {
|
1102 |
+
$this->add_node($romanized_char, $i, $i+1, *chart_ht, "", "unexpected-japanese-contraction-character");
|
1103 |
+
}
|
1104 |
+
} elsif (($prev_script =~ /^(HIRAGANA|KATAKANA)$/i)
|
1105 |
+
&& ($char_name eq "KATAKANA-HIRAGANA PROLONGED SOUND MARK") # Choonpu
|
1106 |
+
&& (($prev_char_roman_vowel) = ($prev_char_roman =~ /([aeiou])$/i))) {
|
1107 |
+
$this->add_node("$prev_char_roman$prev_char_roman_vowel", $prev_node_start, $i+1, *chart_ht, "", "japanese-vowel-lengthening");
|
1108 |
+
} elsif (($current_script =~ /^(Hiragana|Katakana)$/i)
|
1109 |
+
&& ($char_name =~ /^(HIRAGANA|KATAKANA) LETTER SMALL TU$/i) # Sokuon/Sukun
|
1110 |
+
&& ($next_script eq $current_script)
|
1111 |
+
&& ($romanized_next_char = $this->romanize_char_at_position_incl_multi($i+1, $lang_code, $output_style, *ht, *chart_ht))
|
1112 |
+
&& (($doubled_consonant) = ($romanized_next_char =~ /^(ch|[bcdfghjklmnpqrstwz])/i))) {
|
1113 |
+
# Note: $romanized_next_char could be part of a multi-character mapping
|
1114 |
+
# print STDERR "current_script: $current_script char_name: $char_name next_script: $next_script romanized_next_char: $romanized_next_char doubled_consonant: $doubled_consonant\n";
|
1115 |
+
$doubled_consonant = "t" if $doubled_consonant eq "ch";
|
1116 |
+
$this->add_node($doubled_consonant, $i, $i+1, *chart_ht, "", "japanese-consonant-doubling");
|
1117 |
+
|
1118 |
+
# Greek small letter mu to micro-sign (instead of to "m") as used in abbreviations for microgram/micrometer/microliter/microsecond/micromolar/microfarad etc.
|
1119 |
+
} elsif (($char_name eq "GREEK SMALL LETTER MU")
|
1120 |
+
&& (! ($prev_script =~ /^GREEK$/))
|
1121 |
+
&& ($i < $#chars)
|
1122 |
+
&& ($chart_ht{ORIG_CHAR}->{($i+1)} =~ /^[cfgjlmstv]$/i)) {
|
1123 |
+
$this->add_node("\xC2\xB5", $i, $i+1, *chart_ht, "", "greek-mu-to-micro-sign");
|
1124 |
+
|
1125 |
+
# Gurmukhi addak (doubles following consonant)
|
1126 |
+
} elsif (($current_script eq "Gurmukhi")
|
1127 |
+
&& ($char_name eq "GURMUKHI ADDAK")) {
|
1128 |
+
if (($next_script eq $current_script)
|
1129 |
+
&& ($romanized_next_char = $this->romanize_char_at_position_incl_multi($i+1, $lang_code, $output_style, *ht, *chart_ht))
|
1130 |
+
&& (($doubled_consonant) = ($romanized_next_char =~ /^([bcdfghjklmnpqrstvwxz])/i))) {
|
1131 |
+
$this->add_node($doubled_consonant, $i, $i+1, *chart_ht, "", "gurmukhi-consonant-doubling");
|
1132 |
+
} else {
|
1133 |
+
$this->add_node("'", $i, $i+1, *chart_ht, "", "gurmukhi-unexpected-addak");
|
1134 |
+
}
|
1135 |
+
|
1136 |
+
# Subjoined character
|
1137 |
+
} elsif ($subjoined_char_p
|
1138 |
+
&& ($prev_script eq $current_script)
|
1139 |
+
&& (($prev_char_roman_consonant, $prev_char_roman_vowel) = ($prev_char_roman =~ /^(.*[bcdfghjklmnpqrstvwxyz])([aeiou]+)$/i))
|
1140 |
+
&& ($ht{SCRIPT_ABUDIGA_DEFAULT_VOWEL}->{$current_script}->{(lc $prev_char_roman_vowel)})) {
|
1141 |
+
my $new_roman = "$prev_char_roman_consonant$romanized_char";
|
1142 |
+
$this->add_node($new_roman, $prev_node_start, $i+1, *chart_ht, "", "subjoined-character");
|
1143 |
+
# print STDERR " Subjoin l.$line_number/$i: $new_roman\n" if $line_number =~ /^[12]$/;
|
1144 |
+
|
1145 |
+
# Thai special case: written-pre-consonant-spoken-post-consonant
|
1146 |
+
} elsif (($char_name =~ /THAI CHARACTER/)
|
1147 |
+
&& ($prev_script eq $current_script)
|
1148 |
+
&& ($chart_ht{CHAR_SYLLABLE_INFO}->{($i-1)} =~ /written-pre-consonant-spoken-post-consonant/i)
|
1149 |
+
&& ($prev_char_roman =~ /^[aeiou]+$/i)
|
1150 |
+
&& ($romanized_char =~ /^[bcdfghjklmnpqrstvwxyz]/)) {
|
1151 |
+
$this->add_node("$romanized_char$prev_char_roman", $prev_node_start, $i+1, *chart_ht, "", "thai-vowel-consonant-swap");
|
1152 |
+
|
1153 |
+
# Thai special case: THAI CHARACTER O ANG (U+0E2D "\xE0\xB8\xAD")
|
1154 |
+
} elsif ($char_name eq "THAI CHARACTER O ANG") {
|
1155 |
+
if ($prev_script ne $current_script) {
|
1156 |
+
$this->add_node("", $i, $i+1, *chart_ht, "", "thai-initial-o-ang-drop");
|
1157 |
+
} elsif ($next_script ne $current_script) {
|
1158 |
+
$this->add_node("", $i, $i+1, *chart_ht, "", "thai-final-o-ang-drop");
|
1159 |
+
} else {
|
1160 |
+
my $romanized_next_char = $this->romanize_char_at_position($i+1, $lang_code, $output_style, *ht, *chart_ht);
|
1161 |
+
my $romanized_prev2_char = $this->romanize_char_at_position($i-2, $lang_code, $output_style, *ht, *chart_ht);
|
1162 |
+
if (($prev_char_roman =~ /^[bcdfghjklmnpqrstvwxz]+$/i)
|
1163 |
+
&& ($romanized_next_char =~ /^[bcdfghjklmnpqrstvwxz]+$/i)) {
|
1164 |
+
$this->add_node("o", $i, $i+1, *chart_ht, "", "thai-middle-o-ang"); # keep between consonants
|
1165 |
+
} elsif (($prev2_script eq $current_script)
|
1166 |
+
&& 0
|
1167 |
+
&& ($prev_char_name =~ /^THAI CHARACTER MAI [A-Z]+$/) # Thai tone
|
1168 |
+
&& ($romanized_prev2_char =~ /^[bcdfghjklmnpqrstvwxz]+$/i)
|
1169 |
+
&& ($romanized_next_char =~ /^[bcdfghjklmnpqrstvwxz]+$/i)) {
|
1170 |
+
$this->add_node("o", $i, $i+1, *chart_ht, "", "thai-middle-o-ang"); # keep between consonant+tone-mark and consonant
|
1171 |
+
} else {
|
1172 |
+
$this->add_node("", $i, $i+1, *chart_ht, "", "thai-middle-o-ang-drop"); # drop next to vowel
|
1173 |
+
}
|
1174 |
+
}
|
1175 |
+
|
1176 |
+
# Romanization with space
|
1177 |
+
} elsif ($romanized_char =~ /\s/) {
|
1178 |
+
$this->add_node($char, $i, $i+1, *chart_ht, "", "space");
|
1179 |
+
|
1180 |
+
# Tibetan special cases
|
1181 |
+
} elsif ($current_script eq "Tibetan") {
|
1182 |
+
|
1183 |
+
if ($subjoined_char_p
|
1184 |
+
&& ($prev_script eq $current_script)
|
1185 |
+
&& $prev_letter_plus_char_p
|
1186 |
+
&& ($prev_char_roman =~ /^[bcdfghjklmnpqrstvwxyz]+$/i)) {
|
1187 |
+
$this->add_node("$prev_char_roman$romanized_char", $prev_node_start, $i+1, *chart_ht, "", "subjoined-tibetan-character");
|
1188 |
+
} elsif ($romanized_char =~ /^-A$/i) {
|
1189 |
+
my $romanized_next_char = $this->romanize_char_at_position($i+1, $lang_code, $output_style, *ht, *chart_ht);
|
1190 |
+
if (! $prev_letter_plus_char_p) {
|
1191 |
+
$this->add_node("'", $i, $i+1, *chart_ht, "", "tibetan-frontal-dash-a");
|
1192 |
+
} elsif (($prev_script eq $current_script)
|
1193 |
+
&& ($next_script eq $current_script)
|
1194 |
+
&& ($prev_char_roman =~ /[bcdfghjklmnpqrstvwxyz]$/)
|
1195 |
+
&& ($romanized_next_char =~ /^[aeiou]/)) {
|
1196 |
+
$this->add_node("a'", $i, $i+1, *chart_ht, "", "tibetan-medial-dash-a");
|
1197 |
+
} elsif (($prev_script eq $current_script)
|
1198 |
+
&& ($next_script eq $current_script)
|
1199 |
+
&& ($prev_char_roman =~ /[aeiou]$/)
|
1200 |
+
&& ($romanized_next_char =~ /[aeiou]/)) {
|
1201 |
+
$this->add_node("'", $i, $i+1, *chart_ht, "", "tibetan-reduced-medial-dash-a");
|
1202 |
+
} elsif (($prev_script eq $current_script)
|
1203 |
+
&& (! ($prev_char_roman =~ /[aeiou]/))
|
1204 |
+
&& (! $next_letter_plus_char_p)) {
|
1205 |
+
$this->add_node("a", $i, $i+1, *chart_ht, "", "tibetan-final-dash-a");
|
1206 |
+
} else {
|
1207 |
+
$this->add_node("a", $i, $i+1, *chart_ht, "", "unexpected-tibetan-dash-a");
|
1208 |
+
}
|
1209 |
+
} elsif (($romanized_char =~ /^[AEIOU]/i)
|
1210 |
+
&& ($prev_script eq $current_script)
|
1211 |
+
&& ($prev_char_roman =~ /^A$/i)
|
1212 |
+
&& (! $prev2_letter_plus_char_p)) {
|
1213 |
+
$this->add_node($romanized_char, $prev_node_start, $i+1, *chart_ht, "", "tibetan-dropped-word-initial-a");
|
1214 |
+
} else {
|
1215 |
+
$this->add_node($romanized_char, $i, $i+1, *chart_ht, "", "standard-unicode-based-romanization");
|
1216 |
+
}
|
1217 |
+
|
1218 |
+
# Khmer (for MUUSIKATOAN etc. see under "Diacritic" above)
|
1219 |
+
} elsif (($current_script eq "Khmer")
|
1220 |
+
&& (($char_roman_consonant, $char_roman_vowel) = ($romanized_char =~ /^(.*[bcdfghjklmnpqrstvwxyz])([ao]+)-$/i))) {
|
1221 |
+
my $romanized_next_char = $this->romanize_char_at_position($i+1, $lang_code, $output_style, *ht, *chart_ht);
|
1222 |
+
if (($next_script eq $current_script)
|
1223 |
+
&& ($romanized_next_char =~ /^[aeiouy]/i)) {
|
1224 |
+
$this->add_node($char_roman_consonant, $i, $i+1, *chart_ht, "", "khmer-vowel-drop");
|
1225 |
+
} else {
|
1226 |
+
$this->add_node("$char_roman_consonant$char_roman_vowel", $i, $i+1, *chart_ht, "", "khmer-standard-unicode-based-romanization");
|
1227 |
+
}
|
1228 |
+
|
1229 |
+
# Abudiga add default vowel
|
1230 |
+
} elsif ((@abudiga_default_vowels = sort keys %{$ht{SCRIPT_ABUDIGA_DEFAULT_VOWEL}->{$current_script}})
|
1231 |
+
&& ($abudiga_default_vowel = $abudiga_default_vowels[0])
|
1232 |
+
&& ($romanized_char =~ /^[bcdfghjklmnpqrstvwxyz]+$/i)) {
|
1233 |
+
my $new_roman = join("", $romanized_char, $abudiga_default_vowel);
|
1234 |
+
$this->add_node($new_roman, $i, $i+1, *chart_ht, "", "standard-unicode-based-romanization-plus-abudiga-default-vowel");
|
1235 |
+
# print STDERR " Abudiga add default vowel l.$line_number/$i: $new_roman\n" if $line_number =~ /^[12]$/;
|
1236 |
+
|
1237 |
+
# Standard romanization
|
1238 |
+
} else {
|
1239 |
+
$node_id = $this->add_node($romanized_char, $i, $i+1, *chart_ht, "", "standard-unicode-based-romanization");
|
1240 |
+
}
|
1241 |
+
} else {
|
1242 |
+
$this->add_node($char, $i, $i+1, *chart_ht, "", "unexpected-original");
|
1243 |
+
}
|
1244 |
+
} elsif (defined($romanized_char = $this->romanize_char_at_position($i, $lang_code, $output_style, *ht, *chart_ht))
|
1245 |
+
&& ((length($romanized_char) <= 2)
|
1246 |
+
|| ($ht{UTF_TO_CHAR_ROMANIZATION}->{$char}))) { # or from unicode_overwrite_romanization table
|
1247 |
+
$romanized_char =~ s/^""$//;
|
1248 |
+
$this->add_node($romanized_char, $i, $i+1, *chart_ht, "", "romanized-without-character-name");
|
1249 |
+
} else {
|
1250 |
+
$this->add_node($char, $i, $i+1, *chart_ht, "", "unexpected-original-without-character-name");
|
1251 |
+
}
|
1252 |
+
}
|
1253 |
+
$i = $next_index;
|
1254 |
+
}
|
1255 |
+
|
1256 |
+
$this->schwa_deletion(0, $n_characters, *chart_ht, $lang_code);
|
1257 |
+
$this->default_vowelize_tibetan(0, $n_characters, *chart_ht, $lang_code, $line_number) if $chart_ht{CHART_CONTAINS_SCRIPT}->{"Tibetan"};
|
1258 |
+
$this->assemble_numbers_in_chart(*chart_ht, $line_number);
|
1259 |
+
|
1260 |
+
if ($return_chart_p) {
|
1261 |
+
} elsif ($return_offset_mappings_p) {
|
1262 |
+
($result, $offset_mappings, $new_char_offset, $new_rom_char_offset) = $this->best_romanized_string(0, $n_characters, *chart_ht, $control, $initial_char_offset, $initial_rom_char_offset);
|
1263 |
+
} else {
|
1264 |
+
$result = $this->best_romanized_string(0, $n_characters, *chart_ht) unless $return_chart_p;
|
1265 |
+
}
|
1266 |
+
|
1267 |
+
if ($verbosePM) {
|
1268 |
+
my $logfile = "/nfs/isd/ulf/cgi-mt/amr-tmp/uroman-log.txt";
|
1269 |
+
$util->append_to_file($logfile, $log) if $log && (-r $logfile);
|
1270 |
+
}
|
1271 |
+
|
1272 |
+
return ($result, $offset_mappings) if $return_offset_mappings_p;
|
1273 |
+
return *chart_ht if $return_chart_p;
|
1274 |
+
return $result;
|
1275 |
+
}
|
1276 |
+
|
1277 |
+
sub string_to_json_string {
|
1278 |
+
local($this, $s) = @_;
|
1279 |
+
|
1280 |
+
utf8::decode($s);
|
1281 |
+
my $j = JSON->new->utf8->encode([$s]);
|
1282 |
+
$j =~ s/^\[(.*)\]$/$1/;
|
1283 |
+
return $j;
|
1284 |
+
}
|
1285 |
+
|
1286 |
+
sub chart_to_json_romanization_elements {
|
1287 |
+
local($this, $chart_start, $chart_end, *chart_ht, $line_number) = @_;
|
1288 |
+
|
1289 |
+
my $result = "";
|
1290 |
+
my $start = $chart_start;
|
1291 |
+
my $end;
|
1292 |
+
while ($start < $chart_end) {
|
1293 |
+
$end = $this->find_end_of_rom_segment($start, $chart_end, *chart_ht);
|
1294 |
+
my @best_romanizations;
|
1295 |
+
if (($end && ($start < $end))
|
1296 |
+
&& (@best_romanizations = $this->best_romanizations($start, $end, *chart_ht))) {
|
1297 |
+
$orig_segment = $this->orig_string_at_span($start, $end, *chart_ht);
|
1298 |
+
$next_start = $end;
|
1299 |
+
} else {
|
1300 |
+
$orig_segment = $chart_ht{ORIG_CHAR}->{$start};
|
1301 |
+
@best_romanizations = ($orig);
|
1302 |
+
$next_start = $start + 1;
|
1303 |
+
}
|
1304 |
+
$exclusive_end = $end - 1;
|
1305 |
+
# $guarded_orig = $util->string_guard($orig_segment);
|
1306 |
+
$guarded_orig = $this->string_to_json_string($orig_segment);
|
1307 |
+
$result .= " { \"line\": $line_number, \"start\": $start, \"end\": $exclusive_end, \"orig\": $guarded_orig, \"roms\": [";
|
1308 |
+
foreach $i ((0 .. $#best_romanizations)) {
|
1309 |
+
my $rom = $best_romanizations[$i];
|
1310 |
+
# my $guarded_rom = $util->string_guard($rom);
|
1311 |
+
my $guarded_rom = $this->string_to_json_string($rom);
|
1312 |
+
$result .= " { \"rom\": $guarded_rom";
|
1313 |
+
# $result .= ", \"alt\": true" if $i >= 1;
|
1314 |
+
$result .= " }";
|
1315 |
+
$result .= "," if $i < $#best_romanizations;
|
1316 |
+
}
|
1317 |
+
$result .= " ] },\n";
|
1318 |
+
$start = $next_start;
|
1319 |
+
}
|
1320 |
+
return $result;
|
1321 |
+
}
|
1322 |
+
|
1323 |
+
sub default_vowelize_tibetan {
|
1324 |
+
local($this, $chart_start, $chart_end, *chart_ht, $lang_code, $line_number) = @_;
|
1325 |
+
|
1326 |
+
# my $verbose = ($line_number == 103);
|
1327 |
+
# print STDERR "\nStart default_vowelize_tibetan l.$line_number $chart_start-$chart_end\n" if $verbose;
|
1328 |
+
my $token_start = $chart_start;
|
1329 |
+
my $next_token_start = $chart_start;
|
1330 |
+
while (($token_start = $next_token_start) < $chart_end) {
|
1331 |
+
$next_token_start = $token_start + 1;
|
1332 |
+
|
1333 |
+
next unless $chart_ht{CHAR_LETTER_PLUS}->{$token_start};
|
1334 |
+
my $current_script = $chart_ht{CHAR_SCRIPT}->{$token_start};
|
1335 |
+
next unless ($current_script eq "Tibetan");
|
1336 |
+
my $token_end = $chart_ht{LETTER_TOKEN_SEGMENT_START_TO_END}->{$token_start};
|
1337 |
+
next unless $token_end;
|
1338 |
+
next unless $token_end > $token_start;
|
1339 |
+
$next_token_start = $token_end;
|
1340 |
+
|
1341 |
+
my $start = $token_start;
|
1342 |
+
my $end;
|
1343 |
+
my @node_ids = ();
|
1344 |
+
while ($start < $token_end) {
|
1345 |
+
$end = $this->find_end_of_rom_segment($start, $chart_end, *chart_ht);
|
1346 |
+
last unless $end && ($end > $start);
|
1347 |
+
my @alt_node_ids = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}};
|
1348 |
+
last unless @alt_node_ids;
|
1349 |
+
push(@node_ids, $alt_node_ids[0]);
|
1350 |
+
$start = $end;
|
1351 |
+
}
|
1352 |
+
my $contains_vowel_p = 0;
|
1353 |
+
my @romanizations = ();
|
1354 |
+
foreach $node_id (@node_ids) {
|
1355 |
+
my $roman = $chart_ht{NODE_ROMAN}->{$node_id};
|
1356 |
+
$roman = "" unless defined($roman);
|
1357 |
+
push(@romanizations, $roman);
|
1358 |
+
$contains_vowel_p = 1 if $roman =~ /[aeiou]/i;
|
1359 |
+
}
|
1360 |
+
# print STDERR " old: $token_start-$token_end @romanizations\n" if $verbose;
|
1361 |
+
unless ($contains_vowel_p) {
|
1362 |
+
my $default_vowel_target_index;
|
1363 |
+
if ($#node_ids <= 1) {
|
1364 |
+
$default_vowel_target_index = 0;
|
1365 |
+
} elsif ($romanizations[$#romanizations] eq "s") {
|
1366 |
+
if ($romanizations[($#romanizations-1)] eq "y") {
|
1367 |
+
$default_vowel_target_index = $#romanizations-1;
|
1368 |
+
} else {
|
1369 |
+
$default_vowel_target_index = $#romanizations-2;
|
1370 |
+
}
|
1371 |
+
} else {
|
1372 |
+
$default_vowel_target_index = $#romanizations-1;
|
1373 |
+
}
|
1374 |
+
$romanizations[$default_vowel_target_index] .= "a";
|
1375 |
+
my $old_node_id = $node_ids[$default_vowel_target_index];
|
1376 |
+
my $old_start = $chart_ht{NODE_START}->{$old_node_id};
|
1377 |
+
my $old_end = $chart_ht{NODE_END}->{$old_node_id};
|
1378 |
+
my $old_roman = $chart_ht{NODE_ROMAN}->{$old_node_id};
|
1379 |
+
my $new_roman = $old_roman . "a";
|
1380 |
+
my $new_node_id = $this->add_node($new_roman, $old_start, $old_end, *chart_ht, "", "tibetan-default-vowel");
|
1381 |
+
$this->copy_slot_values($old_node_id, $new_node_id, *chart_id, "all");
|
1382 |
+
$chart_ht{NODE_TYPE}->{$old_node_id} = "backup"; # keep, but demote
|
1383 |
+
}
|
1384 |
+
if (($romanizations[0] eq "'")
|
1385 |
+
&& ($#romanizations >= 1)
|
1386 |
+
&& ($romanizations[1] =~ /^[o]$/)) {
|
1387 |
+
my $old_node_id = $node_ids[0];
|
1388 |
+
my $old_start = $chart_ht{NODE_START}->{$old_node_id};
|
1389 |
+
my $old_end = $chart_ht{NODE_END}->{$old_node_id};
|
1390 |
+
my $new_node_id = $this->add_node("", $old_start, $old_end, *chart_ht, "", "tibetan-delete-apostrophe");
|
1391 |
+
$this->copy_slot_values($old_node_id, $new_node_id, *chart_id, "all");
|
1392 |
+
$chart_ht{NODE_TYPE}->{$old_node_id} = "alt"; # keep, but demote
|
1393 |
+
}
|
1394 |
+
if (($#node_ids >= 1)
|
1395 |
+
&& ($romanizations[$#romanizations] =~ /^[bcdfghjklmnpqrstvwxz]+y$/)) {
|
1396 |
+
my $old_node_id = $node_ids[$#romanizations];
|
1397 |
+
my $old_start = $chart_ht{NODE_START}->{$old_node_id};
|
1398 |
+
my $old_end = $chart_ht{NODE_END}->{$old_node_id};
|
1399 |
+
my $old_roman = $chart_ht{NODE_ROMAN}->{$old_node_id};
|
1400 |
+
my $new_roman = $old_roman . "a";
|
1401 |
+
my $new_node_id = $this->add_node($new_roman, $old_start, $old_end, *chart_ht, "", "tibetan-syllable-final-vowel");
|
1402 |
+
$this->copy_slot_values($old_node_id, $new_node_id, *chart_id, "all");
|
1403 |
+
$chart_ht{NODE_TYPE}->{$old_node_id} = "alt"; # keep, but demote
|
1404 |
+
}
|
1405 |
+
foreach $old_node_id (@node_ids) {
|
1406 |
+
my $old_roman = $chart_ht{NODE_ROMAN}->{$old_node_id};
|
1407 |
+
next unless $old_roman =~ /-a/;
|
1408 |
+
my $old_start = $chart_ht{NODE_START}->{$old_node_id};
|
1409 |
+
my $old_end = $chart_ht{NODE_END}->{$old_node_id};
|
1410 |
+
my $new_roman = $old_roman;
|
1411 |
+
$new_roman =~ s/-a/a/;
|
1412 |
+
my $new_node_id = $this->add_node($new_roman, $old_start, $old_end, *chart_ht, "", "tibetan-syllable-delete-dash");
|
1413 |
+
$this->copy_slot_values($old_node_id, $new_node_id, *chart_id, "all");
|
1414 |
+
$chart_ht{NODE_TYPE}->{$old_node_id} = "alt"; # keep, but demote
|
1415 |
+
}
|
1416 |
+
}
|
1417 |
+
}
|
1418 |
+
|
1419 |
+
sub schwa_deletion {
|
1420 |
+
local($this, $chart_start, $chart_end, *chart_ht, $lang_code) = @_;
|
1421 |
+
# delete word-final simple "a" in Devanagari (e.g. nepaala -> nepaal)
|
1422 |
+
# see Wikipedia article "Schwa deletion in Indo-Aryan languages"
|
1423 |
+
|
1424 |
+
if ($chart_ht{CHART_CONTAINS_SCRIPT}->{"Devanagari"}) {
|
1425 |
+
my $script_start = $chart_start;
|
1426 |
+
my $next_script_start = $chart_start;
|
1427 |
+
while (($script_start = $next_script_start) < $chart_end) {
|
1428 |
+
$next_script_start = $script_start + 1;
|
1429 |
+
|
1430 |
+
my $current_script = $chart_ht{CHAR_SCRIPT}->{$script_start};
|
1431 |
+
next unless ($current_script eq "Devanagari");
|
1432 |
+
my $script_end = $chart_ht{SCRIPT_SEGMENT_START_TO_END}->{$script_start};
|
1433 |
+
next unless $script_end;
|
1434 |
+
next unless $script_end - $script_start >= 2;
|
1435 |
+
$next_script_start = $script_end;
|
1436 |
+
my $end_node_id = $this->get_node_for_span($script_end-1, $script_end, *chart_ht);
|
1437 |
+
next unless $end_node_id;
|
1438 |
+
my $end_roman = $chart_ht{NODE_ROMAN}->{$end_node_id};
|
1439 |
+
next unless ($end_consonant) = ($end_roman =~ /^([bcdfghjklmnpqrstvwxz]+)a$/i);
|
1440 |
+
my $prev_node_id = $this->get_node_for_span($script_end-4, $script_end-1, *chart_ht)
|
1441 |
+
|| $this->get_node_for_span($script_end-3, $script_end-1, *chart_ht)
|
1442 |
+
|| $this->get_node_for_span($script_end-2, $script_end-1, *chart_ht);
|
1443 |
+
next unless $prev_node_id;
|
1444 |
+
my $prev_roman = $chart_ht{NODE_ROMAN}->{$prev_node_id};
|
1445 |
+
next unless $prev_roman =~ /[aeiou]/i;
|
1446 |
+
# TO DO: check further back for vowel (e.g. if $prev_roman eq "r" due to vowel cancelation)
|
1447 |
+
|
1448 |
+
$chart_ht{NODE_TYPE}->{$end_node_id} = "alt"; # keep, but demote
|
1449 |
+
# print STDERR "* Schwa deletion " . ($script_end-1) . "-$script_end $end_roman->$end_consonant\n";
|
1450 |
+
$this->add_node($end_consonant, $script_end-1, $script_end, *chart_ht, "", "devanagari-with-deleted-final-schwa");
|
1451 |
+
}
|
1452 |
+
}
|
1453 |
+
}
|
1454 |
+
|
1455 |
+
sub best_romanized_string {
|
1456 |
+
local($this, $chart_start, $chart_end, *chart_ht, $control, $orig_char_offset, $rom_char_offset) = @_;
|
1457 |
+
|
1458 |
+
$control = "" unless defined($control);
|
1459 |
+
my $current_orig_char_offset = $orig_char_offset || 0;
|
1460 |
+
my $current_rom_char_offset = $rom_char_offset || 0;
|
1461 |
+
my $return_offset_mappings_p = ($control =~ /\breturn offset mappings\b/);
|
1462 |
+
my $result = "";
|
1463 |
+
my $start = $chart_start;
|
1464 |
+
my $end;
|
1465 |
+
my @char_offsets = ("$current_orig_char_offset:$current_rom_char_offset");
|
1466 |
+
while ($start < $chart_end) {
|
1467 |
+
$end = $this->find_end_of_rom_segment($start, $chart_end, *chart_ht);
|
1468 |
+
my $n_orig_chars_in_segment = 0;
|
1469 |
+
my $n_rom_chars_in_segment = 0;
|
1470 |
+
if ($end && ($start < $end)) {
|
1471 |
+
my @best_romanizations = $this->best_romanizations($start, $end, *chart_ht);
|
1472 |
+
my $best_romanization = (@best_romanizations) ? $best_romanizations[0] : undef;
|
1473 |
+
if (defined($best_romanization)) {
|
1474 |
+
$result .= $best_romanization;
|
1475 |
+
if ($return_offset_mappings_p) {
|
1476 |
+
$n_orig_chars_in_segment = $end-$start;
|
1477 |
+
$n_rom_chars_in_segment = $utf8->length_in_utf8_chars($best_romanization);
|
1478 |
+
}
|
1479 |
+
$start = $end;
|
1480 |
+
} else {
|
1481 |
+
my $best_romanization = $chart_ht{ORIG_CHAR}->{$start};
|
1482 |
+
$result .= $best_romanization;
|
1483 |
+
$start++;
|
1484 |
+
if ($return_offset_mappings_p) {
|
1485 |
+
$n_orig_chars_in_segment = 1;
|
1486 |
+
$n_rom_chars_in_segment = $utf8->length_in_utf8_chars($best_romanization);
|
1487 |
+
}
|
1488 |
+
}
|
1489 |
+
} else {
|
1490 |
+
my $best_romanization = $chart_ht{ORIG_CHAR}->{$start};
|
1491 |
+
$result .= $best_romanization;
|
1492 |
+
$start++;
|
1493 |
+
if ($return_offset_mappings_p) {
|
1494 |
+
$n_orig_chars_in_segment = 1;
|
1495 |
+
$n_rom_chars_in_segment = $utf8->length_in_utf8_chars($best_romanization);
|
1496 |
+
}
|
1497 |
+
}
|
1498 |
+
if ($return_offset_mappings_p) {
|
1499 |
+
my $new_orig_char_offset = $current_orig_char_offset + $n_orig_chars_in_segment;
|
1500 |
+
my $new_rom_char_offset = $current_rom_char_offset + $n_rom_chars_in_segment;
|
1501 |
+
my $offset_mapping = "$new_orig_char_offset:$new_rom_char_offset";
|
1502 |
+
push(@char_offsets, $offset_mapping);
|
1503 |
+
$current_orig_char_offset = $new_orig_char_offset;
|
1504 |
+
$current_rom_char_offset = $new_rom_char_offset;
|
1505 |
+
}
|
1506 |
+
}
|
1507 |
+
return ($result, join(",", @char_offsets), $current_orig_char_offset, $current_rom_char_offset) if $return_offset_mappings_p;
|
1508 |
+
return $result;
|
1509 |
+
}
|
1510 |
+
|
1511 |
+
sub orig_string_at_span {
|
1512 |
+
local($this, $start, $end, *chart_ht) = @_;
|
1513 |
+
|
1514 |
+
my $result = "";
|
1515 |
+
foreach $i (($start .. ($end-1))) {
|
1516 |
+
$result .= $chart_ht{ORIG_CHAR}->{$i};
|
1517 |
+
}
|
1518 |
+
return $result;
|
1519 |
+
}
|
1520 |
+
|
1521 |
+
sub find_end_of_rom_segment {
|
1522 |
+
local($this, $start, $chart_end, *chart_ht) = @_;
|
1523 |
+
|
1524 |
+
my @ends = sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}};
|
1525 |
+
my $end_index = $#ends;
|
1526 |
+
while (($end_index >= 0) && ($ends[$end_index] > $chart_end)) {
|
1527 |
+
$end_index--;
|
1528 |
+
}
|
1529 |
+
if (($end_index >= 0)
|
1530 |
+
&& defined($end = $ends[$end_index])
|
1531 |
+
&& ($start < $end)) {
|
1532 |
+
return $end;
|
1533 |
+
} else {
|
1534 |
+
return "";
|
1535 |
+
}
|
1536 |
+
}
|
1537 |
+
|
1538 |
+
sub best_romanizations {
|
1539 |
+
local($this, $start, $end, *chart_ht) = @_;
|
1540 |
+
|
1541 |
+
@regular_romanizations = ();
|
1542 |
+
@alt_romanizations = ();
|
1543 |
+
@backup_romanizations = ();
|
1544 |
+
|
1545 |
+
foreach $node_id (sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}}) {
|
1546 |
+
my $type = $chart_ht{NODE_TYPE}->{$node_id};
|
1547 |
+
my $roman = $chart_ht{NODE_ROMAN}->{$node_id};
|
1548 |
+
if (! defined($roman)) {
|
1549 |
+
# ignore
|
1550 |
+
} elsif (($type eq "backup") && ! defined($backup_romanization)) {
|
1551 |
+
push(@backup_romanizations, $roman) unless $util->member($roman, @backup_romanizations);
|
1552 |
+
} elsif (($type eq "alt") && ! defined($alt_romanization)) {
|
1553 |
+
push(@alt_romanizations, $roman) unless $util->member($roman, @alt_romanizations);
|
1554 |
+
} else {
|
1555 |
+
push(@regular_romanizations, $roman) unless $util->member($roman, @regular_romanizations);
|
1556 |
+
}
|
1557 |
+
}
|
1558 |
+
@regular_alt_romanizations = sort @regular_romanizations;
|
1559 |
+
foreach $alt_romanization (sort @alt_romanizations) {
|
1560 |
+
push(@regular_alt_romanizations, $alt_romanization) unless $util->member($alt_romanization, @regular_alt_romanizations);
|
1561 |
+
}
|
1562 |
+
return @regular_alt_romanizations if @regular_alt_romanizations;
|
1563 |
+
return sort @backup_romanizations;
|
1564 |
+
}
|
1565 |
+
|
1566 |
+
sub join_alt_romanizations_for_viz {
|
1567 |
+
local($this, @list) = @_;
|
1568 |
+
|
1569 |
+
my @viz_romanizations = ();
|
1570 |
+
|
1571 |
+
foreach $alt_rom (@list) {
|
1572 |
+
if ($alt_rom eq "") {
|
1573 |
+
push(@viz_romanizations, "-");
|
1574 |
+
} else {
|
1575 |
+
push(@viz_romanizations, $alt_rom);
|
1576 |
+
}
|
1577 |
+
}
|
1578 |
+
return join(", ", @viz_romanizations);
|
1579 |
+
}
|
1580 |
+
|
1581 |
+
sub markup_orig_rom_strings {
|
1582 |
+
local($this, $chart_start, $chart_end, *ht, *chart_ht, *pinyin_ht, $last_group_id_index) = @_;
|
1583 |
+
|
1584 |
+
my $marked_up_rom = "";
|
1585 |
+
my $marked_up_orig = "";
|
1586 |
+
my $start = $chart_start;
|
1587 |
+
my $end;
|
1588 |
+
while ($start < $chart_end) {
|
1589 |
+
my $segment_start = $start;
|
1590 |
+
my $segment_end = $start+1;
|
1591 |
+
my $end = $this->find_end_of_rom_segment($start, $chart_end, *chart_ht);
|
1592 |
+
my $rom_segment = "";
|
1593 |
+
my $orig_segment = "";
|
1594 |
+
my $rom_title = "";
|
1595 |
+
my $orig_title = "";
|
1596 |
+
my $contains_alt_romanizations = 0;
|
1597 |
+
if ($end) {
|
1598 |
+
$segment_end = $end;
|
1599 |
+
my @best_romanizations = $this->best_romanizations($start, $end, *chart_ht);
|
1600 |
+
my $best_romanization = (@best_romanizations) ? $best_romanizations[0] : undef;
|
1601 |
+
if (defined($best_romanization)) {
|
1602 |
+
$rom_segment .= $best_romanization;
|
1603 |
+
$orig_segment .= $this->orig_string_at_span($start, $end, *chart_ht);
|
1604 |
+
$segment_end = $end;
|
1605 |
+
if ($#best_romanizations >= 1) {
|
1606 |
+
$rom_title .= $util->guard_html("Alternative romanizations: " . $this->join_alt_romanizations_for_viz(@best_romanizations) . "\n");
|
1607 |
+
$contains_alt_romanizations = 1;
|
1608 |
+
}
|
1609 |
+
} else {
|
1610 |
+
my $segment = $this->orig_string_at_span($start, $start+1, *chart_ht);
|
1611 |
+
$rom_segment .= $segment;
|
1612 |
+
$orig_segment .= $segment;
|
1613 |
+
$segment_end = $start+1;
|
1614 |
+
}
|
1615 |
+
$start = $segment_end;
|
1616 |
+
} else {
|
1617 |
+
$rom_segment .= $chart_ht{ORIG_CHAR}->{$start};
|
1618 |
+
$orig_segment .= $this->orig_string_at_span($start, $start+1, *chart_ht);
|
1619 |
+
$segment_end = $start+1;
|
1620 |
+
$start = $segment_end;
|
1621 |
+
}
|
1622 |
+
my $next_char = $chart_ht{ORIG_CHAR}->{$segment_end};
|
1623 |
+
my $next_char_is_combining_p = $this->char_is_combining_char($next_char, *ht);
|
1624 |
+
while ($next_char_is_combining_p
|
1625 |
+
&& ($segment_end < $chart_end)
|
1626 |
+
&& ($end = $this->find_end_of_rom_segment($segment_end, $chart_end, *chart_ht))
|
1627 |
+
&& ($end > $segment_end)
|
1628 |
+
&& (@best_romanizations = $this->best_romanizations($segment_end, $end, *chart_ht))
|
1629 |
+
&& defined($best_romanization = $best_romanizations[0])) {
|
1630 |
+
$orig_segment .= $this->orig_string_at_span($segment_end, $end, *chart_ht);
|
1631 |
+
$rom_segment .= $best_romanization;
|
1632 |
+
if ($#best_romanizations >= 1) {
|
1633 |
+
$rom_title .= $util->guard_html("Alternative romanizations: " . $this->join_alt_romanizations_for_viz(@best_romanizations) . "\n");
|
1634 |
+
$contains_alt_romanizations = 1;
|
1635 |
+
}
|
1636 |
+
$segment_end = $end;
|
1637 |
+
$start = $segment_end;
|
1638 |
+
$next_char = $chart_ht{ORIG_CHAR}->{$segment_end};
|
1639 |
+
$next_char_is_combining_p = $this->char_is_combining_char($next_char, *ht);
|
1640 |
+
}
|
1641 |
+
foreach $i (($segment_start .. ($segment_end-1))) {
|
1642 |
+
$orig_title .= "+‎ ‎" unless $orig_title eq "";
|
1643 |
+
my $char = $chart_ht{ORIG_CHAR}->{$i};
|
1644 |
+
my $numeric = $ht{UTF_TO_NUMERIC}->{$char};
|
1645 |
+
$numeric = "" unless defined($numeric);
|
1646 |
+
my $pic_descr = $ht{UTF_TO_PICTURE_DESCR}->{$char};
|
1647 |
+
$pic_descr = "" unless defined($pic_descr);
|
1648 |
+
if ($char =~ /^\xE4\xB7[\x80-\xBF]$/) {
|
1649 |
+
$orig_title .= "$char_name\n";
|
1650 |
+
} elsif (($char =~ /^[\xE3-\xE9][\x80-\xBF]{2,2}$/) && $chinesePM->string_contains_utf8_cjk_unified_ideograph_p($char)) {
|
1651 |
+
my $unicode = $utf8->utf8_to_unicode($char);
|
1652 |
+
$orig_title .= "CJK Unified Ideograph U+" . (uc sprintf("%04x", $unicode)) . "\n";
|
1653 |
+
$orig_title .= "Chinese: $tonal_translit\n" if $tonal_translit = $chinesePM->tonal_pinyin($char, *pinyin_ht, "");
|
1654 |
+
$orig_title .= "Number: $numeric\n" if $numeric =~ /\d/;
|
1655 |
+
} elsif ($char_name = $ht{UTF_TO_CHAR_NAME}->{$char}) {
|
1656 |
+
$orig_title .= "$char_name\n";
|
1657 |
+
$orig_title .= "Number: $numeric\n" if $numeric =~ /\d/;
|
1658 |
+
$orig_title .= "Picture: $pic_descr\n" if $pic_descr =~ /\S/;
|
1659 |
+
} else {
|
1660 |
+
my $unicode = $utf8->utf8_to_unicode($char);
|
1661 |
+
if (($unicode >= 0xAC00) && ($unicode <= 0xD7A3)) {
|
1662 |
+
$orig_title .= "Hangul syllable U+" . (uc sprintf("%04x", $unicode)) . "\n";
|
1663 |
+
} else {
|
1664 |
+
$orig_title .= "Unicode character U+" . (uc sprintf("%04x", $unicode)) . "\n";
|
1665 |
+
}
|
1666 |
+
}
|
1667 |
+
}
|
1668 |
+
(@non_ascii_roms) = ($rom_segment =~ /([\xC0-\xFF][\x80-\xBF]*)/g);
|
1669 |
+
foreach $char (@non_ascii_roms) {
|
1670 |
+
my $char_name = $ht{UTF_TO_CHAR_NAME}->{$char};
|
1671 |
+
my $unicode = $utf8->utf8_to_unicode($char);
|
1672 |
+
my $unicode_s = "U+" . (uc sprintf("%04x", $unicode));
|
1673 |
+
if ($char_name) {
|
1674 |
+
$rom_title .= "$char_name\n";
|
1675 |
+
} else {
|
1676 |
+
$rom_title .= "$unicode_s\n";
|
1677 |
+
}
|
1678 |
+
}
|
1679 |
+
$last_group_id_index++;
|
1680 |
+
$rom_title =~ s/\s*$//;
|
1681 |
+
$rom_title =~ s/\n/
/g;
|
1682 |
+
$orig_title =~ s/\s*$//;
|
1683 |
+
$orig_title =~ s/\n/
‎/g;
|
1684 |
+
$orig_title = "‭" . $orig_title . "‬";
|
1685 |
+
my $rom_title_clause = ($rom_title eq "") ? "" : " title=\"$rom_title\"";
|
1686 |
+
my $orig_title_clause = ($orig_title eq "") ? "" : " title=\"$orig_title\"";
|
1687 |
+
my $alt_rom_clause = ($contains_alt_romanizations) ? "border-bottom:1px dotted;" : "";
|
1688 |
+
$marked_up_rom .= "<span id=\"span-$last_group_id_index-1\" onmouseover=\"highlight_elems('span-$last_group_id_index','1');\" onmouseout=\"highlight_elems('span-$last_group_id_index','0');\" style=\"color:#00BB00;$alt_rom_clause\"$rom_title_clause>" . $util->guard_html($rom_segment) . "<\/span>";
|
1689 |
+
$marked_up_orig .= "<span id=\"span-$last_group_id_index-2\" onmouseover=\"highlight_elems('span-$last_group_id_index','1');\" onmouseout=\"highlight_elems('span-$last_group_id_index','0');\"$orig_title_clause>" . $util->guard_html($orig_segment) . "<\/span>";
|
1690 |
+
if (($last_char = $chart_ht{ORIG_CHAR}->{($segment_end-1)})
|
1691 |
+
&& ($last_char_name = $ht{UTF_TO_CHAR_NAME}->{$last_char})
|
1692 |
+
&& ($last_char_name =~ /^(FULLWIDTH COLON|FULLWIDTH COMMA|FULLWIDTH RIGHT PARENTHESIS|IDEOGRAPHIC COMMA|IDEOGRAPHIC FULL STOP|RIGHT CORNER BRACKET|BRAILLE PATTERN BLANK|TIBETAN MARK .*)$/)) {
|
1693 |
+
$marked_up_orig .= "<wbr>";
|
1694 |
+
$marked_up_rom .= "<wbr>";
|
1695 |
+
}
|
1696 |
+
}
|
1697 |
+
return ($marked_up_rom, $marked_up_orig, $last_group_id_index);
|
1698 |
+
}
|
1699 |
+
|
1700 |
+
sub romanizations_with_alternatives {
|
1701 |
+
local($this, *ht, *chart_ht, *pinyin_ht, $chart_start, $chart_end) = @_;
|
1702 |
+
|
1703 |
+
$chart_start = 0 unless defined($chart_start);
|
1704 |
+
$chart_end = $chart_ht{N_CHARS} unless defined($chart_end);
|
1705 |
+
my $result = "";
|
1706 |
+
my $start = $chart_start;
|
1707 |
+
my $end;
|
1708 |
+
# print STDOUT "romanizations_with_alternatives $chart_start-$chart_end\n";
|
1709 |
+
while ($start < $chart_end) {
|
1710 |
+
my $segment_start = $start;
|
1711 |
+
my $segment_end = $start+1;
|
1712 |
+
my $end = $this->find_end_of_rom_segment($start, $chart_end, *chart_ht);
|
1713 |
+
my $rom_segment = "";
|
1714 |
+
# print STDOUT " $start-$end\n";
|
1715 |
+
if ($end) {
|
1716 |
+
$segment_end = $end;
|
1717 |
+
my @best_romanizations = $this->best_romanizations($start, $end, *chart_ht);
|
1718 |
+
# print STDOUT " $start-$end @best_romanizations\n";
|
1719 |
+
if (@best_romanizations) {
|
1720 |
+
if ($#best_romanizations == 0) {
|
1721 |
+
$rom_segment .= $best_romanizations[0];
|
1722 |
+
} else {
|
1723 |
+
$rom_segment .= "{" . join("|", @best_romanizations) . "}";
|
1724 |
+
}
|
1725 |
+
$segment_end = $end;
|
1726 |
+
} else {
|
1727 |
+
my $segment = $this->orig_string_at_span($start, $start+1, *chart_ht);
|
1728 |
+
$rom_segment .= $segment;
|
1729 |
+
$segment_end = $start+1;
|
1730 |
+
}
|
1731 |
+
$start = $segment_end;
|
1732 |
+
} else {
|
1733 |
+
$rom_segment .= $chart_ht{ORIG_CHAR}->{$start};
|
1734 |
+
$segment_end = $start+1;
|
1735 |
+
$start = $segment_end;
|
1736 |
+
}
|
1737 |
+
# print STDOUT " $start-$end ** $rom_segment\n";
|
1738 |
+
$result .= $rom_segment;
|
1739 |
+
}
|
1740 |
+
return $result;
|
1741 |
+
}
|
1742 |
+
|
1743 |
+
sub quick_romanize {
|
1744 |
+
local($this, $s, $lang_code, *ht) = @_;
|
1745 |
+
|
1746 |
+
my $result = "";
|
1747 |
+
my @chars = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht);
|
1748 |
+
while (@chars) {
|
1749 |
+
my $found_match_in_table_p = 0;
|
1750 |
+
foreach $string_length (reverse(1..4)) {
|
1751 |
+
next if ($string_length-1) > $#chars;
|
1752 |
+
$multi_char_substring = join("", @chars[0..($string_length-1)]);
|
1753 |
+
my @mappings = keys %{$ht{UTF_CHAR_MAPPING_LANG_SPEC}->{$lang_code}->{$multi_char_substring}};
|
1754 |
+
@mappings = keys %{$ht{UTF_CHAR_MAPPING}->{$multi_char_substring}} unless @mappings;
|
1755 |
+
if (@mappings) {
|
1756 |
+
my $mapping = $mappings[0];
|
1757 |
+
$result .= $mapping;
|
1758 |
+
foreach $_ ((1 .. $string_length)) {
|
1759 |
+
shift @chars;
|
1760 |
+
}
|
1761 |
+
$found_match_in_table_p = 1;
|
1762 |
+
last;
|
1763 |
+
}
|
1764 |
+
}
|
1765 |
+
unless ($found_match_in_table_p) {
|
1766 |
+
$result .= $chars[0];
|
1767 |
+
shift @chars;
|
1768 |
+
}
|
1769 |
+
}
|
1770 |
+
return $result;
|
1771 |
+
}
|
1772 |
+
|
1773 |
+
sub char_is_combining_char {
|
1774 |
+
local($this, $c, *ht) = @_;
|
1775 |
+
|
1776 |
+
return 0 unless $c;
|
1777 |
+
my $category = $ht{UTF_TO_CAT}->{$c};
|
1778 |
+
return 0 unless $category;
|
1779 |
+
return $category =~ /^M/;
|
1780 |
+
}
|
1781 |
+
|
1782 |
+
sub mark_up_string_for_mouse_over {
|
1783 |
+
local($this, $s, *ht, $control, *pinyin_ht) = @_;
|
1784 |
+
|
1785 |
+
$control = "" unless defined($control);
|
1786 |
+
$no_ascii_p = ($control =~ /NO-ASCII/);
|
1787 |
+
my $result = "";
|
1788 |
+
@chars = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht);
|
1789 |
+
while (@chars) {
|
1790 |
+
$char = shift @chars;
|
1791 |
+
$numeric = $ht{UTF_TO_NUMERIC}->{$char};
|
1792 |
+
$numeric = "" unless defined($numeric);
|
1793 |
+
$pic_descr = $ht{UTF_TO_PICTURE_DESCR}->{$char};
|
1794 |
+
$pic_descr = "" unless defined($pic_descr);
|
1795 |
+
$next_char = ($#chars >= 0) ? $chars[0] : "";
|
1796 |
+
$next_char_is_combining_p = $this->char_is_combining_char($next_char, *ht);
|
1797 |
+
if ($no_ascii_p
|
1798 |
+
&& ($char =~ /^[\x00-\x7F]*$/)
|
1799 |
+
&& ! $next_char_is_combining_p) {
|
1800 |
+
$result .= $util->guard_html($char);
|
1801 |
+
} elsif (($char =~ /^[\xE3-\xE9][\x80-\xBF]{2,2}$/) && $chinesePM->string_contains_utf8_cjk_unified_ideograph_p($char)) {
|
1802 |
+
$unicode = $utf8->utf8_to_unicode($char);
|
1803 |
+
$title = "CJK Unified Ideograph U+" . (uc sprintf("%04x", $unicode));
|
1804 |
+
$title .= "
Chinese: $tonal_translit" if $tonal_translit = $chinesePM->tonal_pinyin($char, *pinyin_ht, "");
|
1805 |
+
$title .= "
Number: $numeric" if $numeric =~ /\d/;
|
1806 |
+
$result .= "<span title=\"$title\">" . $util->guard_html($char) . "<\/span>";
|
1807 |
+
} elsif ($char_name = $ht{UTF_TO_CHAR_NAME}->{$char}) {
|
1808 |
+
$title = $char_name;
|
1809 |
+
$title .= "
Number: $numeric" if $numeric =~ /\d/;
|
1810 |
+
$title .= "
Picture: $pic_descr" if $pic_descr =~ /\S/;
|
1811 |
+
$char_plus = $char;
|
1812 |
+
while ($next_char_is_combining_p) {
|
1813 |
+
# combining marks (Mc:non-spacing, Mc:spacing combining, Me: enclosing)
|
1814 |
+
$next_char_name = $ht{UTF_TO_CHAR_NAME}->{$next_char};
|
1815 |
+
$title .= "
+ $next_char_name";
|
1816 |
+
$char = shift @chars;
|
1817 |
+
$char_plus .= $char;
|
1818 |
+
$next_char = ($#chars >= 0) ? $chars[0] : "";
|
1819 |
+
$next_char_is_combining_p = $this->char_is_combining_char($next_char, *ht);
|
1820 |
+
}
|
1821 |
+
$result .= "<span title=\"$title\">" . $util->guard_html($char_plus) . "<\/span>";
|
1822 |
+
$result .= "<wbr>" if $char_name =~ /^(FULLWIDTH COLON|FULLWIDTH COMMA|FULLWIDTH RIGHT PARENTHESIS|IDEOGRAPHIC COMMA|IDEOGRAPHIC FULL STOP|RIGHT CORNER BRACKET)$/;
|
1823 |
+
} elsif (($unicode = $utf8->utf8_to_unicode($char))
|
1824 |
+
&& ($unicode >= 0xAC00) && ($unicode <= 0xD7A3)) {
|
1825 |
+
$title = "Hangul syllable U+" . (uc sprintf("%04x", $unicode));
|
1826 |
+
$result .= "<span title=\"$title\">" . $util->guard_html($char) . "<\/span>";
|
1827 |
+
} else {
|
1828 |
+
$result .= $util->guard_html($char);
|
1829 |
+
}
|
1830 |
+
}
|
1831 |
+
return $result;
|
1832 |
+
}
|
1833 |
+
|
1834 |
+
sub romanize_char_at_position_incl_multi {
|
1835 |
+
local($this, $i, $lang_code, $output_style, *ht, *chart_ht) = @_;
|
1836 |
+
|
1837 |
+
my $char = $chart_ht{ORIG_CHAR}->{$i};
|
1838 |
+
return "" unless defined($char);
|
1839 |
+
my @mappings = keys %{$ht{UTF_CHAR_MAPPING_LANG_SPEC}->{$lang_code}->{$char}};
|
1840 |
+
return $mappings[0] if @mappings;
|
1841 |
+
@mappings = keys %{$ht{UTF_CHAR_MAPPING}->{$char}};
|
1842 |
+
return $mappings[0] if @mappings;
|
1843 |
+
return $this->romanize_char_at_position($i, $lang_code, $output_style, *ht, *chart_ht);
|
1844 |
+
}
|
1845 |
+
|
1846 |
+
sub romanize_char_at_position {
|
1847 |
+
local($this, $i, $lang_code, $output_style, *ht, *chart_ht) = @_;
|
1848 |
+
|
1849 |
+
my $char = $chart_ht{ORIG_CHAR}->{$i};
|
1850 |
+
return "" unless defined($char);
|
1851 |
+
return $char if $char =~ /^[\x00-\x7F]$/; # ASCII
|
1852 |
+
my $romanization = $ht{UTF_TO_CHAR_ROMANIZATION}->{$char};
|
1853 |
+
return $romanization if $romanization;
|
1854 |
+
my $char_name = $chart_ht{CHAR_NAME}->{$i};
|
1855 |
+
$romanization = $this->romanize_charname($char_name, $lang_code, $output_style, *ht, $char);
|
1856 |
+
$ht{SUSPICIOUS_ROMANIZATION}->{$char_name}->{$romanization}
|
1857 |
+
= ($ht{SUSPICIOUS_ROMANIZATION}->{$char_name}->{$romanization} || 0) + 1
|
1858 |
+
unless (length($romanization) < 4)
|
1859 |
+
|| ($romanization =~ /\s/)
|
1860 |
+
|| ($romanization =~ /^[bcdfghjklmnpqrstvwxyz]{2,3}[aeiou]-$/) # Khmer ngo-/nyo-/pho- OK
|
1861 |
+
|| ($romanization =~ /^[bcdfghjklmnpqrstvwxyz]{2,2}[aeiougw][aeiou]{1,2}$/) # Canadian, Ethiopic syllable OK
|
1862 |
+
|| ($romanization =~ /^(allah|bbux|nyaa|nnya|quuv|rrep|shch|shur|syrx)$/i) # Arabic; Yi; Ethiopic syllable nyaa; Cyrillic letter shcha
|
1863 |
+
|| (($char_name =~ /^(YI SYLLABLE|VAI SYLLABLE|ETHIOPIC SYLLABLE|CANADIAN SYLLABICS|CANADIAN SYLLABICS CARRIER)\s+(\S+)$/) && (length($romanization) <= 5));
|
1864 |
+
# print STDERR "romanize_char_at_position $i $char_name :: $romanization\n" if $char_name =~ /middle/i;
|
1865 |
+
return $romanization;
|
1866 |
+
}
|
1867 |
+
|
1868 |
+
sub romanize_charname {
|
1869 |
+
local($this, $char_name, $lang_code, $output_style, *ht, $char) = @_;
|
1870 |
+
|
1871 |
+
my $cached_result = $ht{ROMANIZE_CHARNAME}->{$char_name}->{$lang_code}->{$output_style};
|
1872 |
+
# print STDERR "(C) romanize_charname($char_name): $cached_result\n" if $cached_result && ($char_name =~ /middle/i);
|
1873 |
+
return $cached_result if defined($cashed_result);
|
1874 |
+
$orig_char_name = $char_name;
|
1875 |
+
$char_name =~ s/^.* LETTER\s+([A-Z]+)-\d+$/$1/; # HENTAIGANA LETTER A-3
|
1876 |
+
$char_name =~ s/^.* LETTER\s+//;
|
1877 |
+
$char_name =~ s/^.* SYLLABLE\s+B\d\d\d\s+//; # Linear B syllables
|
1878 |
+
$char_name =~ s/^.* SYLLABLE\s+//;
|
1879 |
+
$char_name =~ s/^.* SYLLABICS\s+//;
|
1880 |
+
$char_name =~ s/^.* LIGATURE\s+//;
|
1881 |
+
$char_name =~ s/^.* VOWEL SIGN\s+//;
|
1882 |
+
$char_name =~ s/^.* CONSONANT SIGN\s+//;
|
1883 |
+
$char_name =~ s/^.* CONSONANT\s+//;
|
1884 |
+
$char_name =~ s/^.* VOWEL\s+//;
|
1885 |
+
$char_name =~ s/ WITH .*$//;
|
1886 |
+
$char_name =~ s/ WITHOUT .*$//;
|
1887 |
+
$char_name =~ s/\s+(ABOVE|AGUNG|BAR|BARREE|BELOW|CEDILLA|CEREK|DIGRAPH|DOACHASHMEE|FINAL FORM|GHUNNA|GOAL|INITIAL FORM|ISOLATED FORM|KAWI|LELET|LELET RASWADI|LONSUM|MAHAPRANA|MEDIAL FORM|MURDA|MURDA MAHAPRANA|REVERSED|ROTUNDA|SASAK|SUNG|TAM|TEDUNG|TYPE ONE|TYPE TWO|WOLOSO)\s*$//;
|
1888 |
+
$char_name =~ s/^([A-Z]+)\d+$/$1/; # Linear B syllables etc.
|
1889 |
+
foreach $_ ((1 .. 3)) {
|
1890 |
+
$char_name =~ s/^.*\b(?:ABKHASIAN|ACADEMY|AFRICAN|AIVILIK|AITON|AKHMIMIC|ALEUT|ALI GALI|ALPAPRAANA|ALTERNATE|ALTERNATIVE|AMBA|ARABIC|ARCHAIC|ASPIRATED|ATHAPASCAN|BASELINE|BLACKLETTER|BARRED|BASHKIR|BERBER|BHATTIPROLU|BIBLE-CREE|BIG|BINOCULAR|BLACKFOOT|BLENDED|BOTTOM|BROAD|BROKEN|CANDRA|CAPITAL|CARRIER|CHILLU|CLOSE|CLOSED|COPTIC|CROSSED|CRYPTOGRAMMIC|CURLED|CURLY|CYRILLIC|DANTAJA|DENTAL|DIALECT-P|DIAERESIZED|DOTLESS|DOUBLE|DOUBLE-STRUCK|EASTERN PWO KAREN|EGYPTOLOGICAL|FARSI|FINAL|FLATTENED|GLOTTAL|GREAT|GREEK|HALF|HIGH|INITIAL|INSULAR|INVERTED|IOTIFIED|JONA|KANTAJA|KASHMIRI|KHAKASSIAN|KHAMTI|KHANDA|KINNA|KIRGHIZ|KOMI|L-SHAPED|LATINATE|LITTLE|LONG|LONG-LEGGED|LOOPED|LOW|MAHAAPRAANA|MALAYALAM|MANCHU|MANDAILING|MATHEMATICAL|MEDIAL|MIDDLE-WELSH|MON|MONOCULAR|MOOSE-CREE|MULTIOCULAR|MUURDHAJA|N-CREE|NARROW|NASKAPI|NDOLE|NEUTRAL|NIKOLSBURG|NORTHERN|NUBIAN|NUNAVIK|NUNAVUT|OJIBWAY|OLD|OPEN|ORKHON|OVERLONG|PALI|PERSIAN|PHARYNGEAL|PRISHTHAMATRA|R-CREE|REDUPLICATION|REVERSED|ROMANIAN|ROUND|ROUNDED|RUDIMENTA|RUMAI PALAUNG|SANSKRIT|SANYAKA|SARA|SAYISI|SCRIPT|SEBATBEIT|SEMISOFT|SGAW KAREN|SHAN|SHARP|SHWE PALAUNG|SHORT|SIBE|SIDEWAYS|SIMALUNGUN|SMALL|SOGDIAN|SOFT|SOUTH-SLAVEY|SOUTHERN|SPIDERY|STIRRUP|STRAIGHT|STRETCHED|SUBSCRIPT|SWASH|TAI LAING|TAILED|TAILLESS|TAALUJA|TH-CREE|TALL|THREE-LEGGED|TURNED|TODO|TOP|TROKUTASTI|TUAREG|UKRAINIAN|UNBLENDED|VISIGOTHIC|VOCALIC|VOICED|VOICELESS|VOLAPUK|WAVY|WESTERN PWO KAREN|WEST-CREE|WESTERN|WIDE|WOODS-CREE|Y-CREE|YENISEI|YIDDISH)\s+//;
|
1891 |
+
}
|
1892 |
+
$char_name =~ s/\s+(ABOVE|AGUNG|BAR|BARREE|BELOW|CEDILLA|CEREK|DIGRAPH|DOACHASHMEE|FINAL FORM|GHUNNA|GOAL|INITIAL FORM|ISOLATED FORM|KAWI|LELET|LELET RASWADI|LONSUM|MAHAPRANA|MEDIAL FORM|MURDA|MURDA MAHAPRANA|REVERSED|ROTUNDA|SASAK|SUNG|TAM|TEDUNG|TYPE ONE|TYPE TWO|WOLOSO)\s*$//;
|
1893 |
+
if ($char_name =~ /THAI CHARACTER/) {
|
1894 |
+
$char_name =~ s/^THAI CHARACTER\s+//;
|
1895 |
+
if ($char =~ /^\xE0\xB8[\x81-\xAE]/) {
|
1896 |
+
# Thai consonants
|
1897 |
+
$char_name =~ s/^([^AEIOU]*).*/$1/i;
|
1898 |
+
} elsif ($char_name =~ /^SARA [AEIOU]/) {
|
1899 |
+
# Thai vowels
|
1900 |
+
$char_name =~ s/^SARA\s+//;
|
1901 |
+
} else {
|
1902 |
+
$char_name = $char;
|
1903 |
+
}
|
1904 |
+
}
|
1905 |
+
if ($orig_char_name =~ /(HIRAGANA LETTER|KATAKANA LETTER|SYLLABLE|LIGATURE)/) {
|
1906 |
+
$char_name = lc $char_name;
|
1907 |
+
} elsif ($char_name =~ /\b(ANUSVARA|ANUSVARAYA|NIKAHIT|SIGN BINDI|TIPPI)\b/) {
|
1908 |
+
$char_name = "+m";
|
1909 |
+
} elsif ($char_name =~ /\bSCHWA\b/) {
|
1910 |
+
$char_name = "e";
|
1911 |
+
} elsif ($char_name =~ /\bIOTA\b/) {
|
1912 |
+
$char_name = "i";
|
1913 |
+
} elsif ($char_name =~ /\s/) {
|
1914 |
+
} elsif ($orig_char_name =~ /KHMER LETTER/) {
|
1915 |
+
$char_name .= "-";
|
1916 |
+
} elsif ($orig_char_name =~ /CHEROKEE LETTER/) {
|
1917 |
+
# use whole letter as is
|
1918 |
+
} elsif ($orig_char_name =~ /KHMER INDEPENDENT VOWEL/) {
|
1919 |
+
$char_name =~ s/q//;
|
1920 |
+
} elsif ($orig_char_name =~ /LETTER/) {
|
1921 |
+
$char_name =~ s/^[AEIOU]+([^AEIOU]+)$/$1/i;
|
1922 |
+
$char_name =~ s/^([^-AEIOUY]+)[AEIOU].*/$1/i;
|
1923 |
+
$char_name =~ s/^(Y)[AEIOU].*/$1/i if $orig_char_name =~ /\b(?:BENGALI|DEVANAGARI|GURMUKHI|GUJARATI|KANNADA|MALAYALAM|MODI|MYANMAR|ORIYA|TAMIL|TELUGU|TIBETAN)\b.*\bLETTER YA\b/;
|
1924 |
+
$char_name =~ s/^(Y[AEIOU]+)[^AEIOU].*$/$1/i;
|
1925 |
+
$char_name =~ s/^([AEIOU]+)[^AEIOU]+[AEIOU].*/$1/i;
|
1926 |
+
}
|
1927 |
+
|
1928 |
+
my $result = ($orig_char_name =~ /\bCAPITAL\b/) ? (uc $char_name) : (lc $char_name);
|
1929 |
+
# print STDERR "(R) romanize_charname($orig_char_name): $result\n" if $orig_char_name =~ /middle/i;
|
1930 |
+
$ht{ROMANIZE_CHARNAME}->{$char_name}->{$lang_code}->{$output_style} = $result;
|
1931 |
+
return $result;
|
1932 |
+
}
|
1933 |
+
|
1934 |
+
sub assemble_numbers_in_chart {
|
1935 |
+
local($this, *chart_ht, $line_number) = @_;
|
1936 |
+
|
1937 |
+
foreach $start (sort { $a <=> $b } keys %{$chart_ht{COMPLEX_NUMERIC_START_END}}) {
|
1938 |
+
my $end = $chart_ht{COMPLEX_NUMERIC_START_END}->{$start};
|
1939 |
+
my @numbers = ();
|
1940 |
+
foreach $i (($start .. ($end-1))) {
|
1941 |
+
my $orig_char = $chart_ht{ORIG_CHAR}->{$i};
|
1942 |
+
my $node_id = $this->get_node_for_span_with_slot($i, $i+1, "numeric-value", *chart_id);
|
1943 |
+
if (defined($node_id)) {
|
1944 |
+
my $number = $chart_ht{NODE_ROMAN}->{$node_id};
|
1945 |
+
if (defined($number)) {
|
1946 |
+
push(@numbers, $number);
|
1947 |
+
} elsif ($orig_char =~ /^[.,]$/) { # decimal point, comma separator
|
1948 |
+
push(@numbers, $orig_char);
|
1949 |
+
} else {
|
1950 |
+
print STDERR "Found no romanization for node_id $node_id ($i-" . ($i+1) . ") in assemble_numbers_in_chart\n" if $verbosePM;
|
1951 |
+
}
|
1952 |
+
} else {
|
1953 |
+
print STDERR "Found no node_id for span $i-" . ($i+1) . " in assemble_numbers_in_chart\n" if $verbosePM;
|
1954 |
+
}
|
1955 |
+
}
|
1956 |
+
my $complex_number = $this->assemble_number(join("\xC2\xB7", @numbers), $line_number);
|
1957 |
+
# print STDERR "assemble_numbers_in_chart l.$line_number $start-$end $complex_number (@numbers)\n";
|
1958 |
+
$this->add_node($complex_number, $start, $end, *chart_ht, "", "complex-number");
|
1959 |
+
}
|
1960 |
+
}
|
1961 |
+
|
1962 |
+
sub assemble_number {
|
1963 |
+
local($this, $s, $line_number) = @_;
|
1964 |
+
# e.g. 10 9 100 7 10 8 = 1978
|
1965 |
+
|
1966 |
+
my $middot = "\xC2\xB7";
|
1967 |
+
my @tokens = split(/$middot/, $s); # middle dot U+00B7
|
1968 |
+
my $i = 0;
|
1969 |
+
my @orig_tokens = @tokens;
|
1970 |
+
|
1971 |
+
# assemble single digit numbers, e.g. 1 7 5 -> 175
|
1972 |
+
while ($i < $#tokens) {
|
1973 |
+
if ($tokens[$i] =~ /^\d$/) {
|
1974 |
+
my $j = $i+1;
|
1975 |
+
while (($j <= $#tokens) && ($tokens[$j] =~ /^[0-9.,]$/)) {
|
1976 |
+
$j++;
|
1977 |
+
}
|
1978 |
+
$j--;
|
1979 |
+
if ($j>$i) {
|
1980 |
+
my $new_token = join("", @tokens[$i .. $j]);
|
1981 |
+
$new_token =~ s/,//g;
|
1982 |
+
splice(@tokens, $i, $j-$i+1, $new_token);
|
1983 |
+
}
|
1984 |
+
}
|
1985 |
+
$i++;
|
1986 |
+
}
|
1987 |
+
|
1988 |
+
foreach $power ((10, 100, 1000, 10000, 100000, 1000000, 100000000, 1000000000, 1000000000000)) {
|
1989 |
+
for (my $i=0; $i <= $#tokens; $i++) {
|
1990 |
+
if ($tokens[$i] == $power) {
|
1991 |
+
if (($i > 0) && ($tokens[($i-1)] < $power)) {
|
1992 |
+
splice(@tokens, $i-1, 2, ($tokens[($i-1)] * $tokens[$i]));
|
1993 |
+
$i--;
|
1994 |
+
if (($i < $#tokens) && ($tokens[($i+1)] < $power)) {
|
1995 |
+
splice(@tokens, $i, 2, ($tokens[$i] + $tokens[($i+1)]));
|
1996 |
+
$i--;
|
1997 |
+
}
|
1998 |
+
}
|
1999 |
+
}
|
2000 |
+
# 400 30 (e.g. Egyptian)
|
2001 |
+
my $gen_pattern = $power;
|
2002 |
+
$gen_pattern =~ s/^1/\[1-9\]/;
|
2003 |
+
if (($tokens[$i] =~ /^$gen_pattern$/) && ($i < $#tokens) && ($tokens[($i+1)] < $power)) {
|
2004 |
+
splice(@tokens, $i, 2, ($tokens[$i] + $tokens[($i+1)]));
|
2005 |
+
$i--;
|
2006 |
+
}
|
2007 |
+
}
|
2008 |
+
last if $#tokens == 0;
|
2009 |
+
}
|
2010 |
+
my $result = join($middot, @tokens);
|
2011 |
+
if ($verbosePM) {
|
2012 |
+
my $logfile = "/nfs/isd/ulf/cgi-mt/amr-tmp/uroman-number-log.txt";
|
2013 |
+
$util->append_to_file($logfile, "$s -> $result\n") if -r $logfile;
|
2014 |
+
# print STDERR " assemble number l.$line_number @orig_tokens -> $result\n" if $line_number == 43;
|
2015 |
+
}
|
2016 |
+
return $result;
|
2017 |
+
}
|
2018 |
+
|
2019 |
+
1;
|
2020 |
+
|
uroman/lib/NLP/UTF8.pm
ADDED
@@ -0,0 +1,1404 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
################################################################
|
2 |
+
# #
|
3 |
+
# UTF8 #
|
4 |
+
# #
|
5 |
+
################################################################
|
6 |
+
|
7 |
+
package NLP::UTF8;
|
8 |
+
|
9 |
+
use NLP::utilities;
|
10 |
+
$util = NLP::utilities;
|
11 |
+
|
12 |
+
%empty_ht = ();
|
13 |
+
|
14 |
+
sub new {
|
15 |
+
local($caller) = @_;
|
16 |
+
|
17 |
+
my $object = {};
|
18 |
+
my $class = ref( $caller ) || $caller;
|
19 |
+
bless($object, $class);
|
20 |
+
return $object;
|
21 |
+
}
|
22 |
+
|
23 |
+
sub unicode_string2string {
|
24 |
+
# input: string that might contain unicode sequences such as "U+0627"
|
25 |
+
# output: string in pure utf-8
|
26 |
+
local($caller,$s) = @_;
|
27 |
+
|
28 |
+
my $pre;
|
29 |
+
my $unicode;
|
30 |
+
my $post;
|
31 |
+
my $r1;
|
32 |
+
my $r2;
|
33 |
+
my $r3;
|
34 |
+
|
35 |
+
($pre,$unicode,$post) = ($s =~ /^(.*)(?:U\+|\\u)([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])(.*)$/);
|
36 |
+
return $s unless defined($post);
|
37 |
+
$r1 = $caller->unicode_string2string($pre);
|
38 |
+
$r2 = $caller->unicode_hex_string2string($unicode);
|
39 |
+
$r3 = $caller->unicode_string2string($post);
|
40 |
+
$result = $r1 . $r2 . $r3;
|
41 |
+
return $result;
|
42 |
+
}
|
43 |
+
|
44 |
+
sub unicode_hex_string2string {
|
45 |
+
# input: "0627" (interpreted as hex code)
|
46 |
+
# output: utf-8 string for Arabic letter alef
|
47 |
+
local($caller,$unicode) = @_;
|
48 |
+
return "" unless defined($unicode);
|
49 |
+
my $d = hex($unicode);
|
50 |
+
return $caller->unicode2string($d);
|
51 |
+
}
|
52 |
+
|
53 |
+
sub unicode2string {
|
54 |
+
# input: non-neg integer, e.g. 0x627
|
55 |
+
# output: utf-8 string for Arabic letter alef
|
56 |
+
local($caller,$d) = @_;
|
57 |
+
return "" unless defined($d) && $d >= 0;
|
58 |
+
return sprintf("%c",$d) if $d <= 0x7F;
|
59 |
+
|
60 |
+
my $lastbyte1 = ($d & 0x3F) | 0x80;
|
61 |
+
$d >>= 6;
|
62 |
+
return sprintf("%c%c",$d | 0xC0, $lastbyte1) if $d <= 0x1F;
|
63 |
+
|
64 |
+
my $lastbyte2 = ($d & 0x3F) | 0x80;
|
65 |
+
$d >>= 6;
|
66 |
+
return sprintf("%c%c%c",$d | 0xE0, $lastbyte2, $lastbyte1) if $d <= 0xF;
|
67 |
+
|
68 |
+
my $lastbyte3 = ($d & 0x3F) | 0x80;
|
69 |
+
$d >>= 6;
|
70 |
+
return sprintf("%c%c%c%c",$d | 0xF0, $lastbyte3, $lastbyte2, $lastbyte1) if $d <= 0x7;
|
71 |
+
|
72 |
+
my $lastbyte4 = ($d & 0x3F) | 0x80;
|
73 |
+
$d >>= 6;
|
74 |
+
return sprintf("%c%c%c%c%c",$d | 0xF8, $lastbyte4, $lastbyte3, $lastbyte2, $lastbyte1) if $d <= 0x3;
|
75 |
+
|
76 |
+
my $lastbyte5 = ($d & 0x3F) | 0x80;
|
77 |
+
$d >>= 6;
|
78 |
+
return sprintf("%c%c%c%c%c%c",$d | 0xFC, $lastbyte5, $lastbyte4, $lastbyte3, $lastbyte2, $lastbyte1) if $d <= 0x1;
|
79 |
+
return ""; # bad input
|
80 |
+
}
|
81 |
+
|
82 |
+
sub html2utf8 {
|
83 |
+
local($caller, $string) = @_;
|
84 |
+
|
85 |
+
return $string unless $string =~ /\&\#\d{3,5};/;
|
86 |
+
|
87 |
+
my $prev = "";
|
88 |
+
my $s = $string;
|
89 |
+
while ($s ne $prev) {
|
90 |
+
$prev = $s;
|
91 |
+
($pre,$d,$post) = ($s =~ /^(.*)\&\#(\d+);(.*)$/);
|
92 |
+
if (defined($d) && ((($d >= 160) && ($d <= 255))
|
93 |
+
|| (($d >= 1500) && ($d <= 1699))
|
94 |
+
|| (($d >= 19968) && ($d <= 40879)))) {
|
95 |
+
$html_code = "\&\#" . $d . ";";
|
96 |
+
$utf8_code = $caller->unicode2string($d);
|
97 |
+
$s =~ s/$html_code/$utf8_code/;
|
98 |
+
}
|
99 |
+
}
|
100 |
+
return $s;
|
101 |
+
}
|
102 |
+
|
103 |
+
sub xhtml2utf8 {
|
104 |
+
local($caller, $string) = @_;
|
105 |
+
|
106 |
+
return $string unless $string =~ /\&\#x[0-9a-fA-F]{2,5};/;
|
107 |
+
|
108 |
+
my $prev = "";
|
109 |
+
my $s = $string;
|
110 |
+
while ($s ne $prev) {
|
111 |
+
$prev = $s;
|
112 |
+
if (($pre, $html_code, $x, $post) = ($s =~ /^(.*)(\&\#x([0-9a-fA-F]{2,5});)(.*)$/)) {
|
113 |
+
$utf8_code = $caller->unicode_hex_string2string($x);
|
114 |
+
$s =~ s/$html_code/$utf8_code/;
|
115 |
+
}
|
116 |
+
}
|
117 |
+
return $s;
|
118 |
+
}
|
119 |
+
|
120 |
+
sub utf8_marker {
|
121 |
+
return sprintf("%c%c%c\n", 0xEF, 0xBB, 0xBF);
|
122 |
+
}
|
123 |
+
|
124 |
+
sub enforcer {
|
125 |
+
# input: string that might not conform to utf-8
|
126 |
+
# output: string in pure utf-8, with a few "smart replacements" and possibly "?"
|
127 |
+
local($caller,$s,$no_repair) = @_;
|
128 |
+
|
129 |
+
my $ascii;
|
130 |
+
my $utf8;
|
131 |
+
my $rest;
|
132 |
+
|
133 |
+
return $s if $s =~ /^[\x00-\x7F]*$/;
|
134 |
+
|
135 |
+
$no_repair = 0 unless defined($no_repair);
|
136 |
+
$orig = $s;
|
137 |
+
$result = "";
|
138 |
+
|
139 |
+
while ($s ne "") {
|
140 |
+
($ascii,$rest) = ($s =~ /^([\x00-\x7F]+)(.*)$/);
|
141 |
+
if (defined($ascii)) {
|
142 |
+
$result .= $ascii;
|
143 |
+
$s = $rest;
|
144 |
+
next;
|
145 |
+
}
|
146 |
+
($utf8,$rest) = ($s =~ /^([\xC0-\xDF][\x80-\xBF])(.*)$/);
|
147 |
+
($utf8,$rest) = ($s =~ /^([\xE0-\xEF][\x80-\xBF][\x80-\xBF])(.*)$/)
|
148 |
+
unless defined($rest);
|
149 |
+
($utf8,$rest) = ($s =~ /^([\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])(.*)$/)
|
150 |
+
unless defined($rest);
|
151 |
+
($utf8,$rest) = ($s =~ /^([\xF8-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF])(.*)$/)
|
152 |
+
unless defined($rest);
|
153 |
+
if (defined($utf8)) {
|
154 |
+
$result .= $utf8;
|
155 |
+
$s = $rest;
|
156 |
+
next;
|
157 |
+
}
|
158 |
+
($c,$rest) = ($s =~ /^(.)(.*)$/);
|
159 |
+
if (defined($c)) {
|
160 |
+
if ($no_repair) { $result .= "?"; }
|
161 |
+
elsif ($c =~ /\x85/) { $result .= "..."; }
|
162 |
+
elsif ($c =~ /\x91/) { $result .= "'"; }
|
163 |
+
elsif ($c =~ /\x92/) { $result .= "'"; }
|
164 |
+
elsif ($c =~ /\x93/) { $result .= $caller->unicode2string(0x201C); }
|
165 |
+
elsif ($c =~ /\x94/) { $result .= $caller->unicode2string(0x201D); }
|
166 |
+
elsif ($c =~ /[\xC0-\xFF]/) {
|
167 |
+
$c2 = $c;
|
168 |
+
$c2 =~ tr/[\xC0-\xFF]/[\x80-\xBF]/;
|
169 |
+
$result .= "\xC3$c2";
|
170 |
+
} else {
|
171 |
+
$result .= "?";
|
172 |
+
}
|
173 |
+
$s = $rest;
|
174 |
+
next;
|
175 |
+
}
|
176 |
+
$s = "";
|
177 |
+
}
|
178 |
+
$result .= "\n" if ($orig =~ /\n$/) && ! ($result =~ /\n$/);
|
179 |
+
return $result;
|
180 |
+
}
|
181 |
+
|
182 |
+
sub split_into_utf8_characters {
|
183 |
+
# input: utf8 string
|
184 |
+
# output: list of sub-strings, each representing a utf8 character
|
185 |
+
local($caller,$string,$group_control, *ht) = @_;
|
186 |
+
|
187 |
+
@characters = ();
|
188 |
+
$end_of_token_p_string = "";
|
189 |
+
$skipped_bytes = "";
|
190 |
+
$group_control = "" unless defined($group_control);
|
191 |
+
$group_ascii_numbers = ($group_control =~ /ASCII numbers/);
|
192 |
+
$group_ascii_spaces = ($group_control =~ /ASCII spaces/);
|
193 |
+
$group_ascii_punct = ($group_control =~ /ASCII punct/);
|
194 |
+
$group_ascii_chars = ($group_control =~ /ASCII chars/);
|
195 |
+
$group_xml_chars = ($group_control =~ /XML chars/);
|
196 |
+
$group_xml_tags = ($group_control =~ /XML tags/);
|
197 |
+
$return_only_chars = ($group_control =~ /return only chars/);
|
198 |
+
$return_trailing_whitespaces = ($group_control =~ /return trailing whitespaces/);
|
199 |
+
if ($group_control =~ /ASCII all/) {
|
200 |
+
$group_ascii_numbers = 1;
|
201 |
+
$group_ascii_spaces = 1;
|
202 |
+
$group_ascii_chars = 1;
|
203 |
+
$group_ascii_punct = 1;
|
204 |
+
}
|
205 |
+
if ($group_control =~ /(XML chars and tags|XML tags and chars)/) {
|
206 |
+
$group_xml_chars = 1;
|
207 |
+
$group_xml_tags = 1;
|
208 |
+
}
|
209 |
+
$orig_string = $string;
|
210 |
+
$string .= " ";
|
211 |
+
while ($string =~ /\S/) {
|
212 |
+
# one-character UTF-8 = ASCII
|
213 |
+
if ($string =~ /^[\x00-\x7F]/) {
|
214 |
+
if ($group_xml_chars
|
215 |
+
&& (($dec_unicode, $rest) = ($string =~ /^&#(\d+);(.*)$/s))
|
216 |
+
&& ($utf8_char = $caller->unicode2string($dec_unicode))) {
|
217 |
+
push(@characters, $utf8_char);
|
218 |
+
$string = $rest;
|
219 |
+
} elsif ($group_xml_chars
|
220 |
+
&& (($hex_unicode, $rest) = ($string =~ /^&#x([0-9a-f]{1,6});(.*)$/is))
|
221 |
+
&& ($utf8_char = $caller->unicode_hex_string2string($hex_unicode))) {
|
222 |
+
push(@characters, $utf8_char);
|
223 |
+
$string = $rest;
|
224 |
+
} elsif ($group_xml_chars
|
225 |
+
&& (($html_entity_name, $rest) = ($string =~ /^&([a-z]{1,6});(.*)$/is))
|
226 |
+
&& ($dec_unicode = $ht{HTML_ENTITY_NAME_TO_DECUNICODE}->{$html_entity_name})
|
227 |
+
&& ($utf8_char = $caller->unicode2string($dec_unicode))
|
228 |
+
) {
|
229 |
+
push(@characters, $utf8_char);
|
230 |
+
$string = $rest;
|
231 |
+
} elsif ($group_xml_tags
|
232 |
+
&& (($tag, $rest) = ($string =~ /^(<\/?[a-zA-Z][-_:a-zA-Z0-9]*(\s+[a-zA-Z][-_:a-zA-Z0-9]*=\"[^"]*\")*\s*\/?>)(.*)$/s))) {
|
233 |
+
push(@characters, $tag);
|
234 |
+
$string = $rest;
|
235 |
+
} elsif ($group_ascii_numbers && ($string =~ /^[12]\d\d\d\.[01]?\d.[0-3]?\d([^0-9].*)?$/)) {
|
236 |
+
($date) = ($string =~ /^(\d\d\d\d\.\d?\d.\d?\d)([^0-9].*)?$/);
|
237 |
+
push(@characters,$date);
|
238 |
+
$string = substr($string, length($date));
|
239 |
+
} elsif ($group_ascii_numbers && ($string =~ /^\d/)) {
|
240 |
+
($number) = ($string =~ /^(\d+(,\d\d\d)*(\.\d+)?)/);
|
241 |
+
push(@characters,$number);
|
242 |
+
$string = substr($string, length($number));
|
243 |
+
} elsif ($group_ascii_spaces && ($string =~ /^(\s+)/)) {
|
244 |
+
($space) = ($string =~ /^(\s+)/);
|
245 |
+
$string = substr($string, length($space));
|
246 |
+
} elsif ($group_ascii_punct && (($punct_seq) = ($string =~ /^(-+|\.+|[:,%()"])/))) {
|
247 |
+
push(@characters,$punct_seq);
|
248 |
+
$string = substr($string, length($punct_seq));
|
249 |
+
} elsif ($group_ascii_chars && (($word) = ($string =~ /^(\$[A-Z]*|[A-Z]{1,3}\$)/))) {
|
250 |
+
push(@characters,$word);
|
251 |
+
$string = substr($string, length($word));
|
252 |
+
} elsif ($group_ascii_chars && (($abbrev) = ($string =~ /^((?:Jan|Feb|Febr|Mar|Apr|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec|Mr|Mrs|Dr|a.m|p.m)\.)/))) {
|
253 |
+
push(@characters,$abbrev);
|
254 |
+
$string = substr($string, length($abbrev));
|
255 |
+
} elsif ($group_ascii_chars && (($word) = ($string =~ /^(second|minute|hour|day|week|month|year|inch|foot|yard|meter|kilometer|mile)-(?:long|old)/i))) {
|
256 |
+
push(@characters,$word);
|
257 |
+
$string = substr($string, length($word));
|
258 |
+
} elsif ($group_ascii_chars && (($word) = ($string =~ /^(zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion|trillion)-/i))) {
|
259 |
+
push(@characters,$word);
|
260 |
+
$string = substr($string, length($word));
|
261 |
+
} elsif ($group_ascii_chars && (($word) = ($string =~ /^([a-zA-Z]+)(?:[ ,;%?|()"]|'s |' |\. |\d+[:hms][0-9 ])/))) {
|
262 |
+
push(@characters,$word);
|
263 |
+
$string = substr($string, length($word));
|
264 |
+
} elsif ($group_ascii_chars && ($string =~ /^([\x21-\x27\x2A-\x7E]+)/)) { # exclude ()
|
265 |
+
($ascii) = ($string =~ /^([\x21-\x27\x2A-\x7E]+)/); # ASCII black-characters
|
266 |
+
push(@characters,$ascii);
|
267 |
+
$string = substr($string, length($ascii));
|
268 |
+
} elsif ($group_ascii_chars && ($string =~ /^([\x21-\x7E]+)/)) {
|
269 |
+
($ascii) = ($string =~ /^([\x21-\x7E]+)/); # ASCII black-characters
|
270 |
+
push(@characters,$ascii);
|
271 |
+
$string = substr($string, length($ascii));
|
272 |
+
} elsif ($group_ascii_chars && ($string =~ /^([\x00-\x7F]+)/)) {
|
273 |
+
($ascii) = ($string =~ /^([\x00-\x7F]+)/);
|
274 |
+
push(@characters,$ascii);
|
275 |
+
$string = substr($string, length($ascii));
|
276 |
+
} else {
|
277 |
+
push(@characters,substr($string, 0, 1));
|
278 |
+
$string = substr($string, 1);
|
279 |
+
}
|
280 |
+
|
281 |
+
# two-character UTF-8
|
282 |
+
} elsif ($string =~ /^[\xC0-\xDF][\x80-\xBF]/) {
|
283 |
+
push(@characters,substr($string, 0, 2));
|
284 |
+
$string = substr($string, 2);
|
285 |
+
|
286 |
+
# three-character UTF-8
|
287 |
+
} elsif ($string =~ /^[\xE0-\xEF][\x80-\xBF][\x80-\xBF]/) {
|
288 |
+
push(@characters,substr($string, 0, 3));
|
289 |
+
$string = substr($string, 3);
|
290 |
+
|
291 |
+
# four-character UTF-8
|
292 |
+
} elsif ($string =~ /^[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]/) {
|
293 |
+
push(@characters,substr($string, 0, 4));
|
294 |
+
$string = substr($string, 4);
|
295 |
+
|
296 |
+
# five-character UTF-8
|
297 |
+
} elsif ($string =~ /^[\xF8-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]/) {
|
298 |
+
push(@characters,substr($string, 0, 5));
|
299 |
+
$string = substr($string, 5);
|
300 |
+
|
301 |
+
# six-character UTF-8
|
302 |
+
} elsif ($string =~ /^[\xFC-\xFD][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]/) {
|
303 |
+
push(@characters,substr($string, 0, 6));
|
304 |
+
$string = substr($string, 6);
|
305 |
+
|
306 |
+
# not a UTF-8 character
|
307 |
+
} else {
|
308 |
+
$skipped_bytes .= substr($string, 0, 1);
|
309 |
+
$string = substr($string, 1);
|
310 |
+
}
|
311 |
+
|
312 |
+
$end_of_token_p_string .= ($string =~ /^\S/) ? "0" : "1"
|
313 |
+
if $#characters >= length($end_of_token_p_string);
|
314 |
+
}
|
315 |
+
$string =~ s/ $//; # remove previously added space, but keep original spaces
|
316 |
+
if ($return_trailing_whitespaces) {
|
317 |
+
while ($string =~ /^[ \t]/) {
|
318 |
+
push(@characters,substr($string, 0, 1));
|
319 |
+
$string = substr($string, 1);
|
320 |
+
}
|
321 |
+
push(@characters, "\n") if $orig_string =~ /\n$/;
|
322 |
+
}
|
323 |
+
return ($return_only_chars) ? @characters : ($skipped_bytes, $end_of_token_p_string, @characters);
|
324 |
+
}
|
325 |
+
|
326 |
+
sub max_substring_info {
|
327 |
+
local($caller,$s1,$s2,$info_type) = @_;
|
328 |
+
|
329 |
+
($skipped_bytes1, $end_of_token_p_string1, @char_list1) = $caller->split_into_utf8_characters($s1, "", *empty_ht);
|
330 |
+
($skipped_bytes2, $end_of_token_p_string2, @char_list2) = $caller->split_into_utf8_characters($s2, "", *empty_ht);
|
331 |
+
return 0 if $skipped_bytes1 || $skipped_bytes2;
|
332 |
+
|
333 |
+
$best_substring_start1 = 0;
|
334 |
+
$best_substring_start2 = 0;
|
335 |
+
$best_substring_length = 0;
|
336 |
+
|
337 |
+
foreach $start_pos2 ((0 .. $#char_list2)) {
|
338 |
+
last if $start_pos2 + $best_substring_length > $#char_list2;
|
339 |
+
foreach $start_pos1 ((0 .. $#char_list1)) {
|
340 |
+
last if $start_pos1 + $best_substring_length > $#char_list1;
|
341 |
+
$matching_length = 0;
|
342 |
+
while (($start_pos1 + $matching_length <= $#char_list1)
|
343 |
+
&& ($start_pos2 + $matching_length <= $#char_list2)
|
344 |
+
&& ($char_list1[$start_pos1+$matching_length] eq $char_list2[$start_pos2+$matching_length])) {
|
345 |
+
$matching_length++;
|
346 |
+
}
|
347 |
+
if ($matching_length > $best_substring_length) {
|
348 |
+
$best_substring_length = $matching_length;
|
349 |
+
$best_substring_start1 = $start_pos1;
|
350 |
+
$best_substring_start2 = $start_pos2;
|
351 |
+
}
|
352 |
+
}
|
353 |
+
}
|
354 |
+
if ($info_type =~ /^max-ratio1$/) {
|
355 |
+
$length1 = $#char_list1 + 1;
|
356 |
+
return ($length1 > 0) ? ($best_substring_length / $length1) : 0;
|
357 |
+
} elsif ($info_type =~ /^max-ratio2$/) {
|
358 |
+
$length2 = $#char_list2 + 1;
|
359 |
+
return ($length2 > 0) ? ($best_substring_length / $length2) : 0;
|
360 |
+
} elsif ($info_type =~ /^substring$/) {
|
361 |
+
return join("", @char_list1[$best_substring_start1 .. $best_substring_start1+$best_substring_length-1]);
|
362 |
+
} else {
|
363 |
+
$length1 = $#char_list1 + 1;
|
364 |
+
$length2 = $#char_list2 + 1;
|
365 |
+
$info = "s1=$s1;s2=$s2";
|
366 |
+
$info .= ";best_substring_length=$best_substring_length";
|
367 |
+
$info .= ";best_substring_start1=$best_substring_start1";
|
368 |
+
$info .= ";best_substring_start2=$best_substring_start2";
|
369 |
+
$info .= ";length1=$length1";
|
370 |
+
$info .= ";length2=$length2";
|
371 |
+
return $info;
|
372 |
+
}
|
373 |
+
}
|
374 |
+
|
375 |
+
sub n_shared_chars_at_start {
|
376 |
+
local($caller,$s1,$s2) = @_;
|
377 |
+
|
378 |
+
my $n = 0;
|
379 |
+
while (($s1 ne "") && ($s2 ne "")) {
|
380 |
+
($c1, $rest1) = ($s1 =~ /^(.[\x80-\xBF]*)(.*)$/);
|
381 |
+
($c2, $rest2) = ($s2 =~ /^(.[\x80-\xBF]*)(.*)$/);
|
382 |
+
if ($c1 eq $c2) {
|
383 |
+
$n++;
|
384 |
+
$s1 = $rest1;
|
385 |
+
$s2 = $rest2;
|
386 |
+
} else {
|
387 |
+
last;
|
388 |
+
}
|
389 |
+
}
|
390 |
+
return $n;
|
391 |
+
}
|
392 |
+
|
393 |
+
sub char_length {
|
394 |
+
local($caller,$string,$byte_offset) = @_;
|
395 |
+
|
396 |
+
my $char = ($byte_offset) ? substr($string, $byte_offset) : $string;
|
397 |
+
return 1 if $char =~ /^[\x00-\x7F]/;
|
398 |
+
return 2 if $char =~ /^[\xC0-\xDF]/;
|
399 |
+
return 3 if $char =~ /^[\xE0-\xEF]/;
|
400 |
+
return 4 if $char =~ /^[\xF0-\xF7]/;
|
401 |
+
return 5 if $char =~ /^[\xF8-\xFB]/;
|
402 |
+
return 6 if $char =~ /^[\xFC-\xFD]/;
|
403 |
+
return 0;
|
404 |
+
}
|
405 |
+
|
406 |
+
sub length_in_utf8_chars {
|
407 |
+
local($caller,$s) = @_;
|
408 |
+
|
409 |
+
$s =~ s/[\x80-\xBF]//g;
|
410 |
+
$s =~ s/[\x00-\x7F\xC0-\xFF]/c/g;
|
411 |
+
return length($s);
|
412 |
+
}
|
413 |
+
|
414 |
+
sub byte_length_of_n_chars {
|
415 |
+
local($caller,$char_length,$string,$byte_offset,$undef_return_value) = @_;
|
416 |
+
|
417 |
+
$byte_offset = 0 unless defined($byte_offset);
|
418 |
+
$undef_return_value = -1 unless defined($undef_return_value);
|
419 |
+
my $result = 0;
|
420 |
+
my $len;
|
421 |
+
foreach $i ((1 .. $char_length)) {
|
422 |
+
$len = $caller->char_length($string,($byte_offset+$result));
|
423 |
+
return $undef_return_value unless $len;
|
424 |
+
$result += $len;
|
425 |
+
}
|
426 |
+
return $result;
|
427 |
+
}
|
428 |
+
|
429 |
+
sub replace_non_ASCII_bytes {
|
430 |
+
local($caller,$string,$replacement) = @_;
|
431 |
+
|
432 |
+
$replacement = "HEX" unless defined($replacement);
|
433 |
+
if ($replacement =~ /^(Unicode|U\+4|\\u|HEX)$/) {
|
434 |
+
$new_string = "";
|
435 |
+
while (($pre,$utf8_char, $post) = ($string =~ /^([\x09\x0A\x20-\x7E]*)([\x00-\x08\x0B-\x1F\x7F]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF][\x80-\xBF]|[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]|[\xF8-\xFF][\x80-\xBF]+|[\x80-\xBF])(.*)$/s)) {
|
436 |
+
if ($replacement =~ /Unicode/) {
|
437 |
+
$new_string .= $pre . "<U" . (uc $caller->utf8_to_unicode($utf8_char)) . ">";
|
438 |
+
} elsif ($replacement =~ /\\u/) {
|
439 |
+
$new_string .= $pre . "\\u" . (uc sprintf("%04x", $caller->utf8_to_unicode($utf8_char)));
|
440 |
+
} elsif ($replacement =~ /U\+4/) {
|
441 |
+
$new_string .= $pre . "<U+" . (uc $caller->utf8_to_4hex_unicode($utf8_char)) . ">";
|
442 |
+
} else {
|
443 |
+
$new_string .= $pre . "<HEX-" . $caller->utf8_to_hex($utf8_char) . ">";
|
444 |
+
}
|
445 |
+
$string = $post;
|
446 |
+
}
|
447 |
+
$new_string .= $string;
|
448 |
+
} else {
|
449 |
+
$new_string = $string;
|
450 |
+
$new_string =~ s/[\x80-\xFF]/$replacement/g;
|
451 |
+
}
|
452 |
+
return $new_string;
|
453 |
+
}
|
454 |
+
|
455 |
+
sub valid_utf8_string_p {
|
456 |
+
local($caller,$string) = @_;
|
457 |
+
|
458 |
+
return $string =~ /^(?:[\x09\x0A\x20-\x7E]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF][\x80-\xBF]|[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])*$/;
|
459 |
+
}
|
460 |
+
|
461 |
+
sub valid_utf8_string_incl_ascii_control_p {
|
462 |
+
local($caller,$string) = @_;
|
463 |
+
|
464 |
+
return $string =~ /^(?:[\x00-\x7F]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF][\x80-\xBF]|[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])*$/;
|
465 |
+
}
|
466 |
+
|
467 |
+
sub utf8_to_hex {
|
468 |
+
local($caller,$s) = @_;
|
469 |
+
|
470 |
+
$hex = "";
|
471 |
+
foreach $i ((0 .. length($s)-1)) {
|
472 |
+
$hex .= uc sprintf("%2.2x",ord(substr($s, $i, 1)));
|
473 |
+
}
|
474 |
+
return $hex;
|
475 |
+
}
|
476 |
+
|
477 |
+
sub hex_to_utf8 {
|
478 |
+
local($caller,$s) = @_;
|
479 |
+
# surface string \xE2\x80\xBA to UTF8
|
480 |
+
|
481 |
+
my $utf8 = "";
|
482 |
+
while (($hex, $rest) = ($s =~ /^(?:\\x)?([0-9A-Fa-f]{2,2})(.*)$/)) {
|
483 |
+
$utf8 .= sprintf("%c", hex($hex));
|
484 |
+
$s = $rest;
|
485 |
+
}
|
486 |
+
return $utf8;
|
487 |
+
}
|
488 |
+
|
489 |
+
sub utf8_to_4hex_unicode {
|
490 |
+
local($caller,$s) = @_;
|
491 |
+
|
492 |
+
return sprintf("%4.4x", $caller->utf8_to_unicode($s));
|
493 |
+
}
|
494 |
+
|
495 |
+
sub utf8_to_unicode {
|
496 |
+
local($caller,$s) = @_;
|
497 |
+
|
498 |
+
$unicode = 0;
|
499 |
+
foreach $i ((0 .. length($s)-1)) {
|
500 |
+
$c = substr($s, $i, 1);
|
501 |
+
if ($c =~ /^[\x80-\xBF]$/) {
|
502 |
+
$unicode = $unicode * 64 + (ord($c) & 0x3F);
|
503 |
+
} elsif ($c =~ /^[\xC0-\xDF]$/) {
|
504 |
+
$unicode = $unicode * 32 + (ord($c) & 0x1F);
|
505 |
+
} elsif ($c =~ /^[\xE0-\xEF]$/) {
|
506 |
+
$unicode = $unicode * 16 + (ord($c) & 0x0F);
|
507 |
+
} elsif ($c =~ /^[\xF0-\xF7]$/) {
|
508 |
+
$unicode = $unicode * 8 + (ord($c) & 0x07);
|
509 |
+
} elsif ($c =~ /^[\xF8-\xFB]$/) {
|
510 |
+
$unicode = $unicode * 4 + (ord($c) & 0x03);
|
511 |
+
} elsif ($c =~ /^[\xFC-\xFD]$/) {
|
512 |
+
$unicode = $unicode * 2 + (ord($c) & 0x01);
|
513 |
+
}
|
514 |
+
}
|
515 |
+
return $unicode;
|
516 |
+
}
|
517 |
+
|
518 |
+
sub charhex {
|
519 |
+
local($caller,$string) = @_;
|
520 |
+
|
521 |
+
my $result = "";
|
522 |
+
while ($string ne "") {
|
523 |
+
$char = substr($string, 0, 1);
|
524 |
+
$string = substr($string, 1);
|
525 |
+
if ($char =~ /^[ -~]$/) {
|
526 |
+
$result .= $char;
|
527 |
+
} else {
|
528 |
+
$hex = sprintf("%2.2x",ord($char));
|
529 |
+
$hex =~ tr/a-f/A-F/;
|
530 |
+
$result .= "<HEX-$hex>";
|
531 |
+
}
|
532 |
+
}
|
533 |
+
return $result;
|
534 |
+
}
|
535 |
+
|
536 |
+
sub windows1252_to_utf8 {
|
537 |
+
local($caller,$s, $norm_to_ascii_p, $preserve_potential_utf8s_p) = @_;
|
538 |
+
|
539 |
+
return $s if $s =~ /^[\x00-\x7F]*$/; # all ASCII
|
540 |
+
|
541 |
+
$norm_to_ascii_p = 1 unless defined($norm_to_ascii_p);
|
542 |
+
$preserve_potential_utf8s_p = 1 unless defined($preserve_potential_utf8s_p);
|
543 |
+
my $result = "";
|
544 |
+
my $c = "";
|
545 |
+
while ($s ne "") {
|
546 |
+
$n_bytes = 1;
|
547 |
+
if ($s =~ /^[\x00-\x7F]/) {
|
548 |
+
$result .= substr($s, 0, 1); # ASCII
|
549 |
+
} elsif ($preserve_potential_utf8s_p && ($s =~ /^[\xC0-\xDF][\x80-\xBF]/)) {
|
550 |
+
$result .= substr($s, 0, 2); # valid 2-byte UTF8
|
551 |
+
$n_bytes = 2;
|
552 |
+
} elsif ($preserve_potential_utf8s_p && ($s =~ /^[\xE0-\xEF][\x80-\xBF][\x80-\xBF]/)) {
|
553 |
+
$result .= substr($s, 0, 3); # valid 3-byte UTF8
|
554 |
+
$n_bytes = 3;
|
555 |
+
} elsif ($preserve_potential_utf8s_p && ($s =~ /^[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]/)) {
|
556 |
+
$result .= substr($s, 0, 4); # valid 4-byte UTF8
|
557 |
+
$n_bytes = 4;
|
558 |
+
} elsif ($preserve_potential_utf8s_p && ($s =~ /^[\xF8-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]/)) {
|
559 |
+
$result .= substr($s, 0, 5); # valid 5-byte UTF8
|
560 |
+
$n_bytes = 5;
|
561 |
+
} elsif ($s =~ /^[\xA0-\xBF]/) {
|
562 |
+
$c = substr($s, 0, 1);
|
563 |
+
$result .= "\xC2$c";
|
564 |
+
} elsif ($s =~ /^[\xC0-\xFF]/) {
|
565 |
+
$c = substr($s, 0, 1);
|
566 |
+
$c =~ tr/[\xC0-\xFF]/[\x80-\xBF]/;
|
567 |
+
$result .= "\xC3$c";
|
568 |
+
} elsif ($s =~ /^\x80/) {
|
569 |
+
$result .= "\xE2\x82\xAC"; # Euro sign
|
570 |
+
} elsif ($s =~ /^\x82/) {
|
571 |
+
$result .= "\xE2\x80\x9A"; # single low quotation mark
|
572 |
+
} elsif ($s =~ /^\x83/) {
|
573 |
+
$result .= "\xC6\x92"; # Latin small letter f with hook
|
574 |
+
} elsif ($s =~ /^\x84/) {
|
575 |
+
$result .= "\xE2\x80\x9E"; # double low quotation mark
|
576 |
+
} elsif ($s =~ /^\x85/) {
|
577 |
+
$result .= ($norm_to_ascii_p) ? "..." : "\xE2\x80\xA6"; # horizontal ellipsis (three dots)
|
578 |
+
} elsif ($s =~ /^\x86/) {
|
579 |
+
$result .= "\xE2\x80\xA0"; # dagger
|
580 |
+
} elsif ($s =~ /^\x87/) {
|
581 |
+
$result .= "\xE2\x80\xA1"; # double dagger
|
582 |
+
} elsif ($s =~ /^\x88/) {
|
583 |
+
$result .= "\xCB\x86"; # circumflex
|
584 |
+
} elsif ($s =~ /^\x89/) {
|
585 |
+
$result .= "\xE2\x80\xB0"; # per mille sign
|
586 |
+
} elsif ($s =~ /^\x8A/) {
|
587 |
+
$result .= "\xC5\xA0"; # Latin capital letter S with caron
|
588 |
+
} elsif ($s =~ /^\x8B/) {
|
589 |
+
$result .= "\xE2\x80\xB9"; # single left-pointing angle quotation mark
|
590 |
+
} elsif ($s =~ /^\x8C/) {
|
591 |
+
$result .= "\xC5\x92"; # OE ligature
|
592 |
+
} elsif ($s =~ /^\x8E/) {
|
593 |
+
$result .= "\xC5\xBD"; # Latin capital letter Z with caron
|
594 |
+
} elsif ($s =~ /^\x91/) {
|
595 |
+
$result .= ($norm_to_ascii_p) ? "`" : "\xE2\x80\x98"; # left single quotation mark
|
596 |
+
} elsif ($s =~ /^\x92/) {
|
597 |
+
$result .= ($norm_to_ascii_p) ? "'" : "\xE2\x80\x99"; # right single quotation mark
|
598 |
+
} elsif ($s =~ /^\x93/) {
|
599 |
+
$result .= "\xE2\x80\x9C"; # left double quotation mark
|
600 |
+
} elsif ($s =~ /^\x94/) {
|
601 |
+
$result .= "\xE2\x80\x9D"; # right double quotation mark
|
602 |
+
} elsif ($s =~ /^\x95/) {
|
603 |
+
$result .= "\xE2\x80\xA2"; # bullet
|
604 |
+
} elsif ($s =~ /^\x96/) {
|
605 |
+
$result .= ($norm_to_ascii_p) ? "-" : "\xE2\x80\x93"; # n dash
|
606 |
+
} elsif ($s =~ /^\x97/) {
|
607 |
+
$result .= ($norm_to_ascii_p) ? "-" : "\xE2\x80\x94"; # m dash
|
608 |
+
} elsif ($s =~ /^\x98/) {
|
609 |
+
$result .= ($norm_to_ascii_p) ? "~" : "\xCB\x9C"; # small tilde
|
610 |
+
} elsif ($s =~ /^\x99/) {
|
611 |
+
$result .= "\xE2\x84\xA2"; # trade mark sign
|
612 |
+
} elsif ($s =~ /^\x9A/) {
|
613 |
+
$result .= "\xC5\xA1"; # Latin small letter s with caron
|
614 |
+
} elsif ($s =~ /^\x9B/) {
|
615 |
+
$result .= "\xE2\x80\xBA"; # single right-pointing angle quotation mark
|
616 |
+
} elsif ($s =~ /^\x9C/) {
|
617 |
+
$result .= "\xC5\x93"; # oe ligature
|
618 |
+
} elsif ($s =~ /^\x9E/) {
|
619 |
+
$result .= "\xC5\xBE"; # Latin small letter z with caron
|
620 |
+
} elsif ($s =~ /^\x9F/) {
|
621 |
+
$result .= "\xC5\xB8"; # Latin capital letter Y with diaeresis
|
622 |
+
} else {
|
623 |
+
$result .= "?";
|
624 |
+
}
|
625 |
+
$s = substr($s, $n_bytes);
|
626 |
+
}
|
627 |
+
return $result;
|
628 |
+
}
|
629 |
+
|
630 |
+
sub delete_weird_stuff {
|
631 |
+
local($caller, $s) = @_;
|
632 |
+
|
633 |
+
# delete control chacters (except tab and linefeed), zero-width characters, byte order mark,
|
634 |
+
# directional marks, join marks, variation selectors, Arabic tatweel
|
635 |
+
$s =~ s/([\x00-\x08\x0B-\x1F\x7F]|\xC2[\x80-\x9F]|\xD9\x80|\xE2\x80[\x8B-\x8F]|\xEF\xB8[\x80-\x8F]|\xEF\xBB\xBF|\xF3\xA0[\x84-\x87][\x80-\xBF])//g;
|
636 |
+
return $s;
|
637 |
+
}
|
638 |
+
|
639 |
+
sub number_of_utf8_character {
|
640 |
+
local($caller, $s) = @_;
|
641 |
+
|
642 |
+
$s2 = $s;
|
643 |
+
$s2 =~ s/[\x80-\xBF]//g;
|
644 |
+
return length($s2);
|
645 |
+
}
|
646 |
+
|
647 |
+
sub cap_letter_reg_exp {
|
648 |
+
# includes A-Z and other Latin-based capital letters with accents, umlauts and other decorations etc.
|
649 |
+
return "[A-Z]|\xC3[\x80-\x96\x98-\x9E]|\xC4[\x80\x82\x84\x86\x88\x8A\x8C\x8E\x90\x94\x964\x98\x9A\x9C\x9E\xA0\xA2\xA4\xA6\xA8\xAA\xAC\xAE\xB0\xB2\xB4\xB6\xB9\xBB\xBD\xBF]|\xC5[\x81\x83\x85\x87\x8A\x8C\x8E\x90\x92\x96\x98\x9A\x9C\x9E\xA0\xA2\xA4\xA6\xA8\xAA\xAC\xB0\xB2\xB4\xB6\xB8\xB9\xBB\xBD]";
|
650 |
+
}
|
651 |
+
|
652 |
+
sub regex_extended_case_expansion {
|
653 |
+
local($caller, $s) = @_;
|
654 |
+
|
655 |
+
if ($s =~ /\xC3/) {
|
656 |
+
$s =~ s/\xC3\xA0/\xC3\[\x80\xA0\]/g;
|
657 |
+
$s =~ s/\xC3\xA1/\xC3\[\x81\xA1\]/g;
|
658 |
+
$s =~ s/\xC3\xA2/\xC3\[\x82\xA2\]/g;
|
659 |
+
$s =~ s/\xC3\xA3/\xC3\[\x83\xA3\]/g;
|
660 |
+
$s =~ s/\xC3\xA4/\xC3\[\x84\xA4\]/g;
|
661 |
+
$s =~ s/\xC3\xA5/\xC3\[\x85\xA5\]/g;
|
662 |
+
$s =~ s/\xC3\xA6/\xC3\[\x86\xA6\]/g;
|
663 |
+
$s =~ s/\xC3\xA7/\xC3\[\x87\xA7\]/g;
|
664 |
+
$s =~ s/\xC3\xA8/\xC3\[\x88\xA8\]/g;
|
665 |
+
$s =~ s/\xC3\xA9/\xC3\[\x89\xA9\]/g;
|
666 |
+
$s =~ s/\xC3\xAA/\xC3\[\x8A\xAA\]/g;
|
667 |
+
$s =~ s/\xC3\xAB/\xC3\[\x8B\xAB\]/g;
|
668 |
+
$s =~ s/\xC3\xAC/\xC3\[\x8C\xAC\]/g;
|
669 |
+
$s =~ s/\xC3\xAD/\xC3\[\x8D\xAD\]/g;
|
670 |
+
$s =~ s/\xC3\xAE/\xC3\[\x8E\xAE\]/g;
|
671 |
+
$s =~ s/\xC3\xAF/\xC3\[\x8F\xAF\]/g;
|
672 |
+
$s =~ s/\xC3\xB0/\xC3\[\x90\xB0\]/g;
|
673 |
+
$s =~ s/\xC3\xB1/\xC3\[\x91\xB1\]/g;
|
674 |
+
$s =~ s/\xC3\xB2/\xC3\[\x92\xB2\]/g;
|
675 |
+
$s =~ s/\xC3\xB3/\xC3\[\x93\xB3\]/g;
|
676 |
+
$s =~ s/\xC3\xB4/\xC3\[\x94\xB4\]/g;
|
677 |
+
$s =~ s/\xC3\xB5/\xC3\[\x95\xB5\]/g;
|
678 |
+
$s =~ s/\xC3\xB6/\xC3\[\x96\xB6\]/g;
|
679 |
+
$s =~ s/\xC3\xB8/\xC3\[\x98\xB8\]/g;
|
680 |
+
$s =~ s/\xC3\xB9/\xC3\[\x99\xB9\]/g;
|
681 |
+
$s =~ s/\xC3\xBA/\xC3\[\x9A\xBA\]/g;
|
682 |
+
$s =~ s/\xC3\xBB/\xC3\[\x9B\xBB\]/g;
|
683 |
+
$s =~ s/\xC3\xBC/\xC3\[\x9C\xBC\]/g;
|
684 |
+
$s =~ s/\xC3\xBD/\xC3\[\x9D\xBD\]/g;
|
685 |
+
$s =~ s/\xC3\xBE/\xC3\[\x9E\xBE\]/g;
|
686 |
+
}
|
687 |
+
if ($s =~ /\xC5/) {
|
688 |
+
$s =~ s/\xC5\x91/\xC5\[\x90\x91\]/g;
|
689 |
+
$s =~ s/\xC5\xA1/\xC5\[\xA0\xA1\]/g;
|
690 |
+
$s =~ s/\xC5\xB1/\xC5\[\xB0\xB1\]/g;
|
691 |
+
}
|
692 |
+
|
693 |
+
return $s;
|
694 |
+
}
|
695 |
+
|
696 |
+
sub extended_lower_case {
|
697 |
+
local($caller, $s) = @_;
|
698 |
+
|
699 |
+
$s =~ tr/A-Z/a-z/;
|
700 |
+
|
701 |
+
# Latin-1
|
702 |
+
if ($s =~ /\xC3[\x80-\x9F]/) {
|
703 |
+
$s =~ s/À/à/g;
|
704 |
+
$s =~ s/Á/á/g;
|
705 |
+
$s =~ s/Â/â/g;
|
706 |
+
$s =~ s/Ã/ã/g;
|
707 |
+
$s =~ s/Ä/ä/g;
|
708 |
+
$s =~ s/Å/å/g;
|
709 |
+
$s =~ s/Æ/æ/g;
|
710 |
+
$s =~ s/Ç/ç/g;
|
711 |
+
$s =~ s/È/è/g;
|
712 |
+
$s =~ s/É/é/g;
|
713 |
+
$s =~ s/Ê/ê/g;
|
714 |
+
$s =~ s/Ë/ë/g;
|
715 |
+
$s =~ s/Ì/ì/g;
|
716 |
+
$s =~ s/Í/í/g;
|
717 |
+
$s =~ s/Î/î/g;
|
718 |
+
$s =~ s/Ï/ï/g;
|
719 |
+
$s =~ s/Ð/ð/g;
|
720 |
+
$s =~ s/Ñ/ñ/g;
|
721 |
+
$s =~ s/Ò/ò/g;
|
722 |
+
$s =~ s/Ó/ó/g;
|
723 |
+
$s =~ s/Ô/ô/g;
|
724 |
+
$s =~ s/Õ/õ/g;
|
725 |
+
$s =~ s/Ö/ö/g;
|
726 |
+
$s =~ s/Ø/ø/g;
|
727 |
+
$s =~ s/Ù/ù/g;
|
728 |
+
$s =~ s/Ú/ú/g;
|
729 |
+
$s =~ s/Û/û/g;
|
730 |
+
$s =~ s/Ü/ü/g;
|
731 |
+
$s =~ s/Ý/ý/g;
|
732 |
+
$s =~ s/Þ/þ/g;
|
733 |
+
}
|
734 |
+
# Latin Extended-A
|
735 |
+
if ($s =~ /[\xC4-\xC5][\x80-\xBF]/) {
|
736 |
+
$s =~ s/Ā/ā/g;
|
737 |
+
$s =~ s/Ă/ă/g;
|
738 |
+
$s =~ s/Ą/ą/g;
|
739 |
+
$s =~ s/Ć/ć/g;
|
740 |
+
$s =~ s/Ĉ/ĉ/g;
|
741 |
+
$s =~ s/Ċ/ċ/g;
|
742 |
+
$s =~ s/Č/č/g;
|
743 |
+
$s =~ s/Ď/ď/g;
|
744 |
+
$s =~ s/Đ/đ/g;
|
745 |
+
$s =~ s/Ē/ē/g;
|
746 |
+
$s =~ s/Ĕ/ĕ/g;
|
747 |
+
$s =~ s/Ė/ė/g;
|
748 |
+
$s =~ s/Ę/ę/g;
|
749 |
+
$s =~ s/Ě/ě/g;
|
750 |
+
$s =~ s/Ĝ/ĝ/g;
|
751 |
+
$s =~ s/Ğ/ğ/g;
|
752 |
+
$s =~ s/Ġ/ġ/g;
|
753 |
+
$s =~ s/Ģ/ģ/g;
|
754 |
+
$s =~ s/Ĥ/ĥ/g;
|
755 |
+
$s =~ s/Ħ/ħ/g;
|
756 |
+
$s =~ s/Ĩ/ĩ/g;
|
757 |
+
$s =~ s/Ī/ī/g;
|
758 |
+
$s =~ s/Ĭ/ĭ/g;
|
759 |
+
$s =~ s/Į/į/g;
|
760 |
+
$s =~ s/İ/ı/g;
|
761 |
+
$s =~ s/IJ/ij/g;
|
762 |
+
$s =~ s/Ĵ/ĵ/g;
|
763 |
+
$s =~ s/Ķ/ķ/g;
|
764 |
+
$s =~ s/Ĺ/ĺ/g;
|
765 |
+
$s =~ s/Ļ/ļ/g;
|
766 |
+
$s =~ s/Ľ/ľ/g;
|
767 |
+
$s =~ s/Ŀ/ŀ/g;
|
768 |
+
$s =~ s/Ł/ł/g;
|
769 |
+
$s =~ s/Ń/ń/g;
|
770 |
+
$s =~ s/Ņ/ņ/g;
|
771 |
+
$s =~ s/Ň/ň/g;
|
772 |
+
$s =~ s/Ŋ/ŋ/g;
|
773 |
+
$s =~ s/Ō/ō/g;
|
774 |
+
$s =~ s/Ŏ/ŏ/g;
|
775 |
+
$s =~ s/Ő/ő/g;
|
776 |
+
$s =~ s/Œ/œ/g;
|
777 |
+
$s =~ s/Ŕ/ŕ/g;
|
778 |
+
$s =~ s/Ŗ/ŗ/g;
|
779 |
+
$s =~ s/Ř/ř/g;
|
780 |
+
$s =~ s/Ś/ś/g;
|
781 |
+
$s =~ s/Ŝ/ŝ/g;
|
782 |
+
$s =~ s/Ş/ş/g;
|
783 |
+
$s =~ s/Š/š/g;
|
784 |
+
$s =~ s/Ţ/ţ/g;
|
785 |
+
$s =~ s/Ť/ť/g;
|
786 |
+
$s =~ s/Ŧ/ŧ/g;
|
787 |
+
$s =~ s/Ũ/ũ/g;
|
788 |
+
$s =~ s/Ū/ū/g;
|
789 |
+
$s =~ s/Ŭ/ŭ/g;
|
790 |
+
$s =~ s/Ů/ů/g;
|
791 |
+
$s =~ s/Ű/ű/g;
|
792 |
+
$s =~ s/Ų/ų/g;
|
793 |
+
$s =~ s/Ŵ/ŵ/g;
|
794 |
+
$s =~ s/Ŷ/ŷ/g;
|
795 |
+
$s =~ s/Ź/ź/g;
|
796 |
+
$s =~ s/Ż/ż/g;
|
797 |
+
$s =~ s/Ž/ž/g;
|
798 |
+
}
|
799 |
+
# Greek letters
|
800 |
+
if ($s =~ /\xCE[\x86-\xAB]/) {
|
801 |
+
$s =~ s/Α/α/g;
|
802 |
+
$s =~ s/Β/β/g;
|
803 |
+
$s =~ s/Γ/γ/g;
|
804 |
+
$s =~ s/Δ/δ/g;
|
805 |
+
$s =~ s/Ε/ε/g;
|
806 |
+
$s =~ s/Ζ/ζ/g;
|
807 |
+
$s =~ s/Η/η/g;
|
808 |
+
$s =~ s/Θ/θ/g;
|
809 |
+
$s =~ s/Ι/ι/g;
|
810 |
+
$s =~ s/Κ/κ/g;
|
811 |
+
$s =~ s/Λ/λ/g;
|
812 |
+
$s =~ s/Μ/μ/g;
|
813 |
+
$s =~ s/Ν/ν/g;
|
814 |
+
$s =~ s/Ξ/ξ/g;
|
815 |
+
$s =~ s/Ο/ο/g;
|
816 |
+
$s =~ s/Π/π/g;
|
817 |
+
$s =~ s/Ρ/ρ/g;
|
818 |
+
$s =~ s/Σ/σ/g;
|
819 |
+
$s =~ s/Τ/τ/g;
|
820 |
+
$s =~ s/Υ/υ/g;
|
821 |
+
$s =~ s/Φ/φ/g;
|
822 |
+
$s =~ s/Χ/χ/g;
|
823 |
+
$s =~ s/Ψ/ψ/g;
|
824 |
+
$s =~ s/Ω/ω/g;
|
825 |
+
$s =~ s/Ϊ/ϊ/g;
|
826 |
+
$s =~ s/Ϋ/ϋ/g;
|
827 |
+
$s =~ s/Ά/ά/g;
|
828 |
+
$s =~ s/Έ/έ/g;
|
829 |
+
$s =~ s/Ή/ή/g;
|
830 |
+
$s =~ s/Ί/ί/g;
|
831 |
+
$s =~ s/Ό/ό/g;
|
832 |
+
$s =~ s/Ύ/ύ/g;
|
833 |
+
$s =~ s/Ώ/ώ/g;
|
834 |
+
}
|
835 |
+
# Cyrillic letters
|
836 |
+
if ($s =~ /\xD0[\x80-\xAF]/) {
|
837 |
+
$s =~ s/А/а/g;
|
838 |
+
$s =~ s/Б/б/g;
|
839 |
+
$s =~ s/В/в/g;
|
840 |
+
$s =~ s/Г/г/g;
|
841 |
+
$s =~ s/Д/д/g;
|
842 |
+
$s =~ s/Е/е/g;
|
843 |
+
$s =~ s/Ж/ж/g;
|
844 |
+
$s =~ s/З/з/g;
|
845 |
+
$s =~ s/И/и/g;
|
846 |
+
$s =~ s/Й/й/g;
|
847 |
+
$s =~ s/К/к/g;
|
848 |
+
$s =~ s/Л/л/g;
|
849 |
+
$s =~ s/М/м/g;
|
850 |
+
$s =~ s/Н/н/g;
|
851 |
+
$s =~ s/О/о/g;
|
852 |
+
$s =~ s/П/п/g;
|
853 |
+
$s =~ s/Р/р/g;
|
854 |
+
$s =~ s/С/с/g;
|
855 |
+
$s =~ s/Т/т/g;
|
856 |
+
$s =~ s/У/у/g;
|
857 |
+
$s =~ s/Ф/ф/g;
|
858 |
+
$s =~ s/Х/х/g;
|
859 |
+
$s =~ s/Ц/ц/g;
|
860 |
+
$s =~ s/Ч/ч/g;
|
861 |
+
$s =~ s/Ш/ш/g;
|
862 |
+
$s =~ s/Щ/щ/g;
|
863 |
+
$s =~ s/Ъ/ъ/g;
|
864 |
+
$s =~ s/Ы/ы/g;
|
865 |
+
$s =~ s/Ь/ь/g;
|
866 |
+
$s =~ s/Э/э/g;
|
867 |
+
$s =~ s/Ю/ю/g;
|
868 |
+
$s =~ s/Я/я/g;
|
869 |
+
$s =~ s/Ѐ/ѐ/g;
|
870 |
+
$s =~ s/Ё/ё/g;
|
871 |
+
$s =~ s/Ђ/ђ/g;
|
872 |
+
$s =~ s/Ѓ/ѓ/g;
|
873 |
+
$s =~ s/Є/є/g;
|
874 |
+
$s =~ s/Ѕ/ѕ/g;
|
875 |
+
$s =~ s/І/і/g;
|
876 |
+
$s =~ s/Ї/ї/g;
|
877 |
+
$s =~ s/Ј/ј/g;
|
878 |
+
$s =~ s/Љ/љ/g;
|
879 |
+
$s =~ s/Њ/њ/g;
|
880 |
+
$s =~ s/Ћ/ћ/g;
|
881 |
+
$s =~ s/Ќ/ќ/g;
|
882 |
+
$s =~ s/Ѝ/ѝ/g;
|
883 |
+
$s =~ s/Ў/ў/g;
|
884 |
+
$s =~ s/Џ/џ/g;
|
885 |
+
}
|
886 |
+
# Fullwidth A-Z
|
887 |
+
if ($s =~ /\xEF\xBC[\xA1-\xBA]/) {
|
888 |
+
$s =~ s/A/a/g;
|
889 |
+
$s =~ s/B/b/g;
|
890 |
+
$s =~ s/C/c/g;
|
891 |
+
$s =~ s/D/d/g;
|
892 |
+
$s =~ s/E/e/g;
|
893 |
+
$s =~ s/F/f/g;
|
894 |
+
$s =~ s/G/g/g;
|
895 |
+
$s =~ s/H/h/g;
|
896 |
+
$s =~ s/I/i/g;
|
897 |
+
$s =~ s/J/j/g;
|
898 |
+
$s =~ s/K/k/g;
|
899 |
+
$s =~ s/L/l/g;
|
900 |
+
$s =~ s/M/m/g;
|
901 |
+
$s =~ s/N/n/g;
|
902 |
+
$s =~ s/O/o/g;
|
903 |
+
$s =~ s/P/p/g;
|
904 |
+
$s =~ s/Q/q/g;
|
905 |
+
$s =~ s/R/r/g;
|
906 |
+
$s =~ s/S/s/g;
|
907 |
+
$s =~ s/T/t/g;
|
908 |
+
$s =~ s/U/u/g;
|
909 |
+
$s =~ s/V/v/g;
|
910 |
+
$s =~ s/W/w/g;
|
911 |
+
$s =~ s/X/x/g;
|
912 |
+
$s =~ s/Y/y/g;
|
913 |
+
$s =~ s/Z/z/g;
|
914 |
+
}
|
915 |
+
|
916 |
+
return $s;
|
917 |
+
}
|
918 |
+
|
919 |
+
sub extended_upper_case {
|
920 |
+
local($caller, $s) = @_;
|
921 |
+
|
922 |
+
$s =~ tr/a-z/A-Z/;
|
923 |
+
return $s unless $s =~ /[\xC3-\xC5][\x80-\xBF]/;
|
924 |
+
|
925 |
+
$s =~ s/\xC3\xA0/\xC3\x80/g;
|
926 |
+
$s =~ s/\xC3\xA1/\xC3\x81/g;
|
927 |
+
$s =~ s/\xC3\xA2/\xC3\x82/g;
|
928 |
+
$s =~ s/\xC3\xA3/\xC3\x83/g;
|
929 |
+
$s =~ s/\xC3\xA4/\xC3\x84/g;
|
930 |
+
$s =~ s/\xC3\xA5/\xC3\x85/g;
|
931 |
+
$s =~ s/\xC3\xA6/\xC3\x86/g;
|
932 |
+
$s =~ s/\xC3\xA7/\xC3\x87/g;
|
933 |
+
$s =~ s/\xC3\xA8/\xC3\x88/g;
|
934 |
+
$s =~ s/\xC3\xA9/\xC3\x89/g;
|
935 |
+
$s =~ s/\xC3\xAA/\xC3\x8A/g;
|
936 |
+
$s =~ s/\xC3\xAB/\xC3\x8B/g;
|
937 |
+
$s =~ s/\xC3\xAC/\xC3\x8C/g;
|
938 |
+
$s =~ s/\xC3\xAD/\xC3\x8D/g;
|
939 |
+
$s =~ s/\xC3\xAE/\xC3\x8E/g;
|
940 |
+
$s =~ s/\xC3\xAF/\xC3\x8F/g;
|
941 |
+
$s =~ s/\xC3\xB0/\xC3\x90/g;
|
942 |
+
$s =~ s/\xC3\xB1/\xC3\x91/g;
|
943 |
+
$s =~ s/\xC3\xB2/\xC3\x92/g;
|
944 |
+
$s =~ s/\xC3\xB3/\xC3\x93/g;
|
945 |
+
$s =~ s/\xC3\xB4/\xC3\x94/g;
|
946 |
+
$s =~ s/\xC3\xB5/\xC3\x95/g;
|
947 |
+
$s =~ s/\xC3\xB6/\xC3\x96/g;
|
948 |
+
$s =~ s/\xC3\xB8/\xC3\x98/g;
|
949 |
+
$s =~ s/\xC3\xB9/\xC3\x99/g;
|
950 |
+
$s =~ s/\xC3\xBA/\xC3\x9A/g;
|
951 |
+
$s =~ s/\xC3\xBB/\xC3\x9B/g;
|
952 |
+
$s =~ s/\xC3\xBC/\xC3\x9C/g;
|
953 |
+
$s =~ s/\xC3\xBD/\xC3\x9D/g;
|
954 |
+
$s =~ s/\xC3\xBE/\xC3\x9E/g;
|
955 |
+
|
956 |
+
$s =~ s/\xC5\x91/\xC5\x90/g;
|
957 |
+
$s =~ s/\xC5\xA1/\xC5\xA0/g;
|
958 |
+
$s =~ s/\xC5\xB1/\xC5\xB0/g;
|
959 |
+
return $s unless $s =~ /[\xC3-\xC5][\x80-\xBF]/;
|
960 |
+
|
961 |
+
return $s;
|
962 |
+
}
|
963 |
+
|
964 |
+
sub extended_first_upper_case {
|
965 |
+
local($caller, $s) = @_;
|
966 |
+
|
967 |
+
if (($first_char, $rest) = ($s =~ /^([\x00-\x7F]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF][\x80-\xBF])(.*)$/)) {
|
968 |
+
return $caller->extended_upper_case($first_char) . $rest;
|
969 |
+
} else {
|
970 |
+
return $s;
|
971 |
+
}
|
972 |
+
}
|
973 |
+
|
974 |
+
sub repair_doubly_converted_utf8_strings {
|
975 |
+
local($caller, $s) = @_;
|
976 |
+
|
977 |
+
if ($s =~ /\xC3[\x82-\x85]\xC2[\x80-\xBF]/) {
|
978 |
+
$s =~ s/\xC3\x82\xC2([\x80-\xBF])/\xC2$1/g;
|
979 |
+
$s =~ s/\xC3\x83\xC2([\x80-\xBF])/\xC3$1/g;
|
980 |
+
$s =~ s/\xC3\x84\xC2([\x80-\xBF])/\xC4$1/g;
|
981 |
+
$s =~ s/\xC3\x85\xC2([\x80-\xBF])/\xC5$1/g;
|
982 |
+
}
|
983 |
+
return $s;
|
984 |
+
}
|
985 |
+
|
986 |
+
sub repair_misconverted_windows_to_utf8_strings {
|
987 |
+
local($caller, $s) = @_;
|
988 |
+
|
989 |
+
# correcting conversions of UTF8 using Latin1-to-UTF converter
|
990 |
+
if ($s =~ /\xC3\xA2\xC2\x80\xC2[\x90-\xEF]/) {
|
991 |
+
my $result = "";
|
992 |
+
while (($pre,$last_c,$post) = ($s =~ /^(.*?)\xC3\xA2\xC2\x80\xC2([\x90-\xEF])(.*)$/s)) {
|
993 |
+
$result .= "$pre\xE2\x80$last_c";
|
994 |
+
$s = $post;
|
995 |
+
}
|
996 |
+
$result .= $s;
|
997 |
+
$s = $result;
|
998 |
+
}
|
999 |
+
# correcting conversions of Windows1252-to-UTF8 using Latin1-to-UTF converter
|
1000 |
+
if ($s =~ /\xC2[\x80-\x9F]/) {
|
1001 |
+
my $result = "";
|
1002 |
+
while (($pre,$c_windows,$post) = ($s =~ /^(.*?)\xC2([\x80-\x9F])(.*)$/s)) {
|
1003 |
+
$c_utf8 = $caller->windows1252_to_utf8($c_windows, 0);
|
1004 |
+
$result .= ($c_utf8 eq "?") ? ($pre . "\xC2" . $c_windows) : "$pre$c_utf8";
|
1005 |
+
$s = $post;
|
1006 |
+
}
|
1007 |
+
$result .= $s;
|
1008 |
+
$s = $result;
|
1009 |
+
}
|
1010 |
+
if ($s =~ /\xC3/) {
|
1011 |
+
$s =~ s/\xC3\xA2\xE2\x80\x9A\xC2\xAC/\xE2\x82\xAC/g; # x80 -> Euro sign
|
1012 |
+
# x81 codepoint undefined in Windows 1252
|
1013 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xC5\xA1/\xE2\x80\x9A/g; # x82 -> single low-9 quotation mark
|
1014 |
+
$s =~ s/\xC3\x86\xE2\x80\x99/\xC6\x92/g; # x83 -> Latin small letter f with hook
|
1015 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xC5\xBE/\xE2\x80\x9E/g; # x84 -> double low-9 quotation mark
|
1016 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xA6/\xE2\x80\xA6/g; # x85 -> horizontal ellipsis
|
1017 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xA0/\xE2\x80\xA0/g; # x86 -> dagger
|
1018 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xA1/\xE2\x80\xA1/g; # x87 -> double dagger
|
1019 |
+
$s =~ s/\xC3\x8B\xE2\x80\xA0/\xCB\x86/g; # x88 -> modifier letter circumflex accent
|
1020 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xB0/\xE2\x80\xB0/g; # x89 -> per mille sign
|
1021 |
+
$s =~ s/\xC3\x85\xC2\xA0/\xC5\xA0/g; # x8A -> Latin capital letter S with caron
|
1022 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xB9/\xE2\x80\xB9/g; # x8B -> single left-pointing angle quotation mark
|
1023 |
+
$s =~ s/\xC3\x85\xE2\x80\x99/\xC5\x92/g; # x8C -> Latin capital ligature OE
|
1024 |
+
# x8D codepoint undefined in Windows 1252
|
1025 |
+
$s =~ s/\xC3\x85\xC2\xBD/\xC5\xBD/g; # x8E -> Latin capital letter Z with caron
|
1026 |
+
# x8F codepoint undefined in Windows 1252
|
1027 |
+
# x90 codepoint undefined in Windows 1252
|
1028 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xCB\x9C/\xE2\x80\x98/g; # x91 a-circumflex+euro+small tilde -> left single quotation mark
|
1029 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xE2\x84\xA2/\xE2\x80\x99/g; # x92 a-circumflex+euro+trademark -> right single quotation mark
|
1030 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xC5\x93/\xE2\x80\x9C/g; # x93 a-circumflex+euro+Latin small ligature oe -> left double quotation mark
|
1031 |
+
# x94 maps through undefined intermediate code point
|
1032 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xA2/\xE2\x80\xA2/g; # x95 a-circumflex+euro+cent sign -> bullet
|
1033 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xE2\x80\x9C/\xE2\x80\x93/g; # x96 a-circumflex+euro+left double quotation mark -> en dash
|
1034 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xE2\x80\x9D/\xE2\x80\x94/g; # x97 a-circumflex+euro+right double quotation mark -> em dash
|
1035 |
+
$s =~ s/\xC3\x8B\xC5\x93/\xCB\x9C/g; # x98 Latin capital e diaeresis+Latin small ligature oe -> small tilde
|
1036 |
+
$s =~ s/\xC3\xA2\xE2\x80\x9E\xC2\xA2/\xE2\x84\xA2/g; # x99 -> trade mark sign
|
1037 |
+
$s =~ s/\xC3\x85\xC2\xA1/\xC5\xA1/g; # x9A -> Latin small letter s with caron
|
1038 |
+
$s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xBA/\xE2\x80\xBA/g; # x9B -> single right-pointing angle quotation mark
|
1039 |
+
$s =~ s/\xC3\x85\xE2\x80\x9C/\xC5\x93/g; # x9C -> Latin small ligature oe
|
1040 |
+
# x9D codepoint undefined in Windows 1252
|
1041 |
+
$s =~ s/\xC3\x85\xC2\xBE/\xC5\xBE/g; # x9E -> Latin small letter z with caron
|
1042 |
+
$s =~ s/\xC3\x85\xC2\xB8/\xC5\xB8/g; # x9F -> Latin capital letter Y with diaeresis
|
1043 |
+
$s =~ s/\xC3\xAF\xC2\xBF\xC2\xBD/\xEF\xBF\xBD/g; # replacement character
|
1044 |
+
}
|
1045 |
+
|
1046 |
+
return $s;
|
1047 |
+
}
|
1048 |
+
|
1049 |
+
sub latin1_to_utf {
|
1050 |
+
local($caller, $s) = @_;
|
1051 |
+
|
1052 |
+
my $result = "";
|
1053 |
+
while (($pre,$c,$post) = ($s =~ /^(.*?)([\x80-\xFF])(.*)$/s)) {
|
1054 |
+
$result .= $pre;
|
1055 |
+
if ($c =~ /^[\x80-\xBF]$/) {
|
1056 |
+
$result .= "\xC2$c";
|
1057 |
+
} elsif ($c =~ /^[\xC0-\xFF]$/) {
|
1058 |
+
$c =~ tr/[\xC0-\xFF]/[\x80-\xBF]/;
|
1059 |
+
$result .= "\xC3$c";
|
1060 |
+
}
|
1061 |
+
$s = $post;
|
1062 |
+
}
|
1063 |
+
$result .= $s;
|
1064 |
+
return $result;
|
1065 |
+
}
|
1066 |
+
|
1067 |
+
sub character_type_is_letter_type {
|
1068 |
+
local($caller, $char_type) = @_;
|
1069 |
+
|
1070 |
+
return ($char_type =~ /\b((CJK|hiragana|kana|katakana)\s+character|diacritic|letter|syllable)\b/);
|
1071 |
+
}
|
1072 |
+
|
1073 |
+
sub character_type {
|
1074 |
+
local($caller, $c) = @_;
|
1075 |
+
|
1076 |
+
if ($c =~ /^[\x00-\x7F]/) {
|
1077 |
+
return "XML tag" if $c =~ /^<.*>$/;
|
1078 |
+
return "ASCII Latin letter" if $c =~ /^[a-z]$/i;
|
1079 |
+
return "ASCII digit" if $c =~ /^[0-9]$/i;
|
1080 |
+
return "ASCII whitespace" if $c =~ /^[\x09-\x0D\x20]$/;
|
1081 |
+
return "ASCII control-character" if $c =~ /^[\x00-\x1F\x7F]$/;
|
1082 |
+
return "ASCII currency" if $c eq "\$";
|
1083 |
+
return "ASCII punctuation";
|
1084 |
+
} elsif ($c =~ /^[\xC0-\xDF]/) {
|
1085 |
+
return "non-UTF8 (invalid)" unless $c =~ /^[\xC0-\xDF][\x80-\xBF]$/;
|
1086 |
+
return "non-shortest-UTF8 (invalid)" if $c =~ /[\xC0-\xC1]/;
|
1087 |
+
return "non-ASCII control-character" if $c =~ /\xC2[\x80-\x9F]/;
|
1088 |
+
return "non-ASCII whitespace" if $c =~ /\xC2\xA0/;
|
1089 |
+
return "non-ASCII currency" if $c =~ /\xC2[\xA2-\xA5]/;
|
1090 |
+
return "fraction" if $c =~ /\xC2[\xBC-\xBE]/; # NEW
|
1091 |
+
return "superscript digit" if $c =~ /\xC2[\xB2\xB3\xB9]/;
|
1092 |
+
return "non-ASCII Latin letter" if $c =~ /\xC2\xB5/; # micro sign
|
1093 |
+
return "non-ASCII punctuation" if $c =~ /\xC2[\xA0-\xBF]/;
|
1094 |
+
return "non-ASCII punctuation" if $c =~ /\xC3[\x97\xB7]/;
|
1095 |
+
return "non-ASCII Latin letter" if $c =~ /\xC3[\x80-\xBF]/;
|
1096 |
+
return "Latin ligature letter" if $c =~ /\xC4[\xB2\xB3]/;
|
1097 |
+
return "Latin ligature letter" if $c =~ /\xC5[\x92\x93]/;
|
1098 |
+
return "non-ASCII Latin letter" if $c =~ /[\xC4-\xC8]/;
|
1099 |
+
return "non-ASCII Latin letter" if $c =~ /\xC9[\x80-\x8F]/;
|
1100 |
+
return "IPA" if $c =~ /\xC9[\x90-\xBF]/;
|
1101 |
+
return "IPA" if $c =~ /\xCA[\x80-\xBF]/;
|
1102 |
+
return "IPA" if $c =~ /\xCB[\x80-\xBF]/;
|
1103 |
+
return "combining-diacritic" if $c =~ /\xCC[\x80-\xBF]/;
|
1104 |
+
return "combining-diacritic" if $c =~ /\xCD[\x80-\xAF]/;
|
1105 |
+
return "Greek punctuation" if $c =~ /\xCD[\xBE]/; # Greek question mark
|
1106 |
+
return "Greek punctuation" if $c =~ /\xCE[\x87]/; # Greek semicolon
|
1107 |
+
return "Greek letter" if $c =~ /\xCD[\xB0-\xBF]/;
|
1108 |
+
return "Greek letter" if $c =~ /\xCE/;
|
1109 |
+
return "Greek letter" if $c =~ /\xCF[\x80-\xA1\xB3\xB7\xB8\xBA\xBB]/;
|
1110 |
+
return "Coptic letter" if $c =~ /\xCF[\xA2-\xAF]/;
|
1111 |
+
return "Cyrillic letter" if $c =~ /[\xD0-\xD3]/;
|
1112 |
+
return "Cyrillic letter" if $c =~ /\xD4[\x80-\xAF]/;
|
1113 |
+
return "Armenian punctuation" if $c =~ /\xD5[\x9A-\x9F]/;
|
1114 |
+
return "Armenian punctuation" if $c =~ /\xD6[\x89-\x8F]/;
|
1115 |
+
return "Armenian letter" if $c =~ /\xD4[\xB0-\xBF]/;
|
1116 |
+
return "Armenian letter" if $c =~ /\xD5/;
|
1117 |
+
return "Armenian letter" if $c =~ /\xD6[\x80-\x8F]/;
|
1118 |
+
return "Hebrew accent" if $c =~ /\xD6[\x91-\xAE]/;
|
1119 |
+
return "Hebrew punctuation" if $c =~ /\xD6\xBE/;
|
1120 |
+
return "Hebrew punctuation" if $c =~ /\xD7[\x80\x83\x86\xB3\xB4]/;
|
1121 |
+
return "Hebrew point" if $c =~ /\xD6[\xB0-\xBF]/;
|
1122 |
+
return "Hebrew point" if $c =~ /\xD7[\x81\x82\x87]/;
|
1123 |
+
return "Hebrew letter" if $c =~ /\xD7[\x90-\xB2]/;
|
1124 |
+
return "other Hebrew" if $c =~ /\xD6[\x90-\xBF]/;
|
1125 |
+
return "other Hebrew" if $c =~ /\xD7/;
|
1126 |
+
return "Arabic currency" if $c =~ /\xD8\x8B/; # Afghani sign
|
1127 |
+
return "Arabic punctuation" if $c =~ /\xD8[\x89-\x8D\x9B\x9E\x9F]/;
|
1128 |
+
return "Arabic punctuation" if $c =~ /\xD9[\xAA-\xAD]/;
|
1129 |
+
return "Arabic punctuation" if $c =~ /\xDB[\x94]/;
|
1130 |
+
return "Arabic tatweel" if $c =~ /\xD9\x80/;
|
1131 |
+
return "Arabic letter" if $c =~ /\xD8[\xA0-\xBF]/;
|
1132 |
+
return "Arabic letter" if $c =~ /\xD9[\x81-\x9F]/;
|
1133 |
+
return "Arabic letter" if $c =~ /\xD9[\xAE-\xBF]/;
|
1134 |
+
return "Arabic letter" if $c =~ /\xDA[\x80-\xBF]/;
|
1135 |
+
return "Arabic letter" if $c =~ /\xDB[\x80-\x95]/;
|
1136 |
+
return "Arabic Indic digit" if $c =~ /\xD9[\xA0-\xA9]/;
|
1137 |
+
return "Arabic Indic digit" if $c =~ /\xDB[\xB0-\xB9]/;
|
1138 |
+
return "other Arabic" if $c =~ /[\xD8-\xDB]/;
|
1139 |
+
return "Syriac punctuation" if $c =~ /\xDC[\x80-\x8F]/;
|
1140 |
+
return "Syriac letter" if $c =~ /\xDC[\x90-\xAF]/;
|
1141 |
+
return "Syriac diacritic" if $c =~ /\xDC[\xB0-\xBF]/;
|
1142 |
+
return "Syriac diacritic" if $c =~ /\xDD[\x80-\x8A]/;
|
1143 |
+
return "Thaana letter" if $c =~ /\xDE/;
|
1144 |
+
} elsif ($c =~ /^[\xE0-\xEF]/) {
|
1145 |
+
return "non-UTF8 (invalid)" unless $c =~ /^[\xE0-\xEF][\x80-\xBF]{2,2}$/;
|
1146 |
+
return "non-shortest-UTF8 (invalid)" if $c =~ /\xE0[\x80-\x9F]/;
|
1147 |
+
return "Arabic letter" if $c =~ /\xE0\xA2[\xA0-\xBF]/; # extended letters
|
1148 |
+
return "other Arabic" if $c =~ /\xE0\xA3/; # extended characters
|
1149 |
+
return "Devanagari punctuation" if $c =~ /\xE0\xA5[\xA4\xA5]/; # danda, double danda
|
1150 |
+
return "Devanagari digit" if $c =~ /\xE0\xA5[\xA6-\xAF]/;
|
1151 |
+
return "Devanagari letter" if $c =~ /\xE0[\xA4-\xA5]/;
|
1152 |
+
return "Bengali digit" if $c =~ /\xE0\xA7[\xA6-\xAF]/;
|
1153 |
+
return "Bengali currency" if $c =~ /\xE0\xA7[\xB2-\xB9]/;
|
1154 |
+
return "Bengali letter" if $c =~ /\xE0[\xA6-\xA7]/;
|
1155 |
+
return "Gurmukhi digit" if $c =~ /\xE0\xA9[\xA6-\xAF]/;
|
1156 |
+
return "Gurmukhi letter" if $c =~ /\xE0[\xA8-\xA9]/;
|
1157 |
+
return "Gujarati digit" if $c =~ /\xE0\xAB[\xA6-\xAF]/;
|
1158 |
+
return "Gujarati letter" if $c =~ /\xE0[\xAA-\xAB]/;
|
1159 |
+
return "Oriya digit" if $c =~ /\xE0\xAD[\xA6-\xAF]/;
|
1160 |
+
return "Oriya fraction" if $c =~ /\xE0\xAD[\xB2-\xB7]/;
|
1161 |
+
return "Oriya letter" if $c =~ /\xE0[\xAC-\xAD]/;
|
1162 |
+
return "Tamil digit" if $c =~ /\xE0\xAF[\xA6-\xAF]/;
|
1163 |
+
return "Tamil number" if $c =~ /\xE0\xAF[\xB0-\xB2]/; # number (10, 100, 1000)
|
1164 |
+
return "Tamil letter" if $c =~ /\xE0[\xAE-\xAF]/;
|
1165 |
+
return "Telegu digit" if $c =~ /\xE0\xB1[\xA6-\xAF]/;
|
1166 |
+
return "Telegu fraction" if $c =~ /\xE0\xB1[\xB8-\xBE]/;
|
1167 |
+
return "Telegu letter" if $c =~ /\xE0[\xB0-\xB1]/;
|
1168 |
+
return "Kannada digit" if $c =~ /\xE0\xB3[\xA6-\xAF]/;
|
1169 |
+
return "Kannada letter" if $c =~ /\xE0[\xB2-\xB3]/;
|
1170 |
+
return "Malayalam digit" if $c =~ /\xE0\xB5[\x98-\x9E\xA6-\xB8]/;
|
1171 |
+
return "Malayalam punctuation" if $c =~ /\xE0\xB5\xB9/; # date mark
|
1172 |
+
return "Malayalam letter" if $c =~ /\xE0[\xB4-\xB5]/;
|
1173 |
+
return "Sinhala digit" if $c =~ /\xE0\xB7[\xA6-\xAF]/;
|
1174 |
+
return "Sinhala punctuation" if $c =~ /\xE0\xB7\xB4/;
|
1175 |
+
return "Sinhala letter" if $c =~ /\xE0[\xB6-\xB7]/;
|
1176 |
+
return "Thai currency" if $c =~ /\xE0\xB8\xBF/;
|
1177 |
+
return "Thai digit" if $c =~ /\xE0\xB9[\x90-\x99]/;
|
1178 |
+
return "Thai character" if $c =~ /\xE0[\xB8-\xB9]/;
|
1179 |
+
return "Lao punctuation" if $c =~ /\xE0\xBA\xAF/; # Lao ellipsis
|
1180 |
+
return "Lao digit" if $c =~ /\xE0\xBB[\x90-\x99]/;
|
1181 |
+
return "Lao character" if $c =~ /\xE0[\xBA-\xBB]/;
|
1182 |
+
return "Tibetan punctuation" if $c =~ /\xE0\xBC[\x81-\x94]/;
|
1183 |
+
return "Tibetan sign" if $c =~ /\xE0\xBC[\x95-\x9F]/;
|
1184 |
+
return "Tibetan digit" if $c =~ /\xE0\xBC[\xA0-\xB3]/;
|
1185 |
+
return "Tibetan punctuation" if $c =~ /\xE0\xBC[\xB4-\xBD]/;
|
1186 |
+
return "Tibetan letter" if $c =~ /\xE0[\xBC-\xBF]/;
|
1187 |
+
return "Myanmar digit" if $c =~ /\xE1\x81[\x80-\x89]/;
|
1188 |
+
return "Myanmar digit" if $c =~ /\xE1\x82[\x90-\x99]/; # Myanmar Shan digits
|
1189 |
+
return "Myanmar punctuation" if $c =~ /\xE1\x81[\x8A-\x8B]/;
|
1190 |
+
return "Myanmar letter" if $c =~ /\xE1[\x80-\x81]/;
|
1191 |
+
return "Myanmar letter" if $c =~ /\xE1\x82[\x80-\x9F]/;
|
1192 |
+
return "Georgian punctuation" if $c =~ /\xE1\x83\xBB/;
|
1193 |
+
return "Georgian letter" if $c =~ /\xE1\x82[\xA0-\xBF]/;
|
1194 |
+
return "Georgian letter" if $c =~ /\xE1\x83/;
|
1195 |
+
return "Georgian letter" if $c =~ /\xE1\xB2[\x90-\xBF]/; # Georgian Mtavruli capital letters
|
1196 |
+
return "Georgian letter" if $c =~ /\xE2\xB4[\x80-\xAF]/; # Georgian small letters (Khutsuri)
|
1197 |
+
return "Korean Hangul letter" if $c =~ /\xE1[\x84-\x87]/;
|
1198 |
+
return "Ethiopic punctuation" if $c =~ /\xE1\x8D[\xA0-\xA8]/;
|
1199 |
+
return "Ethiopic digit" if $c =~ /\xE1\x8D[\xA9-\xB1]/;
|
1200 |
+
return "Ethiopic number" if $c =~ /\xE1\x8D[\xB2-\xBC]/;
|
1201 |
+
return "Ethiopic syllable" if $c =~ /\xE1[\x88-\x8D]/;
|
1202 |
+
return "Cherokee letter" if $c =~ /\xE1\x8E[\xA0-\xBF]/;
|
1203 |
+
return "Cherokee letter" if $c =~ /\xE1\x8F/;
|
1204 |
+
return "Canadian punctuation" if $c =~ /\xE1\x90\x80/; # Canadian Syllabics hyphen
|
1205 |
+
return "Canadian punctuation" if $c =~ /\xE1\x99\xAE/; # Canadian Syllabics full stop
|
1206 |
+
return "Canadian syllable" if $c =~ /\xE1[\x90-\x99]/;
|
1207 |
+
return "Canadian syllable" if $c =~ /\xE1\xA2[\xB0-\xBF]/;
|
1208 |
+
return "Canadian syllable" if $c =~ /\xE1\xA3/;
|
1209 |
+
return "Ogham whitespace" if $c =~ /\xE1\x9A\x80/;
|
1210 |
+
return "Ogham letter" if $c =~ /\xE1\x9A[\x81-\x9A]/;
|
1211 |
+
return "Ogham punctuation" if $c =~ /\xE1\x9A[\x9B-\x9C]/;
|
1212 |
+
return "Runic punctuation" if $c =~ /\xE1\x9B[\xAB-\xAD]/;
|
1213 |
+
return "Runic letter" if $c =~ /\xE1\x9A[\xA0-\xBF]/;
|
1214 |
+
return "Runic letter" if $c =~ /\xE1\x9B/;
|
1215 |
+
return "Khmer currency" if $c =~ /\xE1\x9F\x9B/;
|
1216 |
+
return "Khmer digit" if $c =~ /\xE1\x9F[\xA0-\xA9]/;
|
1217 |
+
return "Khmer letter" if $c =~ /\xE1[\x9E-\x9F]/;
|
1218 |
+
return "Mongolian punctuation" if $c =~ /\xE1\xA0[\x80-\x8A]/;
|
1219 |
+
return "Mongolian digit" if $c =~ /\xE1\xA0[\x90-\x99]/;
|
1220 |
+
return "Mongolian letter" if $c =~ /\xE1[\xA0-\xA1]/;
|
1221 |
+
return "Mongolian letter" if $c =~ /\xE1\xA2[\x80-\xAF]/;
|
1222 |
+
return "Buginese letter" if $c =~ /\xE1\xA8[\x80-\x9B]/;
|
1223 |
+
return "Buginese punctuation" if $c =~ /\xE1\xA8[\x9E-\x9F]/;
|
1224 |
+
return "Balinese letter" if $c =~ /\xE1\xAC/;
|
1225 |
+
return "Balinese letter" if $c =~ /\xE1\xAD[\x80-\x8F]/;
|
1226 |
+
return "Balinese digit" if $c =~ /\xE1\xAD[\x90-\x99]/;
|
1227 |
+
return "Balinese puncutation" if $c =~ /\xE1\xAD[\x9A-\xA0]/;
|
1228 |
+
return "Balinese symbol" if $c =~ /\xE1\xAD[\xA1-\xBF]/;
|
1229 |
+
return "Sundanese digit" if $c =~ /\xE1\xAE[\xB0-\xB9]/;
|
1230 |
+
return "Sundanese letter" if $c =~ /\xE1\xAE/;
|
1231 |
+
return "Cyrillic letter" if $c =~ /\xE1\xB2[\x80-\x8F]/;
|
1232 |
+
return "Sundanese punctuation" if $c =~ /\xE1\xB3[\x80-\x8F]/;
|
1233 |
+
return "IPA" if $c =~ /\xE1[\xB4-\xB6]/;
|
1234 |
+
return "non-ASCII Latin letter" if $c =~ /\xE1[\xB8-\xBB]/;
|
1235 |
+
return "Greek letter" if $c =~ /\xE1[\xBC-\xBF]/;
|
1236 |
+
return "non-ASCII whitespace" if $c =~ /\xE2\x80[\x80-\x8A\xAF]/;
|
1237 |
+
return "zero-width space" if $c =~ /\xE2\x80\x8B/;
|
1238 |
+
return "zero-width non-space" if $c =~ /\xE2\x80\x8C/;
|
1239 |
+
return "zero-width joiner" if $c =~ /\xE2\x80\x8D/;
|
1240 |
+
return "directional mark" if $c =~ /\xE2\x80[\x8E-\x8F\xAA-\xAE]/;
|
1241 |
+
return "non-ASCII punctuation" if $c =~ /\xE2\x80[\x90-\xBF]/;
|
1242 |
+
return "non-ASCII punctuation" if $c =~ /\xE2\x81[\x80-\x9E]/;
|
1243 |
+
return "superscript letter" if $c =~ /\xE2\x81[\xB1\xBF]/;
|
1244 |
+
return "superscript digit" if $c =~ /\xE2\x81[\xB0-\xB9]/;
|
1245 |
+
return "superscript punctuation" if $c =~ /\xE2\x81[\xBA-\xBE]/;
|
1246 |
+
return "subscript digit" if $c =~ /\xE2\x82[\x80-\x89]/;
|
1247 |
+
return "subscript punctuation" if $c =~ /\xE2\x82[\x8A-\x8E]/;
|
1248 |
+
return "non-ASCII currency" if $c =~ /\xE2\x82[\xA0-\xBF]/;
|
1249 |
+
return "letterlike symbol" if $c =~ /\xE2\x84/;
|
1250 |
+
return "letterlike symbol" if $c =~ /\xE2\x85[\x80-\x8F]/;
|
1251 |
+
return "fraction" if $c =~ /\xE2\x85[\x90-\x9E]/; # NEW
|
1252 |
+
return "Roman number" if $c =~ /\xE2\x85[\xA0-\xBF]/; # NEW
|
1253 |
+
return "arrow symbol" if $c =~ /\xE2\x86[\x90-\xBF]/;
|
1254 |
+
return "arrow symbol" if $c =~ /\xE2\x87/;
|
1255 |
+
return "mathematical operator" if $c =~ /\xE2[\x88-\x8B]/;
|
1256 |
+
return "technical symbol" if $c =~ /\xE2[\x8C-\x8F]/;
|
1257 |
+
return "enclosed alphanumeric" if $c =~ /\xE2\x91[\xA0-\xBF]/;
|
1258 |
+
return "enclosed alphanumeric" if $c =~ /\xE2[\x92-\x93]/;
|
1259 |
+
return "box drawing" if $c =~ /\xE2[\x94-\x95]/;
|
1260 |
+
return "geometric shape" if $c =~ /\xE2\x96[\xA0-\xBF]/;
|
1261 |
+
return "geometric shape" if $c =~ /\xE2\x97/;
|
1262 |
+
return "pictograph" if $c =~ /\xE2[\x98-\x9E]/;
|
1263 |
+
return "arrow symbol" if $c =~ /\xE2\xAC[\x80-\x91\xB0-\xBF]/;
|
1264 |
+
return "geometric shape" if $c =~ /\xE2\xAC[\x92-\xAF]/;
|
1265 |
+
return "arrow symbol" if $c =~ /\xE2\xAD[\x80-\x8F\x9A-\xBF]/;
|
1266 |
+
return "geometric shape" if $c =~ /\xE2\xAD[\x90-\x99]/;
|
1267 |
+
return "arrow symbol" if $c =~ /\xE2\xAE[\x80-\xB9]/;
|
1268 |
+
return "geometric shape" if $c =~ /\xE2\xAE[\xBA-\xBF]/;
|
1269 |
+
return "geometric shape" if $c =~ /\xE2\xAF[\x80-\x88\x8A-\x8F]/;
|
1270 |
+
return "symbol" if $c =~ /\xE2[\xAC-\xAF]/;
|
1271 |
+
return "Coptic fraction" if $c =~ /\xE2\xB3\xBD/;
|
1272 |
+
return "Coptic punctuation" if $c =~ /\xE2\xB3[\xB9-\xBF]/;
|
1273 |
+
return "Coptic letter" if $c =~ /\xE2[\xB2-\xB3]/;
|
1274 |
+
return "Georgian letter" if $c =~ /\xE2\xB4[\x80-\xAF]/;
|
1275 |
+
return "Tifinagh punctuation" if $c =~ /\xE2\xB5\xB0/;
|
1276 |
+
return "Tifinagh letter" if $c =~ /\xE2\xB4[\xB0-\xBF]/;
|
1277 |
+
return "Tifinagh letter" if $c =~ /\xE2\xB5/;
|
1278 |
+
return "Ethiopic syllable" if $c =~ /\xE2\xB6/;
|
1279 |
+
return "Ethiopic syllable" if $c =~ /\xE2\xB7[\x80-\x9F]/;
|
1280 |
+
return "non-ASCII punctuation" if $c =~ /\xE3\x80[\x80-\x91\x94-\x9F\xB0\xBB-\xBD]/;
|
1281 |
+
return "symbol" if $c =~ /\xE3\x80[\x91\x92\xA0\xB6\xB7]/;
|
1282 |
+
return "Japanese hiragana character" if $c =~ /\xE3\x81/;
|
1283 |
+
return "Japanese hiragana character" if $c =~ /\xE3\x82[\x80-\x9F]/;
|
1284 |
+
return "Japanese katakana character" if $c =~ /\xE3\x82[\xA0-\xBF]/;
|
1285 |
+
return "Japanese katakana character" if $c =~ /\xE3\x83/;
|
1286 |
+
return "Bopomofo letter" if $c =~ /\xE3\x84[\x80-\xAF]/;
|
1287 |
+
return "Korean Hangul letter" if $c =~ /\xE3\x84[\xB0-\xBF]/;
|
1288 |
+
return "Korean Hangul letter" if $c =~ /\xE3\x85/;
|
1289 |
+
return "Korean Hangul letter" if $c =~ /\xE3\x86[\x80-\x8F]/;
|
1290 |
+
return "Bopomofo letter" if $c =~ /\xE3\x86[\xA0-\xBF]/;
|
1291 |
+
return "CJK stroke" if $c =~ /\xE3\x87[\x80-\xAF]/;
|
1292 |
+
return "Japanese kana character" if $c =~ /\xE3\x87[\xB0-\xBF]/;
|
1293 |
+
return "CJK symbol" if $c =~ /\xE3[\x88-\x8B]/;
|
1294 |
+
return "CJK square Latin abbreviation" if $c =~ /\xE3\x8D[\xB1-\xBA]/;
|
1295 |
+
return "CJK square Latin abbreviation" if $c =~ /\xE3\x8E/;
|
1296 |
+
return "CJK square Latin abbreviation" if $c =~ /\xE3\x8F[\x80-\x9F\xBF]/;
|
1297 |
+
return "CJK character" if $c =~ /\xE4[\xB8-\xBF]/;
|
1298 |
+
return "CJK character" if $c =~ /[\xE5-\xE9]/;
|
1299 |
+
return "Yi syllable" if $c =~ /\xEA[\x80-\x92]/;
|
1300 |
+
return "Lisu letter" if $c =~ /\xEA\x93[\x90-\xBD]/;
|
1301 |
+
return "Lisu punctuation" if $c =~ /\xEA\x93[\xBE-\xBF]/;
|
1302 |
+
return "Cyrillic letter" if $c =~ /\xEA\x99/;
|
1303 |
+
return "Cyrillic letter" if $c =~ /\xEA\x9A[\x80-\x9F]/;
|
1304 |
+
return "modifier tone" if $c =~ /\xEA\x9C[\x80-\xA1]/;
|
1305 |
+
return "Javanese punctuation" if $c =~ /\xEA\xA7[\x81-\x8D\x9E-\x9F]/;
|
1306 |
+
return "Javanese digit" if $c =~ /\xEA\xA7[\x90-\x99]/;
|
1307 |
+
return "Javanese letter" if $c =~ /\xEA\xA6/;
|
1308 |
+
return "Javanese letter" if $c =~ /\xEA\xA7[\x80-\x9F]/;
|
1309 |
+
return "Ethiopic syllable" if $c =~ /\xEA\xAC[\x80-\xAF]/;
|
1310 |
+
return "Cherokee letter" if $c =~ /\xEA\xAD[\xB0-\xBF]/;
|
1311 |
+
return "Cherokee letter" if $c =~ /\xEA\xAE/;
|
1312 |
+
return "Meetai Mayek digit" if $c =~ /\xEA\xAF[\xB0-\xB9]/;
|
1313 |
+
return "Meetai Mayek letter" if $c =~ /\xEA\xAF/;
|
1314 |
+
return "Korean Hangul syllable" if $c =~ /\xEA[\xB0-\xBF]/;
|
1315 |
+
return "Korean Hangul syllable" if $c =~ /[\xEB-\xEC]/;
|
1316 |
+
return "Korean Hangul syllable" if $c =~ /\xED[\x80-\x9E]/;
|
1317 |
+
return "Klingon letter" if $c =~ /\xEF\xA3[\x90-\xA9]/;
|
1318 |
+
return "Klingon digit" if $c =~ /\xEF\xA3[\xB0-\xB9]/;
|
1319 |
+
return "Klingon punctuation" if $c =~ /\xEF\xA3[\xBD-\xBE]/;
|
1320 |
+
return "Klingon symbol" if $c =~ /\xEF\xA3\xBF/;
|
1321 |
+
return "private use character" if $c =~ /\xEE/;
|
1322 |
+
return "Latin typographic ligature" if $c =~ /\xEF\xAC[\x80-\x86]/;
|
1323 |
+
return "Hebrew presentation letter" if $c =~ /\xEF\xAC[\x9D-\xBF]/;
|
1324 |
+
return "Hebrew presentation letter" if $c =~ /\xEF\xAD[\x80-\x8F]/;
|
1325 |
+
return "Arabic presentation letter" if $c =~ /\xEF\xAD[\x90-\xBF]/;
|
1326 |
+
return "Arabic presentation letter" if $c =~ /\xEF[\xAE-\xB7]/;
|
1327 |
+
return "non-ASCII punctuation" if $c =~ /\xEF\xB8[\x90-\x99]/;
|
1328 |
+
return "non-ASCII punctuation" if $c =~ /\xEF\xB8[\xB0-\xBF]/;
|
1329 |
+
return "non-ASCII punctuation" if $c =~ /\xEF\xB9[\x80-\xAB]/;
|
1330 |
+
return "Arabic presentation letter" if $c =~ /\xEF\xB9[\xB0-\xBF]/;
|
1331 |
+
return "Arabic presentation letter" if $c =~ /\xEF\xBA/;
|
1332 |
+
return "Arabic presentation letter" if $c =~ /\xEF\xBB[\x80-\xBC]/;
|
1333 |
+
return "byte-order mark/zero-width no-break space" if $c eq "\xEF\xBB\xBF";
|
1334 |
+
return "fullwidth currency" if $c =~ /\xEF\xBC\x84/;
|
1335 |
+
return "fullwidth digit" if $c =~ /\xEF\xBC[\x90-\x99]/;
|
1336 |
+
return "fullwidth Latin letter" if $c =~ /\xEF\xBC[\xA1-\xBA]/;
|
1337 |
+
return "fullwidth Latin letter" if $c =~ /\xEF\xBD[\x81-\x9A]/;
|
1338 |
+
return "fullwidth punctuation" if $c =~ /\xEF\xBC/;
|
1339 |
+
return "fullwidth punctuation" if $c =~ /\xEF\xBD[\x9B-\xA4]/;
|
1340 |
+
return "halfwidth Japanese punctuation" if $c =~ /\xEF\xBD[\xA1-\xA4]/;
|
1341 |
+
return "halfwidth Japanese katakana character" if $c =~ /\xEF\xBD[\xA5-\xBF]/;
|
1342 |
+
return "halfwidth Japanese katakana character" if $c =~ /\xEF\xBE[\x80-\x9F]/;
|
1343 |
+
return "fullwidth currency" if $c =~ /\xEF\xBF[\xA0-\xA6]/;
|
1344 |
+
return "replacement character" if $c eq "\xEF\xBF\xBD";
|
1345 |
+
} elsif ($c =~ /[\xF0-\xF7]/) {
|
1346 |
+
return "non-UTF8 (invalid)" unless $c =~ /[\xF0-\xF7][\x80-\xBF]{3,3}$/;
|
1347 |
+
return "non-shortest-UTF8 (invalid)" if $c =~ /\xF0[\x80-\x8F]/;
|
1348 |
+
return "Linear B syllable" if $c =~ /\xF0\x90\x80/;
|
1349 |
+
return "Linear B syllable" if $c =~ /\xF0\x90\x81[\x80-\x8F]/;
|
1350 |
+
return "Linear B symbol" if $c =~ /\xF0\x90\x81[\x90-\x9F]/;
|
1351 |
+
return "Linear B ideogram" if $c =~ /\xF0\x90[\x82-\x83]/;
|
1352 |
+
return "Gothic letter" if $c =~ /\xF0\x90\x8C[\xB0-\xBF]/;
|
1353 |
+
return "Gothic letter" if $c =~ /\xF0\x90\x8D[\x80-\x8F]/;
|
1354 |
+
return "Phoenician letter" if $c =~ /\xF0\x90\xA4[\x80-\x95]/;
|
1355 |
+
return "Phoenician number" if $c =~ /\xF0\x90\xA4[\x96-\x9B]/;
|
1356 |
+
return "Phoenician punctuation" if $c =~ /\xF0\x90\xA4\x9F/; # word separator
|
1357 |
+
return "Old Hungarian number" if $c =~ /\xF0\x90\xB3[\xBA-\xBF]/;
|
1358 |
+
return "Old Hungarian letter" if $c =~ /\xF0\x90[\xB2-\xB3]/;
|
1359 |
+
return "Cuneiform digit" if $c =~ /\xF0\x92\x90/; # numberic sign
|
1360 |
+
return "Cuneiform digit" if $c =~ /\xF0\x92\x91[\x80-\xAF]/; # numberic sign
|
1361 |
+
return "Cuneiform punctuation" if $c =~ /\xF0\x92\x91[\xB0-\xBF]/;
|
1362 |
+
return "Cuneiform sign" if $c =~ /\xF0\x92[\x80-\x95]/;
|
1363 |
+
return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x81\xA8/;
|
1364 |
+
return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x82[\xAD-\xB6]/;
|
1365 |
+
return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x86[\x90\xBC-\xBF]/;
|
1366 |
+
return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x87[\x80-\x84]/;
|
1367 |
+
return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x8D[\xA2-\xAB]/;
|
1368 |
+
return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x8E[\x86-\x92]/;
|
1369 |
+
return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x8F[\xBA-\xBF]/;
|
1370 |
+
return "Egyptian hieroglyph number" if $c =~ /\xF0\x93\x90[\x80-\x83]/;
|
1371 |
+
return "Egyptian hieroglyph" if $c =~ /\xF0\x93[\x80-\x90]/;
|
1372 |
+
return "enclosed alphanumeric" if $c =~ /\xF0\x9F[\x84-\x87]/;
|
1373 |
+
return "Mahjong symbol" if $c =~ /\xF0\x9F\x80[\x80-\xAF]/;
|
1374 |
+
return "Domino symbol" if $c =~ /\xF0\x9F\x80[\xB0-\xBF]/;
|
1375 |
+
return "Domino symbol" if $c =~ /\xF0\x9F\x81/;
|
1376 |
+
return "Domino symbol" if $c =~ /\xF0\x9F\x82[\x80-\x9F]/;
|
1377 |
+
return "Playing card symbol" if $c =~ /\xF0\x9F\x82[\xA0-\xBF]/;
|
1378 |
+
return "Playing card symbol" if $c =~ /\xF0\x9F\x83/;
|
1379 |
+
return "CJK symbol" if $c =~ /\xF0\x9F[\x88-\x8B]/;
|
1380 |
+
return "pictograph" if $c =~ /\xF0\x9F[\x8C-\x9B]/;
|
1381 |
+
return "geometric shape" if $c =~ /\xF0\x9F[\x9E-\x9F]/;
|
1382 |
+
return "non-ASCII punctuation" if $c =~ /\xF0\x9F[\xA0-\xA3]/;
|
1383 |
+
return "pictograph" if $c =~ /\xF0\x9F[\xA4-\xAB]/;
|
1384 |
+
return "CJK character" if $c =~ /\xF0[\xA0-\xAF]/;
|
1385 |
+
return "tag" if $c =~ /\xF3\xA0[\x80-\x81]/;
|
1386 |
+
return "variation selector" if $c =~ /\xF3\xA0[\x84-\x87]/;
|
1387 |
+
return "private use character" if $c =~ /\xF3[\xB0-\xBF]/;
|
1388 |
+
return "private use character" if $c =~ /\xF4[\x80-\x8F]/;
|
1389 |
+
# ...
|
1390 |
+
} elsif ($c =~ /[\xF8-\xFB]/) {
|
1391 |
+
return "non-UTF8 (invalid)" unless $c =~ /[\xF8-\xFB][\x80-\xBF]{4,4}$/;
|
1392 |
+
} elsif ($c =~ /[\xFC-\xFD]/) {
|
1393 |
+
return "non-UTF8 (invalid)" unless $c =~ /[\xFC-\xFD][\x80-\xBF]{5,5}$/;
|
1394 |
+
} elsif ($c =~ /\xFE/) {
|
1395 |
+
return "non-UTF8 (invalid)" unless $c =~ /\xFE][\x80-\xBF]{6,6}$/;
|
1396 |
+
} else {
|
1397 |
+
return "non-UTF8 (invalid)";
|
1398 |
+
}
|
1399 |
+
return "other character";
|
1400 |
+
}
|
1401 |
+
|
1402 |
+
1;
|
1403 |
+
|
1404 |
+
|
uroman/lib/NLP/stringDistance.pm
ADDED
@@ -0,0 +1,724 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
################################################################
|
2 |
+
# #
|
3 |
+
# stringDistance #
|
4 |
+
# #
|
5 |
+
################################################################
|
6 |
+
|
7 |
+
package NLP::stringDistance;
|
8 |
+
|
9 |
+
use List::Util qw(min max);
|
10 |
+
$utf8 = NLP::UTF8;
|
11 |
+
$util = NLP::utilities;
|
12 |
+
$romanizer = NLP::Romanizer;
|
13 |
+
|
14 |
+
%dummy_ht = ();
|
15 |
+
|
16 |
+
sub rule_string_expansion {
|
17 |
+
local($this, *ht, $s, $lang_code) = @_;
|
18 |
+
|
19 |
+
my @characters = $utf8->split_into_utf8_characters($s, "return only chars, return trailing whitespaces", *dummy_ht);
|
20 |
+
foreach $sub_len ((0 .. ($#characters-1))) {
|
21 |
+
my $sub = join("", @characters[0 .. $sub_len]);
|
22 |
+
foreach $super_len ((($sub_len + 1) .. $#characters)) {
|
23 |
+
my $super = join("", @characters[0 .. $super_len]);
|
24 |
+
# print STDERR " $sub -> $super\n" unless $ht{RULE_STRING_EXPANSION}->{$lang_code}->{$sub}->{$super};
|
25 |
+
$ht{RULE_STRING_EXPANSION}->{$lang_code}->{$sub}->{$super} = 1;
|
26 |
+
$ht{RULE_STRING_HAS_EXPANSION}->{$lang_code}->{$sub} = 1;
|
27 |
+
# print STDERR " RULE_STRING_HAS_EXPANSION $lang_code $sub\n";
|
28 |
+
}
|
29 |
+
}
|
30 |
+
}
|
31 |
+
|
32 |
+
sub load_string_distance_data {
|
33 |
+
local($this, $filename, *ht, $verbose) = @_;
|
34 |
+
|
35 |
+
$verbose = 0 unless defined($verbose);
|
36 |
+
open(IN,$filename) || die "Could not open $filename";
|
37 |
+
my $line_number = 0;
|
38 |
+
my $n_cost_rules = 0;
|
39 |
+
while (<IN>) {
|
40 |
+
$line_number++;
|
41 |
+
my $line = $_;
|
42 |
+
$line =~ s/^\xEF\xBB\xBF//;
|
43 |
+
$line =~ s/\s*$//;
|
44 |
+
next if $line =~ /^\s*(\#.*)?$/;
|
45 |
+
print STDERR "** Warning: line $line_number contains suspicious control character: $line\n" if $line =~ /[\x00-\x1F]/;
|
46 |
+
my $s1 = $util->slot_value_in_double_colon_del_list($line, "s1");
|
47 |
+
my $s2 = $util->slot_value_in_double_colon_del_list($line, "s2");
|
48 |
+
$s1 = $util->dequote_string($s1); # 'can\'t' => can't
|
49 |
+
$s2 = $util->dequote_string($s2);
|
50 |
+
my $cost = $util->slot_value_in_double_colon_del_list($line, "cost");
|
51 |
+
if (($s1 eq "") && ($s2 eq "")) {
|
52 |
+
print STDERR "Ignoring bad line $line_number in $filename, because both s1 and s2 are empty strings\n";
|
53 |
+
next;
|
54 |
+
}
|
55 |
+
unless ($cost =~ /^\d+(\.\d+)?$/) {
|
56 |
+
if ($cost eq "") {
|
57 |
+
print STDERR "Ignoring bad line $line_number in $filename, because of missing cost\n";
|
58 |
+
} else {
|
59 |
+
print STDERR "Ignoring bad line $line_number in $filename, because of ill-formed cost $cost\n";
|
60 |
+
}
|
61 |
+
next;
|
62 |
+
}
|
63 |
+
my $lang_code1_s = $util->slot_value_in_double_colon_del_list($line, "lc1");
|
64 |
+
my $lang_code2_s = $util->slot_value_in_double_colon_del_list($line, "lc2");
|
65 |
+
my @lang_codes_1 = ($lang_code1_s eq "") ? ("") : split(/,\s*/, $lang_code1_s);
|
66 |
+
my @lang_codes_2 = ($lang_code2_s eq "") ? ("") : split(/,\s*/, $lang_code2_s);
|
67 |
+
my $left_context1 = $util->slot_value_in_double_colon_del_list($line, "left1");
|
68 |
+
my $left_context2 = $util->slot_value_in_double_colon_del_list($line, "left2");
|
69 |
+
my $right_context1 = $util->slot_value_in_double_colon_del_list($line, "right1");
|
70 |
+
my $right_context2 = $util->slot_value_in_double_colon_del_list($line, "right2");
|
71 |
+
my $bad_left = $util->slot_value_in_double_colon_del_list($line, "left");
|
72 |
+
if ($bad_left) {
|
73 |
+
print STDERR "** Warning: slot '::left $bad_left' in line $line_number\n";
|
74 |
+
next;
|
75 |
+
}
|
76 |
+
my $bad_right = $util->slot_value_in_double_colon_del_list($line, "right");
|
77 |
+
if ($bad_right) {
|
78 |
+
print STDERR "** Warning: slot '::right $bad_right' in line $line_number\n";
|
79 |
+
next;
|
80 |
+
}
|
81 |
+
my $in_lang_codes1 = $util->slot_value_in_double_colon_del_list($line, "in-lc1");
|
82 |
+
my $in_lang_codes2 = $util->slot_value_in_double_colon_del_list($line, "in-lc2");
|
83 |
+
my $out_lang_codes1 = $util->slot_value_in_double_colon_del_list($line, "out-lc1");
|
84 |
+
my $out_lang_codes2 = $util->slot_value_in_double_colon_del_list($line, "out-lc2");
|
85 |
+
if ($left_context1) {
|
86 |
+
if ($left_context1 =~ /^\/.*\/$/) {
|
87 |
+
$left_context1 =~ s/^\///;
|
88 |
+
$left_context1 =~ s/\/$//;
|
89 |
+
} else {
|
90 |
+
print STDERR "Ignoring unrecognized non-regular-express ::left1 $left_context1 in $line_number of $filename\n";
|
91 |
+
$left_context1 = "";
|
92 |
+
}
|
93 |
+
}
|
94 |
+
if ($left_context2) {
|
95 |
+
if ($left_context2 =~ /^\/.*\/$/) {
|
96 |
+
$left_context2 =~ s/^\///;
|
97 |
+
$left_context2 =~ s/\/$//;
|
98 |
+
} else {
|
99 |
+
$left_context2 = "";
|
100 |
+
print STDERR "Ignoring unrecognized non-regular-express ::left2 $left_context2 in $line_number of $filename\n";
|
101 |
+
}
|
102 |
+
}
|
103 |
+
if ($right_context1) {
|
104 |
+
unless ($right_context1 =~ /^(\[[^\[\]]*\])+$/) {
|
105 |
+
$right_context1 = "";
|
106 |
+
print STDERR "Ignoring unrecognized right-context ::right1 $right_context1 in $line_number of $filename\n";
|
107 |
+
}
|
108 |
+
}
|
109 |
+
if ($right_context2) {
|
110 |
+
unless ($right_context2 =~ /^(\[[^\[\]]*\])+$/) {
|
111 |
+
$right_context2 = "";
|
112 |
+
print STDERR "Ignoring unrecognized right-context ::right2 $right_context2 in $line_number of $filename\n";
|
113 |
+
}
|
114 |
+
}
|
115 |
+
foreach $lang_code1 (@lang_codes_1) {
|
116 |
+
foreach $lang_code2 (@lang_codes_2) {
|
117 |
+
$n_cost_rules++;
|
118 |
+
my $cost_rule_id = $n_cost_rules;
|
119 |
+
$ht{COST}->{$lang_code1}->{$lang_code2}->{$s1}->{$s2}->{$cost_rule_id} = $cost;
|
120 |
+
$ht{RULE_STRING}->{$lang_code1}->{$s1} = 1;
|
121 |
+
$ht{RULE_STRING}->{$lang_code2}->{$s2} = 1;
|
122 |
+
$ht{LEFT1}->{$cost_rule_id} = $left_context1;
|
123 |
+
$ht{LEFT2}->{$cost_rule_id} = $left_context2;
|
124 |
+
$ht{RIGHT1}->{$cost_rule_id} = $right_context1;
|
125 |
+
$ht{RIGHT2}->{$cost_rule_id} = $right_context2;
|
126 |
+
$ht{INLC1}->{$cost_rule_id} = $in_lang_codes1;
|
127 |
+
$ht{INLC2}->{$cost_rule_id} = $in_lang_codes2;
|
128 |
+
$ht{OUTLC1}->{$cost_rule_id} = $out_lang_codes1;
|
129 |
+
$ht{OUTLC2}->{$cost_rule_id} = $out_lang_codes2;
|
130 |
+
unless (($s1 eq $s2)
|
131 |
+
&& ($lang_code1 eq $lang_code2)
|
132 |
+
&& ($left_context1 eq $left_context2)
|
133 |
+
&& ($right_context1 eq $right_context2)
|
134 |
+
&& ($in_lang_codes1 eq $in_lang_codes2)
|
135 |
+
&& ($out_lang_codes1 eq $out_lang_codes2)) {
|
136 |
+
$n_cost_rules++;
|
137 |
+
$cost_rule_id = $n_cost_rules;
|
138 |
+
$ht{COST}->{$lang_code2}->{$lang_code1}->{$s2}->{$s1}->{$cost_rule_id} = $cost;
|
139 |
+
$ht{LEFT1}->{$cost_rule_id} = $left_context2;
|
140 |
+
$ht{LEFT2}->{$cost_rule_id} = $left_context1;
|
141 |
+
$ht{RIGHT1}->{$cost_rule_id} = $right_context2;
|
142 |
+
$ht{RIGHT2}->{$cost_rule_id} = $right_context1;
|
143 |
+
$ht{INLC1}->{$cost_rule_id} = $in_lang_codes2;
|
144 |
+
$ht{INLC2}->{$cost_rule_id} = $in_lang_codes1;
|
145 |
+
$ht{OUTLC1}->{$cost_rule_id} = $out_lang_codes2;
|
146 |
+
$ht{OUTLC2}->{$cost_rule_id} = $out_lang_codes1;
|
147 |
+
# print STDERR " Flip rule in line $line: $line\n";
|
148 |
+
}
|
149 |
+
$this->rule_string_expansion(*ht, $s1, $lang_code1);
|
150 |
+
$this->rule_string_expansion(*ht, $s2, $lang_code2);
|
151 |
+
}
|
152 |
+
}
|
153 |
+
}
|
154 |
+
close(IN);
|
155 |
+
print STDERR "Read in $n_cost_rules rules from $line_number lines in $filename\n" if $verbose;
|
156 |
+
}
|
157 |
+
|
158 |
+
sub romanized_string_to_simple_chart {
|
159 |
+
local($this, $s, *chart_ht) = @_;
|
160 |
+
|
161 |
+
my @characters = $utf8->split_into_utf8_characters($s, "return only chars, return trailing whitespaces", *dummy_ht);
|
162 |
+
$chart_ht{N_CHARS} = $#characters + 1;
|
163 |
+
$chart_ht{N_NODES} = 0;
|
164 |
+
foreach $i ((0 .. $#characters)) {
|
165 |
+
$romanizer->add_node($characters[$i], $i, ($i+1), *chart_ht, "", "");
|
166 |
+
}
|
167 |
+
}
|
168 |
+
|
169 |
+
sub linearize_chart_points {
|
170 |
+
local($this, *chart_ht, $chart_id, *sd_ht, $verbose) = @_;
|
171 |
+
|
172 |
+
$verbose = 0 unless defined($verbose);
|
173 |
+
print STDERR "Linearize $chart_id\n" if $verbose;
|
174 |
+
my $current_chart_pos = 0;
|
175 |
+
my $current_linear_chart_pos = 0;
|
176 |
+
$sd_ht{POS2LINPOS}->{$chart_id}->{$current_chart_pos} = $current_linear_chart_pos;
|
177 |
+
$sd_ht{LINPOS2POS}->{$chart_id}->{$current_linear_chart_pos} = $current_chart_pos;
|
178 |
+
print STDERR " LINPOS2POS.$chart_id LIN: $current_linear_chart_pos POS: $current_chart_pos\n" if $verbose;
|
179 |
+
my @end_chart_positions = keys %{$chart_ht{NODES_ENDING_AT}};
|
180 |
+
my $end_chart_pos = (@end_chart_positions) ? max(@end_chart_positions) : 0;
|
181 |
+
$sd_ht{MAXPOS}->{$chart_id} = $end_chart_pos;
|
182 |
+
print STDERR " Chart span: $current_chart_pos-$end_chart_pos\n" if $verbose;
|
183 |
+
while ($current_chart_pos < $end_chart_pos) {
|
184 |
+
my @node_ids = keys %{$chart_ht{NODES_STARTING_AT}->{$current_chart_pos}};
|
185 |
+
foreach $node_id (@node_ids) {
|
186 |
+
my $roman_s = $chart_ht{NODE_ROMAN}->{$node_id};
|
187 |
+
my @roman_chars = $utf8->split_into_utf8_characters($roman_s, "return only chars, return trailing whitespaces", *dummy_ht);
|
188 |
+
print STDERR " $current_chart_pos/$current_linear_chart_pos node: $node_id $roman_s (@roman_chars)\n" if $verbose;
|
189 |
+
if ($#roman_chars >= 1) {
|
190 |
+
foreach $i ((1 .. $#roman_chars)) {
|
191 |
+
$current_linear_chart_pos++;
|
192 |
+
$sd_ht{SPLITPOS2LINPOS}->{$chart_id}->{$current_chart_pos}->{$node_id}->{$i} = $current_linear_chart_pos;
|
193 |
+
$sd_ht{LINPOS2SPLITPOS}->{$chart_id}->{$current_linear_chart_pos}->{$current_chart_pos}->{$node_id}->{$i} = 1;
|
194 |
+
print STDERR " LINPOS2SPLITPOS.$chart_id LIN: $current_linear_chart_pos POS: $current_chart_pos NODE: $node_id I: $i\n" if $verbose;
|
195 |
+
}
|
196 |
+
}
|
197 |
+
}
|
198 |
+
$current_chart_pos++;
|
199 |
+
if ($util->member($current_chart_pos, @end_chart_positions)) {
|
200 |
+
$current_linear_chart_pos++;
|
201 |
+
$sd_ht{POS2LINPOS}->{$chart_id}->{$current_chart_pos} = $current_linear_chart_pos;
|
202 |
+
$sd_ht{LINPOS2POS}->{$chart_id}->{$current_linear_chart_pos} = $current_chart_pos;
|
203 |
+
print STDERR " LINPOS2POS.$chart_id LIN: $current_linear_chart_pos POS: $current_chart_pos\n" if $verbose;
|
204 |
+
}
|
205 |
+
}
|
206 |
+
$current_chart_pos = 0;
|
207 |
+
while ($current_chart_pos <= $end_chart_pos) {
|
208 |
+
my $current_linear_chart_pos = $sd_ht{POS2LINPOS}->{$chart_id}->{$current_chart_pos};
|
209 |
+
$current_linear_chart_pos = "?" unless defined($current_linear_chart_pos);
|
210 |
+
my @node_ids = keys %{$chart_ht{NODES_STARTING_AT}->{$current_chart_pos}};
|
211 |
+
# print STDERR " LINROM.$chart_id LIN: $current_linear_chart_pos POS: $current_chart_pos NODES: @node_ids\n" if $verbose;
|
212 |
+
foreach $node_id (@node_ids) {
|
213 |
+
my $end_pos = $chart_ht{NODE_END}->{$node_id};
|
214 |
+
my $end_linpos = $sd_ht{POS2LINPOS}->{$chart_id}->{$end_pos};
|
215 |
+
my $roman_s = $chart_ht{NODE_ROMAN}->{$node_id};
|
216 |
+
my @roman_chars = $utf8->split_into_utf8_characters($roman_s, "return only chars, return trailing whitespaces", *dummy_ht);
|
217 |
+
print STDERR " LINROM.$chart_id LIN: $current_linear_chart_pos POS: $current_chart_pos NODE: $node_id CHARS: @roman_chars\n" if $verbose;
|
218 |
+
if (@roman_chars) {
|
219 |
+
foreach $i ((0 .. $#roman_chars)) {
|
220 |
+
my $from_linear_chart_pos
|
221 |
+
= (($i == 0)
|
222 |
+
? $sd_ht{POS2LINPOS}->{$chart_id}->{$current_chart_pos}
|
223 |
+
: $sd_ht{SPLITPOS2LINPOS}->{$chart_id}->{$current_chart_pos}->{$node_id}->{$i});
|
224 |
+
print STDERR " FROM.$chart_id I: $i POS: $current_chart_pos NODE: $node_id FROM: $from_linear_chart_pos\n" if $verbose;
|
225 |
+
my $to_linear_chart_pos
|
226 |
+
= (($i == $#roman_chars)
|
227 |
+
? $end_linpos
|
228 |
+
: $sd_ht{SPLITPOS2LINPOS}->{$chart_id}->{$current_chart_pos}->{$node_id}->{($i+1)});
|
229 |
+
print STDERR " TO.$chart_id I: $i POS: $current_chart_pos NODE: $node_id FROM: $to_linear_chart_pos\n" if $verbose;
|
230 |
+
my $roman_char = $roman_chars[$i];
|
231 |
+
$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$from_linear_chart_pos}->{$to_linear_chart_pos}->{$roman_char} = 1;
|
232 |
+
}
|
233 |
+
} else {
|
234 |
+
my $from_linear_chart_pos = $sd_ht{POS2LINPOS}->{$chart_id}->{$current_chart_pos};
|
235 |
+
my $to_linear_chart_pos = $sd_ht{POS2LINPOS}->{$chart_id}->{($current_chart_pos+1)};
|
236 |
+
# HHERE check this out
|
237 |
+
my $i = 1;
|
238 |
+
while (! (defined($to_linear_chart_pos))) {
|
239 |
+
$i++;
|
240 |
+
$to_linear_chart_pos = $sd_ht{POS2LINPOS}->{$chart_id}->{($current_chart_pos+$i)};
|
241 |
+
}
|
242 |
+
if (defined($from_linear_chart_pos) && defined($to_linear_chart_pos)) {
|
243 |
+
$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$from_linear_chart_pos}->{$to_linear_chart_pos}->{""} = 1
|
244 |
+
} else {
|
245 |
+
print STDERR " UNDEF.$chart_id from: "
|
246 |
+
. ((defined($from_linear_chart_pos)) ? $from_linear_chart_pos : "?")
|
247 |
+
. " to: "
|
248 |
+
. ((defined($to_linear_chart_pos)) ? $to_linear_chart_pos : "?")
|
249 |
+
. "\n";
|
250 |
+
}
|
251 |
+
}
|
252 |
+
}
|
253 |
+
$current_chart_pos++;
|
254 |
+
}
|
255 |
+
$sd_ht{MAXLINPOS}->{$chart_id} = $sd_ht{POS2LINPOS}->{$chart_id}->{$end_chart_pos};
|
256 |
+
}
|
257 |
+
|
258 |
+
sub expand_lin_ij_roman {
|
259 |
+
local($this, *sd_ht, $chart_id, $lang_code, *ht) = @_;
|
260 |
+
|
261 |
+
foreach $start (sort { $a <=> $b } keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}}) {
|
262 |
+
foreach $end (sort { $a <=> $b } keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$start}}) {
|
263 |
+
foreach $roman (sort keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$start}->{$end}}) {
|
264 |
+
if ($ht{RULE_STRING_HAS_EXPANSION}->{$lang_code}->{$roman}
|
265 |
+
|| $ht{RULE_STRING_HAS_EXPANSION}->{""}->{$roman}) {
|
266 |
+
$this->expand_lin_ij_roman_rec(*sd_ht, $chart_id, $start, $end, $roman, $lang_code, *ht);
|
267 |
+
}
|
268 |
+
}
|
269 |
+
}
|
270 |
+
}
|
271 |
+
}
|
272 |
+
|
273 |
+
sub expand_lin_ij_roman_rec {
|
274 |
+
local($this, *sd_ht, $chart_id, $start, $end, $roman, $lang_code, *ht) = @_;
|
275 |
+
|
276 |
+
# print STDERR " expand_lin_ij_roman_rec.$chart_id $start-$end $lang_code $roman\n";
|
277 |
+
return unless $ht{RULE_STRING_HAS_EXPANSION}->{$lang_code}->{$roman}
|
278 |
+
|| $ht{RULE_STRING_HAS_EXPANSION}->{""}->{$roman};
|
279 |
+
foreach $new_end (keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$end}}) {
|
280 |
+
foreach $next_roman (sort keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$end}->{$new_end}}) {
|
281 |
+
my $exp_roman = join("", $roman, $next_roman);
|
282 |
+
if ($ht{RULE_STRING}->{$lang_code}->{$exp_roman}
|
283 |
+
|| $ht{RULE_STRING}->{""}->{$exp_roman}) {
|
284 |
+
$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$start}->{$new_end}->{$exp_roman} = 1;
|
285 |
+
# print STDERR " Expansion ($start-$new_end) $exp_roman\n";
|
286 |
+
}
|
287 |
+
if ($ht{RULE_STRING_HAS_EXPANSION}->{$lang_code}->{$exp_roman}
|
288 |
+
|| $ht{RULE_STRING_HAS_EXPANSION}->{""}->{$exp_roman}) {
|
289 |
+
$this->expand_lin_ij_roman_rec(*sd_ht, $chart_id, $start, $new_end, $exp_roman, $lang_code, *ht);
|
290 |
+
}
|
291 |
+
}
|
292 |
+
}
|
293 |
+
}
|
294 |
+
|
295 |
+
sub trace_string_distance {
|
296 |
+
local($this, *sd_ht, $chart1_id, $chart2_id, $control, $line_number, $cost) = @_;
|
297 |
+
|
298 |
+
my $chart_comb_id = join("/", $chart1_id, $chart2_id);
|
299 |
+
return "mismatch" if $sd_ht{MISMATCH}->{$chart_comb_id};
|
300 |
+
my $chart1_end = $sd_ht{MAXLINPOS}->{$chart1_id};
|
301 |
+
my $chart2_end = $sd_ht{MAXLINPOS}->{$chart2_id};
|
302 |
+
my $verbose = ($control =~ /verbose/);
|
303 |
+
my $chunks_p = ($control =~ /chunks/);
|
304 |
+
my @traces = ();
|
305 |
+
my @s1_s = ();
|
306 |
+
my @s2_s = ();
|
307 |
+
my @e1_s = ();
|
308 |
+
my @e2_s = ();
|
309 |
+
my @r1_s = ();
|
310 |
+
my @r2_s = ();
|
311 |
+
my @ic_s = ();
|
312 |
+
|
313 |
+
# print STDERR "trace_string_distance $chart1_id $chart2_id $line_number\n";
|
314 |
+
while ($chart1_end || $chart2_end) {
|
315 |
+
my $incr_cost = $sd_ht{INCR_COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
|
316 |
+
my $prec_i = $sd_ht{PREC_I}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
|
317 |
+
my $prec_j = $sd_ht{PREC_J}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
|
318 |
+
if ($incr_cost || $verbose || $chunks_p) {
|
319 |
+
my $roman1 = $sd_ht{ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
|
320 |
+
my $roman2 = $sd_ht{ROMAN2}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
|
321 |
+
if ($verbose) {
|
322 |
+
push(@traces, "$prec_i-$chart1_end/$prec_j-$chart2_end:$roman1/$roman2:$incr_cost");
|
323 |
+
} else {
|
324 |
+
if (defined($roman1)) {
|
325 |
+
push(@traces, "$roman1/$roman2:$incr_cost");
|
326 |
+
} else {
|
327 |
+
$print_prec_i = (defined($prec_i)) ? $prec_i : "?";
|
328 |
+
$print_prec_j = (defined($prec_j)) ? $prec_j : "?";
|
329 |
+
print STDERR " $prec_i-$chart1_end, $prec_j-$chart2_end\n";
|
330 |
+
}
|
331 |
+
}
|
332 |
+
if ($chunks_p) {
|
333 |
+
push(@s1_s, $prec_i);
|
334 |
+
push(@s2_s, $prec_j);
|
335 |
+
push(@e1_s, $chart1_end);
|
336 |
+
push(@e2_s, $chart2_end);
|
337 |
+
push(@r1_s, $roman1);
|
338 |
+
push(@r2_s, $roman2);
|
339 |
+
push(@ic_s, $incr_cost);
|
340 |
+
}
|
341 |
+
}
|
342 |
+
$chart1_end = $prec_i;
|
343 |
+
$chart2_end = $prec_j;
|
344 |
+
}
|
345 |
+
if ($chunks_p) {
|
346 |
+
my $r1 = "";
|
347 |
+
my $r2 = "";
|
348 |
+
my $tc = 0;
|
349 |
+
my $in_chunk = 0;
|
350 |
+
foreach $i ((0 .. $#ic_s)) {
|
351 |
+
if ($ic_s[$i]) {
|
352 |
+
$r1 = $r1_s[$i] . $r1;
|
353 |
+
$r2 = $r2_s[$i] . $r2;
|
354 |
+
$tc += $ic_s[$i];
|
355 |
+
$in_chunk = 1;
|
356 |
+
} elsif ($in_chunk) {
|
357 |
+
$chunk = "$r1/$r2/$tc";
|
358 |
+
$chunk .= "*" if $cost > 5;
|
359 |
+
$sd_ht{N_COST_CHUNK}->{$chunk} = ($sd_ht{N_COST_CHUNK}->{$chunk} || 0) + 1;
|
360 |
+
$sd_ht{EX_COST_CHUNK}->{$chunk}->{$line_number} = 1;
|
361 |
+
$r1 = "";
|
362 |
+
$r2 = "";
|
363 |
+
$tc = 0;
|
364 |
+
$in_chunk = 0;
|
365 |
+
}
|
366 |
+
}
|
367 |
+
if ($in_chunk) {
|
368 |
+
$chunk = "$r1/$r2/$tc";
|
369 |
+
$chunk .= "*" if $cost > 5;
|
370 |
+
$sd_ht{N_COST_CHUNK}->{$chunk} = ($sd_ht{N_COST_CHUNK}->{$chunk} || 0) + 1;
|
371 |
+
$sd_ht{EX_COST_CHUNK}->{$chunk}->{$line_number} = 1;
|
372 |
+
}
|
373 |
+
} else {
|
374 |
+
return join(" ", reverse @traces);
|
375 |
+
}
|
376 |
+
}
|
377 |
+
|
378 |
+
sub right_context_match {
|
379 |
+
local($this, $right_context_rule, *sd_ht, $chart_id, $start_pos) = @_;
|
380 |
+
|
381 |
+
return 1 if $right_context_rule eq "";
|
382 |
+
if (($right_context_item, $right_context_rest) = ($right_context_rule =~ /^\[([^\[\]]*)\]*(.*)$/)) {
|
383 |
+
my $guarded_right_context_item = $right_context_item;
|
384 |
+
$guarded_right_context_item =~ s/\$/\\\$/g;
|
385 |
+
my @end_positions = keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$start_pos}};
|
386 |
+
return 1 if ($#end_positions == -1)
|
387 |
+
&& (($right_context_item eq "")
|
388 |
+
|| ($right_context_item =~ /\$/));
|
389 |
+
foreach $end_pos (@end_positions) {
|
390 |
+
my @romans = keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$start_pos}->{$end_pos}};
|
391 |
+
foreach $roman (@romans) {
|
392 |
+
if ($roman =~ /^[$guarded_right_context_item]/) {
|
393 |
+
return $this->right_context_match($right_context_rest, *sd_ht, $chart_id, $end_pos);
|
394 |
+
}
|
395 |
+
}
|
396 |
+
}
|
397 |
+
}
|
398 |
+
return 0;
|
399 |
+
}
|
400 |
+
|
401 |
+
sub string_distance {
|
402 |
+
local($this, *sd_ht, $chart1_id, $chart2_id, $lang_code1, $lang_code2, *ht, $control) = @_;
|
403 |
+
|
404 |
+
my $verbose = ($control =~ /verbose/i);
|
405 |
+
my $chart_comb_id = join("/", $chart1_id, $chart2_id);
|
406 |
+
|
407 |
+
my $chart1_end_pos = $sd_ht{MAXLINPOS}->{$chart1_id};
|
408 |
+
my $chart2_end_pos = $sd_ht{MAXLINPOS}->{$chart2_id};
|
409 |
+
print STDERR "string_distance.$chart_comb_id $chart1_end_pos/$chart2_end_pos\n" if $verbose;
|
410 |
+
$sd_ht{COST_IJ}->{$chart_comb_id}->{0}->{0} = 0;
|
411 |
+
$sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{0}->{0} = "";
|
412 |
+
$sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{0}->{0} = "";
|
413 |
+
# HHERE
|
414 |
+
foreach $chart1_start ((0 .. $chart1_end_pos)) {
|
415 |
+
# print STDERR " C1 $chart1_start- ($chart1_start .. $chart1_end_pos)\n";
|
416 |
+
my $prev_further_expansion_possible = 0;
|
417 |
+
my @chart1_ends = sort { $a <=> $b } keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart1_id}->{$chart1_start}};
|
418 |
+
my $max_chart1_ends = (@chart1_ends) ? $chart1_ends[$#chart1_ends] : -1;
|
419 |
+
foreach $chart1_end (($chart1_start .. $chart1_end_pos)) {
|
420 |
+
my $further_expansion_possible = ($chart1_start == $chart1_end)
|
421 |
+
|| defined($sd_ht{LINPOS2SPLITPOS}->{$chart1_id}->{$chart1_start})
|
422 |
+
|| ($chart1_end < $max_chart1_ends);
|
423 |
+
my @romans1 = (($chart1_start == $chart1_end)
|
424 |
+
? ("")
|
425 |
+
: (sort keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart1_id}->{$chart1_start}->{$chart1_end}}));
|
426 |
+
if ($#romans1 == -1) {
|
427 |
+
$further_expansion_possible = 1 if $prev_further_expansion_possible;
|
428 |
+
} else {
|
429 |
+
$prev_further_expansion_possible = 0;
|
430 |
+
}
|
431 |
+
# print STDERR " C1 $chart1_start-$chart1_end romans1: @romans1 {$further_expansion_possible} *l*\n";
|
432 |
+
foreach $roman1 (@romans1) {
|
433 |
+
# print STDERR " C1 $chart1_start-$chart1_end $roman1 {$further_expansion_possible} *?*\n";
|
434 |
+
next unless $ht{RULE_STRING}->{$lang_code1}->{$roman1}
|
435 |
+
|| $ht{RULE_STRING}->{""}->{$roman1};
|
436 |
+
# print STDERR " C1 $chart1_start-$chart1_end $roman1 {$further_expansion_possible} ***\n";
|
437 |
+
foreach $lang_code1o (($lang_code1, "")) {
|
438 |
+
foreach $lang_code2o (($lang_code2, "")) {
|
439 |
+
my @chart2_starts = (sort { $a <=> $b } keys %{$sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_start}});
|
440 |
+
foreach $chart2_start (@chart2_starts) {
|
441 |
+
# print STDERR " C1 $chart1_start-$chart1_end $roman1 C2 $chart2_start- (@chart2_starts)\n";
|
442 |
+
foreach $chart2_end (($chart2_start .. $chart2_end_pos)) {
|
443 |
+
print STDERR " C1 $chart1_start-$chart1_end $roman1 C2 $chart2_start-$chart2_end\n";
|
444 |
+
my @romans2 = (($chart2_start == $chart2_end)
|
445 |
+
? ("")
|
446 |
+
: (sort keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart2_id}->{$chart2_start}->{$chart2_end}}));
|
447 |
+
foreach $roman2 (@romans2) {
|
448 |
+
if ($roman1 eq $roman2) {
|
449 |
+
print STDERR " C1 $chart1_start-$chart1_end $roman1 C2 $chart2_start-$chart2_end $roman2 (IDENTITY)\n";
|
450 |
+
my $cost = 0;
|
451 |
+
my $preceding_cost = $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_start}->{$chart2_start};
|
452 |
+
my $combined_cost = $preceding_cost + $cost;
|
453 |
+
my $old_cost = $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
|
454 |
+
if ((! defined($old_cost)) || ($combined_cost < $old_cost)) {
|
455 |
+
$sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $combined_cost;
|
456 |
+
push(@chart2_starts, $chart2_end) unless $util->member($chart2_end, @chart2_starts);
|
457 |
+
$sd_ht{PREC_I}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $chart1_start;
|
458 |
+
$sd_ht{PREC_J}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $chart2_start;
|
459 |
+
$sd_ht{ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $roman1;
|
460 |
+
$sd_ht{ROMAN2}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $roman2;
|
461 |
+
$sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end}
|
462 |
+
= $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_start}->{$chart2_start} . $roman1;
|
463 |
+
$sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{$chart1_end}->{$chart2_end}
|
464 |
+
= $sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{$chart1_start}->{$chart2_start} . $roman2;
|
465 |
+
$comb_left_roman1 = $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
|
466 |
+
$sd_ht{INCR_COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $cost;
|
467 |
+
$sd_ht{COST_RULE}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = "IDENTITY";
|
468 |
+
print STDERR " New cost $chart1_end/$chart2_end: $combined_cost (+$cost from $chart1_start/$chart2_start $roman1/$roman2)\n" if $verbose;
|
469 |
+
}
|
470 |
+
} else {
|
471 |
+
next unless $ht{RULE_STRING}->{$lang_code2o}->{$roman2};
|
472 |
+
print STDERR " C1 $chart1_start-$chart1_end $roman1 C2 $chart2_start-$chart2_end $roman2\n";
|
473 |
+
next unless defined($ht{COST}->{$lang_code1o}->{$lang_code2o}->{$roman1}->{$roman2});
|
474 |
+
my @cost_rule_ids = keys %{$ht{COST}->{$lang_code1o}->{$lang_code2o}->{$roman1}->{$roman2}};
|
475 |
+
foreach $cost_rule_id (@cost_rule_ids) {
|
476 |
+
## check whether any context requirements are satisfied
|
477 |
+
# left context rules are regular expressions
|
478 |
+
my $left_context_rule1 = $ht{LEFT1}->{$cost_rule_id};
|
479 |
+
if ($left_context_rule1) {
|
480 |
+
my $comb_left_roman1 = $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_start}->{$chart2_start};
|
481 |
+
if (defined($comb_left_roman1)) {
|
482 |
+
next unless $comb_left_roman1 =~ /$left_context_rule1/;
|
483 |
+
} else {
|
484 |
+
print STDERR " No comb_left_roman1 value for $chart_comb_id $chart1_start,$chart2_start\n";
|
485 |
+
}
|
486 |
+
}
|
487 |
+
my $left_context_rule2 = $ht{LEFT2}->{$cost_rule_id};
|
488 |
+
if ($left_context_rule2) {
|
489 |
+
my $comb_left_roman2 = $sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{$chart1_start}->{$chart2_start};
|
490 |
+
if (defined($comb_left_roman2)) {
|
491 |
+
next unless $comb_left_roman2 =~ /$left_context_rule2/;
|
492 |
+
} else {
|
493 |
+
print STDERR " No comb_left_roman2 value for $chart_comb_id $chart1_start,$chart2_start\n";
|
494 |
+
}
|
495 |
+
}
|
496 |
+
my $right_context_rule1 = $ht{RIGHT1}->{$cost_rule_id};
|
497 |
+
if ($right_context_rule1) {
|
498 |
+
my $match_p = $this->right_context_match($right_context_rule1, *sd_ht, $chart1_id, $chart1_end);
|
499 |
+
# print STDERR " Match?($right_context_rule1, 1, $chart1_end) = $match_p\n";
|
500 |
+
next unless $match_p;
|
501 |
+
}
|
502 |
+
my $right_context_rule2 = $ht{RIGHT2}->{$cost_rule_id};
|
503 |
+
if ($right_context_rule2) {
|
504 |
+
my $match_p = $this->right_context_match($right_context_rule2, *sd_ht, $chart2_id, $chart2_end);
|
505 |
+
# print STDERR " Match?($right_context_rule2, 2, $chart2_end) = $match_p\n";
|
506 |
+
next unless $match_p;
|
507 |
+
}
|
508 |
+
my $cost = $ht{COST}->{$lang_code1o}->{$lang_code2o}->{$roman1}->{$roman2}->{$cost_rule_id};
|
509 |
+
my $preceding_cost = $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_start}->{$chart2_start};
|
510 |
+
my $combined_cost = $preceding_cost + $cost;
|
511 |
+
my $old_cost = $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
|
512 |
+
if ((! defined($old_cost)) || ($combined_cost < $old_cost)) {
|
513 |
+
$sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $combined_cost;
|
514 |
+
push(@chart2_starts, $chart2_end) unless $util->member($chart2_end, @chart2_starts);
|
515 |
+
$sd_ht{PREC_I}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $chart1_start;
|
516 |
+
$sd_ht{PREC_J}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $chart2_start;
|
517 |
+
$sd_ht{ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $roman1;
|
518 |
+
$sd_ht{ROMAN2}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $roman2;
|
519 |
+
$sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end}
|
520 |
+
= $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_start}->{$chart2_start} . $roman1;
|
521 |
+
$sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{$chart1_end}->{$chart2_end}
|
522 |
+
= $sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{$chart1_start}->{$chart2_start} . $roman2;
|
523 |
+
$comb_left_roman1 = $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end};
|
524 |
+
# print STDERR " Comb-left-roman1($chart_comb_id,$chart1_end,$chart2_end) = $comb_left_roman1\n";
|
525 |
+
$sd_ht{INCR_COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $cost;
|
526 |
+
$sd_ht{COST_RULE}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $cost_rule_id;
|
527 |
+
print STDERR " New cost $chart1_end/$chart2_end: $combined_cost (+$cost from $chart1_start/$chart2_start $roman1/$roman2)\n" if $verbose;
|
528 |
+
}
|
529 |
+
}
|
530 |
+
}
|
531 |
+
}
|
532 |
+
}
|
533 |
+
}
|
534 |
+
}
|
535 |
+
}
|
536 |
+
$further_expansion_possible = 1
|
537 |
+
if $ht{RULE_STRING_HAS_EXPANSION}->{$lang_code1}->{$roman1}
|
538 |
+
|| $ht{RULE_STRING_HAS_EXPANSION}->{""}->{$roman1};
|
539 |
+
# print STDERR " further_expansion_possible: $further_expansion_possible (lc: $lang_code1 r1: $roman1) ***\n";
|
540 |
+
}
|
541 |
+
# print STDERR " last C1 $chart1_start-$chart1_end (@romans1)\n" unless $further_expansion_possible;
|
542 |
+
last unless $further_expansion_possible;
|
543 |
+
$prev_further_expansion_possible = 1 if $further_expansion_possible;
|
544 |
+
}
|
545 |
+
}
|
546 |
+
my $total_cost = $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_end_pos}->{$chart2_end_pos};
|
547 |
+
unless (defined($total_cost)) {
|
548 |
+
$total_cost = 99.9999;
|
549 |
+
$sd_ht{MISMATCH}->{$chart_comb_id} = 1;
|
550 |
+
}
|
551 |
+
return $total_cost;
|
552 |
+
}
|
553 |
+
|
554 |
+
sub print_sd_ht {
|
555 |
+
local($this, *sd_ht, $chart1_id, $chart2_id, *OUT) = @_;
|
556 |
+
|
557 |
+
print OUT "string-distance chart:\n";
|
558 |
+
foreach $chart_id (($chart1_id, $chart2_id)) {
|
559 |
+
print OUT "SD chart $chart_id:\n";
|
560 |
+
foreach $from_linear_chart_pos (sort { $a <=> $b } keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}}) {
|
561 |
+
foreach $to_linear_chart_pos (sort { $a <=> $b } keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$from_linear_chart_pos}}) {
|
562 |
+
foreach $roman_char (sort keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$from_linear_chart_pos}->{$to_linear_chart_pos}}) {
|
563 |
+
print OUT " Lnode($from_linear_chart_pos-$to_linear_chart_pos): $roman_char\n";
|
564 |
+
}
|
565 |
+
}
|
566 |
+
}
|
567 |
+
}
|
568 |
+
}
|
569 |
+
|
570 |
+
sub print_chart_ht {
|
571 |
+
local($this, *chart_ht, *OUT) = @_;
|
572 |
+
|
573 |
+
print OUT "uroman chart:\n";
|
574 |
+
foreach $start (sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AT}}) {
|
575 |
+
foreach $end (sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}}) {
|
576 |
+
foreach $node_id (keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}}) {
|
577 |
+
$roman_s = $chart_ht{NODE_ROMAN}->{$node_id};
|
578 |
+
print OUT " Node $node_id ($start-$end): $roman_s\n";
|
579 |
+
}
|
580 |
+
}
|
581 |
+
}
|
582 |
+
}
|
583 |
+
|
584 |
+
sub normalize_string {
|
585 |
+
local($this, $s) = @_;
|
586 |
+
|
587 |
+
# $s =~ s/(\xE2\x80\x8C)//g; # delete zero width non-joiner
|
588 |
+
$s =~ s/(\xE2\x80[\x93-\x94])/-/g; # en-dash, em-dash
|
589 |
+
$s =~ s/([\x00-\x7F\xC0-\xFE][\x80-\xBF]*)\1+/$1$1/g; # shorten 3 or more occurrences of same character in a row to 2
|
590 |
+
$s =~ s/[ \t]+/ /g;
|
591 |
+
|
592 |
+
return $s;
|
593 |
+
}
|
594 |
+
|
595 |
+
my $string_distance_chart_id = 0;
|
596 |
+
sub string_distance_by_chart {
|
597 |
+
local($this, $s1, $s2, $lang_code1, $lang_code2, *ht, *pinyin_ht, $control) = @_;
|
598 |
+
|
599 |
+
$control = "" unless defined($control);
|
600 |
+
%sd_ht = ();
|
601 |
+
|
602 |
+
$s1 = $this->normalize_string($s1);
|
603 |
+
my $lc_s1 = $utf8->extended_lower_case($s1);
|
604 |
+
$string_distance_chart_id++;
|
605 |
+
my $chart1_id = $string_distance_chart_id;
|
606 |
+
*chart_ht = $romanizer->romanize($lc_s1, $lang_code1, "", *ht, *pinyin_ht, 0, "return chart", $chart1_id);
|
607 |
+
$this->linearize_chart_points(*chart_ht, $chart1_id, *sd_ht);
|
608 |
+
$this->expand_lin_ij_roman(*sd_ht, $chart1_id, $lang_code1, *ht);
|
609 |
+
|
610 |
+
$s2 = $this->normalize_string($s2);
|
611 |
+
my $lc_s2 = $utf8->extended_lower_case($s2);
|
612 |
+
$string_distance_chart_id++;
|
613 |
+
my $chart2_id = $string_distance_chart_id;
|
614 |
+
*chart_ht = $romanizer->romanize($lc_s2, $lang_code2, "", *ht, *pinyin_ht, 0, "return chart", $chart2_id);
|
615 |
+
$this->linearize_chart_points(*chart_ht, $chart2_id, *sd_ht);
|
616 |
+
$this->expand_lin_ij_roman(*sd_ht, $chart2_id, $lang_code2, *ht);
|
617 |
+
|
618 |
+
my $cost = $this->string_distance(*sd_ht, $chart1_id, $chart2_id, $lang_code1, $lang_code2, *ht, $control);
|
619 |
+
return $cost;
|
620 |
+
}
|
621 |
+
|
622 |
+
my $n_quick_romanized_string_distance = 0;
|
623 |
+
sub quick_romanized_string_distance_by_chart {
|
624 |
+
local($this, $s1, $s2, *ht, $control, $lang_code1, $lang_code2) = @_;
|
625 |
+
|
626 |
+
# my $verbose = ($s1 eq "apit") && ($s2 eq "apet");
|
627 |
+
# print STDERR "Start quick_romanized_string_distance_by_chart\n";
|
628 |
+
$s1 = lc $s1;
|
629 |
+
$s2 = lc $s2;
|
630 |
+
$control = "" unless defined($control);
|
631 |
+
$lang_code1 = "" unless defined($lang_code1);
|
632 |
+
$lang_code2 = "" unless defined($lang_code2);
|
633 |
+
my $cache_p = ($control =~ /cache/);
|
634 |
+
my $total_cost;
|
635 |
+
if ($cache_p) {
|
636 |
+
$total_cost = $ht{CACHED_QRSD}->{$s1}->{$s2};
|
637 |
+
if (defined($total_cost)) {
|
638 |
+
return $total_cost;
|
639 |
+
}
|
640 |
+
}
|
641 |
+
my @lang_codes1 = ($lang_code1 eq "") ? ("") : ($lang_code1, "");
|
642 |
+
my @lang_codes2 = ($lang_code2 eq "") ? ("") : ($lang_code2, "");
|
643 |
+
my $chart1_end_pos = length($s1);
|
644 |
+
my $chart2_end_pos = length($s2);
|
645 |
+
my %sd_ht = ();
|
646 |
+
$sd_ht{COST_IJ}->{0}->{0} = 0;
|
647 |
+
foreach $chart1_start ((0 .. $chart1_end_pos)) {
|
648 |
+
foreach $chart1_end (($chart1_start .. $chart1_end_pos)) {
|
649 |
+
my $substr1 = substr($s1, $chart1_start, ($chart1_end-$chart1_start));
|
650 |
+
foreach $lang_code1o (@lang_codes1) {
|
651 |
+
foreach $lang_code2o (@lang_codes2) {
|
652 |
+
# next unless defined($ht{COST}->{$lang_code1o}->{$lang_code2o}->{$substr1});
|
653 |
+
}
|
654 |
+
}
|
655 |
+
my @chart2_starts = (sort { $a <=> $b } keys %{$sd_ht{COST_IJ}->{$chart1_start}});
|
656 |
+
foreach $chart2_start (@chart2_starts) {
|
657 |
+
foreach $chart2_end (($chart2_start .. $chart2_end_pos)) {
|
658 |
+
my $substr2 = substr($s2, $chart2_start, ($chart2_end-$chart2_start));
|
659 |
+
foreach $lang_code1o (@lang_codes1) {
|
660 |
+
foreach $lang_code2o (@lang_codes2) {
|
661 |
+
if ($substr1 eq $substr2) {
|
662 |
+
my $cost = 0;
|
663 |
+
my $preceding_cost = $sd_ht{COST_IJ}->{$chart1_start}->{$chart2_start};
|
664 |
+
if (defined($preceding_cost)) {
|
665 |
+
my $combined_cost = $preceding_cost + $cost;
|
666 |
+
my $old_cost = $sd_ht{COST_IJ}->{$chart1_end}->{$chart2_end};
|
667 |
+
if ((! defined($old_cost)) || ($combined_cost < $old_cost)) {
|
668 |
+
$sd_ht{COST_IJ}->{$chart1_end}->{$chart2_end} = $combined_cost;
|
669 |
+
push(@chart2_starts, $chart2_end) unless $util->member($chart2_end, @chart2_starts);
|
670 |
+
}
|
671 |
+
}
|
672 |
+
} else {
|
673 |
+
next unless defined($ht{COST}->{$lang_code1o}->{$lang_code2o}->{$substr1}->{$substr2});
|
674 |
+
my @cost_rule_ids = keys %{$ht{COST}->{$lang_code1o}->{$lang_code2o}->{$substr1}->{$substr2}};
|
675 |
+
my $best_cost = 99.99;
|
676 |
+
foreach $cost_rule_id (@cost_rule_ids) {
|
677 |
+
my $cost = $ht{COST}->{$lang_code1o}->{$lang_code2o}->{$substr1}->{$substr2}->{$cost_rule_id};
|
678 |
+
my $left_context_rule1 = $ht{LEFT1}->{$cost_rule_id};
|
679 |
+
next if $left_context_rule1
|
680 |
+
&& (! (substr($s1, 0, $chart1_start) =~ /$left_context_rule1/));
|
681 |
+
my $left_context_rule2 = $ht{LEFT2}->{$cost_rule_id};
|
682 |
+
next if $left_context_rule2
|
683 |
+
&& (! (substr($s2, 0, $chart2_start) =~ /$left_context_rule2/));
|
684 |
+
my $right_context_rule1 = $ht{RIGHT1}->{$cost_rule_id};
|
685 |
+
my $right_context1 = substr($s1, $chart1_end);
|
686 |
+
next if $right_context_rule1
|
687 |
+
&& (! (($right_context1 =~ /^$right_context_rule1/)
|
688 |
+
|| (($right_context_rule1 =~ /^\[[^\[\]]*\$/)
|
689 |
+
&& ($right_context1 eq ""))));
|
690 |
+
my $right_context_rule2 = $ht{RIGHT2}->{$cost_rule_id};
|
691 |
+
my $right_context2 = substr($s2, $chart2_end);
|
692 |
+
next if $right_context_rule2
|
693 |
+
&& (! (($right_context2 =~ /^$right_context_rule2/)
|
694 |
+
|| (($right_context_rule2 =~ /^\[[^\[\]]*\$/)
|
695 |
+
&& ($right_context2 eq ""))));
|
696 |
+
$best_cost = $cost if $cost < $best_cost;
|
697 |
+
my $preceding_cost = $sd_ht{COST_IJ}->{$chart1_start}->{$chart2_start};
|
698 |
+
my $combined_cost = $preceding_cost + $cost;
|
699 |
+
my $old_cost = $sd_ht{COST_IJ}->{$chart1_end}->{$chart2_end};
|
700 |
+
if ((! defined($old_cost)) || ($combined_cost < $old_cost)) {
|
701 |
+
$sd_ht{COST_IJ}->{$chart1_end}->{$chart2_end} = $combined_cost;
|
702 |
+
push(@chart2_starts, $chart2_end) unless $util->member($chart2_end, @chart2_starts);
|
703 |
+
}
|
704 |
+
}
|
705 |
+
}
|
706 |
+
}
|
707 |
+
}
|
708 |
+
}
|
709 |
+
}
|
710 |
+
}
|
711 |
+
}
|
712 |
+
$total_cost = $sd_ht{COST_IJ}->{$chart1_end_pos}->{$chart2_end_pos};
|
713 |
+
$total_cost = 99.99 unless defined($total_cost);
|
714 |
+
$ht{CACHED_QRSD}->{$s1}->{$s2} = $total_cost if $cache_p;
|
715 |
+
$n_quick_romanized_string_distance++;
|
716 |
+
return $total_cost;
|
717 |
+
}
|
718 |
+
|
719 |
+
sub get_n_quick_romanized_string_distance {
|
720 |
+
return $n_quick_romanized_string_distance;
|
721 |
+
}
|
722 |
+
|
723 |
+
1;
|
724 |
+
|
uroman/lib/NLP/utilities.pm
ADDED
The diff for this file is too large to render.
See raw diff
|
|
uroman/tarballs/uroman-v1.0.tar.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:912655beef069e5abb43c8fc4c3c4428fd0af6f4a1697accc98277933d3e1ee5
|
3 |
+
size 440252
|
uroman/tarballs/uroman-v1.1.tar.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:df990f6096a10e093ac5f28c2b86d5ef9e9098ef7472855843f9a841bb3b963d
|
3 |
+
size 507234
|
uroman/tarballs/uroman-v1.2.4.tar.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:77d707f3c17d5c45869b80fe71caee6023d1d9949ccffb446626f374605a25e2
|
3 |
+
size 503690
|
uroman/tarballs/uroman-v1.2.5.tar.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a2e9044afff8b4483f43a99b1fb1279889336760d76245ee93f300e660a46660
|
3 |
+
size 575581
|
uroman/tarballs/uroman-v1.2.6.tar.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:02f6f73b067b972a8f7d408da2f9b22741629af67f55b2ea768d11710fbf40a4
|
3 |
+
size 567522
|
uroman/tarballs/uroman-v1.2.7.tar.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fbb51506ed3ea6dcb902c824e62bea39b3741f6526564ba05d6e0083d8d876e5
|
3 |
+
size 566800
|
uroman/tarballs/uroman-v1.2.tar.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4c69e56d9c5eea9416ae00ca4dd859a1ef5129c1867778b66ad2f811f0fd33c9
|
3 |
+
size 494625
|
uroman/test/multi-script.txt
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
::lcode deu Grüße aus Bordeaux
|
2 |
+
::lcode tur İstanbul, Türkiye'de yer alan şehir ve ülkenin 81 ilinden biri.
|
3 |
+
::lcode eng ⠠⠺⠑⠀⠓⠕⠇⠙⠀⠘⠮⠀⠞⠗⠥⠹⠎⠀⠞⠕⠀⠆⠀⠎⠑⠇⠋⠤⠑⠧⠊⠙⠢⠞⠂⠀⠞⠀⠁⠇⠇⠀⠍⠑⠝⠀⠜⠑⠀⠉⠗⠂⠞⠫⠀⠑⠟⠥⠁⠇⠂⠀⠞⠀⠮⠽⠀⠜⠑⠀⠑⠝⠙⠪⠫⠀⠃⠽⠀⠸⠮⠀⠠⠉⠗⠑⠁⠞⠕⠗⠀⠾⠀⠉⠻⠞⠁⠔⠀⠥⠝⠁⠇⠊⠑⠝⠁⠃⠇⠑⠀⠠⠐⠗⠎⠂⠀⠞⠀⠁⠍⠰⠛⠀⠘⠮⠀⠜⠑⠀⠠⠇⠊⠋⠑⠂⠀⠠⠇⠊⠃⠻⠞⠽⠀⠯⠀⠮⠀⠏⠥⠗⠎⠥⠊⠞⠀⠷⠀⠠⠓⠁⠏⠏⠊⠰⠎⠲
|
4 |
+
::lcode ell Το Λος Άντζελες (στα ισπανικά Los Angeles = Οι Άγγελοι) ή στην Αμερικανική αργκό L.A., ελ έι) είναι η δεύτερη μεγαλύτερη πόλη των Ηνωμένων Πολιτειών από άποψη πληθυσμού, καθώς και ένα από τα σημαντικότερα οικονομικά, πολιτιστικά επιστημονικά και ψυχαγωγικά κέντρα του κόσμου.
|
5 |
+
::lcode rus Герма́ния (нем. Deutschland), официальное название — Федерати́вная Респу́блика Герма́ния (нем. Bundesrepublik Deutschland), ФРГ (нем. BRD) — государство в Западной Европе. Площадь территории — 357 021 км². Численность населения по переписи 2011 года — более 80 миллионов человек. [2][6].
|
6 |
+
::lcode ukr Володи́мир Олекса́ндрович Зеле́нський (нар. 25 січня 1978, Кривий Ріг) — український державний діяч, політик, шоумен, актор, комік, режисер, продюсер та сценарист, шостий Президент України з 20 травня 2019 року.
|
7 |
+
::lcode srp Сва људска бића рађају се слободна и једнака у достојанству и правима. Она су обдарена разумом и свешћу и треба једни према другима да поступају у духу братства.
|
8 |
+
::lcode ara كندا (بالإنجليزية: Canada) هي دولة في أمريكا الشمالية تتألف من 10 مقاطعات وثلاثة أقاليم. تقع في القسم الشمالي من القارة وتمتد من المحيط الأطلسي في الشرق إلى المحيط الهادئ في الغرب وتمتد شمالاً في المحيط المتجمد الشمالي. كندا هي البلد الثاني عالمياً من حيث المساحة الكلية. كما أن حدود كندا المشتركة مع الولايات المتحدة من الجنوب والشمال الغربي هي الأطول في العالم.
|
9 |
+
::lcode fas کالیفرنیا (به انگلیسی: California) ایالتی در غرب آمریکا بر کرانهٔ اقیانوس آرام است. مرکز آن ساکرامنتو و شهرهای مهم آن لسآنجلس، سن دیگو، سن خوزه و سانفرانسیسکو هستند.همچنین این ایالت پر جمعیت ترین ایالت امریکا است.
|
10 |
+
::lcode uig ئامېرىكا قوشما شتاتلىرى بولسا شىمالىي ئامېرىكاغا جايلاشقان بىر دۆلەت. ئۇنىڭ پايتەختى بولسا ۋاشىنگتون، ئەڭ چوڭ شەھىرى بولسا نيۇيورك شەھىرى. دۆلەت تىلى بولسا ئېنگلىزتىلى. ھازىرقى زۇڭتۇڭ باراك ئوباما. بۇ دۆلەت ئەسلىدە ئەنگىلىيەنىڭ مۇستەملىكىسى بولۇپ ۋاشىنگىتوننىڭ رەھپەرلىكىدە 1776 يىلى 7 ئاينىڭ 4 كۇنى مۇستەقىل بولغان، يەر مەيدانى 9 مىلىيون 826 مىڭ 630 كۋادىرات كلومېتىر، نوپۇسى 306 مىللىيون 142 مىڭ، بۇلارنىڭ ئاسساسلىق دىنى خرىستىئان دىنى.
|
11 |
+
::lcode amh ኢትዮጵያ ከዓለም ሶስቱ ትልቅ የአብርሃም ሀይማኖቶች ጋር ታሪካዊ ግንኙነት አላት።
|
12 |
+
::lcode hin कैलिफ़ोर्निया शब्द का पहला अर्थ था जो क्षेत्र जहाँ आज बाहा कैलिफ़ोर्निया प्रायद्वीप, नेवाडा, यूटा और एरिज़ोना, नया मेक्सिको, और वायोमिंग के कई विभाग स्थित हैं।
|
13 |
+
::lcode mar लंडन (इंग्लिश: London ) हे इंग्लंडचे व युनायटेड किंग्डमचे राजधानीचे व सर्वात मोठे शहर तसेच युरोपियन संघामधील सर्वात मोठे महान���र क्षेत्र आहे.
|
14 |
+
::lcode nep यसको उचाइ समुन्द्र सतहबाट ८,८४८ मीटर (२९,०२८ फीट) छ। यो नेपालको सोलुखुम्बु जिल्लाको खुम्जुङ्ग गा. वि. स. मा पर्छ ।
|
15 |
+
::lcode tam தமிழ்நாடு (Tamil Nadu) இந்தியாவின் 29 மாநிலங்களில் ஒன்றாகும். தமிழ்நாடு, தமிழகம் என்றும் பரவலாக அழைக்கப்படுகிறது.
|
16 |
+
::lcode mal ഇന്ത്യയുടെ തെക്കുപടിഞ്ഞാറെ അറ്റത്തുള്ള സംസ്ഥാനമാണ് കേരളം.
|
17 |
+
::lcode ori ଓଡ଼ିଶା ଭାରତର ପୂର୍ବ ଉପକୂଳରେ ଥିବା ଏକ ପ୍ରଶାସନିକ ରାଜ୍ୟ । ଏହାର ଉତ୍ତର-ପୂର୍ବରେ ପଶ୍ଚିମବଙ୍ଗ, ଉତ୍ତରରେ ଝାଡ଼ଖଣ୍ଡ, ପଶ୍ଚିମ ଓ ଉତ୍ତର-ପଶ୍ଚିମରେ ଛତିଶଗଡ଼, ଦକ୍ଷିଣ ଓ ଦକ୍ଷିଣ-ପଶ୍ଚିମରେ ଆନ୍ଧ୍ରପ୍ରଦେଶ ଅବସ୍ଥିତ । ଏହା ଆୟତନ ହିସାବରେ ନବମ ଓ ଜନସଂଖ୍ୟା ହିସାବରେ ଏଗାରତମ ରାଜ୍ୟ । ଓଡ଼ିଆ ଭାଷା ରାଜ୍ୟର ସରକାରୀ ଭାଷା । ୨୦୦୧ ଜନଗଣନା ଅନୁସାରେ ରାଜ୍ୟର ପ୍ରାୟ ୩୩.୨ ନିୟୁତ ଲୋକ ଓଡ଼ିଆ ଭାଷା ବ୍ୟବହାର କରନ୍ତି ।
|
18 |
+
::lcode zho 加拿大在一万四千年前即有原住民在此生活。
|
19 |
+
::lcode heb כֹּל עוֹד בַּלֵּבָב פְּנִימָה נֶפֶשׁ יְהוּדִי הוֹמִיָּה וּלְפַאֲתֵי מִזְרָח, קָדִימָה, עַיִן לְצִיּוֹן צוֹפִיָּה, עוֹד לֹא אָבְדָה תִּקְוָתֵנוּ, הַתִּקְוָה בַּת שְׁנוֹת אַלְפַּיִם לִהְיוֹת עַם חָפְשִׁי בְּאַרְצֵנוּ, אֶרֶץ צִיּוֹן וִירוּשָׁלַיִם.
|
20 |
+
::lcode yid דווקא איז אן העברעישער זשורנאל וואס באשרייבט די יידיש־שפראכיקע קולטור. עס איז דערשינען געווארן תמוז ה'תשס"ז (יולי 2006).
|
21 |
+
::lcode hye Տալնոեի շրջան (ուկր.՝ Тальнівський район), շրջան Ուկրաինայի Չերկասիի մարզում։ Ստեղծվել է 1923 թվականին։ Վարչական կենտրոնը՝ Տալնոե։ Աշխարհագրությունը Շրջանի տարածքի մակերեսը կազմում է 917 կմ²։ Բնակչություն
|
22 |
+
::lcode tai มีประเทศอิสระ 2 ประเทศ คือ ซานมารีโนและนครรัฐวาติกัน เป็นดินแดนที่ล้อมรอบไปด้วยพื้นที่ของอิตาลี ในขณะที่เมืองกัมปีโอเนดีตาเลีย เป็นดินแดนส่วนแยกของอิตาลีที่ถูกล้อมรอบด้วยพื้นที่ประเทศสวิตเซอร์แลนด์
|
23 |
+
북쪽에는 인도네시아와 동티모르, 파푸아 뉴기니, 북동쪽에는 솔로몬 제도와 바누아투, 누벨칼레도니, 그리고 남동쪽에는 뉴질랜드가 있다.
|
24 |
+
ಬಾ ಇಲ್ಲಿ ಸಂಭವಿಸು ಇಂದೆನ್ನ ಹೃದಯದಲಿ ನಿತ್ಯವೂ ಅವತರಿಪ ಸತ್ಯಾವತಾರ ಮಣ್ಣಾಗಿ ಮರವಾಗಿ ಮಿಗವಾಗಿ ಕಗವಾಗೀ... ಮಣ್ಣಾಗಿ ಮರವಾಗಿ ಮಿಗವಾಗಿ ಕಗವಾಗಿ ಭವ ಭವದಿ ಭತಿಸಿಹೇ ಭವತಿ ದೂರ ನಿತ್ಯವೂ ಅವತರಿಪ ಸತ್ಯಾವತಾರ || ಬಾ ಇಲ್ಲಿ ||
|
25 |
+
ვეპხის ტყაოსანი შოთა რუსთაველი ღმერთსი შემვედრე, ნუთუ კვლა დამხსნას სოფლისა შრომასა, ცეცხლს, წყალსა და მიწასა, ჰაერთა თანა მრომასა; მომცნეს ფრთენი და აღვფრინდე, მივჰხვდე მას ჩემსა ნდომასა, დღისით და ღამით ვჰხედვიდე მზისა ელვათა კრთომაასა.
|
26 |
+
᚛ᚐᚅᚋ ᚋᚖᚂᚓᚌᚖᚋᚏᚔᚇ ᚋᚐᚉᚔ ᚍᚓᚉᚒᚋᚓᚅ᚜
|
27 |
+
ᛁᚳ᛫ᛗᚨᚷ᛫ᚷᛚᚨᛋ᛫ᛖᚩᛏᚪᚾ᛫ᚩᚾᛞ᛫ᚻᛁᛏ᛫ᚾᛖ᛫ᚻᛖᚪᚱᛗᛁᚪᚧ᛫ᛗᛖ᛬
|
28 |
+
𓊪𓏏𓍯𓃭𓐝𓇌𓋴
|
29 |
+
チェコスロバキア
|
30 |
+
ལྷ་ས་གྲ���ང་ཁྱེར
|
31 |
+
ᓵᓕ ᓴᕕᐊᕐᔪᒃ ᐃᒻᒥᓂᒃ ᓂᓪᓕᕈᑎᖃᓲᖑᕗᖅ ᑕᐃᑦᓱᒪᓂᑕᑦᓴᔭᐅᓂᕋᕐᓱᓂ. ᐃᒻᒥᓂᓪᓗᑕᐅᖅ ᓂᓪᓕᕈᑎᖃᓱᖑᒻᒥᓱᓂ ᐅᓪᓗᒥᓂᑕᑦᓴᔭᐅᓂᕋᕐᓱᓂ.
|
32 |
+
ⴰⵎⴰⴳⵔⴰⴷ 1 ⴰⵔ ⴷ ⵜⵜⵍⴰⵍⴰⵏ ⵎⵉⴷⴷⵏ ⴳⴰⵏ ⵉⵍⴻⵍⵍⵉⵜⵏ ⵎⴳⴰⴷⴷⴰⵏ ⵖ ⵡⴰⴷⴷⵓⵔ ⴷ ⵉⵣⵔⴼⴰⵏ, ⵢⵉⵍⵉ ⴰⴽⵯ ⴷⴰⵔⵙⵏ ⵓⵏⵍⵍⵉ ⴷ ⵓⴼⵔⴰⴽ, ⵉⵍⵍⴰ ⴼⵍⵍⴰ ⵙⵏ ⴰⴷ ⵜⵜⵎⵢⴰⵡⴰⵙⵏ ⵏⴳⵔⴰⵜⵙⵏ ⵙ ⵜⴰⴳⵎⴰⵜ.
|
uroman/test/multi-script.uroman-ref.txt
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
::lcode deu Gruesse aus Bordeaux
|
2 |
+
::lcode tur Istanbul, Tuerkiye'de yer alan shehir ve uelkenin 81 ilinden biri.
|
3 |
+
::lcode eng We hold ⠘e truos to ; self-evid⠢t, t all men aee cr,te equal, t ey aee endoee by ⠸e Creator u cita⠔ unalienable ⠠⠐rs, t amg ⠘e aee Life, Libity ⠯ e pursuit a Happis.
|
4 |
+
::lcode ell To Los Andzeles (sta ispanika Los Angeles = Oi Angeloi) e sten Amerikanike arngo L.A., el ei) einai e deutere megalutere pole ton Enomenon Politeion apo apopse plethysmou, kathos kai ena apo ta semandikotera oikonomika, politistika epistemonika kai psychagogika kendra tou kosmou.
|
5 |
+
::lcode rus Germaniya (nem. Deutschland), ofitsialnoe nazvanie — Federativnaya Respublika Germaniya (nem. Bundesrepublik Deutschland), FRG (nem. BRD) — gosudarstvo v Zapadnoi Evrope. Ploshchad territorii — 357 021 km². Chislennost naseleniya po perepisi 2011 goda — bolee 80 millionov chelovek. [2][6].
|
6 |
+
::lcode ukr Volodimir Oleksandrovich Zelensky (nar. 25 sichnya 1978, Krivy Rig) — ukrayinsky derzhavny diyach, politik, shoumen, aktor, komik, rezhiser, prodyuser ta stsenarist, shosty Prezident Ukrayini z 20 travnya 2019 roku.
|
7 |
+
::lcode srp Sva ljudska bitsha radjaju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i sveshtshu i treba jedni prema drugima da postupaju u dukhu bratstva.
|
8 |
+
::lcode ara knda (balinjlyzya: Canada) hy dwla fy amryka alshmalya ttalf mn 10 mqat'at wthlatha aqalym. tq' fy alqsm alshmaly mn alqara wtmtd mn almhyt alatlsy fy alshrq ila almhyt alhadye fy alghrb wtmtd shmalan fy almhyt almtjmd alshmaly. knda hy albld althany 'almyan mn hyth almsaha alklya. kma an hdwd knda almshtrka m' alwlayat almthda mn aljnwb walshmal alghrby hy alatwl fy al'alm.
|
9 |
+
::lcode fas kalifrnia (bh anglisi: California) ialti dr ghrb amrika br kranh' aqianws aram ast. mrkz an sakramntw w shhrhai mhm an lsanjls, sn digw, sn khwzh w sanfransiskw hstnd.hmtchnin in ialt pr jm'it trin ialt amrika ast.
|
10 |
+
::lcode uig yeameraka qwshma shtatlara bwlsa shamalay yeamerakagha jaylashqan bar doelaet. yeunang paytaekhta bwlsa vashangtwn, yeaeng tchwng shaehara bwlsa nyuywrk shaehara. doelaet tala bwlsa yeenglaztala. hazarqa zungtung barak yewbama. bu doelaet yeaesladae yeaengalayaenang mustaemlakasa bwlup vashangatwnnang raehpaerlakadae 1776 yala 7 yeaynang 4 kuna mustaeqal bwlghan, yaer maeydana 9 malaywn 826 mang 630 kvadarat klwmetar, nwpusa 306 mallaywn 142 mang, bularnang yeassaslaq dana khrastayean dana.
|
11 |
+
::lcode amh iteyopheyaa kaaalame sosetu teleqe yaaberehaame hayemaanotoche gaare taarikaawi genenyunate alaate.
|
12 |
+
::lcode hin kailiphorniyaa shabda kaa pahalaa artha thaa jo kssetra jahaam aaj baahaa kailiphorniyaa praayadviip, nevaaddaa, yuuttaa aur erijonaa, nayaa meksiko, aur vaayomimga ke kaii vibhaag sthit haim.
|
13 |
+
::lcode mar lamddan (imglish: London ) he imglamddace va yunaayattedd kimgddamace raajadhaaniice va sarvaat motthe shahar tasec yuropiyan samghaamadhiil sarvaat motthe mahaanagar kssetra aahe.
|
14 |
+
::lcode nep yasako ucaai samundra satahabaatt 8,848 miittar (29,028 phiitt) cha. yo nepaalako solukhumbu jillaako khumjungga gaa. vi. sa. maa parcha .
|
15 |
+
::lcode tam tamilnaadu (Tamil Nadu) intiyaavin 29 maanilangkalil onraakum. tamilnaadu, tamilakam enrum paravalaaka alaikkappadukiratu.
|
16 |
+
::lcode mal intyayutte tekkupattinynyaarre arrrrattulllla samsthaanamaann keerallam.
|
17 |
+
::lcode ori oddishaa bhaaratara puurba upakuullare thibaa eka prashaasanika raajya . ehaara uttara-puurbare pashcimabangga, uttarare jhaaddakhanndda, pashcima o uttara-pashcimare chatishagadda, dakssinna o dakssinna-pashcimare aandhrapradesha abasthita . ehaa aayatana hisaabare nabama o janasamkhyaa hisaabare egaaratama raajya . oddiaa bhaassaa raajyara sarakaarii bhaassaa . 2001 janagannanaa anusaare raajyara praaya 33.2 niyuta loka oddiaa bhaassaa byabahaara karanti .
|
18 |
+
::lcode zho jianadazai14000nianqianjiyouyuanzhuminzaicishenghuo.
|
19 |
+
::lcode heb kol 'od balevav penimah nefesh yehudi homiyah ulefa'ate mizerach, qadimah, 'ayin letsiyon tsofiyah, 'od lo avedah tiqvatenu, hatiqvah bat shenot 'alepayim liheyot 'am chafeshiy be'aretsenu, erets tsiyon virushalayim.
|
20 |
+
::lcode yid dvvqa ayz an h'vr'ysh'r zshvrnal vvas vashryyvt dy yydysh-shfrakyq' qvltvr. 's ayz d'rshyn'n g'vvarn tmvz h'tshs"z (yvly 2006).
|
21 |
+
::lcode hye Talnoei shrjan (ukr., Talnivsky raion), shrjan Ukrainayi Cherkasii marzum. Steghtsvel e 1923 tvakanin. Varchakan kentrone, Talnoe. Ashkharhagrutyune Shrjani taratski makerese kazmum e 917 km². Bnakchutyun
|
22 |
+
::lcode tai miipratesisra 2 prates kuee saanmaariinolaeankrratwaatikan peondindaentiilomrobpaidwypueentiikongitaalii naiknatiimeueengkampiionediitaaleiiy peondindaenswnyaekkongitaaliitiituuklomrobdwypueentiipratesswitserlaend
|
23 |
+
bugjjogeneun indonesiawa dongtimoreu, papua nyugini, bugdongjjogeneun solromon jedowa banuatu, nubelkalredoni, geurigo namdongjjogeneun nyujilraendeuga issda.
|
24 |
+
baa illi sambhavisu imdenna hrdayadali nityavuu avataripa satyaavataara mannnnaagi maravaagi migavaagi kagavaagii... mannnnaagi maravaagi migavaagi kagavaagi bhava bhavadi bhatisihee bhavati duura nityavuu avataripa satyaavataara || baa illi ||
|
25 |
+
vepxis tqaosani shota rustaveli ghmertsi shemvedre, nutu kvla damxsnas sophlisa shromasa, tsetsxls, tsqalsa da mitsasa, haerta tana mromasa; momtsnes phrteni da aghvphrinde, mivhxvde mas chemsa ndomasa, dghisit da ghamit vhxedvide mzisa elvata krtomaasa.
|
26 |
+
anm moilegoimrid maki vekumen
|
27 |
+
ic mag glas eotan ond hit ne hearmiath me.
|
28 |
+
ptolmys
|
29 |
+
chekosurobakia
|
30 |
+
lha·sa·grong·khyer
|
31 |
+
saali safiaryok imminik nillirotiqasoongofoq taitsomanitatsayaonirarsoni. imminillotaoq nillirotiqasongommisoni ollominitatsayaonirarsoni.
|
32 |
+
amagrad 1 ar d ttlalan middn gan ilellitn mgaddan gh waddur d izrfan, yili ak darsn unlli d ufrak, illa flla sn ad ttmyawasn ngratsn s tagmat.
|
uroman/test/string-similarity-test-input.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
trap strap
|
2 |
+
colour color
|
3 |
+
labeling labelling
|
4 |
+
organisation organization
|
5 |
+
Philadelphia Filadelfia
|
6 |
+
Vladimir Volodymyr
|
7 |
+
Moskva Moskvoy
|
uroman/test/string-similarity-test-output-ref.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Lang-code-1: eng Lang-code-2: eng
|
2 |
+
trap strap 1
|
3 |
+
colour color 0.1
|
4 |
+
labeling labelling 0.02
|
5 |
+
organisation organization 0.1
|
6 |
+
Philadelphia Filadelfia 0.02
|
7 |
+
Vladimir Volodymyr 0.5
|
8 |
+
Moskva Moskvoy 0.5
|
uroman/text/amh.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ኢትዮጵያ ከዓለም ሶስቱ ትልቅ የአብርሃም ሀይማኖቶች ጋር ታሪካዊ ግንኙነት አላት።
|
2 |
+
ክርስትናን በአራተኛው ምዕተ-ዓመት ተቀብላለች።
|
3 |
+
ከሕዝቡ አንድ ሶስተኛው እስላም ነው።
|
4 |
+
የመጀመሪያው የእስላም ሂጅራ ወደ ኢትዮጵያ ነው የተከናወነው።
|
5 |
+
ነጋሽ በአፍሪካ የመጀመሪያው የእስላም መቀመጫ ናት።
|
6 |
+
እስከ ፲፱፻፸ ዎቹ ድረስ ብዙ ቤተ-እስራኤሎች በኢትዮጵያ ይኖሩ ነበር።
|
7 |
+
የራስ ተፈሪ እንቅስቃሴ ኢትዮጵያን በትልቅ ክብር ነው የሚያያት።
|
uroman/text/ara.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
كندا (بالإنجليزية: Canada) هي دولة في أمريكا الشمالية تتألف من 10 مقاطعات وثلاثة أقاليم. تقع في القسم الشمالي من القارة وتمتد من المحيط الأطلسي في الشرق إلى المحيط الهادئ في الغرب وتمتد شمالاً في المحيط المتجمد الشمالي. كندا هي البلد الثاني عالمياً من حيث المساحة الكلية. كما أن حدود كندا المشتركة مع الولايات المتحدة من الجنوب والشمال الغربي هي الأطول في العالم.
|
2 |
+
أراضي كندا مأهولة منذ آلاف السنين من قبل مجموعات مختلفة من السكان الأصليين. مع حلول أواخر القرن الخامس عشر بدأت الحملات البريطانية والفرنسية استكشاف المنطقة ومن ثم استوطنتها على طول ساحل المحيط الأطلسي. تنازلت فرنسا عن ما يقرب من جميع مستعمراتها في أمريكا الشمالية في عام 1763 بعد حرب السنوات السبع. في عام 1867، مع اتحاد ثلاثة مستعمرات بريطانية في أمريكا الشمالية عبر كونفدرالية تشكلت كندا باعتبارها كيانًا فدراليًا ذا سيادة يضم أربع مقاطعات. بدأ ذلك عملية اتسعت فيها مساحة كندا وتوسع حكمها الذاتي عن المملكة المتحدة. تجلت هذه الاستقلالية من خلال تشريع وستمنستر عام 1931 وبلغت ذروتها في صورة قانون كندا عام 1982 والذي قطع الاعتماد القانوني لكندا على البرلمان البريطاني.
|
3 |
+
كندا دولة فيدرالية يحكمها نظام ديمقراطي تمثيلي وملكية دستورية حيث الملكة إليزابيث الثانية قائدة للدولة. الأمة الكندية أمة ثنائية اللغة حيث الإنكليزية والفرنسية لغتان رسميتان على المستوى الاتحادي. تعد كندا واحدة من أكثر دول العالم تطوراً، حيث تمتلك اقتصاداً متنوعاً وتعتمد على مواردها الطبيعية الوفيرة، وعلى التجارة وبخاصة مع الولايات المتحدة اللتان تربطهما علاقة طويلة ومعقدة. كندا عضو في مجموعة الدول الصناعية السبع ومجموعة الثماني ومجموعة العشرين وحلف شمال الأطلسي ومنظمة التعاون والتنمية الاقتصادية ومنظمة التجارة العالمية ودول الكومنولث والفرنكوفونية ومنظمة الدول الأمريكية والإبيك والأمم المتحدة. تمتلك كندا واحداً من أعلى مستويات المعيشة في العالم حيث مؤشر التنمية البشرية يضعها في المرتبة الثامنة عالمياً.
|