Spaces:
Runtime error
Runtime error
Commit
•
ef23634
0
Parent(s):
Duplicate from ai4bharat/IndicTrans-MultilingualTranslation
Browse filesCo-authored-by: Umashankar <Shanks0465@users.noreply.huggingface.co>
This view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +31 -0
- README.md +14 -0
- api/api.py +152 -0
- api/punctuate.py +220 -0
- app.py +107 -0
- indic_nlp_library/LICENSE +9 -0
- indic_nlp_library/README.md +142 -0
- indic_nlp_library/contrib/README.md +7 -0
- indic_nlp_library/contrib/correct_moses_tokenizer.py +29 -0
- indic_nlp_library/contrib/hindi_to_kannada_transliterator.py +62 -0
- indic_nlp_library/contrib/indic_scraper_project_sample.ipynb +569 -0
- indic_nlp_library/docs/Makefile +153 -0
- indic_nlp_library/docs/cmd.rst +8 -0
- indic_nlp_library/docs/code.rst +5 -0
- indic_nlp_library/docs/conf.py +242 -0
- indic_nlp_library/docs/index.rst +22 -0
- indic_nlp_library/docs/indicnlp.MD +122 -0
- indic_nlp_library/docs/indicnlp.cli.rst +11 -0
- indic_nlp_library/docs/indicnlp.morph.rst +11 -0
- indic_nlp_library/docs/indicnlp.normalize.rst +15 -0
- indic_nlp_library/docs/indicnlp.pdf +0 -0
- indic_nlp_library/docs/indicnlp.rst +47 -0
- indic_nlp_library/docs/indicnlp.script.rst +26 -0
- indic_nlp_library/docs/indicnlp.syllable.rst +11 -0
- indic_nlp_library/docs/indicnlp.tokenize.rst +26 -0
- indic_nlp_library/docs/indicnlp.transliterate.rst +34 -0
- indic_nlp_library/docs/make.bat +35 -0
- indic_nlp_library/docs/modules.rst +7 -0
- indic_nlp_library/indicnlp/__init__.py +10 -0
- indic_nlp_library/indicnlp/cli/__init__.py +0 -0
- indic_nlp_library/indicnlp/cli/cliparser.py +266 -0
- indic_nlp_library/indicnlp/common.py +58 -0
- indic_nlp_library/indicnlp/langinfo.py +488 -0
- indic_nlp_library/indicnlp/loader.py +35 -0
- indic_nlp_library/indicnlp/morph/__init__.py +0 -0
- indic_nlp_library/indicnlp/morph/unsupervised_morph.py +142 -0
- indic_nlp_library/indicnlp/normalize/__init__.py +0 -0
- indic_nlp_library/indicnlp/normalize/indic_normalize.py +984 -0
- indic_nlp_library/indicnlp/script/__init__.py +0 -0
- indic_nlp_library/indicnlp/script/english_script.py +154 -0
- indic_nlp_library/indicnlp/script/indic_scripts.py +301 -0
- indic_nlp_library/indicnlp/script/phonetic_sim.py +59 -0
- indic_nlp_library/indicnlp/syllable/__init__.py +0 -0
- indic_nlp_library/indicnlp/syllable/syllabifier.py +302 -0
- indic_nlp_library/indicnlp/test/__init__.py +0 -0
- indic_nlp_library/indicnlp/test/unit/__init__.py +0 -0
- indic_nlp_library/indicnlp/tokenize/__init__.py +0 -0
- indic_nlp_library/indicnlp/tokenize/indic_detokenize.py +134 -0
- indic_nlp_library/indicnlp/tokenize/indic_tokenize.py +111 -0
- indic_nlp_library/indicnlp/tokenize/sentence_tokenize.py +268 -0
.gitattributes
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
23 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: IndicTrans MultilingualTranslation
|
3 |
+
emoji: 🌍
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: green
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.3.1
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: mit
|
11 |
+
duplicated_from: ai4bharat/IndicTrans-MultilingualTranslation
|
12 |
+
---
|
13 |
+
|
14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
api/api.py
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
|
3 |
+
import re
|
4 |
+
from math import floor, ceil
|
5 |
+
from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils
|
6 |
+
# from nltk.tokenize import sent_tokenize
|
7 |
+
from flask import Flask, request, jsonify
|
8 |
+
from flask_cors import CORS, cross_origin
|
9 |
+
import webvtt
|
10 |
+
from io import StringIO
|
11 |
+
from mosestokenizer import MosesSentenceSplitter
|
12 |
+
|
13 |
+
from indicTrans.inference.engine import Model
|
14 |
+
from punctuate import RestorePuncts
|
15 |
+
from indicnlp.tokenize.sentence_tokenize import sentence_split
|
16 |
+
|
17 |
+
app = Flask(__name__)
|
18 |
+
cors = CORS(app)
|
19 |
+
app.config['CORS_HEADERS'] = 'Content-Type'
|
20 |
+
|
21 |
+
indic2en_model = Model(expdir='models/v3/indic-en')
|
22 |
+
en2indic_model = Model(expdir='models/v3/en-indic')
|
23 |
+
m2m_model = Model(expdir='models/m2m')
|
24 |
+
|
25 |
+
rpunct = RestorePuncts()
|
26 |
+
|
27 |
+
indic_language_dict = {
|
28 |
+
'Assamese': 'as',
|
29 |
+
'Hindi' : 'hi',
|
30 |
+
'Marathi' : 'mr',
|
31 |
+
'Tamil' : 'ta',
|
32 |
+
'Bengali' : 'bn',
|
33 |
+
'Kannada' : 'kn',
|
34 |
+
'Oriya' : 'or',
|
35 |
+
'Telugu' : 'te',
|
36 |
+
'Gujarati' : 'gu',
|
37 |
+
'Malayalam' : 'ml',
|
38 |
+
'Punjabi' : 'pa',
|
39 |
+
}
|
40 |
+
|
41 |
+
splitter = MosesSentenceSplitter('en')
|
42 |
+
|
43 |
+
def get_inference_params():
|
44 |
+
source_language = request.form['source_language']
|
45 |
+
target_language = request.form['target_language']
|
46 |
+
|
47 |
+
if source_language in indic_language_dict and target_language == 'English':
|
48 |
+
model = indic2en_model
|
49 |
+
source_lang = indic_language_dict[source_language]
|
50 |
+
target_lang = 'en'
|
51 |
+
elif source_language == 'English' and target_language in indic_language_dict:
|
52 |
+
model = en2indic_model
|
53 |
+
source_lang = 'en'
|
54 |
+
target_lang = indic_language_dict[target_language]
|
55 |
+
elif source_language in indic_language_dict and target_language in indic_language_dict:
|
56 |
+
model = m2m_model
|
57 |
+
source_lang = indic_language_dict[source_language]
|
58 |
+
target_lang = indic_language_dict[target_language]
|
59 |
+
|
60 |
+
return model, source_lang, target_lang
|
61 |
+
|
62 |
+
@app.route('/', methods=['GET'])
|
63 |
+
def main():
|
64 |
+
return "IndicTrans API"
|
65 |
+
|
66 |
+
@app.route('/supported_languages', methods=['GET'])
|
67 |
+
@cross_origin()
|
68 |
+
def supported_languages():
|
69 |
+
return jsonify(indic_language_dict)
|
70 |
+
|
71 |
+
@app.route("/translate", methods=['POST'])
|
72 |
+
@cross_origin()
|
73 |
+
def infer_indic_en():
|
74 |
+
model, source_lang, target_lang = get_inference_params()
|
75 |
+
source_text = request.form['text']
|
76 |
+
|
77 |
+
start_time = time.time()
|
78 |
+
target_text = model.translate_paragraph(source_text, source_lang, target_lang)
|
79 |
+
end_time = time.time()
|
80 |
+
return {'text':target_text, 'duration':round(end_time-start_time, 2)}
|
81 |
+
|
82 |
+
@app.route("/translate_vtt", methods=['POST'])
|
83 |
+
@cross_origin()
|
84 |
+
def infer_vtt_indic_en():
|
85 |
+
start_time = time.time()
|
86 |
+
model, source_lang, target_lang = get_inference_params()
|
87 |
+
source_text = request.form['text']
|
88 |
+
# vad_segments = request.form['vad_nochunk'] # Assuming it is an array of start & end timestamps
|
89 |
+
|
90 |
+
vad = webvtt.read_buffer(StringIO(source_text))
|
91 |
+
source_sentences = [v.text.replace('\r', '').replace('\n', ' ') for v in vad]
|
92 |
+
|
93 |
+
## SUMANTH LOGIC HERE ##
|
94 |
+
|
95 |
+
# for each vad timestamp, do:
|
96 |
+
large_sentence = ' '.join(source_sentences) # only sentences in that time range
|
97 |
+
large_sentence = large_sentence.lower()
|
98 |
+
# split_sents = sentence_split(large_sentence, 'en')
|
99 |
+
# print(split_sents)
|
100 |
+
|
101 |
+
large_sentence = re.sub(r'[^\w\s]', '', large_sentence)
|
102 |
+
punctuated = rpunct.punctuate(large_sentence, batch_size=32)
|
103 |
+
end_time = time.time()
|
104 |
+
print("Time Taken for punctuation: {} s".format(end_time - start_time))
|
105 |
+
start_time = time.time()
|
106 |
+
split_sents = splitter([punctuated]) ### Please uncomment
|
107 |
+
|
108 |
+
|
109 |
+
# print(split_sents)
|
110 |
+
# output_sentence_punctuated = model.translate_paragraph(punctuated, source_lang, target_lang)
|
111 |
+
output_sents = model.batch_translate(split_sents, source_lang, target_lang)
|
112 |
+
# print(output_sents)
|
113 |
+
# output_sents = split_sents
|
114 |
+
# print(output_sents)
|
115 |
+
# align this to those range of source_sentences in `captions`
|
116 |
+
|
117 |
+
map_ = {split_sents[i] : output_sents[i] for i in range(len(split_sents))}
|
118 |
+
# print(map_)
|
119 |
+
punct_para = ' '.join(list(map_.keys()))
|
120 |
+
nmt_para = ' '.join(list(map_.values()))
|
121 |
+
nmt_words = nmt_para.split(' ')
|
122 |
+
|
123 |
+
len_punct = len(punct_para.split(' '))
|
124 |
+
len_nmt = len(nmt_para.split(' '))
|
125 |
+
|
126 |
+
start = 0
|
127 |
+
for i in range(len(vad)):
|
128 |
+
if vad[i].text == '':
|
129 |
+
continue
|
130 |
+
|
131 |
+
len_caption = len(vad[i].text.split(' '))
|
132 |
+
frac = (len_caption / len_punct)
|
133 |
+
# frac = round(frac, 2)
|
134 |
+
|
135 |
+
req_nmt_size = floor(frac * len_nmt)
|
136 |
+
# print(frac, req_nmt_size)
|
137 |
+
|
138 |
+
vad[i].text = ' '.join(nmt_words[start:start+req_nmt_size])
|
139 |
+
# print(vad[i].text)
|
140 |
+
# print(start, req_nmt_size)
|
141 |
+
start += req_nmt_size
|
142 |
+
|
143 |
+
end_time = time.time()
|
144 |
+
|
145 |
+
print("Time Taken for translation: {} s".format(end_time - start_time))
|
146 |
+
|
147 |
+
# vad.save('aligned.vtt')
|
148 |
+
|
149 |
+
return {
|
150 |
+
'text': vad.content,
|
151 |
+
# 'duration':round(end_time-start_time, 2)
|
152 |
+
}
|
api/punctuate.py
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
# 💾⚙️🔮
|
3 |
+
|
4 |
+
# taken from https://github.com/Felflare/rpunct/blob/master/rpunct/punctuate.py
|
5 |
+
# modified to support batching during gpu inference
|
6 |
+
|
7 |
+
|
8 |
+
__author__ = "Daulet N."
|
9 |
+
__email__ = "daulet.nurmanbetov@gmail.com"
|
10 |
+
|
11 |
+
import time
|
12 |
+
import logging
|
13 |
+
import webvtt
|
14 |
+
import torch
|
15 |
+
from io import StringIO
|
16 |
+
from nltk.tokenize import sent_tokenize
|
17 |
+
#from langdetect import detect
|
18 |
+
from simpletransformers.ner import NERModel
|
19 |
+
|
20 |
+
|
21 |
+
class RestorePuncts:
|
22 |
+
def __init__(self, wrds_per_pred=250):
|
23 |
+
self.wrds_per_pred = wrds_per_pred
|
24 |
+
self.overlap_wrds = 30
|
25 |
+
self.valid_labels = ['OU', 'OO', '.O', '!O', ',O', '.U', '!U', ',U', ':O', ';O', ':U', "'O", '-O', '?O', '?U']
|
26 |
+
self.model = NERModel("bert", "felflare/bert-restore-punctuation", labels=self.valid_labels,
|
27 |
+
args={"silent": True, "max_seq_length": 512})
|
28 |
+
# use_cuda isnt working and this hack seems to load the model correctly to the gpu
|
29 |
+
self.model.device = torch.device("cuda:1")
|
30 |
+
# dummy punctuate to load the model onto gpu
|
31 |
+
self.punctuate("hello how are you")
|
32 |
+
|
33 |
+
def punctuate(self, text: str, batch_size:int=32, lang:str=''):
|
34 |
+
"""
|
35 |
+
Performs punctuation restoration on arbitrarily large text.
|
36 |
+
Detects if input is not English, if non-English was detected terminates predictions.
|
37 |
+
Overrride by supplying `lang='en'`
|
38 |
+
|
39 |
+
Args:
|
40 |
+
- text (str): Text to punctuate, can be few words to as large as you want.
|
41 |
+
- lang (str): Explicit language of input text.
|
42 |
+
"""
|
43 |
+
#if not lang and len(text) > 10:
|
44 |
+
# lang = detect(text)
|
45 |
+
#if lang != 'en':
|
46 |
+
# raise Exception(F"""Non English text detected. Restore Punctuation works only for English.
|
47 |
+
# If you are certain the input is English, pass argument lang='en' to this function.
|
48 |
+
# Punctuate received: {text}""")
|
49 |
+
|
50 |
+
def chunks(L, n):
|
51 |
+
return [L[x : x + n] for x in range(0, len(L), n)]
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
# plit up large text into bert digestable chunks
|
56 |
+
splits = self.split_on_toks(text, self.wrds_per_pred, self.overlap_wrds)
|
57 |
+
|
58 |
+
texts = [i["text"] for i in splits]
|
59 |
+
batches = chunks(texts, batch_size)
|
60 |
+
preds_lst = []
|
61 |
+
|
62 |
+
|
63 |
+
for batch in batches:
|
64 |
+
batch_preds, _ = self.model.predict(batch)
|
65 |
+
preds_lst.extend(batch_preds)
|
66 |
+
|
67 |
+
|
68 |
+
# predict slices
|
69 |
+
# full_preds_lst contains tuple of labels and logits
|
70 |
+
#full_preds_lst = [self.predict(i['text']) for i in splits]
|
71 |
+
# extract predictions, and discard logits
|
72 |
+
#preds_lst = [i[0][0] for i in full_preds_lst]
|
73 |
+
# join text slices
|
74 |
+
combined_preds = self.combine_results(text, preds_lst)
|
75 |
+
# create punctuated prediction
|
76 |
+
punct_text = self.punctuate_texts(combined_preds)
|
77 |
+
return punct_text
|
78 |
+
|
79 |
+
def predict(self, input_slice):
|
80 |
+
"""
|
81 |
+
Passes the unpunctuated text to the model for punctuation.
|
82 |
+
"""
|
83 |
+
predictions, raw_outputs = self.model.predict([input_slice])
|
84 |
+
return predictions, raw_outputs
|
85 |
+
|
86 |
+
@staticmethod
|
87 |
+
def split_on_toks(text, length, overlap):
|
88 |
+
"""
|
89 |
+
Splits text into predefined slices of overlapping text with indexes (offsets)
|
90 |
+
that tie-back to original text.
|
91 |
+
This is done to bypass 512 token limit on transformer models by sequentially
|
92 |
+
feeding chunks of < 512 toks.
|
93 |
+
Example output:
|
94 |
+
[{...}, {"text": "...", 'start_idx': 31354, 'end_idx': 32648}, {...}]
|
95 |
+
"""
|
96 |
+
wrds = text.replace('\n', ' ').split(" ")
|
97 |
+
resp = []
|
98 |
+
lst_chunk_idx = 0
|
99 |
+
i = 0
|
100 |
+
|
101 |
+
while True:
|
102 |
+
# words in the chunk and the overlapping portion
|
103 |
+
wrds_len = wrds[(length * i):(length * (i + 1))]
|
104 |
+
wrds_ovlp = wrds[(length * (i + 1)):((length * (i + 1)) + overlap)]
|
105 |
+
wrds_split = wrds_len + wrds_ovlp
|
106 |
+
|
107 |
+
# Break loop if no more words
|
108 |
+
if not wrds_split:
|
109 |
+
break
|
110 |
+
|
111 |
+
wrds_str = " ".join(wrds_split)
|
112 |
+
nxt_chunk_start_idx = len(" ".join(wrds_len))
|
113 |
+
lst_char_idx = len(" ".join(wrds_split))
|
114 |
+
|
115 |
+
resp_obj = {
|
116 |
+
"text": wrds_str,
|
117 |
+
"start_idx": lst_chunk_idx,
|
118 |
+
"end_idx": lst_char_idx + lst_chunk_idx,
|
119 |
+
}
|
120 |
+
|
121 |
+
resp.append(resp_obj)
|
122 |
+
lst_chunk_idx += nxt_chunk_start_idx + 1
|
123 |
+
i += 1
|
124 |
+
logging.info(f"Sliced transcript into {len(resp)} slices.")
|
125 |
+
return resp
|
126 |
+
|
127 |
+
@staticmethod
|
128 |
+
def combine_results(full_text: str, text_slices):
|
129 |
+
"""
|
130 |
+
Given a full text and predictions of each slice combines predictions into a single text again.
|
131 |
+
Performs validataion wether text was combined correctly
|
132 |
+
"""
|
133 |
+
split_full_text = full_text.replace('\n', ' ').split(" ")
|
134 |
+
split_full_text = [i for i in split_full_text if i]
|
135 |
+
split_full_text_len = len(split_full_text)
|
136 |
+
output_text = []
|
137 |
+
index = 0
|
138 |
+
|
139 |
+
if len(text_slices[-1]) <= 3 and len(text_slices) > 1:
|
140 |
+
text_slices = text_slices[:-1]
|
141 |
+
|
142 |
+
for _slice in text_slices:
|
143 |
+
slice_wrds = len(_slice)
|
144 |
+
for ix, wrd in enumerate(_slice):
|
145 |
+
# print(index, "|", str(list(wrd.keys())[0]), "|", split_full_text[index])
|
146 |
+
if index == split_full_text_len:
|
147 |
+
break
|
148 |
+
|
149 |
+
if split_full_text[index] == str(list(wrd.keys())[0]) and \
|
150 |
+
ix <= slice_wrds - 3 and text_slices[-1] != _slice:
|
151 |
+
index += 1
|
152 |
+
pred_item_tuple = list(wrd.items())[0]
|
153 |
+
output_text.append(pred_item_tuple)
|
154 |
+
elif split_full_text[index] == str(list(wrd.keys())[0]) and text_slices[-1] == _slice:
|
155 |
+
index += 1
|
156 |
+
pred_item_tuple = list(wrd.items())[0]
|
157 |
+
output_text.append(pred_item_tuple)
|
158 |
+
assert [i[0] for i in output_text] == split_full_text
|
159 |
+
return output_text
|
160 |
+
|
161 |
+
@staticmethod
|
162 |
+
def punctuate_texts(full_pred: list):
|
163 |
+
"""
|
164 |
+
Given a list of Predictions from the model, applies the predictions to text,
|
165 |
+
thus punctuating it.
|
166 |
+
"""
|
167 |
+
punct_resp = ""
|
168 |
+
for i in full_pred:
|
169 |
+
word, label = i
|
170 |
+
if label[-1] == "U":
|
171 |
+
punct_wrd = word.capitalize()
|
172 |
+
else:
|
173 |
+
punct_wrd = word
|
174 |
+
|
175 |
+
if label[0] != "O":
|
176 |
+
punct_wrd += label[0]
|
177 |
+
|
178 |
+
punct_resp += punct_wrd + " "
|
179 |
+
punct_resp = punct_resp.strip()
|
180 |
+
# Append trailing period if doesnt exist.
|
181 |
+
if punct_resp[-1].isalnum():
|
182 |
+
punct_resp += "."
|
183 |
+
return punct_resp
|
184 |
+
|
185 |
+
|
186 |
+
if __name__ == "__main__":
|
187 |
+
|
188 |
+
start = time.time()
|
189 |
+
punct_model = RestorePuncts()
|
190 |
+
|
191 |
+
load_model = time.time()
|
192 |
+
print(f'Time to load model: {load_model - start}')
|
193 |
+
# read test file
|
194 |
+
# with open('en_lower.txt', 'r') as fp:
|
195 |
+
# # test_sample = fp.read()
|
196 |
+
# lines = fp.readlines()
|
197 |
+
|
198 |
+
with open('sample.vtt', 'r') as fp:
|
199 |
+
source_text = fp.read()
|
200 |
+
|
201 |
+
# captions = webvtt.read_buffer(StringIO(source_text))
|
202 |
+
captions = webvtt.read('sample.vtt')
|
203 |
+
source_sentences = [caption.text.replace('\r', '').replace('\n', ' ') for caption in captions]
|
204 |
+
|
205 |
+
# print(source_sentences)
|
206 |
+
|
207 |
+
sent = ' '.join(source_sentences)
|
208 |
+
punctuated = punct_model.punctuate(sent)
|
209 |
+
|
210 |
+
tokenised = sent_tokenize(punctuated)
|
211 |
+
# print(tokenised)
|
212 |
+
|
213 |
+
for i in range(len(tokenised)):
|
214 |
+
captions[i].text = tokenised[i]
|
215 |
+
# return captions.content
|
216 |
+
captions.save('my_captions.vtt')
|
217 |
+
|
218 |
+
end = time.time()
|
219 |
+
print(f'Time for run: {end - load_model}')
|
220 |
+
print(f'Total time: {end - start}')
|
app.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
from inference.engine import Model
|
4 |
+
|
5 |
+
e2i_model_download = "wget --load-cookies /tmp/cookies.txt \"https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1IpcnaQ2ScX_zodt2aLlXa_5Kkntl0nue' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\\n/p')&id=1IpcnaQ2ScX_zodt2aLlXa_5Kkntl0nue\" -O en-indic.zip && rm -rf /tmp/cookies.txt"
|
6 |
+
os.system(e2i_model_download)
|
7 |
+
os.system('unzip /home/user/app/en-indic.zip')
|
8 |
+
|
9 |
+
i2e_model_download = "wget --load-cookies /tmp/cookies.txt \"https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1-hzy09qi-OEogyge7rQG79K7iV4xsNWa' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\\n/p')&id=1-hzy09qi-OEogyge7rQG79K7iV4xsNWa\" -O indic-en.zip && rm -rf /tmp/cookies.txt"
|
10 |
+
os.system(i2e_model_download)
|
11 |
+
os.system('unzip /home/user/app/indic-en.zip')
|
12 |
+
|
13 |
+
i2i_model_download = "wget --show-progress -O m2m.tar https://ai4b-my.sharepoint.com/:u:/g/personal/sumanthdoddapaneni_ai4bharat_org/Eajn_jJIp5NEqeyqZ0GW4FgBdiANlZNQiy7dlwkaNr8DHw?download=1"
|
14 |
+
os.system(i2i_model_download)
|
15 |
+
os.system("tar -xvf /home/user/app/m2m.tar")
|
16 |
+
|
17 |
+
en2indic_model = Model(expdir='/home/user/app/en-indic')
|
18 |
+
indic2en_model = Model(expdir='/home/user/app/indic-en')
|
19 |
+
indic2indic_model = Model(expdir='/home/user/app/m2m')
|
20 |
+
|
21 |
+
LANGUAGES = {"Assamese": "as", "Bengali": "bn", "Gujarati": "gu", "Hindi": "hi", "Kannada": "kn",
|
22 |
+
"Malayalam": "ml", "Marathi": "mr", "Odia": "or", "Punjabi": "pa", "Tamil": "ta", "Telugu": "te", "English": "en"}
|
23 |
+
|
24 |
+
|
25 |
+
def translate(text, fromLang, toLang):
|
26 |
+
if (fromLang != "English" and toLang == "English"):
|
27 |
+
return indic2en_model.translate_paragraph(text, LANGUAGES[fromLang], LANGUAGES[toLang])
|
28 |
+
elif (fromLang == "English" and toLang != "English"):
|
29 |
+
return en2indic_model.translate_paragraph(text, LANGUAGES[fromLang], LANGUAGES[toLang])
|
30 |
+
elif (fromLang != "English" and toLang != "English"):
|
31 |
+
return indic2indic_model.translate_paragraph(text, LANGUAGES[fromLang], LANGUAGES[toLang])
|
32 |
+
else:
|
33 |
+
return text
|
34 |
+
|
35 |
+
|
36 |
+
languages = list(LANGUAGES.keys())
|
37 |
+
|
38 |
+
fromChoice = gr.inputs.Dropdown(
|
39 |
+
languages, type="value", default="Hindi", label="Select Source Language")
|
40 |
+
|
41 |
+
toChoice = gr.inputs.Dropdown(
|
42 |
+
languages, type="value", default="Tamil", label="Select Target Language")
|
43 |
+
|
44 |
+
text_output = gr.outputs.Textbox(
|
45 |
+
type="auto", label=f"Translation")
|
46 |
+
|
47 |
+
text = gr.inputs.Textbox(lines=5, placeholder="Enter Text to translate",
|
48 |
+
default="", label="Enter Text in Source Language")
|
49 |
+
|
50 |
+
supported_lang = ', '.join(languages)
|
51 |
+
|
52 |
+
interface_description = f"""
|
53 |
+
<html>
|
54 |
+
<body>
|
55 |
+
<h1>
|
56 |
+
Usage:
|
57 |
+
</h1>
|
58 |
+
<ul>
|
59 |
+
<li>Choose the Source Language and Target Language for translation.</li>
|
60 |
+
<li>Enter your text in source language in the textbox.</li>
|
61 |
+
<li>Click Submit and view your translated output.</li>
|
62 |
+
</ul>
|
63 |
+
<br/>
|
64 |
+
<span>Currently the model supports {supported_lang} </span>
|
65 |
+
</body>
|
66 |
+
</html>
|
67 |
+
"""
|
68 |
+
|
69 |
+
interface_article = """
|
70 |
+
<html>
|
71 |
+
<body>
|
72 |
+
<div>
|
73 |
+
<h1>
|
74 |
+
About
|
75 |
+
</h1>
|
76 |
+
<h4>
|
77 |
+
Original repository can be found at <a href="https://github.com/AI4Bharat/indicTrans">here</a>.
|
78 |
+
</h4>
|
79 |
+
<br/>
|
80 |
+
<span>
|
81 |
+
The models used in this interface are multilingual single-script transformer based models for translating between English and Indian languages. The models are trained using the Samanantar corpus and at the time of their release was the state of the art open source model as evaluated on Facebook's FLORES benchmark.
|
82 |
+
</span>
|
83 |
+
<br/>
|
84 |
+
<h4>
|
85 |
+
These models are currently being used on AI Tools/Platforms such as:
|
86 |
+
</h4>
|
87 |
+
<ul>
|
88 |
+
<li><a href="https://ai4bharat.org/shoonya">Shoonya</a></li>
|
89 |
+
<li><a href="https://ai4bharat.org/chitralekha">Chitralekha</a> (deployed for NPTEL)</li>
|
90 |
+
<li><a href="https://ai4bharat.org/anuvaad">Anuvaad</a> (deployed for Supreme Court of India & Bangladesh)</li>
|
91 |
+
<li>Pratham Books</li>
|
92 |
+
</ul>
|
93 |
+
</div>
|
94 |
+
</body>
|
95 |
+
</html>
|
96 |
+
"""
|
97 |
+
|
98 |
+
examples = [
|
99 |
+
["A farmer lives in a village", "English", "Hindi"],
|
100 |
+
["एक गाव मे एक किसान रहता ता", "Hindi", "English"],
|
101 |
+
["एक गाव मे एक किसान रहता ता", "Hindi", "Tamil"]
|
102 |
+
]
|
103 |
+
|
104 |
+
|
105 |
+
iface = gr.Interface(fn=translate, inputs=[text, fromChoice, toChoice], outputs=text_output,
|
106 |
+
title='IndicTrans - Multilingual Translation', description=interface_description, article=interface_article, examples=examples)
|
107 |
+
iface.launch(enable_queue=True)
|
indic_nlp_library/LICENSE
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2013-present Anoop Kunchukuttan
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
6 |
+
|
7 |
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
8 |
+
|
9 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
indic_nlp_library/README.md
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Indic NLP Library
|
2 |
+
|
3 |
+
The goal of the Indic NLP Library is to build Python based libraries for common text processing and Natural Language Processing in Indian languages. Indian languages share a lot of similarity in terms of script, phonology, language syntax, etc. and this library is an attempt to provide a general solution to very commonly required toolsets for Indian language text.
|
4 |
+
|
5 |
+
The library provides the following functionalities:
|
6 |
+
|
7 |
+
- Text Normalization
|
8 |
+
- Script Information
|
9 |
+
- Word Tokenization and Detokenization
|
10 |
+
- Sentence Splitting
|
11 |
+
- Word Segmentation
|
12 |
+
- Syllabification
|
13 |
+
- Script Conversion
|
14 |
+
- Romanization
|
15 |
+
- Indicization
|
16 |
+
- Transliteration
|
17 |
+
- Translation
|
18 |
+
|
19 |
+
The data resources required by the Indic NLP Library are hosted in a different repository. These resources are required for some modules. You can download from the [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources) project.
|
20 |
+
|
21 |
+
**If you are interested in Indian language NLP resources, you should check the [Indic NLP Catalog](https://github.com/indicnlpweb/indicnlp_catalog) for pointers.**
|
22 |
+
|
23 |
+
## Pre-requisites
|
24 |
+
|
25 |
+
- Python 3.x
|
26 |
+
- (For Python 2.x version check the tag `PYTHON_2.7_FINAL_JAN_2019`. Not actively supporting Python 2.x anymore, but will try to maintain as much compatibility as possible)
|
27 |
+
- [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources)
|
28 |
+
- [Urduhack](https://github.com/urduhack/urduhack): Needed only if Urdu normalization is required. It has other dependencies like Tensorflow.
|
29 |
+
- Other dependencies are listed in setup.py
|
30 |
+
|
31 |
+
|
32 |
+
## Configuration
|
33 |
+
|
34 |
+
- Installation from pip:
|
35 |
+
|
36 |
+
`pip install indic-nlp-library`
|
37 |
+
|
38 |
+
- If you want to use the project from the github repo, add the project to the Python Path:
|
39 |
+
|
40 |
+
- Clone this repository
|
41 |
+
- Install dependencies: `pip install -r requirements.txt`
|
42 |
+
- Run: `export PYTHONPATH=$PYTHONPATH:<project base directory>`
|
43 |
+
|
44 |
+
- In either case, export the path to the _Indic NLP Resources_ directory
|
45 |
+
|
46 |
+
Run: `export INDIC_RESOURCES_PATH=<path to Indic NLP resources>`
|
47 |
+
|
48 |
+
## Usage
|
49 |
+
|
50 |
+
You can use the Python API to access all the features of the library. Many of the most common operations are also accessible via a unified commandline API.
|
51 |
+
|
52 |
+
### Getting Started
|
53 |
+
|
54 |
+
Check [this IPython Notebook](http://nbviewer.ipython.org/url/anoopkunchukuttan.github.io/indic_nlp_library/doc/indic_nlp_examples.ipynb) for examples to use the Python API.
|
55 |
+
- You can find the Python 2.x Notebook [here](http://nbviewer.ipython.org/url/anoopkunchukuttan.github.io/indic_nlp_library/doc/indic_nlp_examples_2_7.ipynb)
|
56 |
+
|
57 |
+
### Documentation
|
58 |
+
|
59 |
+
You can find detailed documentation [HERE](https://indic-nlp-library.readthedocs.io/en/latest)
|
60 |
+
|
61 |
+
This documents the Python API as well as the commandline reference.
|
62 |
+
|
63 |
+
## Citing
|
64 |
+
|
65 |
+
If you use this library, please include the following citation:
|
66 |
+
|
67 |
+
```
|
68 |
+
@misc{kunchukuttan2020indicnlp,
|
69 |
+
author = "Anoop Kunchukuttan",
|
70 |
+
title = "{The IndicNLP Library}",
|
71 |
+
year = "2020",
|
72 |
+
howpublished={\url{https://github.com/anoopkunchukuttan/indic_nlp_library/blob/master/docs/indicnlp.pdf}}
|
73 |
+
}
|
74 |
+
```
|
75 |
+
You can find the document [HERE](docs/indicnlp.pdf)
|
76 |
+
|
77 |
+
## Website
|
78 |
+
|
79 |
+
`http://anoopkunchukuttan.github.io/indic_nlp_library`
|
80 |
+
|
81 |
+
## Author
|
82 |
+
Anoop Kunchukuttan ([anoop.kunchukuttan@gmail.com](anoop.kunchukuttan@gmail.com))
|
83 |
+
|
84 |
+
## Companies, Organizations, Projects using IndicNLP Library
|
85 |
+
|
86 |
+
- [AI4Bharat-IndicNLPSuite](https://indicnlp.ai4bharat.org)
|
87 |
+
- [The Classical Language Toolkit](http://cltk.org)
|
88 |
+
- [Microsoft NLP Recipes](https://github.com/microsoft/nlp-recipes)
|
89 |
+
- [Facebook M2M-100](https://github.com/pytorch/fairseq/tree/master/examples/m2m_100)
|
90 |
+
|
91 |
+
## Revision Log
|
92 |
+
|
93 |
+
|
94 |
+
0.81 : 26 May 2021
|
95 |
+
|
96 |
+
- Bug fix in version number extraction
|
97 |
+
|
98 |
+
0.80 : 24 May 2021
|
99 |
+
|
100 |
+
- Improved sentence splitting
|
101 |
+
- Bug fixes
|
102 |
+
- Support for Urdu Normalizer
|
103 |
+
|
104 |
+
0.71 : 03 Sep 2020
|
105 |
+
|
106 |
+
- Improved documentation
|
107 |
+
- Bug fixes
|
108 |
+
|
109 |
+
0.7 : 02 Apr 2020:
|
110 |
+
|
111 |
+
- Unified commandline
|
112 |
+
- Improved documentation
|
113 |
+
- Added setup.py
|
114 |
+
|
115 |
+
0.6 : 16 Dec 2019:
|
116 |
+
|
117 |
+
- New romanizer and indicizer
|
118 |
+
- Script Unifiers
|
119 |
+
- Improved script normalizers
|
120 |
+
- Added contrib directory for sample uses
|
121 |
+
- changed to MIT license
|
122 |
+
|
123 |
+
0.5 : 03 Jun 2019:
|
124 |
+
|
125 |
+
- Improved word tokenizer to handle dates and numbers.
|
126 |
+
- Added sentence splitter that can handle common prefixes/honorofics and uses some heuristics.
|
127 |
+
- Added detokenizer
|
128 |
+
- Added acronym transliterator that can convert English acronyms to Brahmi-derived scripts
|
129 |
+
|
130 |
+
0.4 : 28 Jan 2019: Ported to Python 3, and lots of feature additions since last release; primarily around script information, script similarity and syllabification.
|
131 |
+
|
132 |
+
0.3 : 21 Oct 2014: Supports morph-analysis between Indian languages
|
133 |
+
|
134 |
+
0.2 : 13 Jun 2014: Supports transliteration between Indian languages and tokenization of Indian languages
|
135 |
+
|
136 |
+
0.1 : 12 Mar 2014: Initial version. Supports text normalization.
|
137 |
+
|
138 |
+
## LICENSE
|
139 |
+
|
140 |
+
Indic NLP Library is released under the MIT license
|
141 |
+
|
142 |
+
|
indic_nlp_library/contrib/README.md
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Contrib
|
2 |
+
|
3 |
+
Contains additional utilities and applications using Indic NLP library core
|
4 |
+
|
5 |
+
- `indic_scraper_project_sample.ipynb`: A simple pipeline for building monolingual corpora for Indian languages from crawled web content, Wikipedia, etc. An extensible framework which allows incorporation of website specific extractors, whereas generic NLP tasks like tokenization, sentence splitting, normalization, etc. are handled by the framework.
|
6 |
+
- `correct_moses_tokenizer.py`: This script corrects the incorrect tokenization done by Moses tokenizer. The Moses tokenizer splits on nukta and halant characters.
|
7 |
+
- `hindi_to_kannada_transliterator.py`: This script transliterates Hindi to Kannada. It removes/remaps characters only found in Hindi. It also adds halanta to words ending with consonant - as is the convention in Kannada.
|
indic_nlp_library/contrib/correct_moses_tokenizer.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
from indicnlp import langinfo
|
3 |
+
from indicnlp import loader
|
4 |
+
|
5 |
+
if __name__ == '__main__':
|
6 |
+
"""
|
7 |
+
This script corrects the incorrect tokenization done by Moses tokenizer.
|
8 |
+
The Moses tokenizer splits on nukta and halant characters
|
9 |
+
Usage: python correct_moses_tokenizer.py <infname> <outfname> <langcode>
|
10 |
+
"""
|
11 |
+
|
12 |
+
loader.load()
|
13 |
+
|
14 |
+
infname=sys.argv[1]
|
15 |
+
outfname=sys.argv[2]
|
16 |
+
lang=sys.argv[3]
|
17 |
+
|
18 |
+
halant_char=langinfo.offset_to_char(langinfo.HALANTA_OFFSET,lang)
|
19 |
+
nukta_char=langinfo.offset_to_char(langinfo.NUKTA_OFFSET,lang)
|
20 |
+
|
21 |
+
with open(infname,'r',encoding='utf-8') as infile, \
|
22 |
+
open(outfname,'w',encoding='utf-8') as outfile:
|
23 |
+
for line in infile:
|
24 |
+
outfile.write(
|
25 |
+
line.replace(
|
26 |
+
' {} '.format(halant_char), halant_char).replace(
|
27 |
+
' {} '.format(nukta_char), nukta_char).replace(
|
28 |
+
' {}{}'.format(nukta_char,halant_char),'{}{}'.format(nukta_char,halant_char))
|
29 |
+
)
|
indic_nlp_library/contrib/hindi_to_kannada_transliterator.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
from indicnlp import common
|
3 |
+
common.set_resources_path(INDIC_NLP_RESOURCES)
|
4 |
+
|
5 |
+
from indicnlp import loader
|
6 |
+
from indicnlp.normalize import indic_normalize
|
7 |
+
from indicnlp.transliterate import unicode_transliterate
|
8 |
+
|
9 |
+
if __name__ == '__main__':
|
10 |
+
"""
|
11 |
+
This script transliterates Hindi to Kannada. It removes/remaps
|
12 |
+
characters only found in Hindi. It also adds halanta to words ending
|
13 |
+
with consonant - as is the convention in Kannada
|
14 |
+
"""
|
15 |
+
|
16 |
+
infname=sys.argv[1] # one sentence/word per line. Sentences should be space-tokenized
|
17 |
+
outfname=sys.agv[2]
|
18 |
+
loader.load()
|
19 |
+
|
20 |
+
normalizer_factory=indic_normalize.IndicNormalizerFactory()
|
21 |
+
normalizer=normalizer_factory.get_normalizer('hi')
|
22 |
+
|
23 |
+
with open(infname,'r',encoding='utf-8') as infile, \
|
24 |
+
open(outfname,'w',encoding='utf-8') as outfile:
|
25 |
+
for line in infile:
|
26 |
+
line=line.strip()
|
27 |
+
line=normalizer.normalize(line)
|
28 |
+
|
29 |
+
## replace chandrabindus with anusvara
|
30 |
+
line=line.replace('\u0900','\u0902')
|
31 |
+
line=line.replace('\u0901','\u0902')
|
32 |
+
|
33 |
+
### replace chandra e and o diacritics with e and o respectively
|
34 |
+
#line=line.replace('\u0945','\u0947')
|
35 |
+
#line=line.replace('\u0949','\u094b')
|
36 |
+
|
37 |
+
### replace chandra e and o diacritics with a diacritic
|
38 |
+
## this seems to be general usage
|
39 |
+
line=line.replace('\u0945','\u093e')
|
40 |
+
line=line.replace('\u0949','\u093e')
|
41 |
+
|
42 |
+
## remove nukta
|
43 |
+
line=line.replace('\u093c','')
|
44 |
+
|
45 |
+
## add halant if word ends with consonant
|
46 |
+
#if isc.is_consonant(isc.get_phonetic_feature_vector(line[-1],'hi')):
|
47 |
+
# line=line+'\u094d'
|
48 |
+
words=line.split(' ')
|
49 |
+
outwords=[]
|
50 |
+
for word in line.split(' '):
|
51 |
+
if isc.is_consonant(isc.get_phonetic_feature_vector(word[-1],'hi')):
|
52 |
+
word=word+'\u094d'
|
53 |
+
outwords.append(word)
|
54 |
+
line=' '.join(outwords)
|
55 |
+
|
56 |
+
|
57 |
+
## script conversion
|
58 |
+
line=unicode_transliterate.UnicodeIndicTransliterator.transliterate(line,'hi','kn')
|
59 |
+
|
60 |
+
outfile.write(line+'\n')
|
61 |
+
|
62 |
+
|
indic_nlp_library/contrib/indic_scraper_project_sample.ipynb
ADDED
@@ -0,0 +1,569 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"# Pre-requisites\n",
|
8 |
+
"\n",
|
9 |
+
"- Python 3.5+\n",
|
10 |
+
"- Python packages: \n",
|
11 |
+
" - `pip install bs4 pandas mmh3`\n",
|
12 |
+
"- [Indic NLP Library](https://github.com/anoopkunchukuttan/indic_nlp_library)\n",
|
13 |
+
"- [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources)"
|
14 |
+
]
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"cell_type": "markdown",
|
18 |
+
"metadata": {},
|
19 |
+
"source": [
|
20 |
+
"# Initialize the Indic NLP Library\n",
|
21 |
+
"\n",
|
22 |
+
"Run the cell below to initialize the Indic NLP Library"
|
23 |
+
]
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"cell_type": "code",
|
27 |
+
"execution_count": null,
|
28 |
+
"metadata": {},
|
29 |
+
"outputs": [],
|
30 |
+
"source": [
|
31 |
+
"# The path to the local git repo for Indic NLP Library\n",
|
32 |
+
"INDIC_NLP_LIB_HOME=\"/disk1/src/indic_nlp_library\"\n",
|
33 |
+
"\n",
|
34 |
+
"# The path to the local git repo for Indic NLP Resources\n",
|
35 |
+
"INDIC_NLP_RESOURCES=\"/disk1/src/indic_nlp_resources\"\n",
|
36 |
+
"\n",
|
37 |
+
"import sys\n",
|
38 |
+
"sys.path.append('{}/src'.format(INDIC_NLP_LIB_HOME))\n",
|
39 |
+
"\n",
|
40 |
+
"from indicnlp import common\n",
|
41 |
+
"common.set_resources_path(INDIC_NLP_RESOURCES)\n",
|
42 |
+
"\n",
|
43 |
+
"from indicnlp import loader\n",
|
44 |
+
"loader.load()"
|
45 |
+
]
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"cell_type": "code",
|
49 |
+
"execution_count": null,
|
50 |
+
"metadata": {},
|
51 |
+
"outputs": [],
|
52 |
+
"source": [
|
53 |
+
"from bs4 import BeautifulSoup\n",
|
54 |
+
"import os\n",
|
55 |
+
"import string\n",
|
56 |
+
"import indicnlp\n",
|
57 |
+
"from indicnlp.tokenize import indic_tokenize\n",
|
58 |
+
"from indicnlp.normalize import indic_normalize\n",
|
59 |
+
"from indicnlp.transliterate import unicode_transliterate\n",
|
60 |
+
"from indicnlp.tokenize import sentence_tokenize\n",
|
61 |
+
"import re\n",
|
62 |
+
"import collections\n",
|
63 |
+
"import random\n",
|
64 |
+
"import mmh3"
|
65 |
+
]
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"cell_type": "markdown",
|
69 |
+
"metadata": {},
|
70 |
+
"source": [
|
71 |
+
"# Common Functions"
|
72 |
+
]
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"cell_type": "code",
|
76 |
+
"execution_count": null,
|
77 |
+
"metadata": {},
|
78 |
+
"outputs": [],
|
79 |
+
"source": [
|
80 |
+
"def preprocess_sent(text,lang,normalizer):\n",
|
81 |
+
" \"\"\"\n",
|
82 |
+
" Pre-process text (normalization and tokenization)\n",
|
83 |
+
" \n",
|
84 |
+
" text: text string to preprocess\n",
|
85 |
+
" lang: language code (2-letter ISO code)\n",
|
86 |
+
" normalizer: normalizer object for language\n",
|
87 |
+
" \n",
|
88 |
+
" returns the processed text string\n",
|
89 |
+
" \"\"\"\n",
|
90 |
+
" return ' '.join(indic_tokenize.trivial_tokenize(normalizer.normalize(text.replace('\\n',' ')),lang)) \n",
|
91 |
+
"\n",
|
92 |
+
"def sent_split(text,lang):\n",
|
93 |
+
" \"\"\"\n",
|
94 |
+
" Sentence splitter\n",
|
95 |
+
" \n",
|
96 |
+
" text: text to sentence split \n",
|
97 |
+
" lang: language\n",
|
98 |
+
" \n",
|
99 |
+
" returns list of sentences \n",
|
100 |
+
" \"\"\"\n",
|
101 |
+
" return sentence_tokenize.sentence_split(text,lang)\n",
|
102 |
+
"\n",
|
103 |
+
"def extract_all_content(indir,lang,\n",
|
104 |
+
" article_extract_fn,\n",
|
105 |
+
" preprocess_fn=preprocess_sent,\n",
|
106 |
+
" narticles=-1,\n",
|
107 |
+
" start_artid=0):\n",
|
108 |
+
" \"\"\"\n",
|
109 |
+
" This method reads all files from the input directory, extracts text content from each file,\n",
|
110 |
+
" and pre-processes the text. This method is a generator. \n",
|
111 |
+
" For each sentence, the method yields a tuple of the format: \n",
|
112 |
+
" \n",
|
113 |
+
" (artid, fname, paraid, sentid, processed_text)\n",
|
114 |
+
" \n",
|
115 |
+
" indir: path to input directoryo containing files to be parsed \n",
|
116 |
+
" \n",
|
117 |
+
" lang: language to the files in the input directory\n",
|
118 |
+
" \n",
|
119 |
+
" article_extract_fn: the function to extract text content from each file. \n",
|
120 |
+
" Signature of the function: get_article_contents(fname,lang,encoding) \n",
|
121 |
+
" `fname` is name of the file, `lang` is langcode, \n",
|
122 |
+
" `encoding` is text-encoding (default=utf-8). \n",
|
123 |
+
" The function yields a tuple (paraid, sentid, extracted_text) \n",
|
124 |
+
" for each sentence.\n",
|
125 |
+
" \n",
|
126 |
+
" preprocess_fn: pre-processing function to apply to the extracted text. \n",
|
127 |
+
" The function takes a string as input and returns processed string as output.\n",
|
128 |
+
" \n",
|
129 |
+
" narticles: extract and process the first `narticles` from input directory. \n",
|
130 |
+
" if narticles=-1 (default), all files are extracted\n",
|
131 |
+
" \n",
|
132 |
+
" start_artid: the start of the article id to assign to extracted articles (default=0)\n",
|
133 |
+
" \n",
|
134 |
+
" \"\"\"\n",
|
135 |
+
"\n",
|
136 |
+
" fnames = os.listdir(indir)\n",
|
137 |
+
" if narticles>0:\n",
|
138 |
+
" fnames=fnames[:narticles]\n",
|
139 |
+
" nsent=0\n",
|
140 |
+
"\n",
|
141 |
+
" normalizer_factory=indic_normalize.IndicNormalizerFactory()\n",
|
142 |
+
" normalizer=normalizer_factory.get_normalizer(lang)\n",
|
143 |
+
" \n",
|
144 |
+
" print('Number of articles: {}'.format(len(fnames)))\n",
|
145 |
+
" for artid, fname in enumerate(fnames,start_artid):\n",
|
146 |
+
"# print(fname)\n",
|
147 |
+
" if artid%100 == 0:\n",
|
148 |
+
" print('({}|{})'.format(artid,nsent),end=' ... ')\n",
|
149 |
+
" \n",
|
150 |
+
" try:\n",
|
151 |
+
" fpath=os.sep.join([indir,fname])\n",
|
152 |
+
" for paraid, sentid, sent in article_extract_fn(fpath,lang):\n",
|
153 |
+
" nsent+=1\n",
|
154 |
+
" yield( ( artid, fname, paraid, sentid, preprocess_fn(sent,lang,normalizer) ) )\n",
|
155 |
+
" except:\n",
|
156 |
+
" print('Cannot parse {}'.format(fname))\n",
|
157 |
+
" \n",
|
158 |
+
"def write_corpus(corpus_iterator,content_fname,article_mapping_fname,delimiter=' ||| ', encoding='utf-8'):\n",
|
159 |
+
" \"\"\"\n",
|
160 |
+
" Writes the extracted corpus to a file. The extracted data is organized in terms of articles, paragraphs \n",
|
161 |
+
" and sentences. The following is the format of the output file: \n",
|
162 |
+
" - one line per sentence\n",
|
163 |
+
" - format of line: article_id, para_id, sent_id, sentence\n",
|
164 |
+
" In addition to the content file mention, a metadata file which maps the article id to the filename is also written. \n",
|
165 |
+
" \n",
|
166 |
+
" corpus_iterator: iterator over the corpus, yielding tuple (artid, fname, paraid, sentid, processed_text). \n",
|
167 |
+
" The function `extract_all_content` yields a generator in this format. \n",
|
168 |
+
" content_fname: output content file to write the extracted data to in the format mentioned above\n",
|
169 |
+
" article_mapping_fname: output metadata file to write article id to filename mapping.\n",
|
170 |
+
" delimiter=' ||| ': delimiter for the content file. The default delimiter is the same \n",
|
171 |
+
" as used in the Moses phrase table\n",
|
172 |
+
" encoding: text encoding default - 'utf-8'\n",
|
173 |
+
" \n",
|
174 |
+
" \"\"\"\n",
|
175 |
+
" \n",
|
176 |
+
" artid_name_mapping={}\n",
|
177 |
+
" with open(content_fname,'w',encoding=encoding) as contentfile:\n",
|
178 |
+
" for artid, fname, paraid, sentid, text in corpus_iterator:\n",
|
179 |
+
" contentfile.write(delimiter.join([str(artid), str(paraid), str(sentid), text]) + '\\n')\n",
|
180 |
+
" artid_name_mapping[artid]=fname\n",
|
181 |
+
"\n",
|
182 |
+
" with open(article_mapping_fname,'w',encoding=encoding) as artmappingfile:\n",
|
183 |
+
" for artid, name in sorted(artid_name_mapping.items(),key=lambda x: x[0]):\n",
|
184 |
+
" artmappingfile.write('{} {} {}\\n'.format(artid,delimiter,name))\n",
|
185 |
+
"\n",
|
186 |
+
"def convert_txt_to_csv_format(infname, outfname, encoding='utf-8'):\n",
|
187 |
+
" \"\"\"\n",
|
188 |
+
" convert txt file to csv format. This method is used when the text file is directly available.\n",
|
189 |
+
" The input file has one sentence per line. Assumed to be preprocessed (tokenized, normalized)\n",
|
190 |
+
" \n",
|
191 |
+
" \"\"\"\n",
|
192 |
+
" with open(infname,'r',encoding=encoding) as infile, \\\n",
|
193 |
+
" open(outfname,'w',encoding=encoding) as outfile: \n",
|
194 |
+
" for i, line in enumerate(infile):\n",
|
195 |
+
" outfile.write('0 ||| 0 ||| {} ||| {}\\n'.format(i,line.strip()))\n",
|
196 |
+
" \n",
|
197 |
+
"def preprocess_convert_txt_to_csv_format(infname, outfname, lang, encoding='utf-8'):\n",
|
198 |
+
" \"\"\"\n",
|
199 |
+
" Convert raw text file to csv format\n",
|
200 |
+
" \"\"\"\n",
|
201 |
+
" \n",
|
202 |
+
" normalizer_factory=indic_normalize.IndicNormalizerFactory()\n",
|
203 |
+
" normalizer=normalizer_factory.get_normalizer(lang)\n",
|
204 |
+
" \n",
|
205 |
+
" with open(infname,'r',encoding=encoding) as infile, \\\n",
|
206 |
+
" open(outfname,'w',encoding=encoding) as outfile: \n",
|
207 |
+
" i=0\n",
|
208 |
+
" for line in infile:\n",
|
209 |
+
" sents = sent_split(line.strip(),lang)\n",
|
210 |
+
" for sent in sents:\n",
|
211 |
+
" outfile.write('0 ||| 0 ||| {} ||| {}\\n'.format(i,\n",
|
212 |
+
" preprocess_sent(sent.strip(), lang, normalizer)) )\n",
|
213 |
+
" i=i+1\n",
|
214 |
+
"\n",
|
215 |
+
"def print_txt(infnames, outfname, encoding='utf-8'):\n",
|
216 |
+
" \"\"\"\n",
|
217 |
+
" Extract only the text from the content csv file. The output file has one sentence per file.\n",
|
218 |
+
" \"\"\"\n",
|
219 |
+
" with open(outfname,'w',encoding=encoding) as outfile: \n",
|
220 |
+
" for infname in filter(lambda x: os.path.isfile(x),infnames):\n",
|
221 |
+
" with open(infname,'r',encoding=encoding) as infile:\n",
|
222 |
+
" for i, line in enumerate(infile):\n",
|
223 |
+
" fields=line.strip().split('|||')\n",
|
224 |
+
" if len(fields) >=4:\n",
|
225 |
+
" outfile.write('{}\\n'.format(fields[3].strip()))\n",
|
226 |
+
" \n",
|
227 |
+
"# def dedup_and_print_txt(infnames, outfname, encoding='utf-8'):\n",
|
228 |
+
" \n",
|
229 |
+
"# total=0\n",
|
230 |
+
"# unique=0\n",
|
231 |
+
"# hash_codes=set()\n",
|
232 |
+
" \n",
|
233 |
+
"# with open(outfname,'w',encoding=encoding) as outfile: \n",
|
234 |
+
"# for infname in filter(lambda x: os.path.isfile(x),infnames):\n",
|
235 |
+
"# with open(infname,'r',encoding=encoding) as infile:\n",
|
236 |
+
"# for i, line in enumerate(infile):\n",
|
237 |
+
"# fields=line.strip().split('|||')\n",
|
238 |
+
"# if len(fields) >=4:\n",
|
239 |
+
"# sent=fields[3].strip()\n",
|
240 |
+
"# total+=1\n",
|
241 |
+
"# hs=hash(sent)\n",
|
242 |
+
"# if hs not in hash_codes:\n",
|
243 |
+
"# outfile.write('{}\\n'.format(sent))\n",
|
244 |
+
"# hash_codes.add(hs)\n",
|
245 |
+
"# unique+=1\n",
|
246 |
+
" \n",
|
247 |
+
"# print('Total: {}'.format(total))\n",
|
248 |
+
"# print('Unique: {}'.format(unique))\n",
|
249 |
+
"\n",
|
250 |
+
"def dedup_shuffle_and_print_txt(infnames, outfname, max_buf_size=100000,encoding='utf-8'):\n",
|
251 |
+
" \"\"\"\n",
|
252 |
+
" The method creates a sentence level corpora from multiple content csv files.\n",
|
253 |
+
" All sentences are extracted, they are de-duplicated using murmurhash and shuffled\n",
|
254 |
+
" before writing the entire corpus to the output file. The output file has one sentence per line.\n",
|
255 |
+
"\n",
|
256 |
+
" \"\"\"\n",
|
257 |
+
" \n",
|
258 |
+
" total=0\n",
|
259 |
+
" unique=0\n",
|
260 |
+
" hash_codes=set()\n",
|
261 |
+
" sent_buffer=[]\n",
|
262 |
+
" \n",
|
263 |
+
" with open(outfname,'w',encoding=encoding) as outfile: \n",
|
264 |
+
" for infname in filter(lambda x: os.path.isfile(x),infnames):\n",
|
265 |
+
" print('Processing: {}'.format(infname))\n",
|
266 |
+
" with open(infname,'r',encoding=encoding) as infile:\n",
|
267 |
+
" for i, line in enumerate(infile):\n",
|
268 |
+
" fields=line.strip().split('|||')\n",
|
269 |
+
" if len(fields) >=4:\n",
|
270 |
+
" sent=fields[3].strip()\n",
|
271 |
+
" total+=1\n",
|
272 |
+
"# hs=hash(sent)\n",
|
273 |
+
" hs=mmh3.hash128(sent)\n",
|
274 |
+
" if hs not in hash_codes:\n",
|
275 |
+
"# outfile.write('{}\\n'.format(sent))\n",
|
276 |
+
" sent_buffer.append(sent)\n",
|
277 |
+
" hash_codes.add(hs)\n",
|
278 |
+
" unique+=1\n",
|
279 |
+
" if len(sent_buffer)>=max_buf_size:\n",
|
280 |
+
" random.shuffle(sent_buffer)\n",
|
281 |
+
" for sent in sent_buffer: \n",
|
282 |
+
" outfile.write('{}\\n'.format(sent))\n",
|
283 |
+
" sent_buffer.clear()\n",
|
284 |
+
" \n",
|
285 |
+
" if len(sent_buffer)>0:\n",
|
286 |
+
" random.shuffle(sent_buffer)\n",
|
287 |
+
" for sent in sent_buffer: \n",
|
288 |
+
" outfile.write('{}\\n'.format(sent))\n",
|
289 |
+
" sent_buffer.clear() \n",
|
290 |
+
" \n",
|
291 |
+
" print('Total: {}'.format(total))\n",
|
292 |
+
" print('Unique: {}'.format(unique))\n",
|
293 |
+
"\n",
|
294 |
+
"def extract_wikiextractor_file(infname, outfname, lang, \n",
|
295 |
+
" encoding='utf-8', delimiter=' ||| ', preprocess_fn=preprocess_sent):\n",
|
296 |
+
" \"\"\"\n",
|
297 |
+
" Extract text content into a content csv file from wikipedia article page. \n",
|
298 |
+
" The wikipedia article page is the output from `wikiextractor` [https://github.com/attardi/wikiextractor] \n",
|
299 |
+
" \n",
|
300 |
+
" \"\"\"\n",
|
301 |
+
" normalizer_factory=indic_normalize.IndicNormalizerFactory()\n",
|
302 |
+
" normalizer=normalizer_factory.get_normalizer(lang)\n",
|
303 |
+
" \n",
|
304 |
+
" with open(infname,'r',encoding=encoding) as infile, \\\n",
|
305 |
+
" open(outfname,'w',encoding=encoding) as outfile: \n",
|
306 |
+
" artid=-1\n",
|
307 |
+
" paraid=0\n",
|
308 |
+
" for line in infile:\n",
|
309 |
+
" if line.find('<doc')==0:\n",
|
310 |
+
" artid+=1\n",
|
311 |
+
" paraid=0\n",
|
312 |
+
" continue\n",
|
313 |
+
" if line.find('</doc')==0:\n",
|
314 |
+
" continue\n",
|
315 |
+
" if len(line.strip())>0:\n",
|
316 |
+
" for sentid, sent in enumerate(sent_split(line.strip(),lang)):\n",
|
317 |
+
" sent=sent.strip()\n",
|
318 |
+
" if sent!='':\n",
|
319 |
+
" sent = preprocess_fn(sent,lang,normalizer)\n",
|
320 |
+
" outfile.write(delimiter.join([str(artid), str(paraid), str(sentid), sent]) + '\\n')\n",
|
321 |
+
" paraid+=1\n",
|
322 |
+
"\n",
|
323 |
+
" \n",
|
324 |
+
"def extract_leipzig_corpus(infname,outfname,lang,encoding='utf-8'):\n",
|
325 |
+
" \"\"\"\n",
|
326 |
+
" Extractor for files form the Leipzig corpus\n",
|
327 |
+
" [http://wortschatz.uni-leipzig.de/en/download/]\n",
|
328 |
+
" \n",
|
329 |
+
" \"\"\"\n",
|
330 |
+
" normalizer_factory=indic_normalize.IndicNormalizerFactory()\n",
|
331 |
+
" normalizer=normalizer_factory.get_normalizer(lang) \n",
|
332 |
+
"\n",
|
333 |
+
" with open(infname,'r',encoding=encoding) as infile, \\\n",
|
334 |
+
" open(outfname,'w',encoding=encoding) as outfile: \n",
|
335 |
+
" for i, line in enumerate(infile):\n",
|
336 |
+
" outfile.write('0 ||| 0 ||| {} ||| {}\\n'.format(i,preprocess_sent(line,lang,normalizer))) \n",
|
337 |
+
" \n",
|
338 |
+
"def dataset_stats(fname):\n",
|
339 |
+
" \"\"\"\n",
|
340 |
+
" Extracts dataset statistics from the final extracted file. This input file contains\n",
|
341 |
+
" one sentence per line. The sentences are tokenized.\n",
|
342 |
+
" \"\"\"\n",
|
343 |
+
"\n",
|
344 |
+
" all_puncs=set(string.punctuation+'\\u0964\\u0965')\n",
|
345 |
+
" \n",
|
346 |
+
" sent_count=0\n",
|
347 |
+
" token_cnt=0\n",
|
348 |
+
" true_token_cnt=0\n",
|
349 |
+
" tokens=set()\n",
|
350 |
+
" \n",
|
351 |
+
" with open(fname,'r',encoding='utf-8') as infile:\n",
|
352 |
+
" for line in infile:\n",
|
353 |
+
" sent_count+=1\n",
|
354 |
+
" a=line.strip().split(' ')\n",
|
355 |
+
" token_cnt+=len(a)\n",
|
356 |
+
" b=list(filter(lambda x: x not in all_puncs,a))\n",
|
357 |
+
" true_token_cnt+=len(b)\n",
|
358 |
+
" tokens.update(b)\n",
|
359 |
+
" \n",
|
360 |
+
" print('== Stats ==')\n",
|
361 |
+
" print('Sent count: {}'.format(sent_count))\n",
|
362 |
+
" print('Token count: {}'.format(token_cnt))\n",
|
363 |
+
" print('True Token count: {}'.format(true_token_cnt))\n",
|
364 |
+
" print('Unique Token count: {}'.format(len(tokens)))\n"
|
365 |
+
]
|
366 |
+
},
|
367 |
+
{
|
368 |
+
"cell_type": "markdown",
|
369 |
+
"metadata": {},
|
370 |
+
"source": [
|
371 |
+
"# Marathi"
|
372 |
+
]
|
373 |
+
},
|
374 |
+
{
|
375 |
+
"cell_type": "markdown",
|
376 |
+
"metadata": {},
|
377 |
+
"source": [
|
378 |
+
"## Wikipedia"
|
379 |
+
]
|
380 |
+
},
|
381 |
+
{
|
382 |
+
"cell_type": "markdown",
|
383 |
+
"metadata": {},
|
384 |
+
"source": [
|
385 |
+
"### Wikipedia extraction commands using wikiextractor\n",
|
386 |
+
"\n",
|
387 |
+
"```\n",
|
388 |
+
"### This uses WikiExtractor (https://github.com/attardi/wikiextractor)\n",
|
389 |
+
"\n",
|
390 |
+
"x=/disk1/crawl_project/ta/wikipedia\n",
|
391 |
+
"mkdir $x\n",
|
392 |
+
"cd $x\n",
|
393 |
+
"wget https://dumps.wikimedia.org/tawiki/20190501/tawiki-20190501-pages-articles-multistream.xml.bz2\n",
|
394 |
+
"cd /disk1/src/wikiextractor\n",
|
395 |
+
"python3 WikiExtractor.py -cb 250k -o $x/extracted $x/tawiki-20190501-pages-articles-multistream.xml.bz2\n",
|
396 |
+
"cd -\n",
|
397 |
+
"find extracted -name '*bz2' -exec bunzip2 -c {} \\; > text.xml\n",
|
398 |
+
"rm text.xml\n",
|
399 |
+
"rm tawiki-20190501-pages-articles-multistream.xml.bz2\n",
|
400 |
+
"rm -rf extracted\n",
|
401 |
+
"```"
|
402 |
+
]
|
403 |
+
},
|
404 |
+
{
|
405 |
+
"cell_type": "markdown",
|
406 |
+
"metadata": {},
|
407 |
+
"source": [
|
408 |
+
"mrwiki-20190401-pages-articles-multistream.xml.bz2\n",
|
409 |
+
"\n",
|
410 |
+
"INFO: Finished 1-process extraction of 53715 articles in 123.6s (434.7 art/s)\n",
|
411 |
+
"\n",
|
412 |
+
"INFO: total of page: 102025, total of articl page: 53715; total of used articl page: 53715"
|
413 |
+
]
|
414 |
+
},
|
415 |
+
{
|
416 |
+
"cell_type": "markdown",
|
417 |
+
"metadata": {},
|
418 |
+
"source": [
|
419 |
+
"### Post-processing output generated by wikiextractor"
|
420 |
+
]
|
421 |
+
},
|
422 |
+
{
|
423 |
+
"cell_type": "code",
|
424 |
+
"execution_count": null,
|
425 |
+
"metadata": {},
|
426 |
+
"outputs": [],
|
427 |
+
"source": [
|
428 |
+
"## tex.xml is extracted as shown in commanfs above\n",
|
429 |
+
"extract_wikiextractor_file('text.xml',\n",
|
430 |
+
" 'content_fname1.csv',\n",
|
431 |
+
" 'mr')"
|
432 |
+
]
|
433 |
+
},
|
434 |
+
{
|
435 |
+
"cell_type": "markdown",
|
436 |
+
"metadata": {},
|
437 |
+
"source": [
|
438 |
+
"## Loksatta"
|
439 |
+
]
|
440 |
+
},
|
441 |
+
{
|
442 |
+
"cell_type": "markdown",
|
443 |
+
"metadata": {},
|
444 |
+
"source": [
|
445 |
+
"**Extractor function for Marathi Loksatta page**"
|
446 |
+
]
|
447 |
+
},
|
448 |
+
{
|
449 |
+
"cell_type": "code",
|
450 |
+
"execution_count": null,
|
451 |
+
"metadata": {},
|
452 |
+
"outputs": [],
|
453 |
+
"source": [
|
454 |
+
"def get_article_contents_mr_loksatta(fname,lang,encoding='utf-8'):\n",
|
455 |
+
" with open(fname,'r',encoding=encoding) as infile: \n",
|
456 |
+
" soup = BeautifulSoup(infile)\n",
|
457 |
+
" for elem in soup.find_all('div'):\n",
|
458 |
+
" if 'itemprop' in elem.attrs and 'articleBody' in elem['itemprop']:\n",
|
459 |
+
" filtered_paras=list(filter(lambda x: x.name=='p' and len(x.attrs)==0,elem.children))\n",
|
460 |
+
" paraid=0\n",
|
461 |
+
" for blockid, block in enumerate(filtered_paras):\n",
|
462 |
+
"# print('Para: {}'.format(blockid))\n",
|
463 |
+
"# print(list(block.strings))\n",
|
464 |
+
" text=' '.join(block.strings)\n",
|
465 |
+
" if blockid==0 and text.find(':')>=0 and text.find(':')<20:\n",
|
466 |
+
" text=':'.join(text.split(':')[1:])\n",
|
467 |
+
" for para_text in text.split('\\n'): \n",
|
468 |
+
" for sentid, sent in enumerate(sent_split(para_text,lang)):\n",
|
469 |
+
" sent=sent.strip()\n",
|
470 |
+
" if sent!='':\n",
|
471 |
+
" # print('{}: {}'.format(sentid, sent))\n",
|
472 |
+
" yield((paraid,sentid,sent))\n",
|
473 |
+
" # yield((paraid,sentid,preprocess_sent(sent,'ml',normalizer)))\n",
|
474 |
+
" # print() \n",
|
475 |
+
" paraid+=1"
|
476 |
+
]
|
477 |
+
},
|
478 |
+
{
|
479 |
+
"cell_type": "markdown",
|
480 |
+
"metadata": {},
|
481 |
+
"source": [
|
482 |
+
"**Extracting data from crawled HTML files**"
|
483 |
+
]
|
484 |
+
},
|
485 |
+
{
|
486 |
+
"cell_type": "code",
|
487 |
+
"execution_count": null,
|
488 |
+
"metadata": {},
|
489 |
+
"outputs": [],
|
490 |
+
"source": [
|
491 |
+
"lang='mr'\n",
|
492 |
+
"posts_dir='directory_containing_crawled_html_pages'\n",
|
493 |
+
"content_fname='content_fname2.csv'\n",
|
494 |
+
"article_mapping_fname='article_mapping_fname'\n",
|
495 |
+
"get_article_contents=get_article_contents_mr_loksatta\n",
|
496 |
+
"narticles=-1"
|
497 |
+
]
|
498 |
+
},
|
499 |
+
{
|
500 |
+
"cell_type": "code",
|
501 |
+
"execution_count": null,
|
502 |
+
"metadata": {},
|
503 |
+
"outputs": [],
|
504 |
+
"source": [
|
505 |
+
"write_corpus(\n",
|
506 |
+
" extract_all_content(posts_dir, lang, article_extract_fn=get_article_contents,narticles=narticles),\n",
|
507 |
+
" content_fname,\n",
|
508 |
+
" article_mapping_fname\n",
|
509 |
+
" )"
|
510 |
+
]
|
511 |
+
},
|
512 |
+
{
|
513 |
+
"cell_type": "markdown",
|
514 |
+
"metadata": {},
|
515 |
+
"source": [
|
516 |
+
"## Aggregating all crawled data"
|
517 |
+
]
|
518 |
+
},
|
519 |
+
{
|
520 |
+
"cell_type": "code",
|
521 |
+
"execution_count": null,
|
522 |
+
"metadata": {},
|
523 |
+
"outputs": [],
|
524 |
+
"source": [
|
525 |
+
"### aggregating, de-duplicating and shuffling all the data \n",
|
526 |
+
"dedup_shuffle_and_print_txt([ 'content_fname1.csv', 'content_fname2.csv' ], 'output_fname.txt' )\n",
|
527 |
+
"### extract dataset statistics\n",
|
528 |
+
"dataset_stats('output_fname.txt')"
|
529 |
+
]
|
530 |
+
}
|
531 |
+
],
|
532 |
+
"metadata": {
|
533 |
+
"kernelspec": {
|
534 |
+
"display_name": "Python 3",
|
535 |
+
"language": "python",
|
536 |
+
"name": "python3"
|
537 |
+
},
|
538 |
+
"language_info": {
|
539 |
+
"codemirror_mode": {
|
540 |
+
"name": "ipython",
|
541 |
+
"version": 3
|
542 |
+
},
|
543 |
+
"file_extension": ".py",
|
544 |
+
"mimetype": "text/x-python",
|
545 |
+
"name": "python",
|
546 |
+
"nbconvert_exporter": "python",
|
547 |
+
"pygments_lexer": "ipython3",
|
548 |
+
"version": "3.6.7"
|
549 |
+
},
|
550 |
+
"toc": {
|
551 |
+
"base_numbering": 1,
|
552 |
+
"nav_menu": {
|
553 |
+
"height": "703px",
|
554 |
+
"width": "326px"
|
555 |
+
},
|
556 |
+
"number_sections": true,
|
557 |
+
"sideBar": true,
|
558 |
+
"skip_h1_title": false,
|
559 |
+
"title_cell": "Table of Contents",
|
560 |
+
"title_sidebar": "Contents",
|
561 |
+
"toc_cell": false,
|
562 |
+
"toc_position": {},
|
563 |
+
"toc_section_display": true,
|
564 |
+
"toc_window_display": false
|
565 |
+
}
|
566 |
+
},
|
567 |
+
"nbformat": 4,
|
568 |
+
"nbformat_minor": 2
|
569 |
+
}
|
indic_nlp_library/docs/Makefile
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Makefile for Sphinx documentation
|
2 |
+
#
|
3 |
+
|
4 |
+
# You can set these variables from the command line.
|
5 |
+
SPHINXOPTS =
|
6 |
+
SPHINXBUILD = sphinx-build
|
7 |
+
PAPER =
|
8 |
+
BUILDDIR = _build
|
9 |
+
|
10 |
+
# Internal variables.
|
11 |
+
PAPEROPT_a4 = -D latex_paper_size=a4
|
12 |
+
PAPEROPT_letter = -D latex_paper_size=letter
|
13 |
+
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
|
14 |
+
# the i18n builder cannot share the environment and doctrees with the others
|
15 |
+
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
|
16 |
+
|
17 |
+
.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
|
18 |
+
|
19 |
+
help:
|
20 |
+
@echo "Please use \`make <target>' where <target> is one of"
|
21 |
+
@echo " html to make standalone HTML files"
|
22 |
+
@echo " dirhtml to make HTML files named index.html in directories"
|
23 |
+
@echo " singlehtml to make a single large HTML file"
|
24 |
+
@echo " pickle to make pickle files"
|
25 |
+
@echo " json to make JSON files"
|
26 |
+
@echo " htmlhelp to make HTML files and a HTML help project"
|
27 |
+
@echo " qthelp to make HTML files and a qthelp project"
|
28 |
+
@echo " devhelp to make HTML files and a Devhelp project"
|
29 |
+
@echo " epub to make an epub"
|
30 |
+
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
|
31 |
+
@echo " latexpdf to make LaTeX files and run them through pdflatex"
|
32 |
+
@echo " text to make text files"
|
33 |
+
@echo " man to make manual pages"
|
34 |
+
@echo " texinfo to make Texinfo files"
|
35 |
+
@echo " info to make Texinfo files and run them through makeinfo"
|
36 |
+
@echo " gettext to make PO message catalogs"
|
37 |
+
@echo " changes to make an overview of all changed/added/deprecated items"
|
38 |
+
@echo " linkcheck to check all external links for integrity"
|
39 |
+
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
|
40 |
+
|
41 |
+
clean:
|
42 |
+
-rm -rf $(BUILDDIR)/*
|
43 |
+
|
44 |
+
html:
|
45 |
+
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
|
46 |
+
@echo
|
47 |
+
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
|
48 |
+
|
49 |
+
dirhtml:
|
50 |
+
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
|
51 |
+
@echo
|
52 |
+
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
|
53 |
+
|
54 |
+
singlehtml:
|
55 |
+
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
|
56 |
+
@echo
|
57 |
+
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
|
58 |
+
|
59 |
+
pickle:
|
60 |
+
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
|
61 |
+
@echo
|
62 |
+
@echo "Build finished; now you can process the pickle files."
|
63 |
+
|
64 |
+
json:
|
65 |
+
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
|
66 |
+
@echo
|
67 |
+
@echo "Build finished; now you can process the JSON files."
|
68 |
+
|
69 |
+
htmlhelp:
|
70 |
+
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
|
71 |
+
@echo
|
72 |
+
@echo "Build finished; now you can run HTML Help Workshop with the" \
|
73 |
+
".hhp project file in $(BUILDDIR)/htmlhelp."
|
74 |
+
|
75 |
+
qthelp:
|
76 |
+
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
|
77 |
+
@echo
|
78 |
+
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
|
79 |
+
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
|
80 |
+
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/IndicNLPLibrary.qhcp"
|
81 |
+
@echo "To view the help file:"
|
82 |
+
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/IndicNLPLibrary.qhc"
|
83 |
+
|
84 |
+
devhelp:
|
85 |
+
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
|
86 |
+
@echo
|
87 |
+
@echo "Build finished."
|
88 |
+
@echo "To view the help file:"
|
89 |
+
@echo "# mkdir -p $$HOME/.local/share/devhelp/IndicNLPLibrary"
|
90 |
+
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/IndicNLPLibrary"
|
91 |
+
@echo "# devhelp"
|
92 |
+
|
93 |
+
epub:
|
94 |
+
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
|
95 |
+
@echo
|
96 |
+
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
|
97 |
+
|
98 |
+
latex:
|
99 |
+
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
|
100 |
+
@echo
|
101 |
+
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
|
102 |
+
@echo "Run \`make' in that directory to run these through (pdf)latex" \
|
103 |
+
"(use \`make latexpdf' here to do that automatically)."
|
104 |
+
|
105 |
+
latexpdf:
|
106 |
+
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
|
107 |
+
@echo "Running LaTeX files through pdflatex..."
|
108 |
+
$(MAKE) -C $(BUILDDIR)/latex all-pdf
|
109 |
+
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
|
110 |
+
|
111 |
+
text:
|
112 |
+
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
|
113 |
+
@echo
|
114 |
+
@echo "Build finished. The text files are in $(BUILDDIR)/text."
|
115 |
+
|
116 |
+
man:
|
117 |
+
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
|
118 |
+
@echo
|
119 |
+
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
|
120 |
+
|
121 |
+
texinfo:
|
122 |
+
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
|
123 |
+
@echo
|
124 |
+
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
|
125 |
+
@echo "Run \`make' in that directory to run these through makeinfo" \
|
126 |
+
"(use \`make info' here to do that automatically)."
|
127 |
+
|
128 |
+
info:
|
129 |
+
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
|
130 |
+
@echo "Running Texinfo files through makeinfo..."
|
131 |
+
make -C $(BUILDDIR)/texinfo info
|
132 |
+
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
|
133 |
+
|
134 |
+
gettext:
|
135 |
+
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
|
136 |
+
@echo
|
137 |
+
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
|
138 |
+
|
139 |
+
changes:
|
140 |
+
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
|
141 |
+
@echo
|
142 |
+
@echo "The overview file is in $(BUILDDIR)/changes."
|
143 |
+
|
144 |
+
linkcheck:
|
145 |
+
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
|
146 |
+
@echo
|
147 |
+
@echo "Link check complete; look for any errors in the above output " \
|
148 |
+
"or in $(BUILDDIR)/linkcheck/output.txt."
|
149 |
+
|
150 |
+
doctest:
|
151 |
+
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
|
152 |
+
@echo "Testing of doctests in the sources finished, look at the " \
|
153 |
+
"results in $(BUILDDIR)/doctest/output.txt."
|
indic_nlp_library/docs/cmd.rst
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Commandline
|
2 |
+
===========
|
3 |
+
|
4 |
+
.. argparse::
|
5 |
+
:module: indicnlp.cli.cliparser
|
6 |
+
:func: get_parser
|
7 |
+
:prog: cliparser.py
|
8 |
+
|
indic_nlp_library/docs/code.rst
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Auto Generated Documentation
|
2 |
+
============================
|
3 |
+
|
4 |
+
.. automodule:: indicnlp.langinfo indicnlp.common
|
5 |
+
:members:
|
indic_nlp_library/docs/conf.py
ADDED
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
#
|
3 |
+
# Indic NLP Library documentation build configuration file, created by
|
4 |
+
# sphinx-quickstart on Tue Nov 3 01:50:37 2015.
|
5 |
+
#
|
6 |
+
# This file is execfile()d with the current directory set to its containing dir.
|
7 |
+
#
|
8 |
+
# Note that not all possible configuration values are present in this
|
9 |
+
# autogenerated file.
|
10 |
+
#
|
11 |
+
# All configuration values have a default; values that are commented out
|
12 |
+
# serve to show the default.
|
13 |
+
|
14 |
+
import sys, os
|
15 |
+
|
16 |
+
# If extensions (or modules to document with autodoc) are in another directory,
|
17 |
+
# add these directories to sys.path here. If the directory is relative to the
|
18 |
+
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
19 |
+
sys.path.insert(0, os.path.abspath('..'))
|
20 |
+
|
21 |
+
# -- General configuration -----------------------------------------------------
|
22 |
+
|
23 |
+
# If your documentation needs a minimal Sphinx version, state it here.
|
24 |
+
#needs_sphinx = '1.0'
|
25 |
+
|
26 |
+
# Add any Sphinx extension module names here, as strings. They can be extensions
|
27 |
+
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
|
28 |
+
extensions = ['sphinx.ext.autodoc', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', 'sphinx.ext.napoleon', 'sphinxarg.ext']
|
29 |
+
|
30 |
+
# Add any paths that contain templates here, relative to this directory.
|
31 |
+
templates_path = ['_templates']
|
32 |
+
|
33 |
+
# The suffix of source filenames.
|
34 |
+
source_suffix = '.rst'
|
35 |
+
|
36 |
+
# The encoding of source files.
|
37 |
+
#source_encoding = 'utf-8-sig'
|
38 |
+
|
39 |
+
# The master toctree document.
|
40 |
+
master_doc = 'index'
|
41 |
+
|
42 |
+
# General information about the project.
|
43 |
+
project = 'Indic NLP Library'
|
44 |
+
copyright = '2015, Anoop Kunchukuttan'
|
45 |
+
|
46 |
+
# The version info for the project you're documenting, acts as replacement for
|
47 |
+
# |version| and |release|, also used in various other places throughout the
|
48 |
+
# built documents.
|
49 |
+
#
|
50 |
+
# The short X.Y version.
|
51 |
+
version = '0.2'
|
52 |
+
# The full version, including alpha/beta/rc tags.
|
53 |
+
release = '0.2'
|
54 |
+
|
55 |
+
# The language for content autogenerated by Sphinx. Refer to documentation
|
56 |
+
# for a list of supported languages.
|
57 |
+
#language = None
|
58 |
+
|
59 |
+
# There are two options for replacing |today|: either, you set today to some
|
60 |
+
# non-false value, then it is used:
|
61 |
+
#today = ''
|
62 |
+
# Else, today_fmt is used as the format for a strftime call.
|
63 |
+
#today_fmt = '%B %d, %Y'
|
64 |
+
|
65 |
+
# List of patterns, relative to source directory, that match files and
|
66 |
+
# directories to ignore when looking for source files.
|
67 |
+
exclude_patterns = ['_build']
|
68 |
+
|
69 |
+
# The reST default role (used for this markup: `text`) to use for all documents.
|
70 |
+
#default_role = None
|
71 |
+
|
72 |
+
# If true, '()' will be appended to :func: etc. cross-reference text.
|
73 |
+
#add_function_parentheses = True
|
74 |
+
|
75 |
+
# If true, the current module name will be prepended to all description
|
76 |
+
# unit titles (such as .. function::).
|
77 |
+
#add_module_names = True
|
78 |
+
|
79 |
+
# If true, sectionauthor and moduleauthor directives will be shown in the
|
80 |
+
# output. They are ignored by default.
|
81 |
+
#show_authors = False
|
82 |
+
|
83 |
+
# The name of the Pygments (syntax highlighting) style to use.
|
84 |
+
pygments_style = 'sphinx'
|
85 |
+
|
86 |
+
# A list of ignored prefixes for module index sorting.
|
87 |
+
#modindex_common_prefix = []
|
88 |
+
|
89 |
+
|
90 |
+
# -- Options for HTML output ---------------------------------------------------
|
91 |
+
|
92 |
+
# The theme to use for HTML and HTML Help pages. See the documentation for
|
93 |
+
# a list of builtin themes.
|
94 |
+
html_theme = 'sphinx_rtd_theme'
|
95 |
+
|
96 |
+
# Theme options are theme-specific and customize the look and feel of a theme
|
97 |
+
# further. For a list of options available for each theme, see the
|
98 |
+
# documentation.
|
99 |
+
#html_theme_options = {}
|
100 |
+
|
101 |
+
# Add any paths that contain custom themes here, relative to this directory.
|
102 |
+
#html_theme_path = []
|
103 |
+
|
104 |
+
# The name for this set of Sphinx documents. If None, it defaults to
|
105 |
+
# "<project> v<release> documentation".
|
106 |
+
#html_title = None
|
107 |
+
|
108 |
+
# A shorter title for the navigation bar. Default is the same as html_title.
|
109 |
+
#html_short_title = None
|
110 |
+
|
111 |
+
# The name of an image file (relative to this directory) to place at the top
|
112 |
+
# of the sidebar.
|
113 |
+
#html_logo = None
|
114 |
+
|
115 |
+
# The name of an image file (within the static path) to use as favicon of the
|
116 |
+
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
|
117 |
+
# pixels large.
|
118 |
+
#html_favicon = None
|
119 |
+
|
120 |
+
# Add any paths that contain custom static files (such as style sheets) here,
|
121 |
+
# relative to this directory. They are copied after the builtin static files,
|
122 |
+
# so a file named "default.css" will overwrite the builtin "default.css".
|
123 |
+
html_static_path = ['_static']
|
124 |
+
|
125 |
+
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
|
126 |
+
# using the given strftime format.
|
127 |
+
#html_last_updated_fmt = '%b %d, %Y'
|
128 |
+
|
129 |
+
# If true, SmartyPants will be used to convert quotes and dashes to
|
130 |
+
# typographically correct entities.
|
131 |
+
#html_use_smartypants = True
|
132 |
+
|
133 |
+
# Custom sidebar templates, maps document names to template names.
|
134 |
+
#html_sidebars = {}
|
135 |
+
|
136 |
+
# Additional templates that should be rendered to pages, maps page names to
|
137 |
+
# template names.
|
138 |
+
#html_additional_pages = {}
|
139 |
+
|
140 |
+
# If false, no module index is generated.
|
141 |
+
#html_domain_indices = True
|
142 |
+
|
143 |
+
# If false, no index is generated.
|
144 |
+
#html_use_index = True
|
145 |
+
|
146 |
+
# If true, the index is split into individual pages for each letter.
|
147 |
+
#html_split_index = False
|
148 |
+
|
149 |
+
# If true, links to the reST sources are added to the pages.
|
150 |
+
#html_show_sourcelink = True
|
151 |
+
|
152 |
+
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
|
153 |
+
#html_show_sphinx = True
|
154 |
+
|
155 |
+
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
|
156 |
+
#html_show_copyright = True
|
157 |
+
|
158 |
+
# If true, an OpenSearch description file will be output, and all pages will
|
159 |
+
# contain a <link> tag referring to it. The value of this option must be the
|
160 |
+
# base URL from which the finished HTML is served.
|
161 |
+
#html_use_opensearch = ''
|
162 |
+
|
163 |
+
# This is the file name suffix for HTML files (e.g. ".xhtml").
|
164 |
+
#html_file_suffix = None
|
165 |
+
|
166 |
+
# Output file base name for HTML help builder.
|
167 |
+
htmlhelp_basename = 'IndicNLPLibrarydoc'
|
168 |
+
|
169 |
+
|
170 |
+
# -- Options for LaTeX output --------------------------------------------------
|
171 |
+
|
172 |
+
latex_elements = {
|
173 |
+
# The paper size ('letterpaper' or 'a4paper').
|
174 |
+
#'papersize': 'letterpaper',
|
175 |
+
|
176 |
+
# The font size ('10pt', '11pt' or '12pt').
|
177 |
+
#'pointsize': '10pt',
|
178 |
+
|
179 |
+
# Additional stuff for the LaTeX preamble.
|
180 |
+
#'preamble': '',
|
181 |
+
}
|
182 |
+
|
183 |
+
# Grouping the document tree into LaTeX files. List of tuples
|
184 |
+
# (source start file, target name, title, author, documentclass [howto/manual]).
|
185 |
+
latex_documents = [
|
186 |
+
('index', 'IndicNLPLibrary.tex', 'Indic NLP Library Documentation',
|
187 |
+
'Anoop Kunchukuttan', 'manual'),
|
188 |
+
]
|
189 |
+
|
190 |
+
# The name of an image file (relative to this directory) to place at the top of
|
191 |
+
# the title page.
|
192 |
+
#latex_logo = None
|
193 |
+
|
194 |
+
# For "manual" documents, if this is true, then toplevel headings are parts,
|
195 |
+
# not chapters.
|
196 |
+
#latex_use_parts = False
|
197 |
+
|
198 |
+
# If true, show page references after internal links.
|
199 |
+
#latex_show_pagerefs = False
|
200 |
+
|
201 |
+
# If true, show URL addresses after external links.
|
202 |
+
#latex_show_urls = False
|
203 |
+
|
204 |
+
# Documents to append as an appendix to all manuals.
|
205 |
+
#latex_appendices = []
|
206 |
+
|
207 |
+
# If false, no module index is generated.
|
208 |
+
#latex_domain_indices = True
|
209 |
+
|
210 |
+
|
211 |
+
# -- Options for manual page output --------------------------------------------
|
212 |
+
|
213 |
+
# One entry per manual page. List of tuples
|
214 |
+
# (source start file, name, description, authors, manual section).
|
215 |
+
man_pages = [
|
216 |
+
('index', 'indicnlplibrary', 'Indic NLP Library Documentation',
|
217 |
+
['Anoop Kunchukuttan'], 1)
|
218 |
+
]
|
219 |
+
|
220 |
+
# If true, show URL addresses after external links.
|
221 |
+
#man_show_urls = False
|
222 |
+
|
223 |
+
|
224 |
+
# -- Options for Texinfo output ------------------------------------------------
|
225 |
+
|
226 |
+
# Grouping the document tree into Texinfo files. List of tuples
|
227 |
+
# (source start file, target name, title, author,
|
228 |
+
# dir menu entry, description, category)
|
229 |
+
texinfo_documents = [
|
230 |
+
('index', 'IndicNLPLibrary', 'Indic NLP Library Documentation',
|
231 |
+
'Anoop Kunchukuttan', 'IndicNLPLibrary', 'NLP library for Indian languages',
|
232 |
+
'NLP'),
|
233 |
+
]
|
234 |
+
|
235 |
+
# Documents to append as an appendix to all manuals.
|
236 |
+
#texinfo_appendices = []
|
237 |
+
|
238 |
+
# If false, no module index is generated.
|
239 |
+
#texinfo_domain_indices = True
|
240 |
+
|
241 |
+
# How to display URL addresses: 'footnote', 'no', or 'inline'.
|
242 |
+
#texinfo_show_urls = 'footnote'
|
indic_nlp_library/docs/index.rst
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.. Indic NLP Library documentation master file, created by
|
2 |
+
sphinx-quickstart on Tue Nov 3 01:50:37 2015.
|
3 |
+
You can adapt this file completely to your liking, but it should at least
|
4 |
+
contain the root `toctree` directive.
|
5 |
+
|
6 |
+
:github_url: https://github.com/anoopkunchukuttan/indic_nlp_library
|
7 |
+
|
8 |
+
.. toctree::
|
9 |
+
:maxdepth: 2
|
10 |
+
:caption: Packages
|
11 |
+
|
12 |
+
indicnlp
|
13 |
+
|
14 |
+
.. toctree::
|
15 |
+
:maxdepth: 2
|
16 |
+
:caption: Commandline
|
17 |
+
|
18 |
+
cmd
|
19 |
+
|
20 |
+
|
21 |
+
|
22 |
+
|
indic_nlp_library/docs/indicnlp.MD
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Indic NLP Library
|
2 |
+
## A unified approach to NLP for Indian languages
|
3 |
+
|
4 |
+
### Anoop Kunchukuttan (`anoop.kunchukuttan@gmail.com`)
|
5 |
+
|
6 |
+
The goal of the Indic NLP Library is to build Python based libraries for common text processing and Natural Language Processing in Indian languages. Indian languages share a lot of similarity in terms of script, phonology, language syntax, etc. and this library is an attempt to provide a general solution to very commonly required toolsets for Indian language text.
|
7 |
+
|
8 |
+
The library provides the following functionalities:
|
9 |
+
|
10 |
+
- Text Normalization
|
11 |
+
- Script Information
|
12 |
+
- Word Tokenization and Detokenization
|
13 |
+
- Sentence Splitting
|
14 |
+
- Word Segmentation
|
15 |
+
- Syllabification
|
16 |
+
- Script Conversion
|
17 |
+
- Romanization
|
18 |
+
- Indicization
|
19 |
+
- Transliteration
|
20 |
+
- Translation
|
21 |
+
|
22 |
+
The data resources required by the Indic NLP Library are hosted in a different repository. These resources are required for some modules. You can download from the [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources) project.
|
23 |
+
|
24 |
+
**If you are interested in Indian language NLP resources, you should check the [Indic NLP Catalog](https://github.com/anoopkunchukuttan/indic_nlp_library) for pointers.**
|
25 |
+
|
26 |
+
## Pre-requisites
|
27 |
+
|
28 |
+
- Python 3.x
|
29 |
+
- (For Python 2.x version check the tag `PYTHON_2.7_FINAL_JAN_2019`. Not actively supporting Python 2.x anymore, but will try to maintain as much compatibility as possible)
|
30 |
+
- [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources)
|
31 |
+
- Other dependencies are listed in setup.py
|
32 |
+
|
33 |
+
|
34 |
+
## Configuration
|
35 |
+
|
36 |
+
- Installation from pip:
|
37 |
+
|
38 |
+
`pip install indic-nlp-library`
|
39 |
+
|
40 |
+
- If you want to use the project from the github repo, add the project to the Python Path:
|
41 |
+
|
42 |
+
- Clone this repository
|
43 |
+
- Install dependencies: `pip install -r requirements.txt`
|
44 |
+
- Run: `export PYTHONPATH=$PYTHONPATH:<project base directory>`
|
45 |
+
|
46 |
+
- In either case, export the path to the _Indic NLP Resources_ directory
|
47 |
+
|
48 |
+
Run: `export INDIC_RESOURCES_PATH=<path to Indic NLP resources>`
|
49 |
+
|
50 |
+
## Usage
|
51 |
+
|
52 |
+
You can use the Python API to access all the features of the library. Many of the most common operations are also accessible via a unified commandline API.
|
53 |
+
|
54 |
+
### Getting Started
|
55 |
+
|
56 |
+
Check [this IPython Notebook](http://nbviewer.ipython.org/url/anoopkunchukuttan.github.io/indic_nlp_library/doc/indic_nlp_examples.ipynb) for examples to use the Python API.
|
57 |
+
- You can find the Python 2.x Notebook [here](http://nbviewer.ipython.org/url/anoopkunchukuttan.github.io/indic_nlp_library/doc/indic_nlp_examples_2_7.ipynb)
|
58 |
+
|
59 |
+
### Documentation
|
60 |
+
|
61 |
+
You can find detailed documentation [HERE](https://indic-nlp-library.readthedocs.io/en/latest)
|
62 |
+
|
63 |
+
This documents the Python API as well as the commandline reference.
|
64 |
+
|
65 |
+
## Citing
|
66 |
+
|
67 |
+
If you use this library, please include the following citation:
|
68 |
+
|
69 |
+
```
|
70 |
+
@unpublished{kunchukuttan2020indicnlp,
|
71 |
+
author = "Anoop Kunchukuttan",
|
72 |
+
title = "The IndicNLP Library",
|
73 |
+
year = "2020",
|
74 |
+
}
|
75 |
+
```
|
76 |
+
You can find the document [HERE](docs/indicnlp.pdf)
|
77 |
+
|
78 |
+
## Website
|
79 |
+
|
80 |
+
`http://anoopkunchukuttan.github.io/indic_nlp_library`
|
81 |
+
|
82 |
+
## Author
|
83 |
+
Anoop Kunchukuttan ([anoop.kunchukuttan@gmail.com](anoop.kunchukuttan@gmail.com))
|
84 |
+
|
85 |
+
## Version: 0.7
|
86 |
+
|
87 |
+
## Revision Log
|
88 |
+
|
89 |
+
0.7 : 02 Apr 2020:
|
90 |
+
|
91 |
+
- Unified commandline
|
92 |
+
- Improved documentation
|
93 |
+
- Added setup.py
|
94 |
+
|
95 |
+
0.6 : 16 Dec 2019:
|
96 |
+
|
97 |
+
- New romanizer and indicizer
|
98 |
+
- Script Unifiers
|
99 |
+
- Improved script normalizers
|
100 |
+
- Added contrib directory for sample uses
|
101 |
+
- changed to MIT license
|
102 |
+
|
103 |
+
0.5 : 03 Jun 2019:
|
104 |
+
|
105 |
+
- Improved word tokenizer to handle dates and numbers.
|
106 |
+
- Added sentence splitter that can handle common prefixes/honorofics and uses some heuristics.
|
107 |
+
- Added detokenizer
|
108 |
+
- Added acronym transliterator that can convert English acronyms to Brahmi-derived scripts
|
109 |
+
|
110 |
+
0.4 : 28 Jan 2019: Ported to Python 3, and lots of feature additions since last release; primarily around script information, script similarity and syllabification.
|
111 |
+
|
112 |
+
0.3 : 21 Oct 2014: Supports morph-analysis between Indian languages
|
113 |
+
|
114 |
+
0.2 : 13 Jun 2014: Supports transliteration between Indian languages and tokenization of Indian languages
|
115 |
+
|
116 |
+
0.1 : 12 Mar 2014: Initial version. Supports text normalization.
|
117 |
+
|
118 |
+
## LICENSE
|
119 |
+
|
120 |
+
Indic NLP Library is released under the MIT license
|
121 |
+
|
122 |
+
|
indic_nlp_library/docs/indicnlp.cli.rst
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
cli Package
|
2 |
+
=============
|
3 |
+
|
4 |
+
:mod:`cliparser` Module
|
5 |
+
--------------------------------
|
6 |
+
|
7 |
+
.. automodule:: indicnlp.cli.cliparser
|
8 |
+
:members:
|
9 |
+
:undoc-members:
|
10 |
+
:show-inheritance:
|
11 |
+
|
indic_nlp_library/docs/indicnlp.morph.rst
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
morph Package
|
2 |
+
=============
|
3 |
+
|
4 |
+
:mod:`unsupervised_morph` Module
|
5 |
+
--------------------------------
|
6 |
+
|
7 |
+
.. automodule:: indicnlp.morph.unsupervised_morph
|
8 |
+
:members:
|
9 |
+
:undoc-members:
|
10 |
+
:show-inheritance:
|
11 |
+
|
indic_nlp_library/docs/indicnlp.normalize.rst
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
normalize Package
|
2 |
+
=================
|
3 |
+
|
4 |
+
:mod:`indic_normalize` Module
|
5 |
+
-----------------------------
|
6 |
+
|
7 |
+
.. automodule:: indicnlp.normalize.indic_normalize
|
8 |
+
:members:
|
9 |
+
:undoc-members:
|
10 |
+
:show-inheritance:
|
11 |
+
|
12 |
+
.. autoclass:: indicnlp.normalize.indic_normalize.
|
13 |
+
:members:
|
14 |
+
:undoc-members:
|
15 |
+
:show-inheritance:
|
indic_nlp_library/docs/indicnlp.pdf
ADDED
Binary file (38.1 kB). View file
|
|
indic_nlp_library/docs/indicnlp.rst
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
indicnlp Package
|
2 |
+
================
|
3 |
+
|
4 |
+
:mod:`common` Module
|
5 |
+
--------------------
|
6 |
+
|
7 |
+
.. automodule:: indicnlp.common
|
8 |
+
:members:
|
9 |
+
:undoc-members:
|
10 |
+
:show-inheritance:
|
11 |
+
|
12 |
+
:mod:`langinfo` Module
|
13 |
+
----------------------
|
14 |
+
|
15 |
+
.. automodule:: indicnlp.langinfo
|
16 |
+
:members:
|
17 |
+
:undoc-members:
|
18 |
+
:show-inheritance:
|
19 |
+
|
20 |
+
:mod:`loader` Module
|
21 |
+
--------------------
|
22 |
+
|
23 |
+
.. automodule:: indicnlp.loader
|
24 |
+
:members:
|
25 |
+
:undoc-members:
|
26 |
+
:show-inheritance:
|
27 |
+
|
28 |
+
Subpackages
|
29 |
+
-----------
|
30 |
+
|
31 |
+
.. toctree::
|
32 |
+
|
33 |
+
indicnlp.cli
|
34 |
+
indicnlp.morph
|
35 |
+
indicnlp.normalize
|
36 |
+
indicnlp.script
|
37 |
+
indicnlp.syllable
|
38 |
+
indicnlp.tokenize
|
39 |
+
indicnlp.transliterate
|
40 |
+
|
41 |
+
Indices and tables
|
42 |
+
==================
|
43 |
+
|
44 |
+
* :ref:`genindex`
|
45 |
+
* :ref:`modindex`
|
46 |
+
* :ref:`search`
|
47 |
+
|
indic_nlp_library/docs/indicnlp.script.rst
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
script Package
|
2 |
+
==============
|
3 |
+
|
4 |
+
:mod:`indic_scripts` Module
|
5 |
+
---------------------------
|
6 |
+
|
7 |
+
.. automodule:: indicnlp.script.indic_scripts
|
8 |
+
:members:
|
9 |
+
:undoc-members:
|
10 |
+
:show-inheritance:
|
11 |
+
|
12 |
+
:mod:`english_script` Module
|
13 |
+
---------------------------
|
14 |
+
|
15 |
+
.. automodule:: indicnlp.script.english_script
|
16 |
+
:members:
|
17 |
+
:undoc-members:
|
18 |
+
:show-inheritance:
|
19 |
+
|
20 |
+
:mod:`phonetic_sim` Module
|
21 |
+
---------------------------
|
22 |
+
|
23 |
+
.. automodule:: indicnlp.script.phonetic_sim
|
24 |
+
:members:
|
25 |
+
:undoc-members:
|
26 |
+
:show-inheritance:
|
indic_nlp_library/docs/indicnlp.syllable.rst
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
syllable Package
|
2 |
+
==============
|
3 |
+
|
4 |
+
:mod:`syllabifier` Module
|
5 |
+
---------------------------
|
6 |
+
|
7 |
+
.. automodule:: indicnlp.syllable.syllabifier
|
8 |
+
:members:
|
9 |
+
:undoc-members:
|
10 |
+
:show-inheritance:
|
11 |
+
|
indic_nlp_library/docs/indicnlp.tokenize.rst
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tokenize Package
|
2 |
+
================
|
3 |
+
|
4 |
+
:mod:`indic_tokenize` Module
|
5 |
+
----------------------------
|
6 |
+
|
7 |
+
.. automodule:: indicnlp.tokenize.indic_tokenize
|
8 |
+
:members:
|
9 |
+
:undoc-members:
|
10 |
+
:show-inheritance:
|
11 |
+
|
12 |
+
:mod:`indic_detokenize` Module
|
13 |
+
------------------------------
|
14 |
+
|
15 |
+
.. automodule:: indicnlp.tokenize.indic_detokenize
|
16 |
+
:members:
|
17 |
+
:undoc-members:
|
18 |
+
:show-inheritance:
|
19 |
+
|
20 |
+
:mod:`sentence_tokenize` Module
|
21 |
+
----------------------------
|
22 |
+
|
23 |
+
.. automodule:: indicnlp.tokenize.sentence_tokenize
|
24 |
+
:members:
|
25 |
+
:undoc-members:
|
26 |
+
:show-inheritance:
|
indic_nlp_library/docs/indicnlp.transliterate.rst
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transliterate Package
|
2 |
+
=====================
|
3 |
+
|
4 |
+
:mod:`sinhala_transliterator` Module
|
5 |
+
------------------------------------
|
6 |
+
|
7 |
+
.. automodule:: indicnlp.transliterate.sinhala_transliterator
|
8 |
+
:members:
|
9 |
+
:undoc-members:
|
10 |
+
:show-inheritance:
|
11 |
+
|
12 |
+
:mod:`unicode_transliterate` Module
|
13 |
+
-----------------------------------
|
14 |
+
|
15 |
+
.. automodule:: indicnlp.transliterate.unicode_transliterate
|
16 |
+
:members:
|
17 |
+
:undoc-members:
|
18 |
+
:show-inheritance:
|
19 |
+
|
20 |
+
:mod:`acronym_transliterator` Module
|
21 |
+
-----------------------------------
|
22 |
+
|
23 |
+
.. automodule:: indicnlp.transliterate.acronym_transliterator
|
24 |
+
:members:
|
25 |
+
:undoc-members:
|
26 |
+
:show-inheritance:
|
27 |
+
|
28 |
+
:mod:`script_unifier` Module
|
29 |
+
-----------------------------------
|
30 |
+
|
31 |
+
.. automodule:: indicnlp.transliterate.script_unifier
|
32 |
+
:members:
|
33 |
+
:undoc-members:
|
34 |
+
:show-inheritance:
|
indic_nlp_library/docs/make.bat
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
@ECHO OFF
|
2 |
+
|
3 |
+
pushd %~dp0
|
4 |
+
|
5 |
+
REM Command file for Sphinx documentation
|
6 |
+
|
7 |
+
if "%SPHINXBUILD%" == "" (
|
8 |
+
set SPHINXBUILD=sphinx-build
|
9 |
+
)
|
10 |
+
set SOURCEDIR=.
|
11 |
+
set BUILDDIR=_build
|
12 |
+
|
13 |
+
if "%1" == "" goto help
|
14 |
+
|
15 |
+
%SPHINXBUILD% >NUL 2>NUL
|
16 |
+
if errorlevel 9009 (
|
17 |
+
echo.
|
18 |
+
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
19 |
+
echo.installed, then set the SPHINXBUILD environment variable to point
|
20 |
+
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
21 |
+
echo.may add the Sphinx directory to PATH.
|
22 |
+
echo.
|
23 |
+
echo.If you don't have Sphinx installed, grab it from
|
24 |
+
echo.http://sphinx-doc.org/
|
25 |
+
exit /b 1
|
26 |
+
)
|
27 |
+
|
28 |
+
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
29 |
+
goto end
|
30 |
+
|
31 |
+
:help
|
32 |
+
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
33 |
+
|
34 |
+
:end
|
35 |
+
popd
|
indic_nlp_library/docs/modules.rst
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
indicnlp
|
2 |
+
===
|
3 |
+
|
4 |
+
.. toctree::
|
5 |
+
:maxdepth: 4
|
6 |
+
|
7 |
+
indicnlp
|
indic_nlp_library/indicnlp/__init__.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
|
4 |
+
try:
|
5 |
+
from .version import __version__ # noqa
|
6 |
+
except ImportError:
|
7 |
+
version_txt = os.path.join(os.path.dirname(__file__), "version.txt")
|
8 |
+
with open(version_txt) as f:
|
9 |
+
__version__ = f.read().strip()
|
10 |
+
|
indic_nlp_library/indicnlp/cli/__init__.py
ADDED
File without changes
|
indic_nlp_library/indicnlp/cli/cliparser.py
ADDED
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import sys
|
3 |
+
|
4 |
+
from indicnlp import loader
|
5 |
+
from indicnlp.tokenize import indic_tokenize
|
6 |
+
from indicnlp.tokenize import indic_detokenize
|
7 |
+
from indicnlp.normalize import indic_normalize
|
8 |
+
from indicnlp.morph import unsupervised_morph
|
9 |
+
from indicnlp.tokenize import sentence_tokenize
|
10 |
+
from indicnlp.syllable import syllabifier
|
11 |
+
from indicnlp.transliterate import unicode_transliterate
|
12 |
+
from indicnlp.transliterate import script_unifier
|
13 |
+
|
14 |
+
DEFAULT_ENCODING='utf-8'
|
15 |
+
|
16 |
+
def run_detokenize(args):
|
17 |
+
for line in args.infile:
|
18 |
+
args.outfile.write(indic_detokenize.trivial_detokenize(line,args.lang))
|
19 |
+
|
20 |
+
def run_tokenize(args):
|
21 |
+
for line in args.infile:
|
22 |
+
args.outfile.write(' '.join(
|
23 |
+
indic_tokenize.trivial_tokenize(line,args.lang)))
|
24 |
+
|
25 |
+
def run_sentence_split(args):
|
26 |
+
text=' '.join([ l.replace('\n','').replace('\r','') for l in args.infile])
|
27 |
+
outlines=sentence_tokenize.sentence_split(text,args.lang)
|
28 |
+
for line in outlines:
|
29 |
+
args.outfile.write(line+'\n')
|
30 |
+
|
31 |
+
def run_normalize(args):
|
32 |
+
|
33 |
+
# TODO: add more options to cli
|
34 |
+
remove_nuktas=False
|
35 |
+
normalize_nasals='do_nothing'
|
36 |
+
|
37 |
+
# create normalizer
|
38 |
+
factory=indic_normalize.IndicNormalizerFactory()
|
39 |
+
normalizer=factory.get_normalizer(args.lang,
|
40 |
+
remove_nuktas=remove_nuktas,
|
41 |
+
nasals_mode=normalize_nasals)
|
42 |
+
|
43 |
+
# DO normalization
|
44 |
+
for line in args.infile:
|
45 |
+
normalized_line=normalizer.normalize(line)
|
46 |
+
args.outfile.write(normalized_line)
|
47 |
+
|
48 |
+
def run_morph(args):
|
49 |
+
|
50 |
+
add_marker=False
|
51 |
+
analyzer=unsupervised_morph.UnsupervisedMorphAnalyzer(args.lang,add_marker)
|
52 |
+
for line in args.infile:
|
53 |
+
morph_tokens=analyzer.morph_analyze_document(line.strip().split(' '))
|
54 |
+
args.outfile.write(' '.join(morph_tokens) + '\n')
|
55 |
+
|
56 |
+
def run_syllabify(args):
|
57 |
+
for line in args.infile:
|
58 |
+
new_line = ' '.join(
|
59 |
+
[ ' '.join(syllabifier.orthographic_syllabify(w,args.lang))
|
60 |
+
for w in line.strip().split(' ') ]
|
61 |
+
)
|
62 |
+
args.outfile.write(new_line+'\n')
|
63 |
+
|
64 |
+
def run_wc(args):
|
65 |
+
# if args.l==False and args.w==False and args.c==False:
|
66 |
+
# args.l, args.w, args.c= True, True, True
|
67 |
+
|
68 |
+
nl=0
|
69 |
+
nw=0
|
70 |
+
nc=0
|
71 |
+
|
72 |
+
for line in args.infile:
|
73 |
+
nl+=1
|
74 |
+
nw+=len(line.strip(' ').split(' '))
|
75 |
+
nc+=len(line)
|
76 |
+
|
77 |
+
print('{} {} {}'.format(nl,nw,nc))
|
78 |
+
|
79 |
+
def run_indic2roman(args):
|
80 |
+
for line in args.infile:
|
81 |
+
transliterated_line=unicode_transliterate.ItransTransliterator.to_itrans(
|
82 |
+
line,args.lang)
|
83 |
+
args.outfile.write(transliterated_line)
|
84 |
+
|
85 |
+
def run_roman2indic(args):
|
86 |
+
for line in args.infile:
|
87 |
+
transliterated_line=unicode_transliterate.ItransTransliterator.from_itrans(
|
88 |
+
line,args.lang)
|
89 |
+
args.outfile.write(transliterated_line)
|
90 |
+
|
91 |
+
def run_script_unify(args):
|
92 |
+
|
93 |
+
unifier=None
|
94 |
+
|
95 |
+
if args.mode=='aggressive':
|
96 |
+
unifier=script_unifier.AggressiveScriptUnifier(nasals_mode='to_anusvaara_relaxed', common_lang=args.common_lang)
|
97 |
+
|
98 |
+
elif args.mode=='basic':
|
99 |
+
unifier=script_unifier.BasicScriptUnifier(nasals_mode='do_nothing',
|
100 |
+
common_lang=args.common_lang)
|
101 |
+
|
102 |
+
elif args.mode=='naive':
|
103 |
+
unifier=script_unifier.NaiveScriptUnifier(common_lang=args.common_lang)
|
104 |
+
|
105 |
+
assert(unifier is not None)
|
106 |
+
|
107 |
+
for line in args.infile:
|
108 |
+
transliterated_line=unifier.transform(line,args.lang)
|
109 |
+
args.outfile.write(transliterated_line)
|
110 |
+
|
111 |
+
def run_script_convert(args):
|
112 |
+
for line in args.infile:
|
113 |
+
transliterated_line=unicode_transliterate.UnicodeIndicTransliterator.transliterate(
|
114 |
+
line,args.srclang,args.tgtlang)
|
115 |
+
args.outfile.write(transliterated_line)
|
116 |
+
|
117 |
+
def add_common_monolingual_args(task_parser):
|
118 |
+
task_parser.add_argument('infile',
|
119 |
+
type=argparse.FileType('r',encoding=DEFAULT_ENCODING),
|
120 |
+
nargs='?',
|
121 |
+
default=sys.stdin,
|
122 |
+
help='Input File path',
|
123 |
+
)
|
124 |
+
task_parser.add_argument('outfile',
|
125 |
+
type=argparse.FileType('w',encoding=DEFAULT_ENCODING),
|
126 |
+
nargs='?',
|
127 |
+
default=sys.stdout,
|
128 |
+
help='Output File path',
|
129 |
+
)
|
130 |
+
task_parser.add_argument('-l', '--lang',
|
131 |
+
help='Language',
|
132 |
+
)
|
133 |
+
|
134 |
+
def add_common_bilingual_args(task_parser):
|
135 |
+
task_parser.add_argument('infile',
|
136 |
+
type=argparse.FileType('r',encoding=DEFAULT_ENCODING),
|
137 |
+
nargs='?',
|
138 |
+
default=sys.stdin,
|
139 |
+
help='Input File path',
|
140 |
+
)
|
141 |
+
task_parser.add_argument('outfile',
|
142 |
+
type=argparse.FileType('w',encoding=DEFAULT_ENCODING),
|
143 |
+
nargs='?',
|
144 |
+
default=sys.stdout,
|
145 |
+
help='Output File path',
|
146 |
+
)
|
147 |
+
task_parser.add_argument('-s', '--srclang',
|
148 |
+
help='Source Language',
|
149 |
+
)
|
150 |
+
|
151 |
+
task_parser.add_argument('-t', '--tgtlang',
|
152 |
+
help='Target Language',
|
153 |
+
)
|
154 |
+
|
155 |
+
def add_tokenize_parser(subparsers):
|
156 |
+
task_parser=subparsers.add_parser('tokenize',
|
157 |
+
help='tokenizer help')
|
158 |
+
add_common_monolingual_args(task_parser)
|
159 |
+
task_parser.set_defaults(func=run_tokenize)
|
160 |
+
|
161 |
+
def add_detokenize_parser(subparsers):
|
162 |
+
task_parser=subparsers.add_parser('detokenize',
|
163 |
+
help='de-tokenizer help')
|
164 |
+
add_common_monolingual_args(task_parser)
|
165 |
+
task_parser.set_defaults(func=run_detokenize)
|
166 |
+
|
167 |
+
def add_sentence_split_parser(subparsers):
|
168 |
+
task_parser=subparsers.add_parser('sentence_split', help='sentence split help')
|
169 |
+
add_common_monolingual_args(task_parser)
|
170 |
+
task_parser.set_defaults(func=run_sentence_split)
|
171 |
+
|
172 |
+
def add_normalize_parser(subparsers):
|
173 |
+
task_parser=subparsers.add_parser('normalize', help='normalizer help')
|
174 |
+
add_common_monolingual_args(task_parser)
|
175 |
+
task_parser.set_defaults(func=run_normalize)
|
176 |
+
|
177 |
+
def add_morph_parser(subparsers):
|
178 |
+
task_parser=subparsers.add_parser('morph', help='morph help')
|
179 |
+
add_common_monolingual_args(task_parser)
|
180 |
+
task_parser.set_defaults(func=run_morph)
|
181 |
+
|
182 |
+
def add_syllabify_parser(subparsers):
|
183 |
+
task_parser=subparsers.add_parser('syllabify', help='syllabify help')
|
184 |
+
add_common_monolingual_args(task_parser)
|
185 |
+
task_parser.set_defaults(func=run_syllabify)
|
186 |
+
|
187 |
+
def add_wc_parser(subparsers):
|
188 |
+
task_parser=subparsers.add_parser('wc', help='wc help')
|
189 |
+
|
190 |
+
task_parser.add_argument('infile',
|
191 |
+
type=argparse.FileType('r',encoding=DEFAULT_ENCODING),
|
192 |
+
nargs='?',
|
193 |
+
default=sys.stdin,
|
194 |
+
help='Input File path',
|
195 |
+
)
|
196 |
+
# task_parser.add_argument('-l', action='store_true')
|
197 |
+
# task_parser.add_argument('-w', action='store_true')
|
198 |
+
# task_parser.add_argument('-c', action='store_true')
|
199 |
+
# task_parser.set_defaults(l=False)
|
200 |
+
# task_parser.set_defaults(w=False)
|
201 |
+
# task_parser.set_defaults(c=False)
|
202 |
+
|
203 |
+
task_parser.set_defaults(func=run_wc)
|
204 |
+
|
205 |
+
def add_indic2roman_parser(subparsers):
|
206 |
+
task_parser=subparsers.add_parser('indic2roman', help='indic2roman help')
|
207 |
+
add_common_monolingual_args(task_parser)
|
208 |
+
task_parser.set_defaults(func=run_indic2roman)
|
209 |
+
|
210 |
+
def add_roman2indic_parser(subparsers):
|
211 |
+
task_parser=subparsers.add_parser('roman2indic', help='roman2indic help')
|
212 |
+
add_common_monolingual_args(task_parser)
|
213 |
+
task_parser.set_defaults(func=run_indic2roman)
|
214 |
+
|
215 |
+
def add_script_unify_parser(subparsers):
|
216 |
+
task_parser=subparsers.add_parser('script_unify', help='script_unify help')
|
217 |
+
add_common_monolingual_args(task_parser)
|
218 |
+
task_parser.add_argument('-m','--mode',
|
219 |
+
default='basic',
|
220 |
+
choices=['naive', 'basic', 'aggressive'] ,
|
221 |
+
help='Script unification mode',
|
222 |
+
)
|
223 |
+
task_parser.add_argument('-c','--common_lang',
|
224 |
+
default='hi',
|
225 |
+
help='Common language in which all languages are represented',
|
226 |
+
)
|
227 |
+
|
228 |
+
task_parser.set_defaults(func=run_script_unify)
|
229 |
+
|
230 |
+
def add_script_convert_parser(subparsers):
|
231 |
+
task_parser=subparsers.add_parser('script_convert', help='script convert help')
|
232 |
+
add_common_bilingual_args(task_parser)
|
233 |
+
task_parser.set_defaults(func=run_script_convert)
|
234 |
+
|
235 |
+
def get_parser():
|
236 |
+
parser = argparse.ArgumentParser(prog='indicnlp')
|
237 |
+
subparsers = parser.add_subparsers(help='Invoke each operation with one of the subcommands', dest='subcommand')
|
238 |
+
|
239 |
+
add_tokenize_parser(subparsers)
|
240 |
+
add_detokenize_parser(subparsers)
|
241 |
+
add_sentence_split_parser(subparsers)
|
242 |
+
add_normalize_parser(subparsers)
|
243 |
+
|
244 |
+
add_morph_parser(subparsers)
|
245 |
+
add_syllabify_parser(subparsers)
|
246 |
+
|
247 |
+
add_wc_parser(subparsers)
|
248 |
+
|
249 |
+
add_indic2roman_parser(subparsers)
|
250 |
+
add_roman2indic_parser(subparsers)
|
251 |
+
add_script_unify_parser(subparsers)
|
252 |
+
|
253 |
+
add_script_convert_parser(subparsers)
|
254 |
+
|
255 |
+
return parser
|
256 |
+
|
257 |
+
def main():
|
258 |
+
parser=get_parser()
|
259 |
+
args=parser.parse_args()
|
260 |
+
# print(args)
|
261 |
+
args.func(args)
|
262 |
+
|
263 |
+
if __name__ == '__main__':
|
264 |
+
loader.load()
|
265 |
+
main()
|
266 |
+
|
indic_nlp_library/indicnlp/common.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright (c) 2013-present, Anoop Kunchukuttan
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the MIT license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
#
|
8 |
+
|
9 |
+
import os
|
10 |
+
|
11 |
+
"""
|
12 |
+
Path to the Indic NLP Resources directory
|
13 |
+
"""
|
14 |
+
INDIC_RESOURCES_PATH=''
|
15 |
+
|
16 |
+
def init():
|
17 |
+
"""
|
18 |
+
Initialize the module. The following actions are performed:
|
19 |
+
|
20 |
+
- Checks of INDIC_RESOURCES_PATH variable is set. If not, checks if it can beb initialized from
|
21 |
+
INDIC_RESOURCES_PATH environment variable. If that fails, an exception is raised
|
22 |
+
"""
|
23 |
+
global INDIC_RESOURCES_PATH
|
24 |
+
try:
|
25 |
+
if INDIC_RESOURCES_PATH=='':
|
26 |
+
INDIC_RESOURCES_PATH=os.environ['INDIC_RESOURCES_PATH']
|
27 |
+
except Exception as e:
|
28 |
+
raise IndicNlpException('INDIC_RESOURCES_PATH not set')
|
29 |
+
|
30 |
+
if INDIC_RESOURCES_PATH=='':
|
31 |
+
raise IndicNlpException('INDIC_RESOURCES_PATH not set')
|
32 |
+
|
33 |
+
|
34 |
+
|
35 |
+
def get_resources_path():
|
36 |
+
"""
|
37 |
+
Get the path to the Indic NLP Resources directory
|
38 |
+
"""
|
39 |
+
return INDIC_RESOURCES_PATH
|
40 |
+
|
41 |
+
def set_resources_path(resources_path):
|
42 |
+
"""
|
43 |
+
Set the path to the Indic NLP Resources directory
|
44 |
+
"""
|
45 |
+
global INDIC_RESOURCES_PATH
|
46 |
+
INDIC_RESOURCES_PATH=resources_path
|
47 |
+
|
48 |
+
class IndicNlpException(Exception):
|
49 |
+
"""
|
50 |
+
Exceptions thrown by Indic NLP Library components are instances of this class.
|
51 |
+
'msg' attribute contains exception details.
|
52 |
+
"""
|
53 |
+
def __init__(self, msg):
|
54 |
+
self.msg = msg
|
55 |
+
|
56 |
+
def __str__(self):
|
57 |
+
return repr(self.msg)
|
58 |
+
|
indic_nlp_library/indicnlp/langinfo.py
ADDED
@@ -0,0 +1,488 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright (c) 2013-present, Anoop Kunchukuttan
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the MIT license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
#
|
8 |
+
|
9 |
+
## language codes
|
10 |
+
LC_TA='ta'
|
11 |
+
|
12 |
+
SCRIPT_RANGES={
|
13 |
+
'pa':[0x0a00,0x0a7f] ,
|
14 |
+
'gu':[0x0a80,0x0aff] ,
|
15 |
+
'or':[0x0b00,0x0b7f] ,
|
16 |
+
'ta':[0x0b80,0x0bff] ,
|
17 |
+
'te':[0x0c00,0x0c7f] ,
|
18 |
+
'kn':[0x0c80,0x0cff] ,
|
19 |
+
'ml':[0x0d00,0x0d7f] ,
|
20 |
+
'si':[0x0d80,0x0dff] ,
|
21 |
+
'hi':[0x0900,0x097f] ,
|
22 |
+
'mr':[0x0900,0x097f] ,
|
23 |
+
'kK':[0x0900,0x097f] ,
|
24 |
+
'sa':[0x0900,0x097f] ,
|
25 |
+
'ne':[0x0900,0x097f] ,
|
26 |
+
'sd':[0x0900,0x097f] ,
|
27 |
+
'bn':[0x0980,0x09ff] ,
|
28 |
+
'as':[0x0980,0x09ff] ,
|
29 |
+
}
|
30 |
+
|
31 |
+
DRAVIDIAN_LANGUAGES=['ta', 'te', 'kn', 'ml',]
|
32 |
+
IE_LANGUAGES=['hi', 'mr', 'kK', 'sa', 'ne', 'sd', 'bn', 'as', 'pa', 'gu', 'or', 'si', ]
|
33 |
+
DANDA_DELIM_LANGUAGES=['as','bn','hi','ne','or','pa','sa','sd']
|
34 |
+
|
35 |
+
URDU_RANGES=[
|
36 |
+
[0x0600,0x06ff],
|
37 |
+
[0x0750,0x077f],
|
38 |
+
[0xfb50,0xfdff],
|
39 |
+
[0xfe70,0xfeff],
|
40 |
+
]
|
41 |
+
|
42 |
+
COORDINATED_RANGE_START_INCLUSIVE=0
|
43 |
+
COORDINATED_RANGE_END_INCLUSIVE=0x6f
|
44 |
+
|
45 |
+
NUMERIC_OFFSET_START=0x66
|
46 |
+
NUMERIC_OFFSET_END=0x6f
|
47 |
+
|
48 |
+
HALANTA_OFFSET=0x4d
|
49 |
+
AUM_OFFSET=0x50
|
50 |
+
NUKTA_OFFSET=0x3c
|
51 |
+
|
52 |
+
RUPEE_SIGN=0x20b9
|
53 |
+
|
54 |
+
DANDA=0x0964
|
55 |
+
DOUBLE_DANDA=0x0965
|
56 |
+
|
57 |
+
#TODO: add missing fricatives and approximants
|
58 |
+
VELAR_RANGE=[0x15,0x19]
|
59 |
+
PALATAL_RANGE=[0x1a,0x1e]
|
60 |
+
RETROFLEX_RANGE=[0x1f,0x23]
|
61 |
+
DENTAL_RANGE=[0x24,0x29]
|
62 |
+
LABIAL_RANGE=[0x2a,0x2e]
|
63 |
+
|
64 |
+
# verify
|
65 |
+
VOICED_LIST=[0x17,0x18,0x1c,0x1d,0x21,0x22,0x26,0x27,0x2c,0x2d]
|
66 |
+
UNVOICED_LIST=[0x15,0x16,0x1a,0x1b,0x1f,0x20,0x24,0x25,0x2a,0x2b] #TODO: add sibilants/sonorants
|
67 |
+
ASPIRATED_LIST=[0x16,0x18,0x1b,0x1d,0x20,0x22,0x25,0x27,0x2b,0x2d]
|
68 |
+
UNASPIRATED_LIST=[0x15,0x17,0x1a,0x1c,0x1f,0x21,0x24,0x26,0x2a,0x2c]
|
69 |
+
NASAL_LIST=[0x19,0x1e,0x23,0x28,0x29,0x2d]
|
70 |
+
FRICATIVE_LIST=[0x36,0x37,0x38]
|
71 |
+
APPROXIMANT_LIST=[0x2f,0x30,0x31,0x32,0x33,0x34,0x35]
|
72 |
+
|
73 |
+
#TODO: ha has to be properly categorized
|
74 |
+
|
75 |
+
def is_danda_delim(lang):
|
76 |
+
"""
|
77 |
+
Returns True if danda/double danda is a possible delimiter for the language
|
78 |
+
"""
|
79 |
+
return lang in DANDA_DELIM_LANGUAGES
|
80 |
+
|
81 |
+
def get_offset(c,lang):
|
82 |
+
"""
|
83 |
+
Applicable to Brahmi derived Indic scripts
|
84 |
+
"""
|
85 |
+
return ord(c)-SCRIPT_RANGES[lang][0]
|
86 |
+
|
87 |
+
def offset_to_char(c,lang):
|
88 |
+
"""
|
89 |
+
Applicable to Brahmi derived Indic scripts
|
90 |
+
"""
|
91 |
+
return chr(c+SCRIPT_RANGES[lang][0])
|
92 |
+
|
93 |
+
def in_coordinated_range(c_offset):
|
94 |
+
"""
|
95 |
+
Applicable to Brahmi derived Indic scripts
|
96 |
+
"""
|
97 |
+
return (c_offset>=COORDINATED_RANGE_START_INCLUSIVE and c_offset<=COORDINATED_RANGE_END_INCLUSIVE)
|
98 |
+
|
99 |
+
def is_indiclang_char(c,lang):
|
100 |
+
"""
|
101 |
+
Applicable to Brahmi derived Indic scripts
|
102 |
+
"""
|
103 |
+
o=get_offset(c,lang)
|
104 |
+
return (o>=0 and o<=0x7f) or ord(c)==DANDA or ord(c)==DOUBLE_DANDA
|
105 |
+
|
106 |
+
# def is_vowel(c,lang):
|
107 |
+
# """
|
108 |
+
# Is the character a vowel
|
109 |
+
# """
|
110 |
+
# o=get_offset(c,lang)
|
111 |
+
# return (o>=0x04 and o<=0x14)
|
112 |
+
|
113 |
+
# def is_vowel_sign(c,lang):
|
114 |
+
# """
|
115 |
+
# Is the character a vowel sign (maatraa)
|
116 |
+
# """
|
117 |
+
# o=get_offset(c,lang)
|
118 |
+
# return (o>=0x3e and o<=0x4c)
|
119 |
+
|
120 |
+
# def is_halanta(c,lang):
|
121 |
+
# """
|
122 |
+
# Is the character the halanta character
|
123 |
+
# """
|
124 |
+
# o=get_offset(c,lang)
|
125 |
+
# return (o==HALANTA_OFFSET)
|
126 |
+
|
127 |
+
# def is_nukta(c,lang):
|
128 |
+
# """
|
129 |
+
# Is the character the halanta character
|
130 |
+
# """
|
131 |
+
# o=get_offset(c,lang)
|
132 |
+
# return (o==NUKTA_OFFSET)
|
133 |
+
|
134 |
+
# def is_aum(c,lang):
|
135 |
+
# """
|
136 |
+
# Is the character a vowel sign (maatraa)
|
137 |
+
# """
|
138 |
+
# o=get_offset(c,lang)
|
139 |
+
# return (o==AUM_OFFSET)
|
140 |
+
|
141 |
+
# def is_consonant(c,lang):
|
142 |
+
# """
|
143 |
+
# Is the character a consonant
|
144 |
+
# """
|
145 |
+
# o=get_offset(c,lang)
|
146 |
+
# return (o>=0x15 and o<=0x39)
|
147 |
+
|
148 |
+
# def is_velar(c,lang):
|
149 |
+
# """
|
150 |
+
# Is the character a velar
|
151 |
+
# """
|
152 |
+
# o=get_offset(c,lang)
|
153 |
+
# return (o>=VELAR_RANGE[0] and o<=VELAR_RANGE[1])
|
154 |
+
|
155 |
+
# def is_palatal(c,lang):
|
156 |
+
# """
|
157 |
+
# Is the character a palatal
|
158 |
+
# """
|
159 |
+
# o=get_offset(c,lang)
|
160 |
+
# return (o>=PALATAL_RANGE[0] and o<=PALATAL_RANGE[1])
|
161 |
+
|
162 |
+
# def is_retroflex(c,lang):
|
163 |
+
# """
|
164 |
+
# Is the character a retroflex
|
165 |
+
# """
|
166 |
+
# o=get_offset(c,lang)
|
167 |
+
# return (o>=RETROFLEX_RANGE[0] and o<=RETROFLEX_RANGE[1])
|
168 |
+
|
169 |
+
# def is_dental(c,lang):
|
170 |
+
# """
|
171 |
+
# Is the character a dental
|
172 |
+
# """
|
173 |
+
# o=get_offset(c,lang)
|
174 |
+
# return (o>=DENTAL_RANGE[0] and o<=DENTAL_RANGE[1])
|
175 |
+
|
176 |
+
# def is_labial(c,lang):
|
177 |
+
# """
|
178 |
+
# Is the character a labial
|
179 |
+
# """
|
180 |
+
# o=get_offset(c,lang)
|
181 |
+
# return (o>=LABIAL_RANGE[0] and o<=LABIAL_RANGE[1])
|
182 |
+
|
183 |
+
# def is_voiced(c,lang):
|
184 |
+
# """
|
185 |
+
# Is the character a voiced consonant
|
186 |
+
# """
|
187 |
+
# o=get_offset(c,lang)
|
188 |
+
# return o in VOICED_LIST
|
189 |
+
|
190 |
+
# def is_unvoiced(c,lang):
|
191 |
+
# """
|
192 |
+
# Is the character a unvoiced consonant
|
193 |
+
# """
|
194 |
+
# o=get_offset(c,lang)
|
195 |
+
# return o in UNVOICED_LIST
|
196 |
+
|
197 |
+
# def is_aspirated(c,lang):
|
198 |
+
# """
|
199 |
+
# Is the character a aspirated consonant
|
200 |
+
# """
|
201 |
+
# o=get_offset(c,lang)
|
202 |
+
# return o in ASPIRATED_LIST
|
203 |
+
|
204 |
+
# def is_unaspirated(c,lang):
|
205 |
+
# """
|
206 |
+
# Is the character a unaspirated consonant
|
207 |
+
# """
|
208 |
+
# o=get_offset(c,lang)
|
209 |
+
# return o in UNASPIRATED_LIST
|
210 |
+
|
211 |
+
# def is_nasal(c,lang):
|
212 |
+
# """
|
213 |
+
# Is the character a nasal consonant
|
214 |
+
# """
|
215 |
+
# o=get_offset(c,lang)
|
216 |
+
# return o in NASAL_LIST
|
217 |
+
|
218 |
+
# def is_fricative(c,lang):
|
219 |
+
# """
|
220 |
+
# Is the character a fricative consonant
|
221 |
+
# """
|
222 |
+
# o=get_offset(c,lang)
|
223 |
+
# return o in FRICATIVE_LIST
|
224 |
+
|
225 |
+
# def is_approximant(c,lang):
|
226 |
+
# """
|
227 |
+
# Is the character an approximant consonant
|
228 |
+
# """
|
229 |
+
# o=get_offset(c,lang)
|
230 |
+
# return o in APPROXIMANT_LIST
|
231 |
+
|
232 |
+
# def is_number(c,lang):
|
233 |
+
# """
|
234 |
+
# Is the character a number
|
235 |
+
# """
|
236 |
+
# o=get_offset(c,lang)
|
237 |
+
# return (o>=0x66 and o<=0x6f)
|
238 |
+
|
239 |
+
|
240 |
+
def is_vowel(c,lang):
|
241 |
+
"""
|
242 |
+
Is the character a vowel
|
243 |
+
"""
|
244 |
+
o=get_offset(c,lang)
|
245 |
+
return (o>=0x04 and o<=0x14)
|
246 |
+
|
247 |
+
def is_vowel_sign(c,lang):
|
248 |
+
"""
|
249 |
+
Is the character a vowel sign (maatraa)
|
250 |
+
"""
|
251 |
+
o=get_offset(c,lang)
|
252 |
+
return (o>=0x3e and o<=0x4c)
|
253 |
+
|
254 |
+
def is_halanta(c,lang):
|
255 |
+
"""
|
256 |
+
Is the character the halanta character
|
257 |
+
"""
|
258 |
+
o=get_offset(c,lang)
|
259 |
+
return (o==HALANTA_OFFSET)
|
260 |
+
|
261 |
+
def is_nukta(c,lang):
|
262 |
+
"""
|
263 |
+
Is the character the halanta character
|
264 |
+
"""
|
265 |
+
o=get_offset(c,lang)
|
266 |
+
return (o==NUKTA_OFFSET)
|
267 |
+
|
268 |
+
def is_aum(c,lang):
|
269 |
+
"""
|
270 |
+
Is the character a vowel sign (maatraa)
|
271 |
+
"""
|
272 |
+
o=get_offset(c,lang)
|
273 |
+
return (o==AUM_OFFSET)
|
274 |
+
|
275 |
+
def is_consonant(c,lang):
|
276 |
+
"""
|
277 |
+
Is the character a consonant
|
278 |
+
"""
|
279 |
+
o=get_offset(c,lang)
|
280 |
+
return (o>=0x15 and o<=0x39)
|
281 |
+
|
282 |
+
def is_velar(c,lang):
|
283 |
+
"""
|
284 |
+
Is the character a velar
|
285 |
+
"""
|
286 |
+
o=get_offset(c,lang)
|
287 |
+
return (o>=VELAR_RANGE[0] and o<=VELAR_RANGE[1])
|
288 |
+
|
289 |
+
def is_palatal(c,lang):
|
290 |
+
"""
|
291 |
+
Is the character a palatal
|
292 |
+
"""
|
293 |
+
o=get_offset(c,lang)
|
294 |
+
return (o>=PALATAL_RANGE[0] and o<=PALATAL_RANGE[1])
|
295 |
+
|
296 |
+
def is_retroflex(c,lang):
|
297 |
+
"""
|
298 |
+
Is the character a retroflex
|
299 |
+
"""
|
300 |
+
o=get_offset(c,lang)
|
301 |
+
return (o>=RETROFLEX_RANGE[0] and o<=RETROFLEX_RANGE[1])
|
302 |
+
|
303 |
+
def is_dental(c,lang):
|
304 |
+
"""
|
305 |
+
Is the character a dental
|
306 |
+
"""
|
307 |
+
o=get_offset(c,lang)
|
308 |
+
return (o>=DENTAL_RANGE[0] and o<=DENTAL_RANGE[1])
|
309 |
+
|
310 |
+
def is_labial(c,lang):
|
311 |
+
"""
|
312 |
+
Is the character a labial
|
313 |
+
"""
|
314 |
+
o=get_offset(c,lang)
|
315 |
+
return (o>=LABIAL_RANGE[0] and o<=LABIAL_RANGE[1])
|
316 |
+
|
317 |
+
def is_voiced(c,lang):
|
318 |
+
"""
|
319 |
+
Is the character a voiced consonant
|
320 |
+
"""
|
321 |
+
o=get_offset(c,lang)
|
322 |
+
return o in VOICED_LIST
|
323 |
+
|
324 |
+
def is_unvoiced(c,lang):
|
325 |
+
"""
|
326 |
+
Is the character a unvoiced consonant
|
327 |
+
"""
|
328 |
+
o=get_offset(c,lang)
|
329 |
+
return o in UNVOICED_LIST
|
330 |
+
|
331 |
+
def is_aspirated(c,lang):
|
332 |
+
"""
|
333 |
+
Is the character a aspirated consonant
|
334 |
+
"""
|
335 |
+
o=get_offset(c,lang)
|
336 |
+
return o in ASPIRATED_LIST
|
337 |
+
|
338 |
+
def is_unaspirated(c,lang):
|
339 |
+
"""
|
340 |
+
Is the character a unaspirated consonant
|
341 |
+
"""
|
342 |
+
o=get_offset(c,lang)
|
343 |
+
return o in UNASPIRATED_LIST
|
344 |
+
|
345 |
+
def is_nasal(c,lang):
|
346 |
+
"""
|
347 |
+
Is the character a nasal consonant
|
348 |
+
"""
|
349 |
+
o=get_offset(c,lang)
|
350 |
+
return o in NASAL_LIST
|
351 |
+
|
352 |
+
def is_fricative(c,lang):
|
353 |
+
"""
|
354 |
+
Is the character a fricative consonant
|
355 |
+
"""
|
356 |
+
o=get_offset(c,lang)
|
357 |
+
return o in FRICATIVE_LIST
|
358 |
+
|
359 |
+
def is_approximant(c,lang):
|
360 |
+
"""
|
361 |
+
Is the character an approximant consonant
|
362 |
+
"""
|
363 |
+
o=get_offset(c,lang)
|
364 |
+
return o in APPROXIMANT_LIST
|
365 |
+
|
366 |
+
def is_number(c,lang):
|
367 |
+
"""
|
368 |
+
Is the character a number
|
369 |
+
"""
|
370 |
+
o=get_offset(c,lang)
|
371 |
+
return (o>=0x66 and o<=0x6f)
|
372 |
+
|
373 |
+
|
374 |
+
##################################################
|
375 |
+
|
376 |
+
def is_vowel_offset(c_offset):
|
377 |
+
"""
|
378 |
+
Is the offset a vowel
|
379 |
+
"""
|
380 |
+
return (c_offset>=0x04 and c_offset<=0x14)
|
381 |
+
|
382 |
+
def is_vowel_sign_offset(c_offset):
|
383 |
+
"""
|
384 |
+
Is the offset a vowel sign (maatraa)
|
385 |
+
"""
|
386 |
+
return (c_offset>=0x3e and c_offset<=0x4c)
|
387 |
+
|
388 |
+
def is_halanta_offset(c_offset):
|
389 |
+
"""
|
390 |
+
Is the offset the halanta offset
|
391 |
+
"""
|
392 |
+
return (c_offset==HALANTA_OFFSET)
|
393 |
+
|
394 |
+
def is_nukta_offset(c_offset):
|
395 |
+
"""
|
396 |
+
Is the offset the halanta offset
|
397 |
+
"""
|
398 |
+
return (c_offset==NUKTA_OFFSET)
|
399 |
+
|
400 |
+
def is_aum_offset(c_offset):
|
401 |
+
"""
|
402 |
+
Is the offset a vowel sign (maatraa)
|
403 |
+
"""
|
404 |
+
return (c_offset==AUM_OFFSET)
|
405 |
+
|
406 |
+
def is_consonant_offset(c_offset):
|
407 |
+
"""
|
408 |
+
Is the offset a consonant
|
409 |
+
"""
|
410 |
+
return (c_offset>=0x15 and c_offset<=0x39)
|
411 |
+
|
412 |
+
def is_velar_offset(c_offset):
|
413 |
+
"""
|
414 |
+
Is the offset a velar
|
415 |
+
"""
|
416 |
+
return (c_offset>=VELAR_RANGE[0] and c_offset<=VELAR_RANGE[1])
|
417 |
+
|
418 |
+
def is_palatal_offset(c_offset):
|
419 |
+
"""
|
420 |
+
Is the offset a palatal
|
421 |
+
"""
|
422 |
+
return (c_offset>=PALATAL_RANGE[0] and c_offset<=PALATAL_RANGE[1])
|
423 |
+
|
424 |
+
def is_retroflex_offset(c_offset):
|
425 |
+
"""
|
426 |
+
Is the offset a retroflex
|
427 |
+
"""
|
428 |
+
return (c_offset>=RETROFLEX_RANGE[0] and c_offset<=RETROFLEX_RANGE[1])
|
429 |
+
|
430 |
+
def is_dental_offset(c_offset):
|
431 |
+
"""
|
432 |
+
Is the offset a dental
|
433 |
+
"""
|
434 |
+
return (c_offset>=DENTAL_RANGE[0] and c_offset<=DENTAL_RANGE[1])
|
435 |
+
|
436 |
+
def is_labial_offset(c_offset):
|
437 |
+
"""
|
438 |
+
Is the offset a labial
|
439 |
+
"""
|
440 |
+
return (c_offset>=LABIAL_RANGE[0] and c_offset<=LABIAL_RANGE[1])
|
441 |
+
|
442 |
+
def is_voiced_offset(c_offset):
|
443 |
+
"""
|
444 |
+
Is the offset a voiced consonant
|
445 |
+
"""
|
446 |
+
return c_offset in VOICED_LIST
|
447 |
+
|
448 |
+
def is_unvoiced_offset(c_offset):
|
449 |
+
"""
|
450 |
+
Is the offset a unvoiced consonant
|
451 |
+
"""
|
452 |
+
return c_offset in UNVOICED_LIST
|
453 |
+
|
454 |
+
def is_aspirated_offset(c_offset):
|
455 |
+
"""
|
456 |
+
Is the offset a aspirated consonant
|
457 |
+
"""
|
458 |
+
return c_offset in ASPIRATED_LIST
|
459 |
+
|
460 |
+
def is_unaspirated_offset(c_offset):
|
461 |
+
"""
|
462 |
+
Is the offset a unaspirated consonant
|
463 |
+
"""
|
464 |
+
return c_offset in UNASPIRATED_LIST
|
465 |
+
|
466 |
+
def is_nasal_offset(c_offset):
|
467 |
+
"""
|
468 |
+
Is the offset a nasal consonant
|
469 |
+
"""
|
470 |
+
return c_offset in NASAL_LIST
|
471 |
+
|
472 |
+
def is_fricative_offset(c_offset):
|
473 |
+
"""
|
474 |
+
Is the offset a fricative consonant
|
475 |
+
"""
|
476 |
+
return c_offset in FRICATIVE_LIST
|
477 |
+
|
478 |
+
def is_approximant_offset(c_offset):
|
479 |
+
"""
|
480 |
+
Is the offset an approximant consonant
|
481 |
+
"""
|
482 |
+
return c_offset in APPROXIMANT_LIST
|
483 |
+
|
484 |
+
def is_number_offset(c_offset):
|
485 |
+
"""
|
486 |
+
Is the offset a number
|
487 |
+
"""
|
488 |
+
return (c_offset>=0x66 and c_offset<=0x6f)
|
indic_nlp_library/indicnlp/loader.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright (c) 2013-present, Anoop Kunchukuttan
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the MIT license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
#
|
8 |
+
|
9 |
+
from indicnlp import common
|
10 |
+
from indicnlp.script import indic_scripts
|
11 |
+
from indicnlp.script import english_script
|
12 |
+
from indicnlp.transliterate import unicode_transliterate
|
13 |
+
|
14 |
+
def load():
|
15 |
+
"""
|
16 |
+
Initializes the Indic NLP library. Clients should call this method before using the library.
|
17 |
+
|
18 |
+
Any module requiring initialization should have a init() method, to which a call must be made from this method
|
19 |
+
"""
|
20 |
+
|
21 |
+
### Order of intialization may matter
|
22 |
+
|
23 |
+
# Common has to be loaded first to get access to resources
|
24 |
+
common.init()
|
25 |
+
|
26 |
+
## Initialization of Indic scripts module
|
27 |
+
indic_scripts.init()
|
28 |
+
|
29 |
+
## Initialization of English scripts module
|
30 |
+
english_script.init()
|
31 |
+
|
32 |
+
## Initialization of unicode_transliterate module
|
33 |
+
unicode_transliterate.init()
|
34 |
+
|
35 |
+
|
indic_nlp_library/indicnlp/morph/__init__.py
ADDED
File without changes
|
indic_nlp_library/indicnlp/morph/unsupervised_morph.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright (c) 2013-present, Anoop Kunchukuttan
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the MIT license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
#
|
8 |
+
|
9 |
+
import codecs, sys, itertools,re,os
|
10 |
+
import morfessor
|
11 |
+
|
12 |
+
from functools import lru_cache
|
13 |
+
|
14 |
+
from indicnlp import langinfo
|
15 |
+
from indicnlp import common
|
16 |
+
from indicnlp.tokenize import indic_tokenize
|
17 |
+
|
18 |
+
# Unsupervised Morphological Analyser for Indian languages.
|
19 |
+
#
|
20 |
+
# @author Anoop Kunchukuttan
|
21 |
+
#
|
22 |
+
|
23 |
+
class MorphAnalyzerI(object):
|
24 |
+
"""
|
25 |
+
Interface for Morph Analyzer
|
26 |
+
"""
|
27 |
+
|
28 |
+
def morph_analyze(word):
|
29 |
+
pass
|
30 |
+
|
31 |
+
def morph_analyze_document(tokens):
|
32 |
+
pass
|
33 |
+
|
34 |
+
class UnsupervisedMorphAnalyzer(MorphAnalyzerI):
|
35 |
+
"""
|
36 |
+
Unsupervised Morphological analyser built using Morfessor 2.0
|
37 |
+
"""
|
38 |
+
|
39 |
+
def __init__(self,lang,add_marker=False):
|
40 |
+
self.lang=lang
|
41 |
+
self.add_marker=add_marker
|
42 |
+
|
43 |
+
io = morfessor.MorfessorIO()
|
44 |
+
self._morfessor_model=io.read_any_model(os.path.join(common.INDIC_RESOURCES_PATH,'morph','morfessor','{}.model'.format(lang)))
|
45 |
+
|
46 |
+
self._script_range_pat=r'^[{}-{}]+$'.format(chr(langinfo.SCRIPT_RANGES[lang][0]),chr(langinfo.SCRIPT_RANGES[lang][1]))
|
47 |
+
self._script_check_re=re.compile(self._script_range_pat)
|
48 |
+
|
49 |
+
def _contains_number(self,text):
|
50 |
+
if self.lang in langinfo.SCRIPT_RANGES:
|
51 |
+
for c in text:
|
52 |
+
offset=ord(c)-langinfo.SCRIPT_RANGES[self.lang][0]
|
53 |
+
if offset >=langinfo.NUMERIC_OFFSET_START and offset <= langinfo.NUMERIC_OFFSET_END:
|
54 |
+
return True
|
55 |
+
return False
|
56 |
+
|
57 |
+
def _morphanalysis_needed(self,word):
|
58 |
+
return self._script_check_re.match(word) and not self._contains_number(word)
|
59 |
+
|
60 |
+
@lru_cache(maxsize=16384)
|
61 |
+
def morph_analyze(self,word):
|
62 |
+
"""
|
63 |
+
Morphanalyzes a single word and returns a list of component morphemes
|
64 |
+
|
65 |
+
@param word: string input word
|
66 |
+
"""
|
67 |
+
m_list=[]
|
68 |
+
if self._morphanalysis_needed(word):
|
69 |
+
val=self._morfessor_model.viterbi_segment(word)
|
70 |
+
m_list=val[0]
|
71 |
+
if self.add_marker:
|
72 |
+
m_list= [ '{}_S_'.format(m) if i>0 else '{}_R_'.format(m) for i,m in enumerate(m_list)]
|
73 |
+
else:
|
74 |
+
if self.add_marker:
|
75 |
+
word='{}_E_'.format(word)
|
76 |
+
m_list=[word]
|
77 |
+
return m_list
|
78 |
+
|
79 |
+
### Older implementation
|
80 |
+
#val=self._morfessor_model.viterbi_segment(word)
|
81 |
+
#m_list=val[0]
|
82 |
+
#if self.add_marker:
|
83 |
+
# m_list= [ u'{}_S_'.format(m) if i>0 else u'{}_R_'.format(m) for i,m in enumerate(m_list)]
|
84 |
+
#return m_list
|
85 |
+
|
86 |
+
|
87 |
+
def morph_analyze_document(self,tokens):
|
88 |
+
"""
|
89 |
+
Morphanalyzes a document, represented as a list of tokens
|
90 |
+
Each word is morphanalyzed and result is a list of morphemes constituting the document
|
91 |
+
|
92 |
+
@param tokens: string sequence of words
|
93 |
+
|
94 |
+
@return list of segments in the document after morph analysis
|
95 |
+
"""
|
96 |
+
|
97 |
+
out_tokens=[]
|
98 |
+
for token in tokens:
|
99 |
+
morphs=self.morph_analyze(token)
|
100 |
+
out_tokens.extend(morphs)
|
101 |
+
return out_tokens
|
102 |
+
|
103 |
+
#### Older implementation
|
104 |
+
#out_tokens=[]
|
105 |
+
#for token in tokens:
|
106 |
+
# if self._morphanalysis_needed(token):
|
107 |
+
# morphs=self.morph_analyze(token)
|
108 |
+
# out_tokens.extend(morphs)
|
109 |
+
# else:
|
110 |
+
# if self.add_marker:
|
111 |
+
# token=u'{}_E_'.format(token)
|
112 |
+
# out_tokens.append(token)
|
113 |
+
#return out_tokens
|
114 |
+
|
115 |
+
|
116 |
+
if __name__ == '__main__':
|
117 |
+
|
118 |
+
if len(sys.argv)<4:
|
119 |
+
print("Usage: python unsupervised_morph.py <infile> <outfile> <language> <indic_resources_path> [<add_marker>]")
|
120 |
+
sys.exit(1)
|
121 |
+
|
122 |
+
language=sys.argv[3]
|
123 |
+
common.INDIC_RESOURCES_PATH=sys.argv[4]
|
124 |
+
|
125 |
+
add_marker=False
|
126 |
+
|
127 |
+
if len(sys.argv)==6:
|
128 |
+
add_marker= True if sys.argv[5] == 'True' else False
|
129 |
+
|
130 |
+
print('Loading morph analyser for ' + language)
|
131 |
+
analyzer=UnsupervisedMorphAnalyzer(language,add_marker)
|
132 |
+
print('Loaded morph analyser for ' + language)
|
133 |
+
|
134 |
+
with codecs.open(sys.argv[1],'r','utf-8') as ifile:
|
135 |
+
with codecs.open(sys.argv[2],'w','utf-8') as ofile:
|
136 |
+
for line in ifile.readlines():
|
137 |
+
line=line.strip()
|
138 |
+
tokens=indic_tokenize.trivial_tokenize(line)
|
139 |
+
morph_tokens=analyzer.morph_analyze_document(tokens)
|
140 |
+
ofile.write(' '.join(morph_tokens))
|
141 |
+
ofile.write('\n')
|
142 |
+
|
indic_nlp_library/indicnlp/normalize/__init__.py
ADDED
File without changes
|
indic_nlp_library/indicnlp/normalize/indic_normalize.py
ADDED
@@ -0,0 +1,984 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
#
|
4 |
+
# Copyright (c) 2013-present, Anoop Kunchukuttan
|
5 |
+
# All rights reserved.
|
6 |
+
#
|
7 |
+
# This source code is licensed under the MIT license found in the
|
8 |
+
# LICENSE file in the root directory of this source tree.
|
9 |
+
#
|
10 |
+
|
11 |
+
#Program for normalization of text written in Unicode. This is mainly geared towards Indic scripts
|
12 |
+
#
|
13 |
+
# @author Anoop Kunchukuttan
|
14 |
+
#
|
15 |
+
|
16 |
+
import sys, codecs, string, itertools, re
|
17 |
+
from indicnlp import langinfo
|
18 |
+
|
19 |
+
|
20 |
+
class NormalizerI(object):
|
21 |
+
"""
|
22 |
+
The normalizer classes do the following:
|
23 |
+
|
24 |
+
* Some characters have multiple Unicode codepoints. The normalizer chooses a single standard representation
|
25 |
+
* Some control characters are deleted
|
26 |
+
* While typing using the Latin keyboard, certain typical mistakes occur which are corrected by the module
|
27 |
+
|
28 |
+
Base class for normalizer. Performs some common normalization, which includes:
|
29 |
+
|
30 |
+
* Byte order mark, word joiner, etc. removal
|
31 |
+
* ZERO_WIDTH_NON_JOINER and ZERO_WIDTH_JOINER removal
|
32 |
+
* ZERO_WIDTH_SPACE and NO_BREAK_SPACE replaced by spaces
|
33 |
+
|
34 |
+
Script specific normalizers should derive from this class and override the normalize() method.
|
35 |
+
They can call the super class 'normalize() method to avail of the common normalization
|
36 |
+
|
37 |
+
"""
|
38 |
+
|
39 |
+
BYTE_ORDER_MARK='\uFEFF'
|
40 |
+
BYTE_ORDER_MARK_2='\uFFFE'
|
41 |
+
WORD_JOINER='\u2060'
|
42 |
+
SOFT_HYPHEN='\u00AD'
|
43 |
+
|
44 |
+
ZERO_WIDTH_SPACE='\u200B'
|
45 |
+
NO_BREAK_SPACE='\u00A0'
|
46 |
+
|
47 |
+
ZERO_WIDTH_NON_JOINER='\u200C'
|
48 |
+
ZERO_WIDTH_JOINER='\u200D'
|
49 |
+
|
50 |
+
def _normalize_punctuations(self, text):
|
51 |
+
"""
|
52 |
+
Normalize punctuations.
|
53 |
+
Applied many of the punctuation normalizations that are part of MosesNormalizer
|
54 |
+
from sacremoses
|
55 |
+
"""
|
56 |
+
text=text.replace(NormalizerI.BYTE_ORDER_MARK,'')
|
57 |
+
text=text.replace('„', r'"')
|
58 |
+
text=text.replace('“', r'"')
|
59 |
+
text=text.replace('”', r'"')
|
60 |
+
text=text.replace('–', r'-')
|
61 |
+
text=text.replace('—', r' - ')
|
62 |
+
text=text.replace('´', r"'")
|
63 |
+
text=text.replace('‘', r"'")
|
64 |
+
text=text.replace('‚', r"'")
|
65 |
+
text=text.replace('’', r"'")
|
66 |
+
text=text.replace("''", r'"')
|
67 |
+
text=text.replace('´´', r'"')
|
68 |
+
text=text.replace('…', r'...')
|
69 |
+
|
70 |
+
return text
|
71 |
+
|
72 |
+
def normalize(self,text):
|
73 |
+
pass
|
74 |
+
|
75 |
+
|
76 |
+
class BaseNormalizer(NormalizerI):
|
77 |
+
|
78 |
+
def __init__(self,lang,
|
79 |
+
remove_nuktas=False,
|
80 |
+
nasals_mode='do_nothing',
|
81 |
+
do_normalize_chandras=False,
|
82 |
+
do_normalize_vowel_ending=False):
|
83 |
+
|
84 |
+
self.lang=lang
|
85 |
+
self.remove_nuktas=remove_nuktas
|
86 |
+
self.nasals_mode=nasals_mode
|
87 |
+
self.do_normalize_chandras=do_normalize_chandras
|
88 |
+
self.do_normalize_vowel_ending=do_normalize_vowel_ending
|
89 |
+
|
90 |
+
self._init_normalize_chandras()
|
91 |
+
self._init_normalize_nasals()
|
92 |
+
self._init_normalize_vowel_ending()
|
93 |
+
#self._init_visarga_correction()
|
94 |
+
|
95 |
+
def _init_normalize_vowel_ending(self):
|
96 |
+
|
97 |
+
if self.lang in langinfo.IE_LANGUAGES:
|
98 |
+
self.fn_vowel_ending=self._normalize_word_vowel_ending_ie
|
99 |
+
elif self.lang in langinfo.DRAVIDIAN_LANGUAGES:
|
100 |
+
self.fn_vowel_ending=self._normalize_word_vowel_ending_dravidian
|
101 |
+
else:
|
102 |
+
self.fn_vowel_ending=lambda x: x
|
103 |
+
|
104 |
+
def _init_normalize_chandras(self):
|
105 |
+
|
106 |
+
substitution_offsets =\
|
107 |
+
[
|
108 |
+
[0x0d , 0x0f], # chandra e, independent
|
109 |
+
[0x11 , 0x13], # chandra o, independent
|
110 |
+
[0x45 , 0x47], # chandra e , 0xde],pendent
|
111 |
+
[0x49 , 0x4b], # chandra o , 0xde],pendent
|
112 |
+
# [0x72 , 0x0f], # mr: chandra e, independent
|
113 |
+
|
114 |
+
[0x00 , 0x02], # chandrabindu
|
115 |
+
[0x01 , 0x02], # chandrabindu
|
116 |
+
]
|
117 |
+
|
118 |
+
self.chandra_substitutions = [
|
119 |
+
(langinfo.offset_to_char(x[0],self.lang), langinfo.offset_to_char(x[1],self.lang))
|
120 |
+
for x in substitution_offsets ]
|
121 |
+
|
122 |
+
def _normalize_chandras(self,text):
|
123 |
+
for match, repl in self.chandra_substitutions:
|
124 |
+
text=text.replace(match,repl)
|
125 |
+
return text
|
126 |
+
|
127 |
+
def _init_to_anusvaara_strict(self):
|
128 |
+
"""
|
129 |
+
`r1_nasal=re.compile(r'\\u0919\\u094D([\\u0915-\\u0918])')`
|
130 |
+
"""
|
131 |
+
|
132 |
+
pat_signatures=\
|
133 |
+
[
|
134 |
+
[0x19,0x15,0x18],
|
135 |
+
[0x1e,0x1a,0x1d],
|
136 |
+
[0x23,0x1f,0x22],
|
137 |
+
[0x28,0x24,0x27],
|
138 |
+
[0x29,0x24,0x27],
|
139 |
+
[0x2e,0x2a,0x2d],
|
140 |
+
]
|
141 |
+
|
142 |
+
halant_offset=0x4d
|
143 |
+
anusvaara_offset=0x02
|
144 |
+
|
145 |
+
pats=[]
|
146 |
+
|
147 |
+
for pat_signature in pat_signatures:
|
148 |
+
pat=re.compile(r'{nasal}{halant}([{start_r}-{end_r}])'.format(
|
149 |
+
nasal=langinfo.offset_to_char(pat_signature[0],self.lang),
|
150 |
+
halant=langinfo.offset_to_char(halant_offset,self.lang),
|
151 |
+
start_r=langinfo.offset_to_char(pat_signature[1],self.lang),
|
152 |
+
end_r=langinfo.offset_to_char(pat_signature[2],self.lang),
|
153 |
+
))
|
154 |
+
pats.append(pat)
|
155 |
+
|
156 |
+
repl_string='{anusvaara}\\1'.format(anusvaara=langinfo.offset_to_char(anusvaara_offset,self.lang))
|
157 |
+
|
158 |
+
self.pats_repls=(pats,repl_string)
|
159 |
+
|
160 |
+
def _to_anusvaara_strict(self,text):
|
161 |
+
|
162 |
+
pats, repl_string = self.pats_repls
|
163 |
+
for pat in pats:
|
164 |
+
text=pat.sub(repl_string,text)
|
165 |
+
|
166 |
+
return text
|
167 |
+
|
168 |
+
def _init_to_anusvaara_relaxed(self):
|
169 |
+
"""
|
170 |
+
`r1_nasal=re.compile(r'\\u0919\\u094D([\\u0915-\\u0918])')`
|
171 |
+
"""
|
172 |
+
|
173 |
+
nasals_list=[0x19,0x1e,0x23,0x28,0x29,0x2e]
|
174 |
+
nasals_list_str=','.join([langinfo.offset_to_char(x,self.lang) for x in nasals_list])
|
175 |
+
|
176 |
+
halant_offset=0x4d
|
177 |
+
anusvaara_offset=0x02
|
178 |
+
|
179 |
+
pat=re.compile(r'[{nasals_list_str}]{halant}'.format(
|
180 |
+
nasals_list_str=nasals_list_str,
|
181 |
+
halant=langinfo.offset_to_char(halant_offset,self.lang),
|
182 |
+
))
|
183 |
+
|
184 |
+
repl_string='{anusvaara}'.format(anusvaara=langinfo.offset_to_char(anusvaara_offset,self.lang))
|
185 |
+
|
186 |
+
self.pats_repls = (pat,repl_string)
|
187 |
+
|
188 |
+
def _to_anusvaara_relaxed(self,text):
|
189 |
+
pat, repl_string = self.pats_repls
|
190 |
+
return pat.sub(repl_string,text)
|
191 |
+
|
192 |
+
|
193 |
+
def _init_to_nasal_consonants(self):
|
194 |
+
"""
|
195 |
+
`r1_nasal=re.compile(r'\\u0919\\u094D([\\u0915-\\u0918])')`
|
196 |
+
"""
|
197 |
+
|
198 |
+
pat_signatures=\
|
199 |
+
[
|
200 |
+
[0x19,0x15,0x18],
|
201 |
+
[0x1e,0x1a,0x1d],
|
202 |
+
[0x23,0x1f,0x22],
|
203 |
+
[0x28,0x24,0x27],
|
204 |
+
[0x29,0x24,0x27],
|
205 |
+
[0x2e,0x2a,0x2d],
|
206 |
+
]
|
207 |
+
|
208 |
+
halant_offset=0x4d
|
209 |
+
anusvaara_offset=0x02
|
210 |
+
|
211 |
+
pats=[]
|
212 |
+
repl_strings=[]
|
213 |
+
|
214 |
+
for pat_signature in pat_signatures:
|
215 |
+
pat=re.compile(r'{anusvaara}([{start_r}-{end_r}])'.format(
|
216 |
+
anusvaara=langinfo.offset_to_char(anusvaara_offset,self.lang),
|
217 |
+
start_r=langinfo.offset_to_char(pat_signature[1],self.lang),
|
218 |
+
end_r=langinfo.offset_to_char(pat_signature[2],self.lang),
|
219 |
+
))
|
220 |
+
pats.append(pat)
|
221 |
+
repl_string='{nasal}{halant}\\1'.format(
|
222 |
+
nasal=langinfo.offset_to_char(pat_signature[0],self.lang),
|
223 |
+
halant=langinfo.offset_to_char(halant_offset,self.lang),
|
224 |
+
)
|
225 |
+
repl_strings.append(repl_string)
|
226 |
+
|
227 |
+
self.pats_repls=list(zip(pats,repl_strings))
|
228 |
+
|
229 |
+
def _to_nasal_consonants(self,text):
|
230 |
+
|
231 |
+
for pat, repl in self.pats_repls:
|
232 |
+
text=pat.sub(repl,text)
|
233 |
+
|
234 |
+
return text
|
235 |
+
|
236 |
+
def _init_normalize_nasals(self):
|
237 |
+
|
238 |
+
if self.nasals_mode == 'to_anusvaara_strict':
|
239 |
+
self._init_to_anusvaara_strict()
|
240 |
+
elif self.nasals_mode == 'to_anusvaara_relaxed':
|
241 |
+
self._init_to_anusvaara_relaxed()
|
242 |
+
elif self.nasals_mode == 'to_nasal_consonants':
|
243 |
+
self._init_to_nasal_consonants()
|
244 |
+
|
245 |
+
def _normalize_nasals(self,text):
|
246 |
+
if self.nasals_mode == 'to_anusvaara_strict':
|
247 |
+
return self._to_anusvaara_strict(text)
|
248 |
+
elif self.nasals_mode == 'to_anusvaara_relaxed':
|
249 |
+
return self._to_anusvaara_relaxed(text)
|
250 |
+
elif self.nasals_mode == 'to_nasal_consonants':
|
251 |
+
return self._to_nasal_consonants(text)
|
252 |
+
else:
|
253 |
+
return text
|
254 |
+
|
255 |
+
|
256 |
+
def _normalize_word_vowel_ending_dravidian(self,word):
|
257 |
+
"""
|
258 |
+
for Dravidian
|
259 |
+
- consonant ending: add 'a' ki maatra
|
260 |
+
- halant ending: no change
|
261 |
+
- 'a' ki maatra: no change
|
262 |
+
"""
|
263 |
+
if len(word)>0 and langinfo.is_consonant(word[-1],self.lang):
|
264 |
+
return word+langinfo.offset_to_char(0x3e,self.lang)
|
265 |
+
else:
|
266 |
+
return word
|
267 |
+
|
268 |
+
def _normalize_word_vowel_ending_ie(self,word):
|
269 |
+
"""
|
270 |
+
for IE
|
271 |
+
- consonant ending: add halant
|
272 |
+
- halant ending: no change
|
273 |
+
- 'a' ki maatra: no change
|
274 |
+
"""
|
275 |
+
if len(word)>0 and langinfo.is_consonant(word[-1],self.lang):
|
276 |
+
return word+langinfo.offset_to_char(langinfo.HALANTA_OFFSET,self.lang)
|
277 |
+
else:
|
278 |
+
return word
|
279 |
+
|
280 |
+
def _normalize_vowel_ending(self,text):
|
281 |
+
return ' '.join([ self.fn_vowel_ending(w) for w in text.split(' ') ])
|
282 |
+
|
283 |
+
def normalize(self,text):
|
284 |
+
"""
|
285 |
+
Method to be implemented for normalization for each script
|
286 |
+
"""
|
287 |
+
text=text.replace(NormalizerI.BYTE_ORDER_MARK,'')
|
288 |
+
text=text.replace(NormalizerI.BYTE_ORDER_MARK_2,'')
|
289 |
+
text=text.replace(NormalizerI.WORD_JOINER,'')
|
290 |
+
text=text.replace(NormalizerI.SOFT_HYPHEN,'')
|
291 |
+
|
292 |
+
text=text.replace(NormalizerI.ZERO_WIDTH_SPACE,' ') # ??
|
293 |
+
text=text.replace(NormalizerI.NO_BREAK_SPACE,' ')
|
294 |
+
|
295 |
+
text=text.replace(NormalizerI.ZERO_WIDTH_NON_JOINER, '')
|
296 |
+
text=text.replace(NormalizerI.ZERO_WIDTH_JOINER,'')
|
297 |
+
|
298 |
+
text=self._normalize_punctuations(text)
|
299 |
+
|
300 |
+
if self.do_normalize_chandras:
|
301 |
+
text=self._normalize_chandras(text)
|
302 |
+
text=self._normalize_nasals(text)
|
303 |
+
if self.do_normalize_vowel_ending:
|
304 |
+
text=self._normalize_vowel_ending(text)
|
305 |
+
|
306 |
+
return text
|
307 |
+
|
308 |
+
|
309 |
+
def get_char_stats(self,text):
|
310 |
+
print(len(re.findall(NormalizerI.BYTE_ORDER_MARK,text)))
|
311 |
+
print(len(re.findall(NormalizerI.BYTE_ORDER_MARK_2,text)))
|
312 |
+
print(len(re.findall(NormalizerI.WORD_JOINER,text)))
|
313 |
+
print(len(re.findall(NormalizerI.SOFT_HYPHEN,text)))
|
314 |
+
|
315 |
+
print(len(re.findall(NormalizerI.ZERO_WIDTH_SPACE,text) ))
|
316 |
+
print(len(re.findall(NormalizerI.NO_BREAK_SPACE,text)))
|
317 |
+
|
318 |
+
print(len(re.findall(NormalizerI.ZERO_WIDTH_NON_JOINER,text)))
|
319 |
+
print(len(re.findall(NormalizerI.ZERO_WIDTH_JOINER,text)))
|
320 |
+
|
321 |
+
#for mobj in re.finditer(NormalizerI.ZERO_WIDTH_NON_JOINER,text):
|
322 |
+
# print text[mobj.start()-10:mobj.end()+10].replace('\n', ' ').replace(NormalizerI.ZERO_WIDTH_NON_JOINER,'').encode('utf-8')
|
323 |
+
#print hex(ord(text[mobj.end():mobj.end()+1]))
|
324 |
+
|
325 |
+
def correct_visarga(self,text,visarga_char,char_range):
|
326 |
+
text=re.sub(r'([\u0900-\u097f]):','\\1\u0903',text)
|
327 |
+
|
328 |
+
|
329 |
+
|
330 |
+
class DevanagariNormalizer(BaseNormalizer):
|
331 |
+
"""
|
332 |
+
Normalizer for the Devanagari script. In addition to basic normalization by the super class,
|
333 |
+
|
334 |
+
* Replaces the composite characters containing nuktas by their decomposed form
|
335 |
+
* replace pipe character '|' by poorna virama character
|
336 |
+
* replace colon ':' by visarga if the colon follows a charcter in this script
|
337 |
+
|
338 |
+
"""
|
339 |
+
|
340 |
+
NUKTA='\u093C'
|
341 |
+
|
342 |
+
def __init__(self,lang='hi',remove_nuktas=False,nasals_mode='do_nothing',
|
343 |
+
do_normalize_chandras=False,do_normalize_vowel_ending=False):
|
344 |
+
super(DevanagariNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
|
345 |
+
|
346 |
+
def normalize(self,text):
|
347 |
+
|
348 |
+
# common normalization for Indic scripts
|
349 |
+
text=super(DevanagariNormalizer,self).normalize(text)
|
350 |
+
|
351 |
+
# chandra a replacement for Marathi
|
352 |
+
text=text.replace('\u0972','\u090f')
|
353 |
+
|
354 |
+
# decomposing Nukta based composite characters
|
355 |
+
text=text.replace('\u0929','\u0928'+DevanagariNormalizer.NUKTA)
|
356 |
+
text=text.replace('\u0931','\u0930'+DevanagariNormalizer.NUKTA)
|
357 |
+
text=text.replace('\u0934','\u0933'+DevanagariNormalizer.NUKTA)
|
358 |
+
text=text.replace('\u0958','\u0915'+DevanagariNormalizer.NUKTA)
|
359 |
+
text=text.replace('\u0959','\u0916'+DevanagariNormalizer.NUKTA)
|
360 |
+
text=text.replace('\u095A','\u0917'+DevanagariNormalizer.NUKTA)
|
361 |
+
text=text.replace('\u095B','\u091C'+DevanagariNormalizer.NUKTA)
|
362 |
+
text=text.replace('\u095C','\u0921'+DevanagariNormalizer.NUKTA)
|
363 |
+
text=text.replace('\u095D','\u0922'+DevanagariNormalizer.NUKTA)
|
364 |
+
text=text.replace('\u095E','\u092B'+DevanagariNormalizer.NUKTA)
|
365 |
+
text=text.replace('\u095F','\u092F'+DevanagariNormalizer.NUKTA)
|
366 |
+
|
367 |
+
if self.remove_nuktas:
|
368 |
+
text=text.replace(DevanagariNormalizer.NUKTA,'')
|
369 |
+
|
370 |
+
# replace pipe character for poorna virama
|
371 |
+
text=text.replace('\u007c','\u0964')
|
372 |
+
|
373 |
+
# correct visarga
|
374 |
+
text=re.sub(r'([\u0900-\u097f]):','\\1\u0903',text)
|
375 |
+
|
376 |
+
return text
|
377 |
+
|
378 |
+
def get_char_stats(self,text):
|
379 |
+
super(DevanagariNormalizer,self).get_char_stats(text)
|
380 |
+
|
381 |
+
print((len(re.findall('\u0929',text))))
|
382 |
+
print((len(re.findall('\u0931',text))))
|
383 |
+
print((len(re.findall('\u0934',text))))
|
384 |
+
print((len(re.findall('\u0958',text))))
|
385 |
+
print((len(re.findall('\u0959',text))))
|
386 |
+
print((len(re.findall('\u095A',text))))
|
387 |
+
print((len(re.findall('\u095B',text))))
|
388 |
+
print((len(re.findall('\u095C',text))))
|
389 |
+
print((len(re.findall('\u095D',text))))
|
390 |
+
print((len(re.findall('\u095E',text))))
|
391 |
+
print((len(re.findall('\u095F',text))))
|
392 |
+
|
393 |
+
#print(len(re.findall(u'\u0928'+DevanagariNormalizer.NUKTA,text)))
|
394 |
+
#print(len(re.findall(u'\u0930'+DevanagariNormalizer.NUKTA,text)))
|
395 |
+
#print(len(re.findall(u'\u0933'+DevanagariNormalizer.NUKTA,text)))
|
396 |
+
#print(len(re.findall(u'\u0915'+DevanagariNormalizer.NUKTA,text)))
|
397 |
+
#print(len(re.findall(u'\u0916'+DevanagariNormalizer.NUKTA,text)))
|
398 |
+
#print(len(re.findall(u'\u0917'+DevanagariNormalizer.NUKTA,text)))
|
399 |
+
#print(len(re.findall(u'\u091C'+DevanagariNormalizer.NUKTA,text)))
|
400 |
+
#print(len(re.findall(u'\u0921'+DevanagariNormalizer.NUKTA,text)))
|
401 |
+
#print(len(re.findall(u'\u0922'+DevanagariNormalizer.NUKTA,text)))
|
402 |
+
#print(len(re.findall(u'\u092B'+DevanagariNormalizer.NUKTA,text)))
|
403 |
+
#print(len(re.findall(u'\u092F'+DevanagariNormalizer.NUKTA,text)))
|
404 |
+
|
405 |
+
class GurmukhiNormalizer(BaseNormalizer):
|
406 |
+
"""
|
407 |
+
Normalizer for the Gurmukhi script. In addition to basic normalization by the super class,
|
408 |
+
|
409 |
+
* Replaces the composite characters containing nuktas by their decomposed form
|
410 |
+
* Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
|
411 |
+
* replace pipe character '|' by poorna virama character
|
412 |
+
* replace colon ':' by visarga if the colon follows a charcter in this script
|
413 |
+
"""
|
414 |
+
|
415 |
+
NUKTA='\u0A3C'
|
416 |
+
|
417 |
+
VOWEL_NORM_MAPS={
|
418 |
+
## http://www.unicode.org/versions/Unicode12.1.0/ch12.pdf
|
419 |
+
## Table 12-16
|
420 |
+
'\u0a05\u0a3e': '\u0a06',
|
421 |
+
'\u0a72\u0a3f': '\u0a07',
|
422 |
+
'\u0a72\u0a40': '\u0a08',
|
423 |
+
'\u0a73\u0a41': '\u0a09',
|
424 |
+
'\u0a73\u0a42': '\u0a0a',
|
425 |
+
'\u0a72\u0a47': '\u0a0f',
|
426 |
+
'\u0a05\u0a48': '\u0a10',
|
427 |
+
'\u0a73\u0a4b': '\u0a13',
|
428 |
+
'\u0a05\u0a4c': '\u0a14',
|
429 |
+
}
|
430 |
+
|
431 |
+
def __init__(self,lang='pa',remove_nuktas=False,nasals_mode='do_nothing',do_normalize_chandras=False,
|
432 |
+
do_normalize_vowel_ending=False,
|
433 |
+
do_canonicalize_addak=False,
|
434 |
+
do_canonicalize_tippi=False,
|
435 |
+
do_replace_vowel_bases=False):
|
436 |
+
super(GurmukhiNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
|
437 |
+
self.do_canonicalize_addak=do_canonicalize_addak
|
438 |
+
self.do_canonicalize_tippi=do_canonicalize_tippi
|
439 |
+
self.do_replace_vowel_bases=do_replace_vowel_bases
|
440 |
+
|
441 |
+
|
442 |
+
def _normalize_vowels(self,text):
|
443 |
+
"""
|
444 |
+
|
445 |
+
"""
|
446 |
+
|
447 |
+
## standard vowel replacements as per suggestions in
|
448 |
+
## http://www.unicode.org/versions/Unicode12.1.0/ch12.pdf
|
449 |
+
## Table 12-16
|
450 |
+
|
451 |
+
for k,v in GurmukhiNormalizer.VOWEL_NORM_MAPS.items():
|
452 |
+
text=text.replace(k,v)
|
453 |
+
|
454 |
+
## the above mappings should account for majority of the variantions,
|
455 |
+
## Rest are handled via this generic rule which looks at the diacritic
|
456 |
+
## following the 2 special characters
|
457 |
+
## TBD: don't see evidence for this in Wikipedia corpus
|
458 |
+
|
459 |
+
## If these special characters occur without any diacritic, replace them with closet
|
460 |
+
## equivalent vowels
|
461 |
+
if self.do_replace_vowel_bases:
|
462 |
+
text=text.replace('\u0a72','\u0a07')
|
463 |
+
text=text.replace('\u0a73','\u0a09')
|
464 |
+
|
465 |
+
return text
|
466 |
+
|
467 |
+
|
468 |
+
def normalize(self,text):
|
469 |
+
|
470 |
+
# Addak
|
471 |
+
if self.do_canonicalize_addak:
|
472 |
+
## replace addak+consonant with consonat+halant+consonant
|
473 |
+
text=re.sub(r'\u0a71(.)','\\1\u0a4d\\1',text)
|
474 |
+
|
475 |
+
# Tippi
|
476 |
+
if self.do_canonicalize_tippi:
|
477 |
+
text=text.replace('\u0a70','\u0a02')
|
478 |
+
|
479 |
+
# Vowels: Gurumuki has multiple ways of representing independent vowels due
|
480 |
+
# to the characters 'iri' and 'ura'.
|
481 |
+
text=self._normalize_vowels(text)
|
482 |
+
|
483 |
+
# common normalization for Indic scripts
|
484 |
+
text=super(GurmukhiNormalizer,self).normalize(text)
|
485 |
+
|
486 |
+
# decomposing Nukta based composite characters
|
487 |
+
text=text.replace('\u0a33','\u0a32'+GurmukhiNormalizer.NUKTA)
|
488 |
+
text=text.replace('\u0a36','\u0a38'+GurmukhiNormalizer.NUKTA)
|
489 |
+
text=text.replace('\u0a59','\u0a16'+GurmukhiNormalizer.NUKTA)
|
490 |
+
text=text.replace('\u0a5a','\u0a17'+GurmukhiNormalizer.NUKTA)
|
491 |
+
text=text.replace('\u0a5b','\u0a1c'+GurmukhiNormalizer.NUKTA)
|
492 |
+
text=text.replace('\u0a5e','\u0a2b'+GurmukhiNormalizer.NUKTA)
|
493 |
+
|
494 |
+
if self.remove_nuktas:
|
495 |
+
text=text.replace(GurmukhiNormalizer.NUKTA,'')
|
496 |
+
|
497 |
+
# replace the poorna virama codes specific to script
|
498 |
+
# with generic Indic script codes
|
499 |
+
text=text.replace('\u0a64','\u0964')
|
500 |
+
text=text.replace('\u0a65','\u0965')
|
501 |
+
|
502 |
+
## replace pipe character for poorna virama
|
503 |
+
text=text.replace('\u007c','\u0964')
|
504 |
+
|
505 |
+
# correct visarge
|
506 |
+
text=re.sub(r'([\u0a00-\u0a7f]):','\\1\u0a03',text)
|
507 |
+
|
508 |
+
return text
|
509 |
+
|
510 |
+
|
511 |
+
class GujaratiNormalizer(BaseNormalizer):
|
512 |
+
"""
|
513 |
+
Normalizer for the Gujarati script. In addition to basic normalization by the super class,
|
514 |
+
|
515 |
+
* Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
|
516 |
+
* replace colon ':' by visarga if the colon follows a charcter in this script
|
517 |
+
"""
|
518 |
+
|
519 |
+
NUKTA='\u0ABC'
|
520 |
+
|
521 |
+
def __init__(self,lang='gu',remove_nuktas=False,nasals_mode='do_nothing',do_normalize_chandras=False,
|
522 |
+
do_normalize_vowel_ending=False):
|
523 |
+
super(GujaratiNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
|
524 |
+
|
525 |
+
def normalize(self,text):
|
526 |
+
|
527 |
+
# common normalization for Indic scripts
|
528 |
+
text=super(GujaratiNormalizer,self).normalize(text)
|
529 |
+
|
530 |
+
# decomposing Nukta based composite characters
|
531 |
+
if self.remove_nuktas:
|
532 |
+
text=text.replace(GujaratiNormalizer.NUKTA,'')
|
533 |
+
|
534 |
+
|
535 |
+
# replace the poorna virama codes specific to script
|
536 |
+
# with generic Indic script codes
|
537 |
+
text=text.replace('\u0ae4','\u0964')
|
538 |
+
text=text.replace('\u0ae5','\u0965')
|
539 |
+
|
540 |
+
# correct visarge
|
541 |
+
text=re.sub(r'([\u0a80-\u0aff]):','\\1\u0a83',text)
|
542 |
+
|
543 |
+
return text
|
544 |
+
|
545 |
+
|
546 |
+
class OriyaNormalizer(BaseNormalizer):
|
547 |
+
"""
|
548 |
+
Normalizer for the Oriya script. In addition to basic normalization by the super class,
|
549 |
+
|
550 |
+
* Replaces the composite characters containing nuktas by their decomposed form
|
551 |
+
* Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
|
552 |
+
* Canonicalize two part dependent vowels
|
553 |
+
* Replace 'va' with 'ba'
|
554 |
+
* replace pipe character '|' by poorna virama character
|
555 |
+
* replace colon ':' by visarga if the colon follows a charcter in this script
|
556 |
+
"""
|
557 |
+
|
558 |
+
NUKTA='\u0B3C'
|
559 |
+
|
560 |
+
VOWEL_NORM_MAPS={
|
561 |
+
## See Table 12-22 in http://www.unicode.org/versions/Unicode12.1.0/ch12.pdf
|
562 |
+
'\u0b05\u0b3e': '\u0b06',
|
563 |
+
'\u0b0f\u0b57': '\u0b10',
|
564 |
+
'\u0b13\u0b57': '\u0b14',
|
565 |
+
}
|
566 |
+
|
567 |
+
|
568 |
+
def __init__(self,lang='or',remove_nuktas=False,nasals_mode='do_nothing',do_normalize_chandras=False,
|
569 |
+
do_normalize_vowel_ending=False,
|
570 |
+
do_remap_wa=False):
|
571 |
+
super(OriyaNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
|
572 |
+
self.do_remap_wa=do_remap_wa
|
573 |
+
|
574 |
+
def normalize(self,text):
|
575 |
+
|
576 |
+
# common normalization for Indic scripts
|
577 |
+
text=super(OriyaNormalizer,self).normalize(text)
|
578 |
+
|
579 |
+
## standard vowel replacements as per suggestions in Unicode documents
|
580 |
+
for k,v in OriyaNormalizer.VOWEL_NORM_MAPS.items():
|
581 |
+
text=text.replace(k,v)
|
582 |
+
|
583 |
+
# decomposing Nukta based composite characters
|
584 |
+
text=text.replace('\u0b5c','\u0b21'+OriyaNormalizer.NUKTA)
|
585 |
+
text=text.replace('\u0b5d','\u0b22'+OriyaNormalizer.NUKTA)
|
586 |
+
|
587 |
+
if self.remove_nuktas:
|
588 |
+
text=text.replace(OriyaNormalizer.NUKTA,'')
|
589 |
+
|
590 |
+
# replace the poorna virama codes specific to script
|
591 |
+
# with generic Indic script codes
|
592 |
+
text=text.replace('\u0b64','\u0964')
|
593 |
+
text=text.replace('\u0b65','\u0965')
|
594 |
+
|
595 |
+
# replace pipe character for poorna virama
|
596 |
+
text=text.replace('\u0b7c','\u0964')
|
597 |
+
|
598 |
+
# replace wa with ba
|
599 |
+
if self.do_remap_wa:
|
600 |
+
text=text.replace('\u0b71','\u0b2c')
|
601 |
+
|
602 |
+
# replace va with ba
|
603 |
+
# NOTE: documentation (chapter on Indic scripts) and codepoint chart seem contradictory
|
604 |
+
# (this applied to wa to ba rule also above)
|
605 |
+
text=text.replace('\u0b35','\u0b2c')
|
606 |
+
|
607 |
+
# AI dependent vowel sign
|
608 |
+
text=text.replace('\u0b47\u0b56','\u0b58')
|
609 |
+
|
610 |
+
# two part dependent vowels
|
611 |
+
text=text.replace('\u0b47\u0b3e','\u0b4b')
|
612 |
+
text=text.replace('\u0b47\u0b57','\u0b4c')
|
613 |
+
|
614 |
+
|
615 |
+
# additional consonant - not clear how to handle this
|
616 |
+
# ignore
|
617 |
+
|
618 |
+
# correct visarge
|
619 |
+
text=re.sub(r'([\u0b00-\u0b7f]):','\\1\u0b03',text)
|
620 |
+
|
621 |
+
return text
|
622 |
+
|
623 |
+
|
624 |
+
class BengaliNormalizer(BaseNormalizer):
|
625 |
+
"""
|
626 |
+
Normalizer for the Bengali script. In addition to basic normalization by the super class,
|
627 |
+
|
628 |
+
* Replaces the composite characters containing nuktas by their decomposed form
|
629 |
+
* Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
|
630 |
+
* Canonicalize two part dependent vowels
|
631 |
+
* replace pipe character '|' by poorna virama character
|
632 |
+
* replace colon ':' by visarga if the colon follows a charcter in this script
|
633 |
+
|
634 |
+
"""
|
635 |
+
|
636 |
+
NUKTA='\u09BC'
|
637 |
+
|
638 |
+
def __init__(self,lang='bn',remove_nuktas=False,nasals_mode='do_nothing',do_normalize_chandras=False,
|
639 |
+
do_normalize_vowel_ending=False,
|
640 |
+
do_remap_assamese_chars=False):
|
641 |
+
super(BengaliNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
|
642 |
+
self.do_remap_assamese_chars=do_remap_assamese_chars
|
643 |
+
|
644 |
+
def normalize(self,text):
|
645 |
+
|
646 |
+
# common normalization for Indic scripts
|
647 |
+
text=super(BengaliNormalizer,self).normalize(text)
|
648 |
+
|
649 |
+
# decomposing Nukta based composite characters
|
650 |
+
text=text.replace('\u09dc','\u09a1'+BengaliNormalizer.NUKTA)
|
651 |
+
text=text.replace('\u09dd','\u09a2'+BengaliNormalizer.NUKTA)
|
652 |
+
text=text.replace('\u09df','\u09af'+BengaliNormalizer.NUKTA)
|
653 |
+
|
654 |
+
if self.remove_nuktas:
|
655 |
+
text=text.replace(BengaliNormalizer.NUKTA,'')
|
656 |
+
|
657 |
+
if self.do_remap_assamese_chars and self.lang=='as':
|
658 |
+
text=text.replace('\u09f0','\u09b0') # 'ra' character
|
659 |
+
text=text.replace('\u09f1','\u09ac') # 'va' character
|
660 |
+
|
661 |
+
# replace the poorna virama codes specific to script
|
662 |
+
# with generic Indic script codes
|
663 |
+
text=text.replace('\u09e4','\u0964')
|
664 |
+
text=text.replace('\u09e5','\u0965')
|
665 |
+
|
666 |
+
# replace pipe character for poorna virama
|
667 |
+
text=text.replace('\u007c','\u0964')
|
668 |
+
# replace bengali currency numerator four for poorna virama (it looks similar and is used as a substitute)
|
669 |
+
text=text.replace('\u09f7','\u0964')
|
670 |
+
|
671 |
+
# two part dependent vowels
|
672 |
+
text=text.replace('\u09c7\u09be','\u09cb')
|
673 |
+
text=text.replace('\u09c7\u09d7','\u09cc')
|
674 |
+
|
675 |
+
# correct visarge
|
676 |
+
text=re.sub(r'([\u0980-\u09ff]):','\\1\u0983',text)
|
677 |
+
|
678 |
+
return text
|
679 |
+
|
680 |
+
|
681 |
+
class TamilNormalizer(BaseNormalizer):
|
682 |
+
"""
|
683 |
+
Normalizer for the Tamil script. In addition to basic normalization by the super class,
|
684 |
+
|
685 |
+
* Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
|
686 |
+
* canonicalize two-part dependent vowel signs
|
687 |
+
* replace colon ':' by visarga if the colon follows a charcter in this script
|
688 |
+
"""
|
689 |
+
|
690 |
+
def __init__(self,lang='ta',remove_nuktas=False,nasals_mode='do_nothing',
|
691 |
+
do_normalize_chandras=False,do_normalize_vowel_ending=False):
|
692 |
+
super(TamilNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
|
693 |
+
|
694 |
+
def normalize(self,text):
|
695 |
+
|
696 |
+
# common normalization for Indic scripts
|
697 |
+
text=super(TamilNormalizer,self).normalize(text)
|
698 |
+
|
699 |
+
# replace the poorna virama codes specific to script
|
700 |
+
# with generic Indic script codes
|
701 |
+
text=text.replace('\u0be4','\u0964')
|
702 |
+
text=text.replace('\u0be5','\u0965')
|
703 |
+
|
704 |
+
# two part dependent vowels
|
705 |
+
text=text.replace('\u0b92\u0bd7','\u0b94')
|
706 |
+
text=text.replace('\u0bc6\u0bbe','\u0bca')
|
707 |
+
text=text.replace('\u0bc7\u0bbe','\u0bcb')
|
708 |
+
text=text.replace('\u0bc6\u0bd7','\u0bcc')
|
709 |
+
|
710 |
+
# correct visarge
|
711 |
+
text=re.sub(r'([\u0b80-\u0bff]):','\\1\u0b83',text)
|
712 |
+
|
713 |
+
return text
|
714 |
+
|
715 |
+
|
716 |
+
class TeluguNormalizer(BaseNormalizer):
|
717 |
+
"""
|
718 |
+
Normalizer for the Teluguscript. In addition to basic normalization by the super class,
|
719 |
+
|
720 |
+
* Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
|
721 |
+
* canonicalize two-part dependent vowel signs
|
722 |
+
* replace colon ':' by visarga if the colon follows a charcter in this script
|
723 |
+
"""
|
724 |
+
|
725 |
+
def __init__(self,lang='te',remove_nuktas=False,nasals_mode='do_nothing',
|
726 |
+
do_normalize_chandras=False,do_normalize_vowel_ending=False):
|
727 |
+
super(TeluguNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
|
728 |
+
|
729 |
+
def normalize(self,text):
|
730 |
+
|
731 |
+
# common normalization for Indic scripts
|
732 |
+
text=super(TeluguNormalizer,self).normalize(text)
|
733 |
+
|
734 |
+
# replace the poorna virama codes specific to script
|
735 |
+
# with generic Indic script codes
|
736 |
+
text=text.replace('\u0c64','\u0964')
|
737 |
+
text=text.replace('\u0c65','\u0965')
|
738 |
+
|
739 |
+
# dependent vowels
|
740 |
+
text=text.replace('\u0c46\u0c56','\u0c48')
|
741 |
+
|
742 |
+
# correct visarge
|
743 |
+
text=re.sub(r'([\u0c00-\u0c7f]):','\\1\u0c03',text)
|
744 |
+
|
745 |
+
return text
|
746 |
+
|
747 |
+
def get_char_stats(self,text):
|
748 |
+
pass
|
749 |
+
|
750 |
+
class KannadaNormalizer(BaseNormalizer):
|
751 |
+
"""
|
752 |
+
Normalizer for the Kannada script. In addition to basic normalization by the super class,
|
753 |
+
|
754 |
+
* Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
|
755 |
+
* canonicalize two-part dependent vowel signs
|
756 |
+
* replace colon ':' by visarga if the colon follows a charcter in this script
|
757 |
+
"""
|
758 |
+
|
759 |
+
def __init__(self,lang='kn',remove_nuktas=False,nasals_mode='do_nothing',
|
760 |
+
do_normalize_chandras=False,do_normalize_vowel_ending=False):
|
761 |
+
super(KannadaNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
|
762 |
+
|
763 |
+
|
764 |
+
def normalize(self,text):
|
765 |
+
|
766 |
+
# common normalization for Indic scripts
|
767 |
+
text=super(KannadaNormalizer,self).normalize(text)
|
768 |
+
|
769 |
+
# replace the poorna virama codes specific to script
|
770 |
+
# with generic Indic script codes
|
771 |
+
text=text.replace('\u0ce4','\u0964')
|
772 |
+
text=text.replace('\u0ce5','\u0965')
|
773 |
+
|
774 |
+
# dependent vowels
|
775 |
+
text=text.replace('\u0cbf\u0cd5','\u0cc0')
|
776 |
+
text=text.replace('\u0cc6\u0cd5','\u0cc7')
|
777 |
+
text=text.replace('\u0cc6\u0cd6','\u0cc8')
|
778 |
+
text=text.replace('\u0cc6\u0cc2','\u0cca')
|
779 |
+
text=text.replace('\u0cca\u0cd5','\u0ccb')
|
780 |
+
|
781 |
+
# correct visarge
|
782 |
+
text=re.sub(r'([\u0c80-\u0cff]):','\\1\u0c83',text)
|
783 |
+
|
784 |
+
return text
|
785 |
+
|
786 |
+
|
787 |
+
class MalayalamNormalizer(BaseNormalizer):
|
788 |
+
"""
|
789 |
+
Normalizer for the Malayalam script. In addition to basic normalization by the super class,
|
790 |
+
|
791 |
+
* Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
|
792 |
+
* canonicalize two-part dependent vowel signs
|
793 |
+
* Change from old encoding of chillus (till Unicode 5.0) to new encoding
|
794 |
+
* replace colon ':' by visarga if the colon follows a charcter in this script
|
795 |
+
"""
|
796 |
+
|
797 |
+
CHILLU_CHAR_MAP= {
|
798 |
+
'\u0d7a': '\u0d23',
|
799 |
+
'\u0d7b': '\u0d28',
|
800 |
+
'\u0d7c': '\u0d30',
|
801 |
+
'\u0d7d': '\u0d32',
|
802 |
+
'\u0d7e': '\u0d33',
|
803 |
+
'\u0d7f': '\u0d15',
|
804 |
+
}
|
805 |
+
|
806 |
+
def _canonicalize_chillus(self,text):
|
807 |
+
for chillu, char in MalayalamNormalizer.CHILLU_CHAR_MAP.items():
|
808 |
+
text=text.replace(chillu,'{}\u0d4d'.format(char))
|
809 |
+
return text
|
810 |
+
|
811 |
+
def _correct_geminated_T(self,text):
|
812 |
+
return text.replace('\u0d31\u0d4d\u0d31','\u0d1f\u0d4d\u0d1f')
|
813 |
+
|
814 |
+
def __init__(self,lang='ml',remove_nuktas=False,nasals_mode='do_nothing',do_normalize_chandras=False,
|
815 |
+
do_normalize_vowel_ending=False,
|
816 |
+
do_canonicalize_chillus=False, do_correct_geminated_T=False):
|
817 |
+
super(MalayalamNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
|
818 |
+
self.do_canonicalize_chillus=do_canonicalize_chillus
|
819 |
+
self.do_correct_geminated_T=do_correct_geminated_T
|
820 |
+
|
821 |
+
def normalize(self,text):
|
822 |
+
|
823 |
+
# Change from old encoding of chillus (till Unicode 5.0) to new encoding
|
824 |
+
text=text.replace('\u0d23\u0d4d\u200d','\u0d7a')
|
825 |
+
text=text.replace('\u0d28\u0d4d\u200d','\u0d7b')
|
826 |
+
text=text.replace('\u0d30\u0d4d\u200d','\u0d7c')
|
827 |
+
text=text.replace('\u0d32\u0d4d\u200d','\u0d7d')
|
828 |
+
text=text.replace('\u0d33\u0d4d\u200d','\u0d7e')
|
829 |
+
text=text.replace('\u0d15\u0d4d\u200d','\u0d7f')
|
830 |
+
|
831 |
+
# Normalize chillus
|
832 |
+
if self.do_canonicalize_chillus:
|
833 |
+
text=self._canonicalize_chillus(text)
|
834 |
+
|
835 |
+
# common normalization for Indic scripts
|
836 |
+
text=super(MalayalamNormalizer,self).normalize(text)
|
837 |
+
|
838 |
+
# replace the poorna virama codes specific to script
|
839 |
+
# with generic Indic script codes
|
840 |
+
text=text.replace('\u0d64','\u0964')
|
841 |
+
text=text.replace('\u0d65','\u0965')
|
842 |
+
|
843 |
+
# dependent vowels
|
844 |
+
text=text.replace('\u0d46\u0d3e','\u0d4a')
|
845 |
+
text=text.replace('\u0d47\u0d3e','\u0d4b')
|
846 |
+
|
847 |
+
# au forms
|
848 |
+
text=text.replace('\u0d46\u0d57','\u0d4c')
|
849 |
+
text=text.replace('\u0d57','\u0d4c')
|
850 |
+
|
851 |
+
# correct geminated T
|
852 |
+
if self.do_correct_geminated_T:
|
853 |
+
text=self._correct_geminated_T(text)
|
854 |
+
|
855 |
+
# correct visarga
|
856 |
+
text=re.sub(r'([\u0d00-\u0d7f]):','\\1\u0d03',text)
|
857 |
+
|
858 |
+
return text
|
859 |
+
|
860 |
+
class UrduNormalizer(NormalizerI):
|
861 |
+
'''Uses UrduHack library.
|
862 |
+
https://docs.urduhack.com/en/stable/_modules/urduhack/normalization/character.html#normalize
|
863 |
+
'''
|
864 |
+
|
865 |
+
def __init__(self, lang, remove_nuktas=True):
|
866 |
+
self.lang = lang
|
867 |
+
self.remove_nuktas = remove_nuktas
|
868 |
+
|
869 |
+
from urduhack.normalization import (
|
870 |
+
remove_diacritics,
|
871 |
+
normalize_characters,
|
872 |
+
normalize_combine_characters
|
873 |
+
) # TODO: Use only required normalizers
|
874 |
+
from urduhack.preprocessing import (
|
875 |
+
normalize_whitespace,
|
876 |
+
digits_space,
|
877 |
+
all_punctuations_space,
|
878 |
+
english_characters_space
|
879 |
+
)
|
880 |
+
|
881 |
+
def normalize(self, text):
|
882 |
+
text = self._normalize_punctuations(text)
|
883 |
+
text = UrduNormalizer.normalize_whitespace(text)
|
884 |
+
if self.remove_nuktas:
|
885 |
+
text = UrduNormalizer.remove_diacritics(text)
|
886 |
+
text = UrduNormalizer.normalize_characters(text)
|
887 |
+
text = UrduNormalizer.normalize_combine_characters(text)
|
888 |
+
text = UrduNormalizer.digits_space(text)
|
889 |
+
text = UrduNormalizer.all_punctuations_space(text)
|
890 |
+
text = UrduNormalizer.english_characters_space(text)
|
891 |
+
return text
|
892 |
+
|
893 |
+
|
894 |
+
class IndicNormalizerFactory(object):
|
895 |
+
"""
|
896 |
+
Factory class to create language specific normalizers.
|
897 |
+
|
898 |
+
"""
|
899 |
+
|
900 |
+
def get_normalizer(self,language,**kwargs):
|
901 |
+
"""
|
902 |
+
Call the get_normalizer function to get the language specific normalizer
|
903 |
+
|
904 |
+
Paramters:
|
905 |
+
|language: language code
|
906 |
+
|remove_nuktas: boolean, should the normalizer remove nukta characters
|
907 |
+
"""
|
908 |
+
normalizer=None
|
909 |
+
if language in ['hi','mr','sa','kK','ne','sd']:
|
910 |
+
normalizer=DevanagariNormalizer(lang=language, **kwargs)
|
911 |
+
elif language in ['ur']:
|
912 |
+
normalizer = UrduNormalizer(lang=language, **kwargs)
|
913 |
+
elif language in ['pa']:
|
914 |
+
normalizer=GurmukhiNormalizer(lang=language, **kwargs)
|
915 |
+
elif language in ['gu']:
|
916 |
+
normalizer=GujaratiNormalizer(lang=language, **kwargs)
|
917 |
+
elif language in ['bn']:
|
918 |
+
normalizer=BengaliNormalizer(lang=language, **kwargs)
|
919 |
+
elif language in ['as']:
|
920 |
+
normalizer=BengaliNormalizer(lang=language, **kwargs)
|
921 |
+
elif language in ['or']:
|
922 |
+
normalizer=OriyaNormalizer(lang=language, **kwargs)
|
923 |
+
elif language in ['ml']:
|
924 |
+
normalizer=MalayalamNormalizer(lang=language, **kwargs)
|
925 |
+
elif language in ['kn']:
|
926 |
+
normalizer=KannadaNormalizer(lang=language, **kwargs)
|
927 |
+
elif language in ['ta']:
|
928 |
+
normalizer=TamilNormalizer(lang=language, **kwargs)
|
929 |
+
elif language in ['te']:
|
930 |
+
normalizer=TeluguNormalizer(lang=language, **kwargs)
|
931 |
+
else:
|
932 |
+
normalizer=BaseNormalizer(lang=language, **kwargs)
|
933 |
+
|
934 |
+
return normalizer
|
935 |
+
|
936 |
+
def is_language_supported(self,language):
|
937 |
+
"""
|
938 |
+
Is the language supported?
|
939 |
+
"""
|
940 |
+
if language in ['hi','mr','sa','kK','ne','sd',
|
941 |
+
'ur',
|
942 |
+
'pa',
|
943 |
+
'gu',
|
944 |
+
'bn','as',
|
945 |
+
'or',
|
946 |
+
'ml',
|
947 |
+
'kn',
|
948 |
+
'ta',
|
949 |
+
'te']:
|
950 |
+
return True
|
951 |
+
else:
|
952 |
+
return False
|
953 |
+
|
954 |
+
|
955 |
+
if __name__ == '__main__':
|
956 |
+
|
957 |
+
if len(sys.argv)<4:
|
958 |
+
print("Usage: python normalize.py <infile> <outfile> <language> [<replace_nukta(True,False)>] [<normalize_nasals(do_nothing|to_anusvaara_strict|to_anusvaara_relaxed|to_nasal_consonants)>]")
|
959 |
+
sys.exit(1)
|
960 |
+
|
961 |
+
language=sys.argv[3]
|
962 |
+
remove_nuktas=False
|
963 |
+
normalize_nasals='do_nothing'
|
964 |
+
if len(sys.argv)>=5:
|
965 |
+
remove_nuktas=bool(sys.argv[4])
|
966 |
+
if len(sys.argv)>=6:
|
967 |
+
normalize_nasals=sys.argv[5]
|
968 |
+
|
969 |
+
# create normalizer
|
970 |
+
factory=IndicNormalizerFactory()
|
971 |
+
normalizer=factory.get_normalizer(language,remove_nuktas=remove_nuktas,nasals_mode=normalize_nasals)
|
972 |
+
|
973 |
+
# DO normalization
|
974 |
+
with codecs.open(sys.argv[1],'r','utf-8') as ifile:
|
975 |
+
with codecs.open(sys.argv[2],'w','utf-8') as ofile:
|
976 |
+
for line in ifile.readlines():
|
977 |
+
normalized_line=normalizer.normalize(line)
|
978 |
+
ofile.write(normalized_line)
|
979 |
+
|
980 |
+
## gather status about normalization
|
981 |
+
#with codecs.open(sys.argv[1],'r','utf-8') as ifile:
|
982 |
+
# normalizer=DevanagariNormalizer()
|
983 |
+
# text=string.join(ifile.readlines(),sep='')
|
984 |
+
# normalizer.get_char_stats(text)
|
indic_nlp_library/indicnlp/script/__init__.py
ADDED
File without changes
|
indic_nlp_library/indicnlp/script/english_script.py
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright (c) 2013-present, Anoop Kunchukuttan
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the MIT license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
#
|
8 |
+
|
9 |
+
import pandas as pd
|
10 |
+
import numpy as np
|
11 |
+
|
12 |
+
from indicnlp import common
|
13 |
+
from indicnlp.common import IndicNlpException
|
14 |
+
|
15 |
+
|
16 |
+
#### Maps from ARPABET to Internal Id
|
17 |
+
ARPABET_ID_MAP={}
|
18 |
+
ID_ARPABET_MAP={}
|
19 |
+
|
20 |
+
|
21 |
+
###
|
22 |
+
# Phonetic Information about script characters
|
23 |
+
###
|
24 |
+
|
25 |
+
""" Phonetic data for English """
|
26 |
+
ENGLISH_PHONETIC_DATA=None
|
27 |
+
|
28 |
+
""" Phonetic vector for English"""
|
29 |
+
ENGLISH_PHONETIC_VECTORS=None
|
30 |
+
|
31 |
+
""" Length of phonetic vector """
|
32 |
+
PHONETIC_VECTOR_LENGTH=38
|
33 |
+
|
34 |
+
""" Start offset for the phonetic feature vector in the phonetic data vector """
|
35 |
+
PHONETIC_VECTOR_START_OFFSET=6
|
36 |
+
|
37 |
+
## PHONETIC PROPERTIES in order in which they occur in the vector
|
38 |
+
## This list must be in sync with the keys in the PV_PROP_RANGES dictionary
|
39 |
+
PV_PROP=['basic_type',
|
40 |
+
'vowel_length',
|
41 |
+
'vowel_strength',
|
42 |
+
'vowel_status',
|
43 |
+
'consonant_type',
|
44 |
+
'articulation_place',
|
45 |
+
'aspiration',
|
46 |
+
'voicing',
|
47 |
+
'nasalization',
|
48 |
+
'vowel_horizontal',
|
49 |
+
'vowel_vertical',
|
50 |
+
'vowel_roundness',
|
51 |
+
]
|
52 |
+
|
53 |
+
###
|
54 |
+
# Bit vector ranges for various properties
|
55 |
+
###
|
56 |
+
|
57 |
+
PV_PROP_RANGES={
|
58 |
+
'basic_type': [0,6],
|
59 |
+
'vowel_length': [6,8],
|
60 |
+
'vowel_strength': [8,11],
|
61 |
+
'vowel_status': [11,13],
|
62 |
+
'consonant_type': [13,18],
|
63 |
+
'articulation_place': [18,23],
|
64 |
+
'aspiration': [23,25],
|
65 |
+
'voicing': [25,27],
|
66 |
+
'nasalization': [27,29],
|
67 |
+
'vowel_horizontal': [29,32],
|
68 |
+
'vowel_vertical': [32,36],
|
69 |
+
'vowel_roundness': [36,38],
|
70 |
+
}
|
71 |
+
|
72 |
+
|
73 |
+
####
|
74 |
+
# Indexes into the Phonetic Vector
|
75 |
+
####
|
76 |
+
PVIDX_BT_VOWEL=0
|
77 |
+
PVIDX_BT_CONSONANT=1
|
78 |
+
PVIDX_BT_NUKTA=2
|
79 |
+
PVIDX_BT_HALANT=3
|
80 |
+
PVIDX_BT_ANUSVAAR=4
|
81 |
+
PVIDX_BT_MISC=5
|
82 |
+
PVIDX_BT_S=PVIDX_BT_VOWEL
|
83 |
+
PVIDX_BT_E=PVIDX_BT_MISC+1
|
84 |
+
|
85 |
+
PVIDX_VSTAT_DEP=12
|
86 |
+
|
87 |
+
####
|
88 |
+
SCRIPT_RANGE_START=0x0D00
|
89 |
+
## TBD
|
90 |
+
SCRIPT_RANGE_END=0x0D2E
|
91 |
+
|
92 |
+
|
93 |
+
def init():
|
94 |
+
"""
|
95 |
+
To be called by library loader, do not call it in your program
|
96 |
+
"""
|
97 |
+
|
98 |
+
global ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET
|
99 |
+
|
100 |
+
ENGLISH_PHONETIC_DATA=pd.read_csv(common.get_resources_path()+'/script/english_script_phonetic_data.csv',encoding='utf-8')
|
101 |
+
|
102 |
+
ENGLISH_PHONETIC_VECTORS=ENGLISH_PHONETIC_DATA.iloc[:,PHONETIC_VECTOR_START_OFFSET:].values
|
103 |
+
|
104 |
+
PHONETIC_VECTOR_LENGTH=ENGLISH_PHONETIC_VECTORS.shape[1]
|
105 |
+
|
106 |
+
### Load mapping from ARPABET representation of phoneme to internal ID
|
107 |
+
global ARPABET_ID_MAP, ID_ARPABET_MAP
|
108 |
+
|
109 |
+
with open(common.get_resources_path()+'/script/english_arpabet_list.csv','r',encoding='utf-8') as infile:
|
110 |
+
for ph_id, name in enumerate(iter(infile)):
|
111 |
+
name=name.strip()
|
112 |
+
ARPABET_ID_MAP[name]=ph_id
|
113 |
+
ID_ARPABET_MAP[ph_id]=name
|
114 |
+
|
115 |
+
|
116 |
+
def phoneme_to_offset(ph):
|
117 |
+
return ARPABET_ID_MAP[ph]
|
118 |
+
|
119 |
+
def offset_to_phoneme(ph_id):
|
120 |
+
return ID_ARPABET_MAP[ph_id]
|
121 |
+
|
122 |
+
def phoneme_to_enc(ph):
|
123 |
+
return chr(SCRIPT_RANGE_START+phoneme_to_offset(ph))
|
124 |
+
|
125 |
+
def enc_to_phoneme(ph):
|
126 |
+
return offset_to_phoneme(enc_to_offset(ph))
|
127 |
+
|
128 |
+
def enc_to_offset(c):
|
129 |
+
return ord(c)-SCRIPT_RANGE_START
|
130 |
+
|
131 |
+
def in_range(offset):
|
132 |
+
return offset>=SCRIPT_RANGE_START and offset<SCRIPT_RANGE_END
|
133 |
+
|
134 |
+
def get_phonetic_info(lang):
|
135 |
+
return (ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS)
|
136 |
+
|
137 |
+
def invalid_vector():
|
138 |
+
## TODO: check if np datatype is correct?
|
139 |
+
return np.array([0]*PHONETIC_VECTOR_LENGTH)
|
140 |
+
|
141 |
+
def get_phonetic_feature_vector(p,lang):
|
142 |
+
|
143 |
+
offset=enc_to_offset(p)
|
144 |
+
|
145 |
+
if not in_range(offset):
|
146 |
+
return invalid_vector()
|
147 |
+
|
148 |
+
phonetic_data, phonetic_vectors= get_phonetic_info(lang)
|
149 |
+
|
150 |
+
if phonetic_data.iloc[offset]['Valid Vector Representation']==0:
|
151 |
+
return invalid_vector()
|
152 |
+
|
153 |
+
return phonetic_vectors[offset]
|
154 |
+
|
indic_nlp_library/indicnlp/script/indic_scripts.py
ADDED
@@ -0,0 +1,301 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright (c) 2013-present, Anoop Kunchukuttan
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the MIT license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
#
|
8 |
+
|
9 |
+
import pandas as pd
|
10 |
+
import numpy as np
|
11 |
+
import os
|
12 |
+
|
13 |
+
from indicnlp import common
|
14 |
+
from indicnlp.common import IndicNlpException
|
15 |
+
from indicnlp import langinfo as li
|
16 |
+
|
17 |
+
###
|
18 |
+
# Phonetic Information about script characters
|
19 |
+
###
|
20 |
+
|
21 |
+
""" Phonetic data about all languages except Tamil """
|
22 |
+
ALL_PHONETIC_DATA=None
|
23 |
+
|
24 |
+
""" Phonetic data for Tamil """
|
25 |
+
TAMIL_PHONETIC_DATA=None
|
26 |
+
|
27 |
+
""" Phonetic vector for all languages except Tamil """
|
28 |
+
ALL_PHONETIC_VECTORS=None
|
29 |
+
|
30 |
+
""" Phonetic vector for Tamil """
|
31 |
+
TAMIL_PHONETIC_VECTORS=None
|
32 |
+
|
33 |
+
""" Length of phonetic vector """
|
34 |
+
PHONETIC_VECTOR_LENGTH=38
|
35 |
+
|
36 |
+
""" Start offset for the phonetic feature vector in the phonetic data vector """
|
37 |
+
PHONETIC_VECTOR_START_OFFSET=6
|
38 |
+
|
39 |
+
## PHONETIC PROPERTIES in order in which they occur in the vector
|
40 |
+
## This list must be in sync with the keys in the PV_PROP_RANGES dictionary
|
41 |
+
PV_PROP=['basic_type',
|
42 |
+
'vowel_length',
|
43 |
+
'vowel_strength',
|
44 |
+
'vowel_status',
|
45 |
+
'consonant_type',
|
46 |
+
'articulation_place',
|
47 |
+
'aspiration',
|
48 |
+
'voicing',
|
49 |
+
'nasalization',
|
50 |
+
'vowel_horizontal',
|
51 |
+
'vowel_vertical',
|
52 |
+
'vowel_roundness',
|
53 |
+
]
|
54 |
+
|
55 |
+
###
|
56 |
+
# Bit vector ranges for various properties
|
57 |
+
###
|
58 |
+
|
59 |
+
PV_PROP_RANGES={
|
60 |
+
'basic_type': [0,6],
|
61 |
+
'vowel_length': [6,8],
|
62 |
+
'vowel_strength': [8,11],
|
63 |
+
'vowel_status': [11,13],
|
64 |
+
'consonant_type': [13,18],
|
65 |
+
'articulation_place': [18,23],
|
66 |
+
'aspiration': [23,25],
|
67 |
+
'voicing': [25,27],
|
68 |
+
'nasalization': [27,29],
|
69 |
+
'vowel_horizontal': [29,32],
|
70 |
+
'vowel_vertical': [32,36],
|
71 |
+
'vowel_roundness': [36,38],
|
72 |
+
}
|
73 |
+
|
74 |
+
|
75 |
+
####
|
76 |
+
# Indexes into the Phonetic Vector
|
77 |
+
####
|
78 |
+
PVIDX_BT_VOWEL=0
|
79 |
+
PVIDX_BT_CONSONANT=1
|
80 |
+
PVIDX_BT_NUKTA=2
|
81 |
+
PVIDX_BT_HALANT=3
|
82 |
+
PVIDX_BT_ANUSVAAR=4
|
83 |
+
PVIDX_BT_MISC=5
|
84 |
+
PVIDX_BT_S=PVIDX_BT_VOWEL
|
85 |
+
PVIDX_BT_E=PVIDX_BT_MISC+1
|
86 |
+
|
87 |
+
PVIDX_VSTAT_DEP=12
|
88 |
+
|
89 |
+
#####
|
90 |
+
# Unicode information about characters
|
91 |
+
#####
|
92 |
+
|
93 |
+
SCRIPT_OFFSET_START=0
|
94 |
+
SCRIPT_OFFSET_RANGE=0x80
|
95 |
+
|
96 |
+
def init():
|
97 |
+
"""
|
98 |
+
To be called by library loader, do not call it in your program
|
99 |
+
"""
|
100 |
+
|
101 |
+
global ALL_PHONETIC_DATA, ALL_PHONETIC_VECTORS, TAMIL_PHONETIC_DATA, TAMIL_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET
|
102 |
+
|
103 |
+
ALL_PHONETIC_DATA=pd.read_csv(os.path.join(common.get_resources_path(),'script','all_script_phonetic_data.csv'),encoding='utf-8')
|
104 |
+
TAMIL_PHONETIC_DATA=pd.read_csv(os.path.join(common.get_resources_path(),'script','tamil_script_phonetic_data.csv'),encoding='utf-8')
|
105 |
+
|
106 |
+
ALL_PHONETIC_VECTORS= ALL_PHONETIC_DATA.iloc[:,PHONETIC_VECTOR_START_OFFSET:].values
|
107 |
+
TAMIL_PHONETIC_VECTORS=TAMIL_PHONETIC_DATA.iloc[:,PHONETIC_VECTOR_START_OFFSET:].values
|
108 |
+
|
109 |
+
PHONETIC_VECTOR_LENGTH=ALL_PHONETIC_VECTORS.shape[1]
|
110 |
+
|
111 |
+
def is_supported_language(lang):
|
112 |
+
return lang in list(li.SCRIPT_RANGES.keys())
|
113 |
+
|
114 |
+
def get_offset(c,lang):
|
115 |
+
if not is_supported_language(lang):
|
116 |
+
raise IndicNlpException('Language {} not supported'.format(lang))
|
117 |
+
return ord(c)-li.SCRIPT_RANGES[lang][0]
|
118 |
+
|
119 |
+
def offset_to_char(off,lang):
|
120 |
+
"""
|
121 |
+
Applicable to Brahmi derived Indic scripts
|
122 |
+
"""
|
123 |
+
if not is_supported_language(lang):
|
124 |
+
raise IndicNlpException('Language {} not supported'.format(lang))
|
125 |
+
return chr(off+li.SCRIPT_RANGES[lang][0])
|
126 |
+
|
127 |
+
def is_indiclang_char(c,lang):
|
128 |
+
"""
|
129 |
+
Applicable to Brahmi derived Indic scripts
|
130 |
+
Note that DANDA and DOUBLE_DANDA have the same Unicode codepoint for all Indic scripts
|
131 |
+
"""
|
132 |
+
if not is_supported_language(lang):
|
133 |
+
raise IndicNlpException('Language {} not supported'.format(lang))
|
134 |
+
o=get_offset(c,lang)
|
135 |
+
return (o>=SCRIPT_OFFSET_START and o<SCRIPT_OFFSET_RANGE) \
|
136 |
+
or ord(c)==li.DANDA or ord(c)==li.DOUBLE_DANDA
|
137 |
+
|
138 |
+
def in_coordinated_range_offset(c_offset):
|
139 |
+
"""
|
140 |
+
Applicable to Brahmi derived Indic scripts
|
141 |
+
"""
|
142 |
+
return (c_offset>=li.COORDINATED_RANGE_START_INCLUSIVE and c_offset<=li.COORDINATED_RANGE_END_INCLUSIVE)
|
143 |
+
|
144 |
+
def in_coordinated_range(c,lang):
|
145 |
+
if not is_supported_language(lang):
|
146 |
+
raise IndicNlpException('Language {} not supported'.format(lang))
|
147 |
+
return in_coordinated_range_offset(get_offset(c,lang))
|
148 |
+
|
149 |
+
def get_phonetic_info(lang):
|
150 |
+
if not is_supported_language(lang):
|
151 |
+
raise IndicNlpException('Language {} not supported'.format(lang))
|
152 |
+
phonetic_data= ALL_PHONETIC_DATA if lang!=li.LC_TA else TAMIL_PHONETIC_DATA
|
153 |
+
phonetic_vectors= ALL_PHONETIC_VECTORS if lang!=li.LC_TA else TAMIL_PHONETIC_VECTORS
|
154 |
+
|
155 |
+
return (phonetic_data, phonetic_vectors)
|
156 |
+
|
157 |
+
def invalid_vector():
|
158 |
+
## TODO: check if np datatype is correct?
|
159 |
+
return np.array([0]*PHONETIC_VECTOR_LENGTH)
|
160 |
+
|
161 |
+
def get_phonetic_feature_vector(c,lang):
|
162 |
+
|
163 |
+
offset=get_offset(c,lang)
|
164 |
+
|
165 |
+
if not in_coordinated_range_offset(offset):
|
166 |
+
return invalid_vector()
|
167 |
+
|
168 |
+
phonetic_data, phonetic_vectors= get_phonetic_info(lang)
|
169 |
+
|
170 |
+
if phonetic_data.iloc[offset]['Valid Vector Representation']==0:
|
171 |
+
return invalid_vector()
|
172 |
+
|
173 |
+
return phonetic_vectors[offset]
|
174 |
+
|
175 |
+
def get_phonetic_feature_vector_offset(offset,lang):
|
176 |
+
|
177 |
+
if not in_coordinated_range_offset(offset):
|
178 |
+
return invalid_vector()
|
179 |
+
|
180 |
+
phonetic_data, phonetic_vectors= get_phonetic_info(lang)
|
181 |
+
|
182 |
+
if phonetic_data.iloc[offset]['Valid Vector Representation']==0:
|
183 |
+
return invalid_vector()
|
184 |
+
|
185 |
+
return phonetic_vectors[offset]
|
186 |
+
|
187 |
+
### Unary operations on vectors
|
188 |
+
def is_valid(v):
|
189 |
+
return np.sum(v)>0
|
190 |
+
|
191 |
+
def is_vowel(v):
|
192 |
+
return v[PVIDX_BT_VOWEL]==1
|
193 |
+
|
194 |
+
def is_consonant(v):
|
195 |
+
return v[PVIDX_BT_CONSONANT]==1
|
196 |
+
|
197 |
+
def is_halant(v):
|
198 |
+
return v[PVIDX_BT_HALANT]==1
|
199 |
+
|
200 |
+
def is_nukta(v):
|
201 |
+
return v[PVIDX_BT_NUKTA]==1
|
202 |
+
|
203 |
+
def is_anusvaar(v):
|
204 |
+
return v[PVIDX_BT_ANUSVAAR]==1
|
205 |
+
|
206 |
+
def is_misc(v):
|
207 |
+
return v[PVIDX_BT_MISC]==1
|
208 |
+
|
209 |
+
def is_dependent_vowel(v):
|
210 |
+
return is_vowel(v) and v[PVIDX_VSTAT_DEP]==1
|
211 |
+
|
212 |
+
def is_plosive(v):
|
213 |
+
return is_consonant(v) and get_property_vector(v,'consonant_type')[0]==1
|
214 |
+
|
215 |
+
### Binary operations on phonetic vectors
|
216 |
+
|
217 |
+
def or_vectors(v1,v2):
|
218 |
+
return np.array([ 1 if (b1+b2)>=1 else 0 for b1,b2 in zip(v1,v2) ])
|
219 |
+
|
220 |
+
def xor_vectors(v1,v2):
|
221 |
+
return np.array([ 1 if b1!=b2 else 0 for b1,b2 in zip(v1,v2) ])
|
222 |
+
|
223 |
+
### Getting properties from phonetic vectors
|
224 |
+
|
225 |
+
def get_property_vector(v,prop_name):
|
226 |
+
return v[PV_PROP_RANGES[prop_name][0]:PV_PROP_RANGES[prop_name][1]]
|
227 |
+
|
228 |
+
def get_property_value(v,prop_name):
|
229 |
+
factor_bits=get_property_vector(v,prop_name).tolist()
|
230 |
+
|
231 |
+
v=0
|
232 |
+
c=1
|
233 |
+
for b in factor_bits[::-1]:
|
234 |
+
v+=(c*b)
|
235 |
+
c=c*2.0
|
236 |
+
|
237 |
+
return int(v)
|
238 |
+
|
239 |
+
def lcsr_indic(srcw,tgtw,slang,tlang):
|
240 |
+
"""
|
241 |
+
compute the Longest Common Subsequence Ratio (LCSR) between two strings at the character level.
|
242 |
+
This works for Indic scripts by mapping both languages to a common script
|
243 |
+
|
244 |
+
srcw: source language string
|
245 |
+
tgtw: source language string
|
246 |
+
slang: source language
|
247 |
+
tlang: target language
|
248 |
+
"""
|
249 |
+
score_mat=np.zeros((len(srcw)+1,len(tgtw)+1))
|
250 |
+
|
251 |
+
for si,sc in enumerate(srcw,1):
|
252 |
+
for ti,tc in enumerate(tgtw,1):
|
253 |
+
so=get_offset(sc,slang)
|
254 |
+
to=get_offset(tc,tlang)
|
255 |
+
|
256 |
+
if in_coordinated_range_offset(so) and in_coordinated_range_offset(to) and so==to:
|
257 |
+
score_mat[si,ti]=score_mat[si-1,ti-1]+1.0
|
258 |
+
elif not (in_coordinated_range_offset(so) or in_coordinated_range_offset(to)) and sc==tc:
|
259 |
+
score_mat[si,ti]=score_mat[si-1,ti-1]+1.0
|
260 |
+
else:
|
261 |
+
score_mat[si,ti]= max(
|
262 |
+
score_mat[si,ti-1],
|
263 |
+
score_mat[si-1,ti])
|
264 |
+
|
265 |
+
return (score_mat[-1,-1]/float(max(len(srcw),len(tgtw))),float(len(srcw)),float(len(tgtw)))
|
266 |
+
|
267 |
+
def lcsr_any(srcw,tgtw):
|
268 |
+
"""
|
269 |
+
LCSR computation if both languages have the same script
|
270 |
+
"""
|
271 |
+
score_mat=np.zeros((len(srcw)+1,len(tgtw)+1))
|
272 |
+
|
273 |
+
for si,sc in enumerate(srcw,1):
|
274 |
+
for ti,tc in enumerate(tgtw,1):
|
275 |
+
|
276 |
+
if sc==tc:
|
277 |
+
score_mat[si,ti]=score_mat[si-1,ti-1]+1.0
|
278 |
+
else:
|
279 |
+
score_mat[si,ti]= max(
|
280 |
+
score_mat[si,ti-1],
|
281 |
+
score_mat[si-1,ti])
|
282 |
+
|
283 |
+
return (score_mat[-1,-1]/float(max(len(srcw),len(tgtw))),float(len(srcw)),float(len(tgtw)))
|
284 |
+
|
285 |
+
def lcsr(srcw,tgtw,slang,tlang):
|
286 |
+
"""
|
287 |
+
compute the Longest Common Subsequence Ratio (LCSR) between two strings at the character level.
|
288 |
+
|
289 |
+
srcw: source language string
|
290 |
+
tgtw: source language string
|
291 |
+
slang: source language
|
292 |
+
tlang: target language
|
293 |
+
"""
|
294 |
+
|
295 |
+
if slang==tlang or not is_supported_language(slang) or not is_supported_language(tlang):
|
296 |
+
return lcsr_any(srcw,tgtw,slang,tlang)
|
297 |
+
else:
|
298 |
+
return lcsr_indic(srcw,tgtw)
|
299 |
+
|
300 |
+
|
301 |
+
|
indic_nlp_library/indicnlp/script/phonetic_sim.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright (c) 2013-present, Anoop Kunchukuttan
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the MIT license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
#
|
8 |
+
|
9 |
+
from indicnlp import loader
|
10 |
+
from indicnlp import langinfo
|
11 |
+
from indicnlp.script.indic_scripts import *
|
12 |
+
import numpy as np
|
13 |
+
import gzip
|
14 |
+
import pandas as pd
|
15 |
+
import sys
|
16 |
+
|
17 |
+
def equal(v1,v2):
|
18 |
+
return 0.0 if np.sum( xor_vectors(v1, v2)) > 0 else 1.0
|
19 |
+
|
20 |
+
def dice(v1,v2):
|
21 |
+
dotprod=2*float(np.dot( v1, v2.T ))
|
22 |
+
return dotprod/float(len(v1)+len(v2))
|
23 |
+
|
24 |
+
def jaccard(v1,v2):
|
25 |
+
dotprod=float(np.dot( v1, v2.T ))
|
26 |
+
return dotprod/float(len(v1)+len(v2)-dotprod)
|
27 |
+
|
28 |
+
def cosine(v1,v2):
|
29 |
+
dotprod=float(np.dot( v1, v2.T ))
|
30 |
+
norm1=float(np.dot( v1, v1.T ))
|
31 |
+
norm2=float(np.dot( v2, v2.T ))
|
32 |
+
return ((dotprod)/(np.sqrt(norm1*norm2)+0.00001))
|
33 |
+
|
34 |
+
def dotprod(v1,v2):
|
35 |
+
return float(np.dot( v1, v2.T ))
|
36 |
+
|
37 |
+
def sim1(v1,v2,base=5.0):
|
38 |
+
return np.power(base,dotprod(v1,v2))
|
39 |
+
|
40 |
+
def softmax(v1,v2):
|
41 |
+
return sim1(v1,v2,np.e)
|
42 |
+
|
43 |
+
def create_similarity_matrix(sim_func,slang,tlang,normalize=True):
|
44 |
+
|
45 |
+
dim=langinfo.COORDINATED_RANGE_END_INCLUSIVE-langinfo.COORDINATED_RANGE_START_INCLUSIVE+1
|
46 |
+
sim_mat=np.zeros((dim,dim))
|
47 |
+
|
48 |
+
for offset1 in range(langinfo.COORDINATED_RANGE_START_INCLUSIVE, langinfo.COORDINATED_RANGE_END_INCLUSIVE+1):
|
49 |
+
v1=get_phonetic_feature_vector(offset_to_char(offset1,slang),slang)
|
50 |
+
for offset2 in range(langinfo.COORDINATED_RANGE_START_INCLUSIVE, langinfo.COORDINATED_RANGE_END_INCLUSIVE+1):
|
51 |
+
v2=get_phonetic_feature_vector(offset_to_char(offset2,tlang),tlang)
|
52 |
+
sim_mat[offset1,offset2]=sim_func(v1,v2)
|
53 |
+
|
54 |
+
if normalize:
|
55 |
+
sums=np.sum(sim_mat, axis=1)
|
56 |
+
sim_mat=(sim_mat.transpose()/sums).transpose()
|
57 |
+
|
58 |
+
return sim_mat
|
59 |
+
|
indic_nlp_library/indicnlp/syllable/__init__.py
ADDED
File without changes
|
indic_nlp_library/indicnlp/syllable/syllabifier.py
ADDED
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright (c) 2013-present, Anoop Kunchukuttan
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the MIT license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
#
|
8 |
+
|
9 |
+
import codecs, sys
|
10 |
+
from indicnlp.script import indic_scripts as si
|
11 |
+
import re
|
12 |
+
|
13 |
+
chillu_char_map= {
|
14 |
+
'\u0d7a': '\u0d23',
|
15 |
+
'\u0d7b': '\u0d28',
|
16 |
+
'\u0d7c': '\u0d30',
|
17 |
+
'\u0d7d': '\u0d32',
|
18 |
+
'\u0d7e': '\u0d33',
|
19 |
+
'\u0d7f': '\u0d15',
|
20 |
+
}
|
21 |
+
|
22 |
+
char_chillu_map= {}
|
23 |
+
for k,v in chillu_char_map.items():
|
24 |
+
char_chillu_map[v]=k
|
25 |
+
|
26 |
+
def normalize_malayalam(word):
|
27 |
+
|
28 |
+
word_mask=re.sub(r'[0-9]','0',word)
|
29 |
+
|
30 |
+
# instead of chillu characters, use consonant+halant
|
31 |
+
for chillu,char in chillu_char_map.items():
|
32 |
+
word=word.replace(chillu,'{}\u0d4d'.format(char))
|
33 |
+
word_mask=word_mask.replace(chillu,'41')
|
34 |
+
|
35 |
+
word_mask=re.sub(r'[^0-9]','0',word_mask)
|
36 |
+
|
37 |
+
return word, word_mask
|
38 |
+
|
39 |
+
def denormalize_malayalam(word, word_mask):
|
40 |
+
|
41 |
+
word=list(word)
|
42 |
+
word_mask=list(word_mask)
|
43 |
+
|
44 |
+
## pattern 4
|
45 |
+
idx=0
|
46 |
+
while idx>=0:
|
47 |
+
try:
|
48 |
+
idx=word_mask.index('4',idx)
|
49 |
+
word[idx:idx+2]=char_chillu_map[word[idx]]
|
50 |
+
word_mask[idx:idx+2]='0'
|
51 |
+
start=idx
|
52 |
+
except ValueError as e:
|
53 |
+
break
|
54 |
+
|
55 |
+
return ''.join(word)
|
56 |
+
|
57 |
+
def normalize_punjabi(word):
|
58 |
+
word_mask=re.sub(r'[0-9]','0',word)
|
59 |
+
|
60 |
+
## replace tippi with anusvaar
|
61 |
+
word=word.replace('\u0a70','\u0a02')
|
62 |
+
word_mask=word_mask.replace('\u0a70','2')
|
63 |
+
|
64 |
+
## replace addak+consonant with consonat+halant+consonant
|
65 |
+
word=re.sub(r'\u0a71(.)','\\1\u0a4d\\1',word)
|
66 |
+
word_mask=re.sub(r'\u0a71(.)','311',word_mask)
|
67 |
+
|
68 |
+
word_mask=re.sub(r'[^0-9]','0',word_mask)
|
69 |
+
|
70 |
+
return word, word_mask
|
71 |
+
|
72 |
+
def denormalize_punjabi(word, word_mask):
|
73 |
+
|
74 |
+
word=list(word)
|
75 |
+
word_mask=list(word_mask)
|
76 |
+
|
77 |
+
## pattern 2
|
78 |
+
idx=0
|
79 |
+
while idx>=0:
|
80 |
+
try:
|
81 |
+
idx=word_mask.index('2',idx)
|
82 |
+
word[idx]='\u0a70'
|
83 |
+
word_mask[idx]='0'
|
84 |
+
start=idx
|
85 |
+
except ValueError as e:
|
86 |
+
break
|
87 |
+
|
88 |
+
## pattern 3
|
89 |
+
idx=0
|
90 |
+
while idx>=0:
|
91 |
+
try:
|
92 |
+
idx=word_mask.index('3',idx)
|
93 |
+
word[idx:idx+3]='\u0a71{}'.format(word[idx])
|
94 |
+
word_mask[idx:idx+3]='00'
|
95 |
+
start=idx
|
96 |
+
except ValueError as e:
|
97 |
+
break
|
98 |
+
|
99 |
+
return ''.join(word)
|
100 |
+
|
101 |
+
def char_backoff(syllables_list,vocab):
|
102 |
+
syllables_final=[]
|
103 |
+
|
104 |
+
if vocab is None:
|
105 |
+
syllables_final=syllables_list
|
106 |
+
else:
|
107 |
+
for s in syllables_list:
|
108 |
+
if s in vocab:
|
109 |
+
syllables_final.append(s)
|
110 |
+
else:
|
111 |
+
for x in s:
|
112 |
+
syllables_final.append(x)
|
113 |
+
|
114 |
+
return syllables_final
|
115 |
+
|
116 |
+
|
117 |
+
def orthographic_syllabify_improved(word,lang,vocab=None):
|
118 |
+
|
119 |
+
word_mask=['0']*len(word)
|
120 |
+
|
121 |
+
if lang=='ml':
|
122 |
+
word, word_mask = normalize_malayalam(word)
|
123 |
+
word=word
|
124 |
+
elif lang=='pa':
|
125 |
+
word, word_mask = normalize_punjabi(word)
|
126 |
+
|
127 |
+
p_vectors=[si.get_phonetic_feature_vector(c,lang) for c in word]
|
128 |
+
|
129 |
+
syllables=[]
|
130 |
+
syllables_mask=[]
|
131 |
+
|
132 |
+
for i in range(len(word)):
|
133 |
+
v=p_vectors[i]
|
134 |
+
|
135 |
+
syllables.append(word[i])
|
136 |
+
syllables_mask.append(word_mask[i])
|
137 |
+
|
138 |
+
### simplified syllabification
|
139 |
+
#if i+1<len(word) and \
|
140 |
+
# (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])):
|
141 |
+
# syllables.append(u' ')
|
142 |
+
# syllables_mask.append(u'0')
|
143 |
+
|
144 |
+
#elif not si.is_valid(v) or si.is_misc(v) or si.is_vowel(v):
|
145 |
+
# syllables.append(u' ')
|
146 |
+
# syllables_mask.append(u'0')
|
147 |
+
|
148 |
+
#elif i+1<len(word) and \
|
149 |
+
# (si.is_consonant(v) or si.is_nukta(v)) and \
|
150 |
+
# (si.is_consonant(p_vectors[i+1]) or si.is_anusvaar(p_vectors[i+1])):
|
151 |
+
# syllables.append(u' ')
|
152 |
+
# syllables_mask.append(u'0')
|
153 |
+
|
154 |
+
#### better syllabification
|
155 |
+
if i+1<len(word) and (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])):
|
156 |
+
syllables.append(' ')
|
157 |
+
syllables_mask.append('0')
|
158 |
+
|
159 |
+
elif not si.is_valid(v) or si.is_misc(v) :
|
160 |
+
syllables.append(' ')
|
161 |
+
syllables_mask.append('0')
|
162 |
+
|
163 |
+
elif si.is_vowel(v):
|
164 |
+
|
165 |
+
anu_nonplos= ( i+2<len(word) and \
|
166 |
+
si.is_anusvaar(p_vectors[i+1]) and \
|
167 |
+
not si.is_plosive(p_vectors[i+2])\
|
168 |
+
)
|
169 |
+
|
170 |
+
anu_eow= ( i+2==len(word) and \
|
171 |
+
si.is_anusvaar(p_vectors[i+1]) )
|
172 |
+
|
173 |
+
if not(anu_nonplos or anu_eow):
|
174 |
+
syllables.append(' ')
|
175 |
+
syllables_mask.append('0')
|
176 |
+
|
177 |
+
elif i+1<len(word) and \
|
178 |
+
(si.is_consonant(v) or si.is_nukta(v)):
|
179 |
+
if si.is_consonant(p_vectors[i+1]):
|
180 |
+
syllables.append(' ')
|
181 |
+
syllables_mask.append('0')
|
182 |
+
elif si.is_vowel(p_vectors[i+1]) and \
|
183 |
+
not si.is_dependent_vowel(p_vectors[i+1]):
|
184 |
+
syllables.append(' ')
|
185 |
+
syllables_mask.append('0')
|
186 |
+
elif si.is_anusvaar(p_vectors[i+1]):
|
187 |
+
anu_nonplos= ( i+2<len(word) and \
|
188 |
+
not si.is_plosive(p_vectors[i+2])\
|
189 |
+
)
|
190 |
+
|
191 |
+
anu_eow= i+2==len(word)
|
192 |
+
|
193 |
+
if not(anu_nonplos or anu_eow):
|
194 |
+
syllables.append(' ')
|
195 |
+
syllables_mask.append('0')
|
196 |
+
|
197 |
+
syllables_mask=''.join(syllables_mask)
|
198 |
+
syllables=''.join(syllables)
|
199 |
+
|
200 |
+
#assert len(syllables_mask) == len(syllables)
|
201 |
+
#assert syllables_mask.find('01') == -1
|
202 |
+
if syllables_mask.find('01') >= 0:
|
203 |
+
print('Warning')
|
204 |
+
|
205 |
+
if lang=='ml':
|
206 |
+
syllables = denormalize_malayalam(syllables,syllables_mask)
|
207 |
+
elif lang=='pa':
|
208 |
+
syllables = denormalize_punjabi(syllables,syllables_mask)
|
209 |
+
|
210 |
+
syllables_list = syllables.strip().split(' ')
|
211 |
+
return(char_backoff(syllables_list,vocab))
|
212 |
+
|
213 |
+
def orthographic_syllabify(word,lang,vocab=None):
|
214 |
+
|
215 |
+
p_vectors=[si.get_phonetic_feature_vector(c,lang) for c in word]
|
216 |
+
|
217 |
+
syllables=[]
|
218 |
+
|
219 |
+
for i in range(len(word)):
|
220 |
+
v=p_vectors[i]
|
221 |
+
|
222 |
+
syllables.append(word[i])
|
223 |
+
|
224 |
+
### simplified syllabification
|
225 |
+
#if i+1<len(word) and \
|
226 |
+
# (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])):
|
227 |
+
# syllables.append(u' ')
|
228 |
+
|
229 |
+
#elif not si.is_valid(v) or si.is_misc(v) or si.is_vowel(v):
|
230 |
+
# syllables.append(u' ')
|
231 |
+
|
232 |
+
#elif i+1<len(word) and \
|
233 |
+
# (si.is_consonant(v) or si.is_nukta(v)) and \
|
234 |
+
# (si.is_consonant(p_vectors[i+1]) or si.is_anusvaar(p_vectors[i+1])):
|
235 |
+
# syllables.append(u' ')
|
236 |
+
|
237 |
+
#### better syllabification
|
238 |
+
if i+1<len(word) and (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])):
|
239 |
+
syllables.append(' ')
|
240 |
+
|
241 |
+
elif not si.is_valid(v) or si.is_misc(v) :
|
242 |
+
syllables.append(' ')
|
243 |
+
|
244 |
+
elif si.is_vowel(v):
|
245 |
+
|
246 |
+
anu_nonplos= ( i+2<len(word) and \
|
247 |
+
si.is_anusvaar(p_vectors[i+1]) and \
|
248 |
+
not si.is_plosive(p_vectors[i+2])\
|
249 |
+
)
|
250 |
+
|
251 |
+
anu_eow= ( i+2==len(word) and \
|
252 |
+
si.is_anusvaar(p_vectors[i+1]) )
|
253 |
+
|
254 |
+
if not(anu_nonplos or anu_eow):
|
255 |
+
syllables.append(' ')
|
256 |
+
|
257 |
+
elif i+1<len(word) and \
|
258 |
+
(si.is_consonant(v) or si.is_nukta(v)):
|
259 |
+
if si.is_consonant(p_vectors[i+1]):
|
260 |
+
syllables.append(' ')
|
261 |
+
elif si.is_vowel(p_vectors[i+1]) and \
|
262 |
+
not si.is_dependent_vowel(p_vectors[i+1]):
|
263 |
+
syllables.append(' ')
|
264 |
+
elif si.is_anusvaar(p_vectors[i+1]):
|
265 |
+
anu_nonplos= ( i+2<len(word) and \
|
266 |
+
not si.is_plosive(p_vectors[i+2])\
|
267 |
+
)
|
268 |
+
|
269 |
+
anu_eow= i+2==len(word)
|
270 |
+
|
271 |
+
if not(anu_nonplos or anu_eow):
|
272 |
+
syllables.append(' ')
|
273 |
+
|
274 |
+
syllables_list = ''.join(syllables).strip().split(' ')
|
275 |
+
return(char_backoff(syllables_list,vocab))
|
276 |
+
|
277 |
+
def orthographic_simple_syllabify(word,lang,vocab=None):
|
278 |
+
|
279 |
+
p_vectors=[si.get_phonetic_feature_vector(c,lang) for c in word]
|
280 |
+
|
281 |
+
syllables=[]
|
282 |
+
|
283 |
+
for i in range(len(word)):
|
284 |
+
v=p_vectors[i]
|
285 |
+
|
286 |
+
syllables.append(word[i])
|
287 |
+
|
288 |
+
## simplified syllabification
|
289 |
+
if i+1<len(word) and \
|
290 |
+
(not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])):
|
291 |
+
syllables.append(' ')
|
292 |
+
|
293 |
+
elif not si.is_valid(v) or si.is_misc(v) or si.is_vowel(v):
|
294 |
+
syllables.append(' ')
|
295 |
+
|
296 |
+
elif i+1<len(word) and \
|
297 |
+
(si.is_consonant(v) or si.is_nukta(v)) and \
|
298 |
+
(si.is_consonant(p_vectors[i+1]) or si.is_anusvaar(p_vectors[i+1])):
|
299 |
+
syllables.append(' ')
|
300 |
+
|
301 |
+
syllables_list = ''.join(syllables).strip().split(' ')
|
302 |
+
return(char_backoff(syllables_list,vocab))
|
indic_nlp_library/indicnlp/test/__init__.py
ADDED
File without changes
|
indic_nlp_library/indicnlp/test/unit/__init__.py
ADDED
File without changes
|
indic_nlp_library/indicnlp/tokenize/__init__.py
ADDED
File without changes
|
indic_nlp_library/indicnlp/tokenize/indic_detokenize.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright (c) 2013-present, Anoop Kunchukuttan
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the MIT license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
#
|
8 |
+
|
9 |
+
#Program for detokenizing Indian language input
|
10 |
+
#
|
11 |
+
# @author Anoop Kunchukuttan
|
12 |
+
#
|
13 |
+
"""
|
14 |
+
De-tokenizer for Indian languages.
|
15 |
+
"""
|
16 |
+
|
17 |
+
import string, re, sys
|
18 |
+
from indicnlp.common import IndicNlpException
|
19 |
+
|
20 |
+
## detokenizer patterns
|
21 |
+
left_attach=r'!%)\]},.:;>?\u0964\u0965'
|
22 |
+
pat_la=re.compile(r'[ ](['+left_attach+r'])')
|
23 |
+
|
24 |
+
right_attach=r'#$(\[{<@'
|
25 |
+
pat_ra=re.compile(r'(['+right_attach+r'])[ ]')
|
26 |
+
|
27 |
+
lr_attach=r'-/\\'
|
28 |
+
pat_lra=re.compile(r'[ ](['+lr_attach+r'])[ ]')
|
29 |
+
|
30 |
+
#donknow=u'&*+=^_|~'
|
31 |
+
|
32 |
+
## date, numbers, section/article numbering
|
33 |
+
## TODO: handle indic numbers
|
34 |
+
pat_num_seq=re.compile(r'([0-9]+ [,.:/] )+[0-9]+')
|
35 |
+
|
36 |
+
### e-mail address
|
37 |
+
#pat_num=re.compile(ur'[a-zA-Z]+[ ]?
|
38 |
+
|
39 |
+
def trivial_detokenize_indic(text):
|
40 |
+
"""detokenize string for Indian language scripts using Brahmi-derived scripts
|
41 |
+
|
42 |
+
A trivial detokenizer which:
|
43 |
+
|
44 |
+
- decides whether punctuation attaches to left/right or both
|
45 |
+
- handles number sequences
|
46 |
+
- handles quotes smartly (deciding left or right attachment)
|
47 |
+
|
48 |
+
Args:
|
49 |
+
text (str): tokenized text to process
|
50 |
+
|
51 |
+
Returns:
|
52 |
+
str: detokenized string
|
53 |
+
"""
|
54 |
+
|
55 |
+
s=text
|
56 |
+
### some normalizations
|
57 |
+
|
58 |
+
#numbers and dates
|
59 |
+
new_s=''
|
60 |
+
prev=0
|
61 |
+
for m in pat_num_seq.finditer(s):
|
62 |
+
start=m.start()
|
63 |
+
end=m.end()
|
64 |
+
if start>prev:
|
65 |
+
new_s=new_s+s[prev:start]
|
66 |
+
new_s=new_s+s[start:end].replace(' ','')
|
67 |
+
prev=end
|
68 |
+
|
69 |
+
new_s=new_s+s[prev:]
|
70 |
+
s=new_s
|
71 |
+
|
72 |
+
### consective single quotes or backslashes become double quotes
|
73 |
+
#s=s.replace("' '", "''")
|
74 |
+
#s=s.replace("` `", '``')
|
75 |
+
|
76 |
+
s=pat_lra.sub('\\1',s)
|
77 |
+
s=pat_la.sub('\\1',s)
|
78 |
+
s=pat_ra.sub('\\1',s)
|
79 |
+
|
80 |
+
# assumes well formedness of quotes and alternates between right and left attach
|
81 |
+
|
82 |
+
alt_attach='\'"`'
|
83 |
+
for punc in alt_attach:
|
84 |
+
cnt=0
|
85 |
+
out_str=[]
|
86 |
+
for c in s:
|
87 |
+
if c == punc:
|
88 |
+
if cnt%2==0:
|
89 |
+
out_str.append('@RA')
|
90 |
+
else:
|
91 |
+
out_str.append('@LA')
|
92 |
+
cnt+=1
|
93 |
+
else:
|
94 |
+
out_str.append(c)
|
95 |
+
|
96 |
+
s=''.join(out_str).replace('@RA ',punc).replace(' @LA',punc
|
97 |
+
).replace('@RA',punc).replace('@LA',punc)
|
98 |
+
|
99 |
+
return s
|
100 |
+
|
101 |
+
def trivial_detokenize(text,lang='hi'):
|
102 |
+
"""detokenize string for languages of the Indian subcontinent
|
103 |
+
|
104 |
+
A trivial detokenizer which:
|
105 |
+
|
106 |
+
- decides whether punctuation attaches to left/right or both
|
107 |
+
- handles number sequences
|
108 |
+
- handles quotes smartly (deciding left or right attachment)
|
109 |
+
|
110 |
+
Args:
|
111 |
+
text (str): tokenized text to process
|
112 |
+
|
113 |
+
Returns:
|
114 |
+
str: detokenized string
|
115 |
+
|
116 |
+
Raises:
|
117 |
+
IndicNlpException: If language is not supported
|
118 |
+
"""
|
119 |
+
if lang=='ur':
|
120 |
+
raise IndicNlpException('No detokenizer available for Urdu')
|
121 |
+
else:
|
122 |
+
return trivial_detokenize_indic(text)
|
123 |
+
|
124 |
+
# if __name__ == '__main__':
|
125 |
+
|
126 |
+
# if len(sys.argv)<4:
|
127 |
+
# print("Usage: python indic_detokenize.py <infile> <outfile> <language>")
|
128 |
+
# sys.exit(1)
|
129 |
+
|
130 |
+
# with open(sys.argv[1],'r', encoding='utf-8') as ifile:
|
131 |
+
# with open(sys.argv[2],'w', encoding='utf-8') as ofile:
|
132 |
+
# for line in ifile:
|
133 |
+
# detokenized_line=trivial_detokenize(line,sys.argv[3])
|
134 |
+
# ofile.write(detokenized_line)
|
indic_nlp_library/indicnlp/tokenize/indic_tokenize.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright (c) 2013-present, Anoop Kunchukuttan
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the MIT license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
#
|
8 |
+
|
9 |
+
#Program for tokenizing Indian language input
|
10 |
+
#
|
11 |
+
# @author Anoop Kunchukuttan
|
12 |
+
#
|
13 |
+
"""
|
14 |
+
Tokenizer for Indian languages. Currently, simple punctuation-based tokenizers
|
15 |
+
are supported (see `trivial_tokenize`). Major Indian language punctuations are
|
16 |
+
handled.
|
17 |
+
"""
|
18 |
+
import string, re, sys
|
19 |
+
|
20 |
+
from indicnlp.common import IndicNlpException
|
21 |
+
|
22 |
+
### tokenizer patterns
|
23 |
+
triv_tokenizer_indic_pat=re.compile(r'(['+string.punctuation+r'\u0964\u0965'+r'])')
|
24 |
+
triv_tokenizer_urdu_pat=re.compile(r'(['+string.punctuation+r'\u0609\u060A\u060C\u061E\u066A\u066B\u066C\u066D\u06D4'+r'])')
|
25 |
+
|
26 |
+
## date, numbers, section/article numbering
|
27 |
+
pat_num_seq=re.compile(r'([0-9]+ [,.:/] )+[0-9]+')
|
28 |
+
|
29 |
+
def trivial_tokenize_indic(text):
|
30 |
+
"""tokenize string for Indian language scripts using Brahmi-derived scripts
|
31 |
+
|
32 |
+
A trivial tokenizer which just tokenizes on the punctuation boundaries.
|
33 |
+
This also includes punctuations for the Indian language scripts (the
|
34 |
+
purna virama and the deergha virama). This is a language independent
|
35 |
+
tokenizer
|
36 |
+
|
37 |
+
Args:
|
38 |
+
text (str): text to tokenize
|
39 |
+
|
40 |
+
Returns:
|
41 |
+
list: list of tokens
|
42 |
+
|
43 |
+
"""
|
44 |
+
tok_str=triv_tokenizer_indic_pat.sub(r' \1 ',text.replace('\t',' '))
|
45 |
+
# return re.sub(r'[ ]+',' ',tok_str).strip(' ').split(' ')
|
46 |
+
|
47 |
+
s=re.sub(r'[ ]+',' ',tok_str).strip(' ')
|
48 |
+
|
49 |
+
# do not tokenize numbers and dates
|
50 |
+
new_s=''
|
51 |
+
prev=0
|
52 |
+
for m in pat_num_seq.finditer(s):
|
53 |
+
start=m.start()
|
54 |
+
end=m.end()
|
55 |
+
if start>prev:
|
56 |
+
new_s=new_s+s[prev:start]
|
57 |
+
new_s=new_s+s[start:end].replace(' ','')
|
58 |
+
prev=end
|
59 |
+
|
60 |
+
new_s=new_s+s[prev:]
|
61 |
+
s=new_s
|
62 |
+
|
63 |
+
return s.split(' ')
|
64 |
+
|
65 |
+
def trivial_tokenize_urdu(text):
|
66 |
+
"""tokenize Urdu string
|
67 |
+
|
68 |
+
A trivial tokenizer which just tokenizes on the punctuation boundaries.
|
69 |
+
This also includes punctuations for the Urdu script.
|
70 |
+
These punctuations characters were identified from the Unicode database
|
71 |
+
for Arabic script by looking for punctuation symbols.
|
72 |
+
|
73 |
+
Args:
|
74 |
+
text (str): text to tokenize
|
75 |
+
|
76 |
+
Returns:
|
77 |
+
list: list of tokens
|
78 |
+
"""
|
79 |
+
tok_str=triv_tokenizer_urdu_pat.sub(r' \1 ',text.replace('\t',' '))
|
80 |
+
return re.sub(r'[ ]+',' ',tok_str).strip(' ').split(' ')
|
81 |
+
|
82 |
+
def trivial_tokenize(text,lang='hi'):
|
83 |
+
"""trivial tokenizer for Indian languages using Brahmi for Arabic scripts
|
84 |
+
|
85 |
+
A trivial tokenizer which just tokenizes on the punctuation boundaries.
|
86 |
+
Major punctuations specific to Indian langauges are handled.
|
87 |
+
These punctuations characters were identified from the Unicode database.
|
88 |
+
|
89 |
+
Args:
|
90 |
+
text (str): text to tokenize
|
91 |
+
lang (str): ISO 639-2 language code
|
92 |
+
|
93 |
+
Returns:
|
94 |
+
list: list of tokens
|
95 |
+
"""
|
96 |
+
if lang=='ur':
|
97 |
+
return trivial_tokenize_urdu(text)
|
98 |
+
else:
|
99 |
+
return trivial_tokenize_indic(text)
|
100 |
+
|
101 |
+
# if __name__ == '__main__':
|
102 |
+
|
103 |
+
# if len(sys.argv)<4:
|
104 |
+
# print("Usage: python indic_tokenize.py <infile> <outfile> <language>")
|
105 |
+
# sys.exit(1)
|
106 |
+
|
107 |
+
# with open(sys.argv[1],'r', encoding='utf-8') as ifile:
|
108 |
+
# with open(sys.argv[2],'w', encoding='utf-8') as ofile:
|
109 |
+
# for line in ifile:
|
110 |
+
# tokenized_line=' '.join(trivial_tokenize(line,sys.argv[3]))
|
111 |
+
# ofile.write(tokenized_line)
|
indic_nlp_library/indicnlp/tokenize/sentence_tokenize.py
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright (c) 2013-present, Anoop Kunchukuttan
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the MIT license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
#
|
8 |
+
|
9 |
+
#Program for sentence splitting of Indian language input
|
10 |
+
#
|
11 |
+
# @author Anoop Kunchukuttan
|
12 |
+
#
|
13 |
+
"""
|
14 |
+
Sentence splitter for Indian languages. Contains a rule-based
|
15 |
+
sentence splitter that can understand common non-breaking phrases
|
16 |
+
in many Indian languages.
|
17 |
+
"""
|
18 |
+
|
19 |
+
import re
|
20 |
+
|
21 |
+
from indicnlp.transliterate import unicode_transliterate
|
22 |
+
from indicnlp import langinfo
|
23 |
+
|
24 |
+
|
25 |
+
## for language which have danda as delimiter
|
26 |
+
## period is not part of the sentence delimiters
|
27 |
+
DELIM_PAT_DANDA=re.compile(r'[\?!\u0964\u0965]')
|
28 |
+
|
29 |
+
## for languages which don't have danda as delimiter
|
30 |
+
DELIM_PAT_NO_DANDA=re.compile(r'[\.\?!\u0964\u0965]')
|
31 |
+
|
32 |
+
## pattern to check for presence of danda in text
|
33 |
+
CONTAINS_DANDA=re.compile(r'[\u0964\u0965]')
|
34 |
+
|
35 |
+
def is_acronym_abbvr(text,lang):
|
36 |
+
"""Is the text a non-breaking phrase
|
37 |
+
|
38 |
+
Args:
|
39 |
+
text (str): text to check for non-breaking phrase
|
40 |
+
lang (str): ISO 639-2 language code
|
41 |
+
|
42 |
+
Returns:
|
43 |
+
boolean: true if `text` is a non-breaking phrase
|
44 |
+
"""
|
45 |
+
|
46 |
+
ack_chars = {
|
47 |
+
## acronym for latin characters
|
48 |
+
'ए', 'ऎ',
|
49 |
+
'बी', 'बि',
|
50 |
+
'सी', 'सि',
|
51 |
+
'डी', 'डि',
|
52 |
+
'ई', 'इ',
|
53 |
+
'एफ', 'ऎफ',
|
54 |
+
'जी', 'जि',
|
55 |
+
'एच','ऎच',
|
56 |
+
'आई', 'आइ','ऐ',
|
57 |
+
'जे', 'जॆ',
|
58 |
+
'के', 'कॆ',
|
59 |
+
'एल', 'ऎल',
|
60 |
+
'एम','ऎम',
|
61 |
+
'एन','ऎन',
|
62 |
+
'ओ', 'ऒ',
|
63 |
+
'पी', 'पि',
|
64 |
+
'क्यू', 'क्यु',
|
65 |
+
'आर',
|
66 |
+
'एस','ऎस',
|
67 |
+
'टी', 'टि',
|
68 |
+
'यू', 'यु',
|
69 |
+
'वी', 'वि', 'व्ही', 'व्हि',
|
70 |
+
'डब्ल्यू', 'डब्ल्यु',
|
71 |
+
'एक्स','ऎक्स',
|
72 |
+
'वाय',
|
73 |
+
'जेड', 'ज़ेड',
|
74 |
+
## add halant to the previous English character mappings.
|
75 |
+
'एफ्',
|
76 |
+
'ऎफ्',
|
77 |
+
'एच्',
|
78 |
+
'ऎच्',
|
79 |
+
'एल्',
|
80 |
+
'ऎल्',
|
81 |
+
'एम्',
|
82 |
+
'ऎम्',
|
83 |
+
'एन्',
|
84 |
+
'ऎन्',
|
85 |
+
'आर्',
|
86 |
+
'एस्',
|
87 |
+
'ऎस्',
|
88 |
+
'एक्स्',
|
89 |
+
'ऎक्स्',
|
90 |
+
'वाय्',
|
91 |
+
'जेड्', 'ज़ेड्',
|
92 |
+
|
93 |
+
#Indic vowels
|
94 |
+
'ऄ',
|
95 |
+
'अ',
|
96 |
+
'आ',
|
97 |
+
'इ',
|
98 |
+
'ई',
|
99 |
+
'उ',
|
100 |
+
'ऊ',
|
101 |
+
'ऋ',
|
102 |
+
'ऌ',
|
103 |
+
'ऍ',
|
104 |
+
'ऎ',
|
105 |
+
'ए',
|
106 |
+
'ऐ',
|
107 |
+
'ऑ',
|
108 |
+
'ऒ',
|
109 |
+
'ओ',
|
110 |
+
'औ',
|
111 |
+
'ॠ',
|
112 |
+
'ॡ',
|
113 |
+
|
114 |
+
#Indic consonants
|
115 |
+
'क',
|
116 |
+
'ख',
|
117 |
+
'ग',
|
118 |
+
'घ',
|
119 |
+
'ङ',
|
120 |
+
'च',
|
121 |
+
'छ',
|
122 |
+
'ज',
|
123 |
+
'झ',
|
124 |
+
'ञ',
|
125 |
+
'ट',
|
126 |
+
'ठ',
|
127 |
+
'ड',
|
128 |
+
'ढ',
|
129 |
+
'ण',
|
130 |
+
'त',
|
131 |
+
'थ',
|
132 |
+
'द',
|
133 |
+
'ध',
|
134 |
+
'न',
|
135 |
+
'ऩ',
|
136 |
+
'प',
|
137 |
+
'फ',
|
138 |
+
'ब',
|
139 |
+
'भ',
|
140 |
+
'म',
|
141 |
+
'य',
|
142 |
+
'र',
|
143 |
+
'ऱ',
|
144 |
+
'ल',
|
145 |
+
'ळ',
|
146 |
+
'ऴ',
|
147 |
+
'व',
|
148 |
+
'श',
|
149 |
+
'ष',
|
150 |
+
'स',
|
151 |
+
'ह',
|
152 |
+
|
153 |
+
## abbreviation
|
154 |
+
'श्री',
|
155 |
+
'डॉ',
|
156 |
+
'कु',
|
157 |
+
'चि',
|
158 |
+
'सौ',
|
159 |
+
}
|
160 |
+
|
161 |
+
return unicode_transliterate.UnicodeIndicTransliterator.transliterate(text,lang,'hi') in ack_chars
|
162 |
+
|
163 |
+
def sentence_split(text,lang,delim_pat='auto'): ## New signature
|
164 |
+
"""split the text into sentences
|
165 |
+
|
166 |
+
A rule-based sentence splitter for Indian languages written in
|
167 |
+
Brahmi-derived scripts. The text is split at sentence delimiter
|
168 |
+
boundaries. The delimiters can be configured by passing appropriate
|
169 |
+
parameters.
|
170 |
+
|
171 |
+
The sentence splitter can identify non-breaking phrases like
|
172 |
+
single letter, common abbreviations/honorofics for some Indian
|
173 |
+
languages.
|
174 |
+
|
175 |
+
Args:
|
176 |
+
text (str): text to split into sentence
|
177 |
+
lang (str): ISO 639-2 language code
|
178 |
+
delim_pat (str): regular expression to identify sentence delimiter characters. If set to 'auto', the delimiter pattern is chosen automatically based on the language and text.
|
179 |
+
|
180 |
+
|
181 |
+
Returns:
|
182 |
+
list: list of sentences identified from the input text
|
183 |
+
"""
|
184 |
+
|
185 |
+
#print('Input: {}'.format(delim_pat))
|
186 |
+
if delim_pat=='auto':
|
187 |
+
if langinfo.is_danda_delim(lang):
|
188 |
+
# in modern texts it is possible that period is used as delimeter
|
189 |
+
# instead of DANDA. Hence, a check. Use danda delimiter pattern
|
190 |
+
# only if text contains at least one danda
|
191 |
+
if CONTAINS_DANDA.search(text) is None:
|
192 |
+
delim_pat=DELIM_PAT_NO_DANDA
|
193 |
+
#print('LANG has danda delim. TEXT_CONTAINS_DANDA: FALSE --> DELIM_PAT_NO_DANDA')
|
194 |
+
else:
|
195 |
+
delim_pat=DELIM_PAT_DANDA
|
196 |
+
#print('LANG has danda delim. TEXT_CONTAINS_DANDA: TRUE --> DELIM_PAT_DANDA')
|
197 |
+
else:
|
198 |
+
delim_pat=DELIM_PAT_NO_DANDA
|
199 |
+
#print('LANG has no danda delim --> DELIM_PAT_NO_DANDA')
|
200 |
+
|
201 |
+
## otherwise, assume the caller set the delimiter pattern
|
202 |
+
|
203 |
+
### Phase 1: break on sentence delimiters.
|
204 |
+
cand_sentences=[]
|
205 |
+
begin=0
|
206 |
+
text = text.strip()
|
207 |
+
for mo in delim_pat.finditer(text):
|
208 |
+
p1=mo.start()
|
209 |
+
p2=mo.end()
|
210 |
+
|
211 |
+
## NEW
|
212 |
+
if p1>0 and text[p1-1].isnumeric():
|
213 |
+
continue
|
214 |
+
|
215 |
+
end=p1+1
|
216 |
+
s= text[begin:end].strip()
|
217 |
+
if len(s)>0:
|
218 |
+
cand_sentences.append(s)
|
219 |
+
begin=p1+1
|
220 |
+
|
221 |
+
s= text[begin:].strip()
|
222 |
+
if len(s)>0:
|
223 |
+
cand_sentences.append(s)
|
224 |
+
|
225 |
+
if not delim_pat.search('.'):
|
226 |
+
## run phase 2 only if delimiter pattern contains period
|
227 |
+
#print('No need to run phase2')
|
228 |
+
return cand_sentences
|
229 |
+
# print(cand_sentences)
|
230 |
+
# print('====')
|
231 |
+
|
232 |
+
# return cand_sentences
|
233 |
+
|
234 |
+
### Phase 2: Address the fact that '.' may not always be a sentence delimiter
|
235 |
+
### Method: If there is a run of lines containing only a word (optionally) and '.',
|
236 |
+
### merge these lines as well one sentence preceding and succeeding this run of lines.
|
237 |
+
final_sentences=[]
|
238 |
+
sen_buffer=''
|
239 |
+
bad_state=False
|
240 |
+
|
241 |
+
for i, sentence in enumerate(cand_sentences):
|
242 |
+
words=sentence.split(' ')
|
243 |
+
#if len(words)<=2 and words[-1]=='.':
|
244 |
+
if len(words)==1 and sentence[-1]=='.':
|
245 |
+
bad_state=True
|
246 |
+
sen_buffer = sen_buffer + ' ' + sentence
|
247 |
+
## NEW condition
|
248 |
+
elif sentence[-1]=='.' and is_acronym_abbvr(words[-1][:-1],lang):
|
249 |
+
if len(sen_buffer)>0 and not bad_state:
|
250 |
+
final_sentences.append(sen_buffer)
|
251 |
+
bad_state=True
|
252 |
+
sen_buffer = sentence
|
253 |
+
elif bad_state:
|
254 |
+
sen_buffer = sen_buffer + ' ' + sentence
|
255 |
+
if len(sen_buffer)>0:
|
256 |
+
final_sentences.append(sen_buffer)
|
257 |
+
sen_buffer=''
|
258 |
+
bad_state=False
|
259 |
+
else: ## good state
|
260 |
+
if len(sen_buffer)>0:
|
261 |
+
final_sentences.append(sen_buffer)
|
262 |
+
sen_buffer=sentence
|
263 |
+
bad_state=False
|
264 |
+
|
265 |
+
if len(sen_buffer)>0:
|
266 |
+
final_sentences.append(sen_buffer)
|
267 |
+
|
268 |
+
return final_sentences
|