musfiqdehan
commited on
Commit
·
4a52b88
1
Parent(s):
3f01399
Add word alignment mapping functions
Browse files- helper/__init__.py +0 -0
- helper/alignment_mappers.py +125 -0
- helper/pos_taggers.py +167 -0
- helper/text_preprocess.py +165 -0
- helper/translators.py +141 -0
helper/__init__.py
ADDED
File without changes
|
helper/alignment_mappers.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This module contains the helper functions to get the word alignment mapping between two sentences.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import torch
|
6 |
+
import itertools
|
7 |
+
import transformers
|
8 |
+
from transformers import logging
|
9 |
+
|
10 |
+
# Set the verbosity to error, so that the warning messages are not printed
|
11 |
+
logging.set_verbosity_warning()
|
12 |
+
logging.set_verbosity_error()
|
13 |
+
|
14 |
+
|
15 |
+
def select_model(model_name):
|
16 |
+
"""
|
17 |
+
Select Model
|
18 |
+
"""
|
19 |
+
if model_name == "Google-mBERT (Base-Multilingual)":
|
20 |
+
model_name="bert-base-multilingual-cased"
|
21 |
+
elif model_name == "Neulab-AwesomeAlign (Bn-En-0.5M)":
|
22 |
+
model_name="musfiqdehan/bn-en-word-aligner"
|
23 |
+
elif model_name == "BUET-BanglaBERT (Large)":
|
24 |
+
model_name="csebuetnlp/banglabert_large"
|
25 |
+
elif model_name == "SagorSarker-BanglaBERT (Base)":
|
26 |
+
model_name="sagorsarker/bangla-bert-base"
|
27 |
+
elif model_name == "SentenceTransformers-LaBSE (Multilingual)":
|
28 |
+
model_name="sentence-transformers/LaBSE"
|
29 |
+
|
30 |
+
return model_name
|
31 |
+
|
32 |
+
|
33 |
+
def get_alignment_mapping(source="", target="", model_name=""):
|
34 |
+
"""
|
35 |
+
Get Aligned Words
|
36 |
+
"""
|
37 |
+
model_name = select_model(model_name)
|
38 |
+
|
39 |
+
model = transformers.BertModel.from_pretrained(model_name)
|
40 |
+
tokenizer = transformers.BertTokenizer.from_pretrained(model_name)
|
41 |
+
|
42 |
+
# pre-processing
|
43 |
+
sent_src, sent_tgt = source.strip().split(), target.strip().split()
|
44 |
+
|
45 |
+
token_src, token_tgt = [tokenizer.tokenize(word) for word in sent_src], [
|
46 |
+
tokenizer.tokenize(word) for word in sent_tgt]
|
47 |
+
|
48 |
+
wid_src, wid_tgt = [tokenizer.convert_tokens_to_ids(x) for x in token_src], [
|
49 |
+
tokenizer.convert_tokens_to_ids(x) for x in token_tgt]
|
50 |
+
|
51 |
+
ids_src, ids_tgt = tokenizer.prepare_for_model(list(itertools.chain(*wid_src)), return_tensors='pt', model_max_length=tokenizer.model_max_length, truncation=True)['input_ids'], tokenizer.prepare_for_model(list(itertools.chain(*wid_tgt)), return_tensors='pt', truncation=True, model_max_length=tokenizer.model_max_length)['input_ids']
|
52 |
+
sub2word_map_src = []
|
53 |
+
|
54 |
+
for i, word_list in enumerate(token_src):
|
55 |
+
sub2word_map_src += [i for x in word_list]
|
56 |
+
|
57 |
+
sub2word_map_tgt = []
|
58 |
+
|
59 |
+
for i, word_list in enumerate(token_tgt):
|
60 |
+
sub2word_map_tgt += [i for x in word_list]
|
61 |
+
|
62 |
+
# alignment
|
63 |
+
align_layer = 8
|
64 |
+
|
65 |
+
threshold = 1e-3
|
66 |
+
|
67 |
+
model.eval()
|
68 |
+
|
69 |
+
with torch.no_grad():
|
70 |
+
out_src = model(ids_src.unsqueeze(0), output_hidden_states=True)[
|
71 |
+
2][align_layer][0, 1:-1]
|
72 |
+
out_tgt = model(ids_tgt.unsqueeze(0), output_hidden_states=True)[
|
73 |
+
2][align_layer][0, 1:-1]
|
74 |
+
|
75 |
+
dot_prod = torch.matmul(out_src, out_tgt.transpose(-1, -2))
|
76 |
+
|
77 |
+
softmax_srctgt = torch.nn.Softmax(dim=-1)(dot_prod)
|
78 |
+
softmax_tgtsrc = torch.nn.Softmax(dim=-2)(dot_prod)
|
79 |
+
|
80 |
+
softmax_inter = (softmax_srctgt > threshold) * \
|
81 |
+
(softmax_tgtsrc > threshold)
|
82 |
+
|
83 |
+
align_subwords = torch.nonzero(softmax_inter, as_tuple=False)
|
84 |
+
|
85 |
+
align_words = set()
|
86 |
+
|
87 |
+
for i, j in align_subwords:
|
88 |
+
align_words.add((sub2word_map_src[i], sub2word_map_tgt[j]))
|
89 |
+
|
90 |
+
return sent_src, sent_tgt, align_words
|
91 |
+
|
92 |
+
|
93 |
+
|
94 |
+
def get_word_mapping(source="", target="", model_name=""):
|
95 |
+
"""
|
96 |
+
Get Word Aligned Mapping Words
|
97 |
+
"""
|
98 |
+
sent_src, sent_tgt, align_words = get_alignment_mapping(
|
99 |
+
source=source, target=target, model_name=model_name)
|
100 |
+
|
101 |
+
result = []
|
102 |
+
|
103 |
+
for i, j in sorted(align_words):
|
104 |
+
result.append(f'bn:({sent_src[i]}) -> en:({sent_tgt[j]})')
|
105 |
+
|
106 |
+
return result
|
107 |
+
|
108 |
+
|
109 |
+
|
110 |
+
def get_word_index_mapping(source="", target="", model_name=""):
|
111 |
+
"""
|
112 |
+
Get Word Aligned Mapping Index
|
113 |
+
"""
|
114 |
+
sent_src, sent_tgt, align_words = get_alignment_mapping(
|
115 |
+
source=source, target=target, model_name=model_name)
|
116 |
+
|
117 |
+
result = []
|
118 |
+
|
119 |
+
for i, j in sorted(align_words):
|
120 |
+
result.append(f'bn:({i}) -> en:({j})')
|
121 |
+
|
122 |
+
return result
|
123 |
+
|
124 |
+
|
125 |
+
|
helper/pos_taggers.py
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This module contains the functions to get PoS tags using Spacy and return a Markdown table
|
3 |
+
"""
|
4 |
+
|
5 |
+
from .alignment_mappers import get_alignment_mapping, select_model
|
6 |
+
|
7 |
+
from flair.models import SequenceTagger
|
8 |
+
from flair.data import Sentence
|
9 |
+
|
10 |
+
import spacy
|
11 |
+
from spacy.cli import download
|
12 |
+
download("en_core_web_sm")
|
13 |
+
import en_core_web_sm
|
14 |
+
|
15 |
+
import nltk
|
16 |
+
nltk.download('punkt')
|
17 |
+
nltk.download('averaged_perceptron_tagger')
|
18 |
+
|
19 |
+
from textblob import TextBlob
|
20 |
+
|
21 |
+
|
22 |
+
def get_spacy_postag_dict(target=""):
|
23 |
+
'''
|
24 |
+
Get spacy pos tags
|
25 |
+
'''
|
26 |
+
nlp = en_core_web_sm.load()
|
27 |
+
target_tokenized = nlp(target)
|
28 |
+
spacy_postag_dict = dict((token.text, token.tag_)
|
29 |
+
for token in target_tokenized)
|
30 |
+
return spacy_postag_dict
|
31 |
+
|
32 |
+
def get_nltk_postag_dict(target=""):
|
33 |
+
'''
|
34 |
+
Get nltk pos tags
|
35 |
+
'''
|
36 |
+
target_tokenized = nltk.tokenize.word_tokenize(target)
|
37 |
+
nltk_postag_dict = dict((key, value)
|
38 |
+
for key, value in nltk.pos_tag(target_tokenized))
|
39 |
+
return nltk_postag_dict
|
40 |
+
|
41 |
+
def get_flair_postag_dict(target=""):
|
42 |
+
'''
|
43 |
+
Get flair pos tags
|
44 |
+
'''
|
45 |
+
tagger = SequenceTagger.load("pos")
|
46 |
+
target_tokenized = Sentence(target)
|
47 |
+
tagger.predict(target_tokenized)
|
48 |
+
flair_postag_dict = dict((token.text, token.tag)
|
49 |
+
for token in target_tokenized)
|
50 |
+
return flair_postag_dict
|
51 |
+
|
52 |
+
def get_textblob_postag_dict(target=""):
|
53 |
+
'''
|
54 |
+
Get textblob pos tags
|
55 |
+
'''
|
56 |
+
blob = TextBlob(target)
|
57 |
+
textblob_postag_dict = dict(blob.tags)
|
58 |
+
return textblob_postag_dict
|
59 |
+
|
60 |
+
def get_postag(
|
61 |
+
get_postag_dict,
|
62 |
+
source="",
|
63 |
+
target="",
|
64 |
+
model_name="musfiqdehan/bn-en-word-aligner"):
|
65 |
+
"""Get Spacy PoS Tags and return a Markdown table"""
|
66 |
+
|
67 |
+
sent_src, sent_tgt, align_words = get_alignment_mapping(
|
68 |
+
source=source, target=target, model_name=model_name
|
69 |
+
)
|
70 |
+
postag_dict = get_postag_dict(target=target)
|
71 |
+
|
72 |
+
mapped_sent_src = []
|
73 |
+
|
74 |
+
html_table = '''
|
75 |
+
<table>
|
76 |
+
<thead>
|
77 |
+
<th>Bangla</th>
|
78 |
+
<th>English</th>
|
79 |
+
<th>PoS Tags</th>
|
80 |
+
</thead>
|
81 |
+
'''
|
82 |
+
|
83 |
+
for i, j in sorted(align_words):
|
84 |
+
punc = r"""!()-[]{}।;:'"\,<>./?@#$%^&*_~"""
|
85 |
+
if sent_src[i] in punc or sent_tgt[j] in punc:
|
86 |
+
mapped_sent_src.append(sent_src[i])
|
87 |
+
|
88 |
+
html_table += f'''
|
89 |
+
<tbody>
|
90 |
+
<tr>
|
91 |
+
<td> {sent_src[i]} </td>
|
92 |
+
<td> {sent_tgt[j]} </td>
|
93 |
+
<td> PUNC </td>
|
94 |
+
</tr>
|
95 |
+
'''
|
96 |
+
else:
|
97 |
+
mapped_sent_src.append(sent_src[i])
|
98 |
+
|
99 |
+
html_table += f'''
|
100 |
+
<tr>
|
101 |
+
<td> {sent_src[i]} </td>
|
102 |
+
<td> {sent_tgt[j]} </td>
|
103 |
+
<td> {postag_dict[sent_tgt[j]]} </td>
|
104 |
+
</tr>
|
105 |
+
'''
|
106 |
+
|
107 |
+
unks = list(set(sent_src).difference(set(mapped_sent_src)))
|
108 |
+
for word in unks:
|
109 |
+
|
110 |
+
html_table += f'''
|
111 |
+
<tr>
|
112 |
+
<td> {word} </td>
|
113 |
+
<td> N/A </td>
|
114 |
+
<td> UNK </td>
|
115 |
+
</tr>
|
116 |
+
'''
|
117 |
+
|
118 |
+
html_table += '''
|
119 |
+
</tbody>
|
120 |
+
</table>
|
121 |
+
'''
|
122 |
+
|
123 |
+
pos_accuracy = ((len(sent_src) - len(unks)) / len(sent_src))
|
124 |
+
pos_accuracy = f"{pos_accuracy:0.2%}"
|
125 |
+
|
126 |
+
return html_table, pos_accuracy
|
127 |
+
|
128 |
+
|
129 |
+
def select_pos_tagger(src, tgt, model_name, tagger):
|
130 |
+
'''
|
131 |
+
Select the PoS tagger
|
132 |
+
'''
|
133 |
+
|
134 |
+
result = None
|
135 |
+
pos_accuracy = None
|
136 |
+
|
137 |
+
model_name = select_model(model_name)
|
138 |
+
|
139 |
+
if tagger == "spaCy":
|
140 |
+
result, pos_accuracy = get_postag(
|
141 |
+
get_spacy_postag_dict,
|
142 |
+
source=src,
|
143 |
+
target=tgt,
|
144 |
+
model_name=model_name,
|
145 |
+
)
|
146 |
+
elif tagger == "NLTK":
|
147 |
+
result, pos_accuracy = get_postag(
|
148 |
+
get_nltk_postag_dict,
|
149 |
+
source=src,
|
150 |
+
target=tgt,
|
151 |
+
model_name=model_name,
|
152 |
+
)
|
153 |
+
elif tagger == "Flair":
|
154 |
+
result, pos_accuracy = get_postag(
|
155 |
+
get_flair_postag_dict,
|
156 |
+
source=src,
|
157 |
+
target=tgt,
|
158 |
+
model_name=model_name,
|
159 |
+
)
|
160 |
+
elif tagger == "TextBlob":
|
161 |
+
result, pos_accuracy = get_postag(
|
162 |
+
get_textblob_postag_dict,
|
163 |
+
source=src,
|
164 |
+
target=tgt,
|
165 |
+
model_name=model_name,
|
166 |
+
)
|
167 |
+
return result, pos_accuracy
|
helper/text_preprocess.py
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This file contains functions for text preprocessing
|
3 |
+
"""
|
4 |
+
|
5 |
+
import re
|
6 |
+
|
7 |
+
|
8 |
+
def decontracting_words(sentence):
|
9 |
+
"""
|
10 |
+
Decontracting words (e.g. I'm -> I am, I've -> I have, etc.)
|
11 |
+
https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
|
12 |
+
https://stackoverflow.com/a/19794953
|
13 |
+
"""
|
14 |
+
contractions = {
|
15 |
+
"ain't": "am not",
|
16 |
+
"aren't": "are not",
|
17 |
+
"can't": "can not",
|
18 |
+
"can't've": "can not have",
|
19 |
+
"'cause": "because",
|
20 |
+
"could've": "could have",
|
21 |
+
"couldn't": "could not",
|
22 |
+
"couldn't've": "could not have",
|
23 |
+
"didn't": "did not",
|
24 |
+
"doesn't": "does not",
|
25 |
+
"don't": "do not",
|
26 |
+
"hadn't": "had not",
|
27 |
+
"hadn't've": "had not have",
|
28 |
+
"hasn't": "has not",
|
29 |
+
"haven't": "have not",
|
30 |
+
"he'd": "he would",
|
31 |
+
"he'd've": "he would have",
|
32 |
+
"he'll": "he will",
|
33 |
+
"he'll've": "he will have",
|
34 |
+
"he's": "he is",
|
35 |
+
"how'd": "how did",
|
36 |
+
"how'd'y": "how do you",
|
37 |
+
"how'll": "how will",
|
38 |
+
"how's": "how is",
|
39 |
+
"i'd": "i would",
|
40 |
+
"i'd've": "i would have",
|
41 |
+
"i'll": "i will",
|
42 |
+
"i'll've": "i will have",
|
43 |
+
"i'm": "i am",
|
44 |
+
"i've": "i have",
|
45 |
+
"isn't": "is not",
|
46 |
+
"it'd": "it would",
|
47 |
+
"it'd've": "it would have",
|
48 |
+
"it'll": "it will",
|
49 |
+
"it'll've": "it will have",
|
50 |
+
"it's": "it is",
|
51 |
+
"let's": "let us",
|
52 |
+
"ma'am": "madam",
|
53 |
+
"mayn't": "may not",
|
54 |
+
"might've": "might have",
|
55 |
+
"mightn't": "might not",
|
56 |
+
"mightn't've": "might not have",
|
57 |
+
"must've": "must have",
|
58 |
+
"mustn't": "must not",
|
59 |
+
"mustn't've": "must not have",
|
60 |
+
"needn't": "need not",
|
61 |
+
"needn't've": "need not have",
|
62 |
+
"o'clock": "of the clock",
|
63 |
+
"oughtn't": "ought not",
|
64 |
+
"oughtn't've": "ought not have",
|
65 |
+
"shan't": "shall not",
|
66 |
+
"sha'n't": "shall not",
|
67 |
+
"shan't've": "shall not have",
|
68 |
+
"she'd": "she would",
|
69 |
+
"she'd've": "she would have",
|
70 |
+
"she'll": "she will",
|
71 |
+
"she'll've": "she will have",
|
72 |
+
"she's": "she is",
|
73 |
+
"should've": "should have",
|
74 |
+
"shouldn't": "should not",
|
75 |
+
"shouldn't've": "should not have",
|
76 |
+
"so've": "so have",
|
77 |
+
"so's": "so as",
|
78 |
+
"that'd": "that would",
|
79 |
+
"that'd've": "that would have",
|
80 |
+
"that's": "that is",
|
81 |
+
"there'd": "there would",
|
82 |
+
"there'd've": "there would have",
|
83 |
+
"there's": "there is",
|
84 |
+
"they'd": "they would",
|
85 |
+
"they'd've": "they would have",
|
86 |
+
"they'll": "they will",
|
87 |
+
"they'll've": "they will have",
|
88 |
+
"they're": "they are",
|
89 |
+
"they've": "they have",
|
90 |
+
"to've": "to have",
|
91 |
+
"wasn't": "was not",
|
92 |
+
"we'd": "we would",
|
93 |
+
"we'd've": "we would have",
|
94 |
+
"we'll": "we will",
|
95 |
+
"we'll've": "we will have",
|
96 |
+
"we're": "we are",
|
97 |
+
"we've": "we have",
|
98 |
+
"weren't": "were not",
|
99 |
+
"what'll": "what will",
|
100 |
+
"what'll've": "what will have",
|
101 |
+
"what're": "what are",
|
102 |
+
"what's": "what is",
|
103 |
+
"what've": "what have",
|
104 |
+
"when's": "when is",
|
105 |
+
"when've": "when have",
|
106 |
+
"where'd": "where did",
|
107 |
+
"where's": "where is",
|
108 |
+
"where've": "where have",
|
109 |
+
"who'll": "who will",
|
110 |
+
"who'll've": "who will have",
|
111 |
+
"who's": "who is",
|
112 |
+
"who've": "who have",
|
113 |
+
"why's": "why is",
|
114 |
+
"why've": "why have",
|
115 |
+
"will've": "will have",
|
116 |
+
"won't": "will not",
|
117 |
+
"won't've": "will not have",
|
118 |
+
"would've": "would have",
|
119 |
+
"wouldn't": "would not",
|
120 |
+
"wouldn't've": "would not have",
|
121 |
+
"y'all": "you all",
|
122 |
+
"y'all'd": "you all would",
|
123 |
+
"y'all'd've": "you all would have",
|
124 |
+
"y'all're": "you all are",
|
125 |
+
"y'all've": "you all have",
|
126 |
+
"you'd": "you would",
|
127 |
+
"you'd've": "you would have",
|
128 |
+
"you'll": "you will",
|
129 |
+
"you'll've": "you will have",
|
130 |
+
"you're": "you are",
|
131 |
+
"you've": "you have"
|
132 |
+
}
|
133 |
+
|
134 |
+
sentence_decontracted = []
|
135 |
+
|
136 |
+
for word in sentence.split():
|
137 |
+
if word in contractions:
|
138 |
+
word = contractions[word]
|
139 |
+
|
140 |
+
sentence_decontracted.append(word)
|
141 |
+
|
142 |
+
sentence = ' '.join(sentence_decontracted)
|
143 |
+
sentence = sentence.replace("'ve", " have")
|
144 |
+
sentence = sentence.replace("n't", " not")
|
145 |
+
sentence = sentence.replace("'re", " are")
|
146 |
+
sentence = sentence.replace("'ll", " will")
|
147 |
+
sentence = sentence.replace("'d", " would")
|
148 |
+
sentence = sentence.replace("'s", " is")
|
149 |
+
sentence = sentence.replace("'m", " am")
|
150 |
+
|
151 |
+
return sentence
|
152 |
+
|
153 |
+
|
154 |
+
|
155 |
+
def space_punc(line):
|
156 |
+
"""
|
157 |
+
Add a space before and after a punctuation mark
|
158 |
+
and remove more than one space
|
159 |
+
print(space_punc('bla. bla? "bla"bla.bla! bla...'))
|
160 |
+
>> bla . bla ? " bla " bla . bla ! bla . . .
|
161 |
+
"""
|
162 |
+
|
163 |
+
line = re.sub('([.,:;\-।!?"()\'])', r" \1 ", line)
|
164 |
+
line = re.sub("\s{2,}", " ", line)
|
165 |
+
return line
|
helper/translators.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This file contains the functions to translate the text from one language to another.
|
3 |
+
"""
|
4 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
5 |
+
from deep_translator import GoogleTranslator, MyMemoryTranslator, MicrosoftTranslator, YandexTranslator, ChatGptTranslator
|
6 |
+
from .text_preprocess import decontracting_words, space_punc
|
7 |
+
from dotenv import load_dotenv
|
8 |
+
import os
|
9 |
+
|
10 |
+
|
11 |
+
# Load the environment variables from the .env file
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
# Translators API Keys
|
15 |
+
MICROSOFT_API_KEY = os.getenv("MICROSOFT_TRANSLATOR_KEY")
|
16 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
17 |
+
YANDEX_API_KEY = os.getenv("YANDEX_API_KEY")
|
18 |
+
|
19 |
+
# Digit Translation
|
20 |
+
digit_converter = {
|
21 |
+
'০': '0',
|
22 |
+
'১': '1',
|
23 |
+
'২': '2',
|
24 |
+
'৩': '3',
|
25 |
+
'৪': '4',
|
26 |
+
'৫': '5',
|
27 |
+
'৬': '6',
|
28 |
+
'৭': '7',
|
29 |
+
'৮': '8',
|
30 |
+
'৯': '9'
|
31 |
+
}
|
32 |
+
|
33 |
+
|
34 |
+
def get_translated_digit(sentence):
|
35 |
+
"""
|
36 |
+
Translate the digits from Bengali to English
|
37 |
+
"""
|
38 |
+
translated_sentence = []
|
39 |
+
for each_letter in sentence:
|
40 |
+
if each_letter in digit_converter.keys():
|
41 |
+
translated_sentence.append(digit_converter[each_letter])
|
42 |
+
# print(digit_converter[each_letter], end="")
|
43 |
+
else:
|
44 |
+
translated_sentence.append(each_letter)
|
45 |
+
# print(each_letter, end="")
|
46 |
+
|
47 |
+
return "".join(each for each in translated_sentence)
|
48 |
+
|
49 |
+
# Bangla to English Translation (BUET BanglaNMT)
|
50 |
+
translation_model_bn_en = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/banglat5_nmt_bn_en")
|
51 |
+
translation_tokenizer_bn_en = AutoTokenizer.from_pretrained("csebuetnlp/banglat5_nmt_bn_en")
|
52 |
+
|
53 |
+
def banglanmt_translation(input_text):
|
54 |
+
"""
|
55 |
+
Translate a sentence from Bengali to English using BUET BanglaNMT
|
56 |
+
"""
|
57 |
+
inputs = translation_tokenizer_bn_en(input_text, return_tensors="pt")
|
58 |
+
outputs = translation_model_bn_en.generate(**inputs)
|
59 |
+
translated_text = translation_tokenizer_bn_en.decode(outputs[0], skip_special_tokens=True)
|
60 |
+
return translated_text
|
61 |
+
|
62 |
+
def google_translation(sentence: str, source="bn", target="en") -> str:
|
63 |
+
"""
|
64 |
+
Translate a sentence from one language to another using Google Translator.\n
|
65 |
+
At first install dependencies \n
|
66 |
+
`!pip install -U deep-translator`
|
67 |
+
"""
|
68 |
+
translator = GoogleTranslator()
|
69 |
+
translated_sentence = translator.translate(
|
70 |
+
sentence, source=source, target=target)
|
71 |
+
return translated_sentence
|
72 |
+
|
73 |
+
def microsoft_translation(sentence: str, source="bn", target="en") -> str:
|
74 |
+
"""
|
75 |
+
Translate a sentence from one language to another using Microsoft Translator.\n
|
76 |
+
At first install dependencies \n
|
77 |
+
`!pip install -U deep-translator`
|
78 |
+
"""
|
79 |
+
translator = MicrosoftTranslator(api_key=MICROSOFT_API_KEY, target='en')
|
80 |
+
translated_sentence = translator.translate(sentence)
|
81 |
+
return translated_sentence
|
82 |
+
|
83 |
+
def chatgpt_translation(sentence: str, source="bn", target="en") -> str:
|
84 |
+
"""
|
85 |
+
Translate a sentence from one language to another using ChatGPT Translator.\n
|
86 |
+
At first install dependencies \n
|
87 |
+
`!pip install -U deep-translator`
|
88 |
+
"""
|
89 |
+
translator = ChatGptTranslator(api_key=OPENAI_API_KEY, target=target)
|
90 |
+
translated_sentence = translator.translate(sentence)
|
91 |
+
return translated_sentence
|
92 |
+
|
93 |
+
def yandex_translation(sentence: str, source="bn", target="en") -> str:
|
94 |
+
"""
|
95 |
+
Translate a sentence from one language to another using Yandex Translator.\n
|
96 |
+
At first install dependencies \n
|
97 |
+
`!pip install -U deep-translator`
|
98 |
+
"""
|
99 |
+
translator = YandexTranslator(api_key=YANDEX_API_KEY)
|
100 |
+
translated_sentence = translator.translate(
|
101 |
+
sentence, source=source, target=target)
|
102 |
+
return translated_sentence
|
103 |
+
|
104 |
+
def mymemory_translation(sentence: str, source="bn-IN", target="en-US") -> str:
|
105 |
+
"""
|
106 |
+
Translate a sentence from one language to another using MyMemory Translator.\n
|
107 |
+
At first install dependencies \n
|
108 |
+
`!pip install -U deep-translator`
|
109 |
+
"""
|
110 |
+
translator = MyMemoryTranslator(source=source, target=target)
|
111 |
+
translated_sentence = translator.translate(sentence)
|
112 |
+
return translated_sentence
|
113 |
+
|
114 |
+
def get_better_translation(translator_func, src=""):
|
115 |
+
src_mod = get_translated_digit(src)
|
116 |
+
tgt = translator_func(src_mod)
|
117 |
+
tgt = decontracting_words(tgt)
|
118 |
+
tgt = tgt.replace('rupees', 'takas').replace('Rs', 'takas')
|
119 |
+
return tgt
|
120 |
+
|
121 |
+
def select_translator(src, translator):
|
122 |
+
"""
|
123 |
+
Select the translator
|
124 |
+
"""
|
125 |
+
tgt = None
|
126 |
+
tgt_base = None
|
127 |
+
|
128 |
+
if translator == "Google":
|
129 |
+
tgt = get_better_translation(google_translation, src)
|
130 |
+
tgt = space_punc(tgt)
|
131 |
+
tgt_base = google_translation(src)
|
132 |
+
elif translator == "BanglaNMT":
|
133 |
+
tgt = get_better_translation(banglanmt_translation, src)
|
134 |
+
tgt = space_punc(tgt)
|
135 |
+
tgt_base = banglanmt_translation(src)
|
136 |
+
elif translator == "MyMemory":
|
137 |
+
tgt = get_better_translation(mymemory_translation, src)
|
138 |
+
tgt = space_punc(tgt)
|
139 |
+
tgt_base = mymemory_translation(src)
|
140 |
+
|
141 |
+
return tgt_base, tgt
|