Spaces:
Runtime error
Runtime error
musfiqdehan
commited on
Commit
•
407b426
1
Parent(s):
d9b70f3
Syncing huggingface space and github
Browse files- .gitattributes +35 -0
- .gitignore +2 -0
- README.md +24 -0
- app.py +97 -0
- init.py → helper/__init__.py +0 -0
- helper/alignment_mappers.py +100 -0
- helper/pos_taggers.py +165 -0
- helper/text_preprocess.py +165 -0
- helper/translators.py +141 -0
- requirements.txt +11 -0
- styles.css +29 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
.env
|
2 |
+
.venv/
|
README.md
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Bangla PoS Taggers
|
3 |
+
emoji: 🌼
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: purple
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.19.2
|
8 |
+
app_file: app.py
|
9 |
+
pinned: true
|
10 |
+
license: mit
|
11 |
+
short_description: Parts of Speech Tagging of Bangla Sentence
|
12 |
+
---
|
13 |
+
|
14 |
+
|
15 |
+
This demo is related to the paper named "[Word Alignment by Fine-tuning Embeddings on Parallel Corpora](https://arxiv.org/abs/2101.08231)"
|
16 |
+
```
|
17 |
+
@inproceedings{dou2021word,
|
18 |
+
title={Word Alignment by Fine-tuning Embeddings on Parallel Corpora},
|
19 |
+
author={Dou, Zi-Yi and Neubig, Graham},
|
20 |
+
booktitle={Conference of the European Chapter of the Association for Computational Linguistics (EACL)},
|
21 |
+
year={2021}
|
22 |
+
}
|
23 |
+
```
|
24 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from gradio_rich_textbox import RichTextbox
|
3 |
+
|
4 |
+
from helper.text_preprocess import space_punc
|
5 |
+
from helper.pos_taggers import select_pos_tagger
|
6 |
+
from helper.translators import select_translator
|
7 |
+
|
8 |
+
|
9 |
+
def bn_postagger(src, translator, tagger):
|
10 |
+
"""
|
11 |
+
Bangla PoS Tagger
|
12 |
+
"""
|
13 |
+
|
14 |
+
src = space_punc(src)
|
15 |
+
|
16 |
+
tgt_base, tgt = select_translator(src, translator)
|
17 |
+
|
18 |
+
result, pos_accuracy = select_pos_tagger(src, tgt, tagger)
|
19 |
+
|
20 |
+
return tgt_base, result, pos_accuracy
|
21 |
+
|
22 |
+
|
23 |
+
# Define the Gradio interface
|
24 |
+
# demo = gr.Interface(
|
25 |
+
# fn=bn_postagger,
|
26 |
+
# inputs=[
|
27 |
+
# gr.Textbox(label="Enter Bangla Sentence", placeholder="বাংলা বাক্য লিখুন"),
|
28 |
+
# gr.Dropdown(["Google", "BanglaNMT", "MyMemory"], label="Select a Translator"),
|
29 |
+
# gr.Dropdown(["spaCy", "NLTK", "Flair", "TextBlob"], label="Select a PoS Tagger")
|
30 |
+
# ],
|
31 |
+
# outputs= [
|
32 |
+
# gr.Textbox(label="English Translation"),
|
33 |
+
# RichTextbox(label="PoS Tags"),
|
34 |
+
# gr.Textbox(label="Overall PoS Tagging Accuracy")
|
35 |
+
# ],
|
36 |
+
# live=False,
|
37 |
+
# title="Bangla PoS Taggers",
|
38 |
+
# theme='',
|
39 |
+
# examples=[
|
40 |
+
# ["বাংলাদেশ দক্ষিণ এশিয়ার একটি সার্বভৌম রাষ্ট্র।"],
|
41 |
+
# ["বাংলাদেশের সংবিধানিক নাম কি?"],
|
42 |
+
# ["বাংলাদেশের সাংবিধানিক নাম গণপ্রজাতন্ত্রী বাংলাদেশ।"],
|
43 |
+
# ["তিনজনের কেউই বাবার পথ ধরে প্রযুক্তি দুনিয়ায় হাঁটেননি।"],
|
44 |
+
# ["বিশ্বের আরও একটি সেরা ক্লাব।"]
|
45 |
+
|
46 |
+
# ]
|
47 |
+
# )
|
48 |
+
|
49 |
+
with gr.Blocks(css="styles.css") as demo:
|
50 |
+
gr.HTML("<h1>Bangla PoS Taggers</h1>")
|
51 |
+
gr.HTML("<p>Parts of Speech (PoS) Tagging of Bangla Sentence using Bangla-English <strong>Word Alignment</strong></p>")
|
52 |
+
|
53 |
+
with gr.Row():
|
54 |
+
with gr.Column():
|
55 |
+
inputs = [
|
56 |
+
gr.Textbox(
|
57 |
+
label="Enter Bangla Sentence",
|
58 |
+
placeholder="বাংলা বাক্য লিখুন"
|
59 |
+
),
|
60 |
+
gr.Dropdown(
|
61 |
+
choices=["Google", "BanglaNMT", "MyMemory"],
|
62 |
+
label="Select a Translator"
|
63 |
+
),
|
64 |
+
gr.Dropdown(
|
65 |
+
choices=["spaCy", "NLTK", "Flair", "TextBlob"],
|
66 |
+
label="Select a PoS Tagger"
|
67 |
+
)
|
68 |
+
]
|
69 |
+
|
70 |
+
btn = gr.Button(value="Submit", elem_classes="mybtn")
|
71 |
+
gr.ClearButton(inputs)
|
72 |
+
|
73 |
+
with gr.Column():
|
74 |
+
outputs = [
|
75 |
+
gr.Textbox(label="English Translation"),
|
76 |
+
RichTextbox(label="PoS Tags"),
|
77 |
+
gr.Textbox(label="Overall PoS Tagging Accuracy")
|
78 |
+
]
|
79 |
+
|
80 |
+
btn.click(bn_postagger, inputs, outputs)
|
81 |
+
|
82 |
+
gr.Examples([
|
83 |
+
["বাংলাদেশ দক্ষিণ এশিয়ার একটি সার্বভৌম রাষ্ট্র।", "Google", "NLTK"],
|
84 |
+
["বাংলাদেশের সংবিধানিক নাম কি?", "Google", "spaCy"],
|
85 |
+
["বাংলাদেশের সাংবিধানিক নাম গণপ্রজাতন্ত্রী বাংলাদেশ।", "Google", "TextBlob"],
|
86 |
+
["তিনজনের কেউই বাবার পথ ধরে প্রযুক্তি দুনিয়ায় হাঁটেননি।", "Google", "spaCy"],
|
87 |
+
["তিনজনের কেউই বাবার পথ ধরে প্রযুক্তি দুনিয়ায় হাঁটেননি।", "BanglaNMT", "spaCy"],
|
88 |
+
["তিনজনের কেউই বাবার পথ ধরে প্রযুক্তি দুনিয়ায় হাঁটেননি।", "MyMemory", "spaCy"],
|
89 |
+
["বিশ্বের আরও একটি সেরা ক্লাব।", "Google", "Flair"]
|
90 |
+
|
91 |
+
], inputs)
|
92 |
+
|
93 |
+
|
94 |
+
|
95 |
+
# Launch the Gradio app
|
96 |
+
if __name__ == "__main__":
|
97 |
+
demo.launch()
|
init.py → helper/__init__.py
RENAMED
File without changes
|
helper/alignment_mappers.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This module contains the helper functions to get the word alignment mapping between two sentences.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import torch
|
6 |
+
import itertools
|
7 |
+
import transformers
|
8 |
+
from transformers import logging
|
9 |
+
|
10 |
+
# Set the verbosity to error, so that the warning messages are not printed
|
11 |
+
logging.set_verbosity_warning()
|
12 |
+
logging.set_verbosity_error()
|
13 |
+
|
14 |
+
|
15 |
+
def get_alignment_mapping(source="", target="", model_path="musfiqdehan/bn-en-word-aligner"):
|
16 |
+
"""
|
17 |
+
Get Aligned Words
|
18 |
+
"""
|
19 |
+
model = transformers.BertModel.from_pretrained(model_path)
|
20 |
+
tokenizer = transformers.BertTokenizer.from_pretrained(model_path)
|
21 |
+
|
22 |
+
# pre-processing
|
23 |
+
sent_src, sent_tgt = source.strip().split(), target.strip().split()
|
24 |
+
token_src, token_tgt = [tokenizer.tokenize(word) for word in sent_src], [
|
25 |
+
tokenizer.tokenize(word) for word in sent_tgt]
|
26 |
+
wid_src, wid_tgt = [tokenizer.convert_tokens_to_ids(x) for x in token_src], [
|
27 |
+
tokenizer.convert_tokens_to_ids(x) for x in token_tgt]
|
28 |
+
ids_src, ids_tgt = tokenizer.prepare_for_model(list(itertools.chain(*wid_src)), return_tensors='pt', model_max_length=tokenizer.model_max_length, truncation=True)[
|
29 |
+
'input_ids'], tokenizer.prepare_for_model(list(itertools.chain(*wid_tgt)), return_tensors='pt', truncation=True, model_max_length=tokenizer.model_max_length)['input_ids']
|
30 |
+
sub2word_map_src = []
|
31 |
+
|
32 |
+
for i, word_list in enumerate(token_src):
|
33 |
+
sub2word_map_src += [i for x in word_list]
|
34 |
+
|
35 |
+
sub2word_map_tgt = []
|
36 |
+
|
37 |
+
for i, word_list in enumerate(token_tgt):
|
38 |
+
sub2word_map_tgt += [i for x in word_list]
|
39 |
+
|
40 |
+
# alignment
|
41 |
+
align_layer = 8
|
42 |
+
|
43 |
+
threshold = 1e-3
|
44 |
+
|
45 |
+
model.eval()
|
46 |
+
|
47 |
+
with torch.no_grad():
|
48 |
+
out_src = model(ids_src.unsqueeze(0), output_hidden_states=True)[
|
49 |
+
2][align_layer][0, 1:-1]
|
50 |
+
out_tgt = model(ids_tgt.unsqueeze(0), output_hidden_states=True)[
|
51 |
+
2][align_layer][0, 1:-1]
|
52 |
+
|
53 |
+
dot_prod = torch.matmul(out_src, out_tgt.transpose(-1, -2))
|
54 |
+
|
55 |
+
softmax_srctgt = torch.nn.Softmax(dim=-1)(dot_prod)
|
56 |
+
softmax_tgtsrc = torch.nn.Softmax(dim=-2)(dot_prod)
|
57 |
+
|
58 |
+
softmax_inter = (softmax_srctgt > threshold) * \
|
59 |
+
(softmax_tgtsrc > threshold)
|
60 |
+
|
61 |
+
align_subwords = torch.nonzero(softmax_inter, as_tuple=False)
|
62 |
+
|
63 |
+
align_words = set()
|
64 |
+
|
65 |
+
for i, j in align_subwords:
|
66 |
+
align_words.add((sub2word_map_src[i], sub2word_map_tgt[j]))
|
67 |
+
|
68 |
+
return sent_src, sent_tgt, align_words
|
69 |
+
|
70 |
+
|
71 |
+
|
72 |
+
def get_word_mapping(source="", target="", model_path="musfiqdehan/bn-en-word-aligner"):
|
73 |
+
"""
|
74 |
+
Get Word Aligned Mapping Words
|
75 |
+
"""
|
76 |
+
sent_src, sent_tgt, align_words = get_alignment_mapping(
|
77 |
+
source=source, target=target, model_path=model_path)
|
78 |
+
|
79 |
+
result = []
|
80 |
+
|
81 |
+
for i, j in sorted(align_words):
|
82 |
+
result.append(f'bn:({sent_src[i]}) -> en:({sent_tgt[j]})')
|
83 |
+
|
84 |
+
return result
|
85 |
+
|
86 |
+
|
87 |
+
|
88 |
+
def get_word_index_mapping(source="", target="", model_path="musfiqdehan/bn-en-word-aligner"):
|
89 |
+
"""
|
90 |
+
Get Word Aligned Mapping Index
|
91 |
+
"""
|
92 |
+
sent_src, sent_tgt, align_words = get_alignment_mapping(
|
93 |
+
source=source, target=target, model_path=model_path)
|
94 |
+
|
95 |
+
result = []
|
96 |
+
|
97 |
+
for i, j in sorted(align_words):
|
98 |
+
result.append(f'bn:({i}) -> en:({j})')
|
99 |
+
|
100 |
+
return result
|
helper/pos_taggers.py
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This module contains the functions to get PoS tags using Spacy and return a Markdown table
|
3 |
+
"""
|
4 |
+
|
5 |
+
from .alignment_mappers import get_alignment_mapping
|
6 |
+
|
7 |
+
from flair.models import SequenceTagger
|
8 |
+
from flair.data import Sentence
|
9 |
+
|
10 |
+
import spacy
|
11 |
+
from spacy.cli import download
|
12 |
+
download("en_core_web_sm")
|
13 |
+
import en_core_web_sm
|
14 |
+
|
15 |
+
import nltk
|
16 |
+
nltk.download('punkt')
|
17 |
+
nltk.download('averaged_perceptron_tagger')
|
18 |
+
|
19 |
+
from textblob import TextBlob
|
20 |
+
|
21 |
+
|
22 |
+
def get_spacy_postag_dict(target=""):
|
23 |
+
'''
|
24 |
+
Get spacy pos tags
|
25 |
+
'''
|
26 |
+
nlp = en_core_web_sm.load()
|
27 |
+
target_tokenized = nlp(target)
|
28 |
+
spacy_postag_dict = dict((token.text, token.tag_)
|
29 |
+
for token in target_tokenized)
|
30 |
+
return spacy_postag_dict
|
31 |
+
|
32 |
+
def get_nltk_postag_dict(target=""):
|
33 |
+
'''
|
34 |
+
Get nltk pos tags
|
35 |
+
'''
|
36 |
+
target_tokenized = nltk.tokenize.word_tokenize(target)
|
37 |
+
nltk_postag_dict = dict((key, value)
|
38 |
+
for key, value in nltk.pos_tag(target_tokenized))
|
39 |
+
return nltk_postag_dict
|
40 |
+
|
41 |
+
def get_flair_postag_dict(target=""):
|
42 |
+
'''
|
43 |
+
Get flair pos tags
|
44 |
+
'''
|
45 |
+
tagger = SequenceTagger.load("pos")
|
46 |
+
target_tokenized = Sentence(target)
|
47 |
+
tagger.predict(target_tokenized)
|
48 |
+
flair_postag_dict = dict((token.text, token.tag)
|
49 |
+
for token in target_tokenized)
|
50 |
+
return flair_postag_dict
|
51 |
+
|
52 |
+
def get_textblob_postag_dict(target=""):
|
53 |
+
'''
|
54 |
+
Get textblob pos tags
|
55 |
+
'''
|
56 |
+
blob = TextBlob(target)
|
57 |
+
textblob_postag_dict = dict(blob.tags)
|
58 |
+
return textblob_postag_dict
|
59 |
+
|
60 |
+
def get_postag(
|
61 |
+
get_postag_dict,
|
62 |
+
source="",
|
63 |
+
target="",
|
64 |
+
model_path="musfiqdehan/bn-en-word-aligner"):
|
65 |
+
"""Get Spacy PoS Tags and return a Markdown table"""
|
66 |
+
|
67 |
+
sent_src, sent_tgt, align_words = get_alignment_mapping(
|
68 |
+
source=source, target=target, model_path=model_path
|
69 |
+
)
|
70 |
+
postag_dict = get_postag_dict(target=target)
|
71 |
+
|
72 |
+
mapped_sent_src = []
|
73 |
+
|
74 |
+
html_table = '''
|
75 |
+
<table>
|
76 |
+
<thead>
|
77 |
+
<th>Bangla</th>
|
78 |
+
<th>English</th>
|
79 |
+
<th>PoS Tags</th>
|
80 |
+
</thead>
|
81 |
+
'''
|
82 |
+
|
83 |
+
for i, j in sorted(align_words):
|
84 |
+
punc = r"""!()-[]{}।;:'"\,<>./?@#$%^&*_~"""
|
85 |
+
if sent_src[i] in punc or sent_tgt[j] in punc:
|
86 |
+
mapped_sent_src.append(sent_src[i])
|
87 |
+
|
88 |
+
html_table += f'''
|
89 |
+
<tbody>
|
90 |
+
<tr>
|
91 |
+
<td> {sent_src[i]} </td>
|
92 |
+
<td> {sent_tgt[j]} </td>
|
93 |
+
<td> PUNC </td>
|
94 |
+
</tr>
|
95 |
+
'''
|
96 |
+
else:
|
97 |
+
mapped_sent_src.append(sent_src[i])
|
98 |
+
|
99 |
+
html_table += f'''
|
100 |
+
<tr>
|
101 |
+
<td> {sent_src[i]} </td>
|
102 |
+
<td> {sent_tgt[j]} </td>
|
103 |
+
<td> {postag_dict[sent_tgt[j]]} </td>
|
104 |
+
</tr>
|
105 |
+
'''
|
106 |
+
|
107 |
+
unks = list(set(sent_src).difference(set(mapped_sent_src)))
|
108 |
+
for word in unks:
|
109 |
+
|
110 |
+
html_table += f'''
|
111 |
+
<tr>
|
112 |
+
<td> {word} </td>
|
113 |
+
<td> N/A </td>
|
114 |
+
<td> UNK </td>
|
115 |
+
</tr>
|
116 |
+
'''
|
117 |
+
|
118 |
+
html_table += '''
|
119 |
+
</tbody>
|
120 |
+
</table>
|
121 |
+
'''
|
122 |
+
|
123 |
+
pos_accuracy = ((len(sent_src) - len(unks)) / len(sent_src))
|
124 |
+
pos_accuracy = f"{pos_accuracy:0.2%}"
|
125 |
+
|
126 |
+
return html_table, pos_accuracy
|
127 |
+
|
128 |
+
|
129 |
+
def select_pos_tagger(src, tgt, tagger):
|
130 |
+
'''
|
131 |
+
Select the PoS tagger
|
132 |
+
'''
|
133 |
+
|
134 |
+
result = None
|
135 |
+
pos_accuracy = None
|
136 |
+
|
137 |
+
if tagger == "spaCy":
|
138 |
+
result, pos_accuracy = get_postag(
|
139 |
+
get_spacy_postag_dict,
|
140 |
+
source=src,
|
141 |
+
target=tgt,
|
142 |
+
model_path="musfiqdehan/bn-en-word-aligner",
|
143 |
+
)
|
144 |
+
elif tagger == "NLTK":
|
145 |
+
result, pos_accuracy = get_postag(
|
146 |
+
get_nltk_postag_dict,
|
147 |
+
source=src,
|
148 |
+
target=tgt,
|
149 |
+
model_path="musfiqdehan/bn-en-word-aligner",
|
150 |
+
)
|
151 |
+
elif tagger == "Flair":
|
152 |
+
result, pos_accuracy = get_postag(
|
153 |
+
get_flair_postag_dict,
|
154 |
+
source=src,
|
155 |
+
target=tgt,
|
156 |
+
model_path="musfiqdehan/bn-en-word-aligner",
|
157 |
+
)
|
158 |
+
elif tagger == "TextBlob":
|
159 |
+
result, pos_accuracy = get_postag(
|
160 |
+
get_textblob_postag_dict,
|
161 |
+
source=src,
|
162 |
+
target=tgt,
|
163 |
+
model_path="musfiqdehan/bn-en-word-aligner",
|
164 |
+
)
|
165 |
+
return result, pos_accuracy
|
helper/text_preprocess.py
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This file contains functions for text preprocessing
|
3 |
+
"""
|
4 |
+
|
5 |
+
import re
|
6 |
+
|
7 |
+
|
8 |
+
def decontracting_words(sentence):
|
9 |
+
"""
|
10 |
+
Decontracting words (e.g. I'm -> I am, I've -> I have, etc.)
|
11 |
+
https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
|
12 |
+
https://stackoverflow.com/a/19794953
|
13 |
+
"""
|
14 |
+
contractions = {
|
15 |
+
"ain't": "am not",
|
16 |
+
"aren't": "are not",
|
17 |
+
"can't": "can not",
|
18 |
+
"can't've": "can not have",
|
19 |
+
"'cause": "because",
|
20 |
+
"could've": "could have",
|
21 |
+
"couldn't": "could not",
|
22 |
+
"couldn't've": "could not have",
|
23 |
+
"didn't": "did not",
|
24 |
+
"doesn't": "does not",
|
25 |
+
"don't": "do not",
|
26 |
+
"hadn't": "had not",
|
27 |
+
"hadn't've": "had not have",
|
28 |
+
"hasn't": "has not",
|
29 |
+
"haven't": "have not",
|
30 |
+
"he'd": "he would",
|
31 |
+
"he'd've": "he would have",
|
32 |
+
"he'll": "he will",
|
33 |
+
"he'll've": "he will have",
|
34 |
+
"he's": "he is",
|
35 |
+
"how'd": "how did",
|
36 |
+
"how'd'y": "how do you",
|
37 |
+
"how'll": "how will",
|
38 |
+
"how's": "how is",
|
39 |
+
"i'd": "i would",
|
40 |
+
"i'd've": "i would have",
|
41 |
+
"i'll": "i will",
|
42 |
+
"i'll've": "i will have",
|
43 |
+
"i'm": "i am",
|
44 |
+
"i've": "i have",
|
45 |
+
"isn't": "is not",
|
46 |
+
"it'd": "it would",
|
47 |
+
"it'd've": "it would have",
|
48 |
+
"it'll": "it will",
|
49 |
+
"it'll've": "it will have",
|
50 |
+
"it's": "it is",
|
51 |
+
"let's": "let us",
|
52 |
+
"ma'am": "madam",
|
53 |
+
"mayn't": "may not",
|
54 |
+
"might've": "might have",
|
55 |
+
"mightn't": "might not",
|
56 |
+
"mightn't've": "might not have",
|
57 |
+
"must've": "must have",
|
58 |
+
"mustn't": "must not",
|
59 |
+
"mustn't've": "must not have",
|
60 |
+
"needn't": "need not",
|
61 |
+
"needn't've": "need not have",
|
62 |
+
"o'clock": "of the clock",
|
63 |
+
"oughtn't": "ought not",
|
64 |
+
"oughtn't've": "ought not have",
|
65 |
+
"shan't": "shall not",
|
66 |
+
"sha'n't": "shall not",
|
67 |
+
"shan't've": "shall not have",
|
68 |
+
"she'd": "she would",
|
69 |
+
"she'd've": "she would have",
|
70 |
+
"she'll": "she will",
|
71 |
+
"she'll've": "she will have",
|
72 |
+
"she's": "she is",
|
73 |
+
"should've": "should have",
|
74 |
+
"shouldn't": "should not",
|
75 |
+
"shouldn't've": "should not have",
|
76 |
+
"so've": "so have",
|
77 |
+
"so's": "so as",
|
78 |
+
"that'd": "that would",
|
79 |
+
"that'd've": "that would have",
|
80 |
+
"that's": "that is",
|
81 |
+
"there'd": "there would",
|
82 |
+
"there'd've": "there would have",
|
83 |
+
"there's": "there is",
|
84 |
+
"they'd": "they would",
|
85 |
+
"they'd've": "they would have",
|
86 |
+
"they'll": "they will",
|
87 |
+
"they'll've": "they will have",
|
88 |
+
"they're": "they are",
|
89 |
+
"they've": "they have",
|
90 |
+
"to've": "to have",
|
91 |
+
"wasn't": "was not",
|
92 |
+
"we'd": "we would",
|
93 |
+
"we'd've": "we would have",
|
94 |
+
"we'll": "we will",
|
95 |
+
"we'll've": "we will have",
|
96 |
+
"we're": "we are",
|
97 |
+
"we've": "we have",
|
98 |
+
"weren't": "were not",
|
99 |
+
"what'll": "what will",
|
100 |
+
"what'll've": "what will have",
|
101 |
+
"what're": "what are",
|
102 |
+
"what's": "what is",
|
103 |
+
"what've": "what have",
|
104 |
+
"when's": "when is",
|
105 |
+
"when've": "when have",
|
106 |
+
"where'd": "where did",
|
107 |
+
"where's": "where is",
|
108 |
+
"where've": "where have",
|
109 |
+
"who'll": "who will",
|
110 |
+
"who'll've": "who will have",
|
111 |
+
"who's": "who is",
|
112 |
+
"who've": "who have",
|
113 |
+
"why's": "why is",
|
114 |
+
"why've": "why have",
|
115 |
+
"will've": "will have",
|
116 |
+
"won't": "will not",
|
117 |
+
"won't've": "will not have",
|
118 |
+
"would've": "would have",
|
119 |
+
"wouldn't": "would not",
|
120 |
+
"wouldn't've": "would not have",
|
121 |
+
"y'all": "you all",
|
122 |
+
"y'all'd": "you all would",
|
123 |
+
"y'all'd've": "you all would have",
|
124 |
+
"y'all're": "you all are",
|
125 |
+
"y'all've": "you all have",
|
126 |
+
"you'd": "you would",
|
127 |
+
"you'd've": "you would have",
|
128 |
+
"you'll": "you will",
|
129 |
+
"you'll've": "you will have",
|
130 |
+
"you're": "you are",
|
131 |
+
"you've": "you have"
|
132 |
+
}
|
133 |
+
|
134 |
+
sentence_decontracted = []
|
135 |
+
|
136 |
+
for word in sentence.split():
|
137 |
+
if word in contractions:
|
138 |
+
word = contractions[word]
|
139 |
+
|
140 |
+
sentence_decontracted.append(word)
|
141 |
+
|
142 |
+
sentence = ' '.join(sentence_decontracted)
|
143 |
+
sentence = sentence.replace("'ve", " have")
|
144 |
+
sentence = sentence.replace("n't", " not")
|
145 |
+
sentence = sentence.replace("'re", " are")
|
146 |
+
sentence = sentence.replace("'ll", " will")
|
147 |
+
sentence = sentence.replace("'d", " would")
|
148 |
+
sentence = sentence.replace("'s", " is")
|
149 |
+
sentence = sentence.replace("'m", " am")
|
150 |
+
|
151 |
+
return sentence
|
152 |
+
|
153 |
+
|
154 |
+
|
155 |
+
def space_punc(line):
|
156 |
+
"""
|
157 |
+
Add a space before and after a punctuation mark
|
158 |
+
and remove more than one space
|
159 |
+
print(space_punc('bla. bla? "bla"bla.bla! bla...'))
|
160 |
+
>> bla . bla ? " bla " bla . bla ! bla . . .
|
161 |
+
"""
|
162 |
+
|
163 |
+
line = re.sub('([.,:;\-।!?"()\'])', r" \1 ", line)
|
164 |
+
line = re.sub("\s{2,}", " ", line)
|
165 |
+
return line
|
helper/translators.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This file contains the functions to translate the text from one language to another.
|
3 |
+
"""
|
4 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
5 |
+
from deep_translator import GoogleTranslator, MyMemoryTranslator, MicrosoftTranslator, YandexTranslator, ChatGptTranslator
|
6 |
+
from .text_preprocess import decontracting_words, space_punc
|
7 |
+
from dotenv import load_dotenv
|
8 |
+
import os
|
9 |
+
|
10 |
+
|
11 |
+
# Load the environment variables from the .env file
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
# Translators API Keys
|
15 |
+
MICROSOFT_API_KEY = os.getenv("MICROSOFT_TRANSLATOR_KEY")
|
16 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
17 |
+
YANDEX_API_KEY = os.getenv("YANDEX_API_KEY")
|
18 |
+
|
19 |
+
# Digit Translation
|
20 |
+
digit_converter = {
|
21 |
+
'০': '0',
|
22 |
+
'১': '1',
|
23 |
+
'২': '2',
|
24 |
+
'৩': '3',
|
25 |
+
'৪': '4',
|
26 |
+
'৫': '5',
|
27 |
+
'৬': '6',
|
28 |
+
'৭': '7',
|
29 |
+
'৮': '8',
|
30 |
+
'৯': '9'
|
31 |
+
}
|
32 |
+
|
33 |
+
|
34 |
+
def get_translated_digit(sentence):
|
35 |
+
"""
|
36 |
+
Translate the digits from Bengali to English
|
37 |
+
"""
|
38 |
+
translated_sentence = []
|
39 |
+
for each_letter in sentence:
|
40 |
+
if each_letter in digit_converter.keys():
|
41 |
+
translated_sentence.append(digit_converter[each_letter])
|
42 |
+
# print(digit_converter[each_letter], end="")
|
43 |
+
else:
|
44 |
+
translated_sentence.append(each_letter)
|
45 |
+
# print(each_letter, end="")
|
46 |
+
|
47 |
+
return "".join(each for each in translated_sentence)
|
48 |
+
|
49 |
+
# Bangla to English Translation (BUET BanglaNMT)
|
50 |
+
translation_model_bn_en = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/banglat5_nmt_bn_en")
|
51 |
+
translation_tokenizer_bn_en = AutoTokenizer.from_pretrained("csebuetnlp/banglat5_nmt_bn_en")
|
52 |
+
|
53 |
+
def banglanmt_translation(input_text):
|
54 |
+
"""
|
55 |
+
Translate a sentence from Bengali to English using BUET BanglaNMT
|
56 |
+
"""
|
57 |
+
inputs = translation_tokenizer_bn_en(input_text, return_tensors="pt")
|
58 |
+
outputs = translation_model_bn_en.generate(**inputs)
|
59 |
+
translated_text = translation_tokenizer_bn_en.decode(outputs[0], skip_special_tokens=True)
|
60 |
+
return translated_text
|
61 |
+
|
62 |
+
def google_translation(sentence: str, source="bn", target="en") -> str:
|
63 |
+
"""
|
64 |
+
Translate a sentence from one language to another using Google Translator.\n
|
65 |
+
At first install dependencies \n
|
66 |
+
`!pip install -U deep-translator`
|
67 |
+
"""
|
68 |
+
translator = GoogleTranslator()
|
69 |
+
translated_sentence = translator.translate(
|
70 |
+
sentence, source=source, target=target)
|
71 |
+
return translated_sentence
|
72 |
+
|
73 |
+
def microsoft_translation(sentence: str, source="bn", target="en") -> str:
|
74 |
+
"""
|
75 |
+
Translate a sentence from one language to another using Microsoft Translator.\n
|
76 |
+
At first install dependencies \n
|
77 |
+
`!pip install -U deep-translator`
|
78 |
+
"""
|
79 |
+
translator = MicrosoftTranslator(api_key=MICROSOFT_API_KEY, target='en')
|
80 |
+
translated_sentence = translator.translate(sentence)
|
81 |
+
return translated_sentence
|
82 |
+
|
83 |
+
def chatgpt_translation(sentence: str, source="bn", target="en") -> str:
|
84 |
+
"""
|
85 |
+
Translate a sentence from one language to another using ChatGPT Translator.\n
|
86 |
+
At first install dependencies \n
|
87 |
+
`!pip install -U deep-translator`
|
88 |
+
"""
|
89 |
+
translator = ChatGptTranslator(api_key=OPENAI_API_KEY, target=target)
|
90 |
+
translated_sentence = translator.translate(sentence)
|
91 |
+
return translated_sentence
|
92 |
+
|
93 |
+
def yandex_translation(sentence: str, source="bn", target="en") -> str:
|
94 |
+
"""
|
95 |
+
Translate a sentence from one language to another using Yandex Translator.\n
|
96 |
+
At first install dependencies \n
|
97 |
+
`!pip install -U deep-translator`
|
98 |
+
"""
|
99 |
+
translator = YandexTranslator(api_key=YANDEX_API_KEY)
|
100 |
+
translated_sentence = translator.translate(
|
101 |
+
sentence, source=source, target=target)
|
102 |
+
return translated_sentence
|
103 |
+
|
104 |
+
def mymemory_translation(sentence: str, source="bn-IN", target="en-US") -> str:
|
105 |
+
"""
|
106 |
+
Translate a sentence from one language to another using MyMemory Translator.\n
|
107 |
+
At first install dependencies \n
|
108 |
+
`!pip install -U deep-translator`
|
109 |
+
"""
|
110 |
+
translator = MyMemoryTranslator(source=source, target=target)
|
111 |
+
translated_sentence = translator.translate(sentence)
|
112 |
+
return translated_sentence
|
113 |
+
|
114 |
+
def get_better_translation(translator_func, src=""):
|
115 |
+
src_mod = get_translated_digit(src)
|
116 |
+
tgt = translator_func(src_mod)
|
117 |
+
tgt = decontracting_words(tgt)
|
118 |
+
tgt = tgt.replace('rupees', 'takas').replace('Rs', 'takas')
|
119 |
+
return tgt
|
120 |
+
|
121 |
+
def select_translator(src, translator):
|
122 |
+
"""
|
123 |
+
Select the translator
|
124 |
+
"""
|
125 |
+
tgt = None
|
126 |
+
tgt_base = None
|
127 |
+
|
128 |
+
if translator == "Google":
|
129 |
+
tgt = get_better_translation(google_translation, src)
|
130 |
+
tgt = space_punc(tgt)
|
131 |
+
tgt_base = google_translation(src)
|
132 |
+
elif translator == "BanglaNMT":
|
133 |
+
tgt = get_better_translation(banglanmt_translation, src)
|
134 |
+
tgt = space_punc(tgt)
|
135 |
+
tgt_base = banglanmt_translation(src)
|
136 |
+
elif translator == "MyMemory":
|
137 |
+
tgt = get_better_translation(mymemory_translation, src)
|
138 |
+
tgt = space_punc(tgt)
|
139 |
+
tgt_base = mymemory_translation(src)
|
140 |
+
|
141 |
+
return tgt_base, tgt
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
sentencepiece
|
3 |
+
transformers
|
4 |
+
spacy
|
5 |
+
flair
|
6 |
+
nltk
|
7 |
+
textblob
|
8 |
+
deep-translator
|
9 |
+
pandas
|
10 |
+
gradio_rich_textbox
|
11 |
+
python-dotenv
|
styles.css
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
@import url("https://fonts.googleapis.com/css2?family=Merriweather:wght@400;700;900&display=swap");
|
2 |
+
|
3 |
+
h1 {
|
4 |
+
font-family: "Merriweather", serif;
|
5 |
+
text-align: center;
|
6 |
+
font-weight: 700;
|
7 |
+
}
|
8 |
+
|
9 |
+
p {
|
10 |
+
text-align: center;
|
11 |
+
}
|
12 |
+
|
13 |
+
.mybtn {
|
14 |
+
background-color: rgb(240,98,16)!important;
|
15 |
+
}
|
16 |
+
|
17 |
+
table {
|
18 |
+
border: 1px solid gray;
|
19 |
+
border-collapse: collapse;
|
20 |
+
text-align: center;
|
21 |
+
width: 100%;
|
22 |
+
}
|
23 |
+
|
24 |
+
th,
|
25 |
+
td {
|
26 |
+
border: 1px solid gray;
|
27 |
+
border-collapse: collapse;
|
28 |
+
padding: 5px;
|
29 |
+
}
|