Spaces:

musfiqdehan
/

Multilingual-Sentence-Alignments

Sleeping

App Files Files Community

musfiqdehan commited on Mar 3, 2024

Commit

4a52b88

1 Parent(s): 3f01399

Add word alignment mapping functions

Browse files

Files changed (5) hide show

helper/__init__.py +0 -0
helper/alignment_mappers.py +125 -0
helper/pos_taggers.py +167 -0
helper/text_preprocess.py +165 -0
helper/translators.py +141 -0

helper/__init__.py ADDED Viewed

File without changes

helper/alignment_mappers.py ADDED Viewed

	@@ -0,0 +1,125 @@

+"""
+This module contains the helper functions to get the word alignment mapping between two sentences.
+"""
+import torch
+import itertools
+import transformers
+from transformers import logging
+# Set the verbosity to error, so that the warning messages are not printed
+logging.set_verbosity_warning()
+logging.set_verbosity_error()
+def select_model(model_name):
+    """
+    Select Model
+    """
+    if model_name == "Google-mBERT (Base-Multilingual)":
+        model_name="bert-base-multilingual-cased"
+    elif model_name == "Neulab-AwesomeAlign (Bn-En-0.5M)":
+        model_name="musfiqdehan/bn-en-word-aligner"
+    elif model_name == "BUET-BanglaBERT (Large)":
+        model_name="csebuetnlp/banglabert_large"
+    elif model_name == "SagorSarker-BanglaBERT (Base)":
+        model_name="sagorsarker/bangla-bert-base"
+    elif model_name == "SentenceTransformers-LaBSE (Multilingual)":
+        model_name="sentence-transformers/LaBSE"
+    return model_name
+def get_alignment_mapping(source="", target="", model_name=""):
+    """
+    Get Aligned Words
+    """
+    model_name = select_model(model_name)
+    model = transformers.BertModel.from_pretrained(model_name)
+    tokenizer = transformers.BertTokenizer.from_pretrained(model_name)
+    # pre-processing
+    sent_src, sent_tgt = source.strip().split(), target.strip().split()
+    token_src, token_tgt = [tokenizer.tokenize(word) for word in sent_src], [
+        tokenizer.tokenize(word) for word in sent_tgt]
+    wid_src, wid_tgt = [tokenizer.convert_tokens_to_ids(x) for x in token_src], [
+        tokenizer.convert_tokens_to_ids(x) for x in token_tgt]
+    ids_src, ids_tgt = tokenizer.prepare_for_model(list(itertools.chain(*wid_src)), return_tensors='pt', model_max_length=tokenizer.model_max_length, truncation=True)['input_ids'], tokenizer.prepare_for_model(list(itertools.chain(*wid_tgt)), return_tensors='pt', truncation=True, model_max_length=tokenizer.model_max_length)['input_ids']
+    sub2word_map_src = []
+    for i, word_list in enumerate(token_src):
+        sub2word_map_src += [i for x in word_list]
+    sub2word_map_tgt = []
+    for i, word_list in enumerate(token_tgt):
+        sub2word_map_tgt += [i for x in word_list]
+    # alignment
+    align_layer = 8
+    threshold = 1e-3
+    model.eval()
+    with torch.no_grad():
+        out_src = model(ids_src.unsqueeze(0), output_hidden_states=True)[
+            2][align_layer][0, 1:-1]
+        out_tgt = model(ids_tgt.unsqueeze(0), output_hidden_states=True)[
+            2][align_layer][0, 1:-1]
+        dot_prod = torch.matmul(out_src, out_tgt.transpose(-1, -2))
+        softmax_srctgt = torch.nn.Softmax(dim=-1)(dot_prod)
+        softmax_tgtsrc = torch.nn.Softmax(dim=-2)(dot_prod)
+        softmax_inter = (softmax_srctgt > threshold) * \
+            (softmax_tgtsrc > threshold)
+    align_subwords = torch.nonzero(softmax_inter, as_tuple=False)
+    align_words = set()
+    for i, j in align_subwords:
+        align_words.add((sub2word_map_src[i], sub2word_map_tgt[j]))
+    return sent_src, sent_tgt, align_words
+def get_word_mapping(source="", target="", model_name=""):
+    """
+    Get Word Aligned Mapping Words
+    """
+    sent_src, sent_tgt, align_words = get_alignment_mapping(
+        source=source, target=target, model_name=model_name)
+    result = []
+    for i, j in sorted(align_words):
+        result.append(f'bn:({sent_src[i]}) -> en:({sent_tgt[j]})')
+    return result
+def get_word_index_mapping(source="", target="", model_name=""):
+    """
+    Get Word Aligned Mapping Index
+    """
+    sent_src, sent_tgt, align_words = get_alignment_mapping(
+        source=source, target=target, model_name=model_name)
+    result = []
+    for i, j in sorted(align_words):
+        result.append(f'bn:({i}) -> en:({j})')
+    return result

helper/pos_taggers.py ADDED Viewed

	@@ -0,0 +1,167 @@

+"""
+This module contains the functions to get PoS tags using Spacy and return a Markdown table
+"""
+from .alignment_mappers import get_alignment_mapping, select_model
+from flair.models import SequenceTagger
+from flair.data import Sentence
+import spacy
+from spacy.cli import download
+download("en_core_web_sm")
+import en_core_web_sm
+import nltk
+nltk.download('punkt')
+nltk.download('averaged_perceptron_tagger')
+from textblob import TextBlob
+def get_spacy_postag_dict(target=""):
+    '''
+    Get spacy pos tags
+    '''
+    nlp = en_core_web_sm.load()
+    target_tokenized = nlp(target)
+    spacy_postag_dict = dict((token.text, token.tag_)
+                             for token in target_tokenized)
+    return spacy_postag_dict
+def get_nltk_postag_dict(target=""):
+    '''
+    Get nltk pos tags
+    '''
+    target_tokenized = nltk.tokenize.word_tokenize(target)
+    nltk_postag_dict = dict((key, value)
+                            for key, value in nltk.pos_tag(target_tokenized))
+    return nltk_postag_dict
+def get_flair_postag_dict(target=""):
+    '''
+    Get flair pos tags
+    '''
+    tagger = SequenceTagger.load("pos")
+    target_tokenized = Sentence(target)
+    tagger.predict(target_tokenized)
+    flair_postag_dict = dict((token.text, token.tag)
+                             for token in target_tokenized)
+    return flair_postag_dict
+def get_textblob_postag_dict(target=""):
+    '''
+    Get textblob pos tags
+    '''
+    blob = TextBlob(target)
+    textblob_postag_dict = dict(blob.tags)
+    return textblob_postag_dict
+def get_postag(
+        get_postag_dict,
+        source="",
+        target="",
+        model_name="musfiqdehan/bn-en-word-aligner"):
+    """Get Spacy PoS Tags and return a Markdown table"""
+    sent_src, sent_tgt, align_words = get_alignment_mapping(
+        source=source, target=target, model_name=model_name
+    )
+    postag_dict = get_postag_dict(target=target)
+    mapped_sent_src = []
+    html_table = '''
+                    <table>
+                        <thead>
+                            <th>Bangla</th>
+                            <th>English</th>
+                            <th>PoS Tags</th>
+                        </thead>
+                '''
+    for i, j in sorted(align_words):
+        punc = r"""!()-[]{}।;:'"\,<>./?@#$%^&*_~"""
+        if sent_src[i] in punc or sent_tgt[j] in punc:
+            mapped_sent_src.append(sent_src[i])
+            html_table += f'''
+                            <tbody>
+                                <tr>
+                                    <td> {sent_src[i]} </td>
+                                    <td> {sent_tgt[j]} </td>
+                                    <td> PUNC </td>
+                                </tr>
+                            '''
+        else:
+            mapped_sent_src.append(sent_src[i])
+            html_table += f'''
+                            <tr>
+                                <td> {sent_src[i]} </td>
+                                <td> {sent_tgt[j]} </td>
+                                <td> {postag_dict[sent_tgt[j]]} </td>
+                            </tr>
+                            '''
+    unks = list(set(sent_src).difference(set(mapped_sent_src)))
+    for word in unks:
+        html_table += f'''
+                        <tr>
+                            <td> {word} </td>
+                            <td> N/A </td>
+                            <td> UNK </td>
+                        </tr>
+                    '''
+    html_table += '''
+                        </tbody>
+                    </table>
+                '''
+    pos_accuracy = ((len(sent_src) - len(unks)) / len(sent_src))
+    pos_accuracy = f"{pos_accuracy:0.2%}"
+    return html_table, pos_accuracy
+def select_pos_tagger(src, tgt, model_name, tagger):
+    '''
+    Select the PoS tagger
+    '''
+    result = None
+    pos_accuracy = None
+    model_name = select_model(model_name)
+    if tagger == "spaCy":
+        result, pos_accuracy = get_postag(
+            get_spacy_postag_dict,
+            source=src,
+            target=tgt,
+            model_name=model_name,
+        )
+    elif tagger == "NLTK":
+        result, pos_accuracy = get_postag(
+            get_nltk_postag_dict,
+            source=src,
+            target=tgt,
+            model_name=model_name,
+        )
+    elif tagger == "Flair":
+        result, pos_accuracy = get_postag(
+            get_flair_postag_dict,
+            source=src,
+            target=tgt,
+            model_name=model_name,
+        )
+    elif tagger == "TextBlob":
+        result, pos_accuracy = get_postag(
+            get_textblob_postag_dict,
+            source=src,
+            target=tgt,
+            model_name=model_name,
+        )
+    return result, pos_accuracy

helper/text_preprocess.py ADDED Viewed

	@@ -0,0 +1,165 @@

+"""
+This file contains functions for text preprocessing
+"""
+import re
+def decontracting_words(sentence):
+    """
+    Decontracting words (e.g. I'm -> I am, I've -> I have, etc.)
+    https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
+    https://stackoverflow.com/a/19794953
+    """
+    contractions = {
+        "ain't": "am not",
+        "aren't": "are not",
+        "can't": "can not",
+        "can't've": "can not have",
+        "'cause": "because",
+        "could've": "could have",
+        "couldn't": "could not",
+        "couldn't've": "could not have",
+        "didn't": "did not",
+        "doesn't": "does not",
+        "don't": "do not",
+        "hadn't": "had not",
+        "hadn't've": "had not have",
+        "hasn't": "has not",
+        "haven't": "have not",
+        "he'd": "he would",
+        "he'd've": "he would have",
+        "he'll": "he will",
+        "he'll've": "he will have",
+        "he's": "he is",
+        "how'd": "how did",
+        "how'd'y": "how do you",
+        "how'll": "how will",
+        "how's": "how is",
+        "i'd": "i would",
+        "i'd've": "i would have",
+        "i'll": "i will",
+        "i'll've": "i will have",
+        "i'm": "i am",
+        "i've": "i have",
+        "isn't": "is not",
+        "it'd": "it would",
+        "it'd've": "it would have",
+        "it'll": "it will",
+        "it'll've": "it will have",
+        "it's": "it is",
+        "let's": "let us",
+        "ma'am": "madam",
+        "mayn't": "may not",
+        "might've": "might have",
+        "mightn't": "might not",
+        "mightn't've": "might not have",
+        "must've": "must have",
+        "mustn't": "must not",
+        "mustn't've": "must not have",
+        "needn't": "need not",
+        "needn't've": "need not have",
+        "o'clock": "of the clock",
+        "oughtn't": "ought not",
+        "oughtn't've": "ought not have",
+        "shan't": "shall not",
+        "sha'n't": "shall not",
+        "shan't've": "shall not have",
+        "she'd": "she would",
+        "she'd've": "she would have",
+        "she'll": "she will",
+        "she'll've": "she will have",
+        "she's": "she is",
+        "should've": "should have",
+        "shouldn't": "should not",
+        "shouldn't've": "should not have",
+        "so've": "so have",
+        "so's": "so as",
+        "that'd": "that would",
+        "that'd've": "that would have",
+        "that's": "that is",
+        "there'd": "there would",
+        "there'd've": "there would have",
+        "there's": "there is",
+        "they'd": "they would",
+        "they'd've": "they would have",
+        "they'll": "they will",
+        "they'll've": "they will have",
+        "they're": "they are",
+        "they've": "they have",
+        "to've": "to have",
+        "wasn't": "was not",
+        "we'd": "we would",
+        "we'd've": "we would have",
+        "we'll": "we will",
+        "we'll've": "we will have",
+        "we're": "we are",
+        "we've": "we have",
+        "weren't": "were not",
+        "what'll": "what will",
+        "what'll've": "what will have",
+        "what're": "what are",
+        "what's": "what is",
+        "what've": "what have",
+        "when's": "when is",
+        "when've": "when have",
+        "where'd": "where did",
+        "where's": "where is",
+        "where've": "where have",
+        "who'll": "who will",
+        "who'll've": "who will have",
+        "who's": "who is",
+        "who've": "who have",
+        "why's": "why is",
+        "why've": "why have",
+        "will've": "will have",
+        "won't": "will not",
+        "won't've": "will not have",
+        "would've": "would have",
+        "wouldn't": "would not",
+        "wouldn't've": "would not have",
+        "y'all": "you all",
+        "y'all'd": "you all would",
+        "y'all'd've": "you all would have",
+        "y'all're": "you all are",
+        "y'all've": "you all have",
+        "you'd": "you would",
+        "you'd've": "you would have",
+        "you'll": "you will",
+        "you'll've": "you will have",
+        "you're": "you are",
+        "you've": "you have"
+    }
+    sentence_decontracted = []
+    for word in sentence.split():
+        if word in contractions:
+            word = contractions[word]
+        sentence_decontracted.append(word)
+    sentence = ' '.join(sentence_decontracted)
+    sentence = sentence.replace("'ve", " have")
+    sentence = sentence.replace("n't", " not")
+    sentence = sentence.replace("'re", " are")
+    sentence = sentence.replace("'ll", " will")
+    sentence = sentence.replace("'d", " would")
+    sentence = sentence.replace("'s", " is")
+    sentence = sentence.replace("'m", " am")
+    return sentence
+def space_punc(line):
+    """
+    Add a space before and after a punctuation mark
+    and remove more than one space
+    print(space_punc('bla. bla? "bla"bla.bla! bla...'))
+    >> bla . bla ? " bla " bla . bla ! bla . . .
+    """
+    line = re.sub('([.,:;\-।!?"()\'])', r" \1 ", line)
+    line = re.sub("\s{2,}", " ", line)
+    return line

helper/translators.py ADDED Viewed

	@@ -0,0 +1,141 @@

+"""
+This file contains the functions to translate the text from one language to another.
+"""
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+from deep_translator import GoogleTranslator, MyMemoryTranslator, MicrosoftTranslator, YandexTranslator, ChatGptTranslator
+from .text_preprocess import decontracting_words, space_punc
+from dotenv import load_dotenv
+import os
+# Load the environment variables from the .env file
+load_dotenv()
+# Translators API Keys
+MICROSOFT_API_KEY = os.getenv("MICROSOFT_TRANSLATOR_KEY")
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+YANDEX_API_KEY = os.getenv("YANDEX_API_KEY")
+# Digit Translation
+digit_converter = {
+    '০': '0',
+    '১': '1',
+    '২': '2',
+    '৩': '3',
+    '৪': '4',
+    '৫': '5',
+    '৬': '6',
+    '৭': '7',
+    '৮': '8',
+    '৯': '9'
+}
+def get_translated_digit(sentence):
+    """
+    Translate the digits from Bengali to English
+    """
+    translated_sentence = []
+    for each_letter in sentence:
+        if each_letter in digit_converter.keys():
+            translated_sentence.append(digit_converter[each_letter])
+            # print(digit_converter[each_letter], end="")
+        else:
+            translated_sentence.append(each_letter)
+            # print(each_letter, end="")
+    return "".join(each for each in translated_sentence)
+# Bangla to English Translation (BUET BanglaNMT)
+translation_model_bn_en = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/banglat5_nmt_bn_en")
+translation_tokenizer_bn_en = AutoTokenizer.from_pretrained("csebuetnlp/banglat5_nmt_bn_en")
+def banglanmt_translation(input_text):
+    """
+    Translate a sentence from Bengali to English using BUET BanglaNMT
+    """
+    inputs = translation_tokenizer_bn_en(input_text, return_tensors="pt")
+    outputs = translation_model_bn_en.generate(**inputs)
+    translated_text = translation_tokenizer_bn_en.decode(outputs[0], skip_special_tokens=True)
+    return translated_text
+def google_translation(sentence: str, source="bn", target="en") -> str:
+    """
+    Translate a sentence from one language to another using Google Translator.\n
+    At first install dependencies \n
+    `!pip install -U deep-translator`
+    """
+    translator = GoogleTranslator()
+    translated_sentence = translator.translate(
+        sentence, source=source, target=target)
+    return translated_sentence
+def microsoft_translation(sentence: str, source="bn", target="en") -> str:
+    """
+    Translate a sentence from one language to another using Microsoft Translator.\n
+    At first install dependencies \n
+    `!pip install -U deep-translator`
+    """
+    translator = MicrosoftTranslator(api_key=MICROSOFT_API_KEY, target='en')
+    translated_sentence = translator.translate(sentence)
+    return translated_sentence
+def chatgpt_translation(sentence: str, source="bn", target="en") -> str:
+    """
+    Translate a sentence from one language to another using ChatGPT Translator.\n
+    At first install dependencies \n
+    `!pip install -U deep-translator`
+    """
+    translator = ChatGptTranslator(api_key=OPENAI_API_KEY, target=target)
+    translated_sentence = translator.translate(sentence)
+    return translated_sentence
+def yandex_translation(sentence: str, source="bn", target="en") -> str:
+    """
+    Translate a sentence from one language to another using Yandex Translator.\n
+    At first install dependencies \n
+    `!pip install -U deep-translator`
+    """
+    translator = YandexTranslator(api_key=YANDEX_API_KEY)
+    translated_sentence = translator.translate(
+        sentence, source=source, target=target)
+    return translated_sentence
+def mymemory_translation(sentence: str, source="bn-IN", target="en-US") -> str:
+    """
+    Translate a sentence from one language to another using MyMemory Translator.\n
+    At first install dependencies \n
+    `!pip install -U deep-translator`
+    """
+    translator = MyMemoryTranslator(source=source, target=target)
+    translated_sentence = translator.translate(sentence)
+    return translated_sentence
+def get_better_translation(translator_func, src=""):
+    src_mod = get_translated_digit(src)
+    tgt = translator_func(src_mod)
+    tgt = decontracting_words(tgt)
+    tgt = tgt.replace('rupees', 'takas').replace('Rs', 'takas')
+    return tgt
+def select_translator(src, translator):
+    """
+    Select the translator
+    """
+    tgt = None
+    tgt_base = None
+    if translator == "Google":
+        tgt = get_better_translation(google_translation, src)
+        tgt = space_punc(tgt)
+        tgt_base = google_translation(src)
+    elif translator == "BanglaNMT":
+        tgt = get_better_translation(banglanmt_translation, src)
+        tgt = space_punc(tgt)
+        tgt_base = banglanmt_translation(src)
+    elif translator == "MyMemory":
+        tgt = get_better_translation(mymemory_translation, src)
+        tgt = space_punc(tgt)
+        tgt_base = mymemory_translation(src)
+    return tgt_base, tgt