|
from docx import Document |
|
import os |
|
import sys |
|
import transformers |
|
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer |
|
import torch |
|
from mosestokenizer import * |
|
from indicnlp.tokenize import sentence_tokenize |
|
|
|
|
|
|
|
|
|
|
|
os.chdir(r"C:\Users\Prince Raj\Desktop\BOT\transformers") |
|
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M") |
|
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M") |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model = model.to(device) |
|
|
|
lang_dict = { |
|
'english' : 'eng_Latn', |
|
'assamese' : 'asm_Beng', |
|
'awadhi' : 'awa_Deva' , |
|
'bengali' : 'ben_Beng', |
|
'bhojpuri' : 'bho_Deva', |
|
'gujarati' : 'guj_Gujr', |
|
'hindi' : 'hin_Deva', |
|
'kannada' : 'kan_Knda', |
|
'kashmiri' : 'kas_Deva', |
|
'maithili' : 'mai_Deva', |
|
'malayalam' : 'mal_Mlym', |
|
'marathi' : 'mar_Deva', |
|
'odia' : 'ory_Orya', |
|
'punjabi' : 'pan_Guru', |
|
'sanskrit' : 'san_Deva', |
|
'sindhi' : 'snd_Arab' , |
|
'tamil' : 'tam_Taml' , |
|
'telugu' : 'tel_Telu', |
|
'urdu' : 'urd_Arab' |
|
} |
|
|
|
def translate_sentence(article, target): |
|
inputs = tokenizer(article, return_tensors="pt").to(device) |
|
|
|
translated_tokens = model.generate( |
|
**inputs, forced_bos_token_id=tokenizer.lang_code_to_id[lang_dict[target]], max_length=100) |
|
|
|
return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] |
|
|
|
|
|
INDIC_DICT = {"assamese" :"as", 'bengali' : 'bn', 'gujarati' : 'gu', |
|
'hindi' : 'hi', |
|
'kannada' : 'kn', |
|
'malayalam' : 'ml', |
|
'marathi' : 'mr', |
|
'odia' : 'or', |
|
'punjabi' : 'pa', |
|
'tamil' : 'ta' , |
|
'telugu' : 'te'} |
|
|
|
def split_sentences(paragraph, language): |
|
if language in INDIC_DICT.keys(): |
|
return sentence_tokenize.sentence_split(paragraph, lang=INDIC_DICT[language]) |
|
elif language == 'en': |
|
with MosesSentenceSplitter('en') as splitter: |
|
return splitter([paragraph]) |
|
else: |
|
return paragraph.split(".") |
|
|
|
def languages(paragraph, source, target): |
|
if len(paragraph.split()) < 100: |
|
return translate_sentence(paragraph, target) |
|
else: |
|
sentences = split_sentences(paragraph, source) |
|
outputs = [] |
|
for each_sentence in sentences: |
|
outputs.append(translate_sentence(each_sentence, target)) |
|
return " ".join(outputs) |
|
|
|
sys.modules[__name__] = languages |
|
|
|
|
|
|
|
|
|
|