File size: 2,647 Bytes
12f6a20 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
from docx import Document
import os
import sys
import transformers
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
from mosestokenizer import *
from indicnlp.tokenize import sentence_tokenize
# import zipfile
# with zipfile.ZipFile(r"C:\Users\Prince Raj\Desktop\BOT\cuad-training\data.zip") as zip_ref:
# zip_ref.extractall(r"C:\Users\Prince Raj\Desktop\BOT\cuad-training")
os.chdir(r"C:\Users\Prince Raj\Desktop\BOT\transformers")
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
lang_dict = {
'english' : 'eng_Latn',
'assamese' : 'asm_Beng',
'awadhi' : 'awa_Deva' ,
'bengali' : 'ben_Beng',
'bhojpuri' : 'bho_Deva',
'gujarati' : 'guj_Gujr',
'hindi' : 'hin_Deva',
'kannada' : 'kan_Knda',
'kashmiri' : 'kas_Deva',
'maithili' : 'mai_Deva',
'malayalam' : 'mal_Mlym',
'marathi' : 'mar_Deva',
'odia' : 'ory_Orya',
'punjabi' : 'pan_Guru',
'sanskrit' : 'san_Deva',
'sindhi' : 'snd_Arab' ,
'tamil' : 'tam_Taml' ,
'telugu' : 'tel_Telu',
'urdu' : 'urd_Arab'
}
def translate_sentence(article, target):
inputs = tokenizer(article, return_tensors="pt").to(device)
translated_tokens = model.generate(
**inputs, forced_bos_token_id=tokenizer.lang_code_to_id[lang_dict[target]], max_length=100)
return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
INDIC_DICT = {"assamese" :"as", 'bengali' : 'bn', 'gujarati' : 'gu',
'hindi' : 'hi',
'kannada' : 'kn',
'malayalam' : 'ml',
'marathi' : 'mr',
'odia' : 'or',
'punjabi' : 'pa',
'tamil' : 'ta' ,
'telugu' : 'te'}
def split_sentences(paragraph, language):
if language in INDIC_DICT.keys():
return sentence_tokenize.sentence_split(paragraph, lang=INDIC_DICT[language])
elif language == 'en':
with MosesSentenceSplitter('en') as splitter:
return splitter([paragraph])
else:
return paragraph.split(".")
def languages(paragraph, source, target):
if len(paragraph.split()) < 100:
return translate_sentence(paragraph, target)
else:
sentences = split_sentences(paragraph, source)
outputs = []
for each_sentence in sentences:
outputs.append(translate_sentence(each_sentence, target))
return " ".join(outputs)
sys.modules[__name__] = languages
# sent = "I am hungry now"
# src = "english"
# trg = "hindi"
# print(languages(sent, src, trg)) |