File size: 2,647 Bytes
12f6a20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from docx import Document
import os
import sys
import transformers
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch 
from mosestokenizer import *
from indicnlp.tokenize import sentence_tokenize


# import zipfile
# with zipfile.ZipFile(r"C:\Users\Prince Raj\Desktop\BOT\cuad-training\data.zip") as zip_ref:
#     zip_ref.extractall(r"C:\Users\Prince Raj\Desktop\BOT\cuad-training")
os.chdir(r"C:\Users\Prince Raj\Desktop\BOT\transformers")
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

lang_dict = {
    'english' : 'eng_Latn',
    'assamese' : 'asm_Beng',
    'awadhi' : 'awa_Deva' ,
    'bengali' : 'ben_Beng',
    'bhojpuri' :	'bho_Deva',
    'gujarati' :	'guj_Gujr', 
    'hindi' : 'hin_Deva',
    'kannada' :	'kan_Knda',
    'kashmiri' :	'kas_Deva',
    'maithili' :	'mai_Deva',
    'malayalam' :	'mal_Mlym',
    'marathi' :	'mar_Deva',
    'odia' :	'ory_Orya',
    'punjabi' :	'pan_Guru',
    'sanskrit' :	'san_Deva',
    'sindhi' :	'snd_Arab' ,
    'tamil' :	'tam_Taml' ,
    'telugu' :	'tel_Telu',
    'urdu' :	'urd_Arab'
}

def translate_sentence(article, target):
    inputs = tokenizer(article, return_tensors="pt").to(device)

    translated_tokens = model.generate(
        **inputs, forced_bos_token_id=tokenizer.lang_code_to_id[lang_dict[target]], max_length=100)

    return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]


INDIC_DICT = {"assamese" :"as",   'bengali' : 'bn', 'gujarati' :	'gu', 
    'hindi' : 'hi',
    'kannada' :	'kn',
    'malayalam' :	'ml',
    'marathi' :	'mr',
    'odia' :	'or',
    'punjabi' :	'pa',
    'tamil' :	'ta' ,
    'telugu' :	'te'}

def split_sentences(paragraph, language):
    if language in INDIC_DICT.keys():
        return sentence_tokenize.sentence_split(paragraph, lang=INDIC_DICT[language])
    elif language == 'en':
      with MosesSentenceSplitter('en') as splitter:
            return splitter([paragraph])
    else:
      return paragraph.split(".")

def languages(paragraph, source, target):
  if len(paragraph.split()) < 100:
    return translate_sentence(paragraph, target)
  else:
      sentences = split_sentences(paragraph, source)
      outputs = []
      for each_sentence in sentences:
        outputs.append(translate_sentence(each_sentence, target))
      return " ".join(outputs)

sys.modules[__name__] = languages

# sent = "I am hungry now"
# src = "english"
# trg = "hindi"
# print(languages(sent, src, trg))