ambreshrc commited on
Commit
d0d448d
1 Parent(s): 5a65e56

Create new file

Browse files
Files changed (1) hide show
  1. app.py +123 -0
app.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from io import BytesIO
3
+ # import gradio as gr
4
+ # Def_04 Docx file to translated_Docx file
5
+ #from transformers import MarianMTModel, MarianTokenizer
6
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
7
+ import nltk
8
+ from nltk.tokenize import sent_tokenize
9
+ from nltk.tokenize import LineTokenizer
10
+ nltk.download('punkt')
11
+ import math
12
+ import torch
13
+ from docx import Document
14
+ from time import sleep
15
+ from stqdm import stqdm
16
+
17
+ import docx
18
+ def getText(filename):
19
+ doc = docx.Document(filename)
20
+ fullText = []
21
+ for para in doc.paragraphs:
22
+ fullText.append(para.text)
23
+ return '\n'.join(fullText)
24
+
25
+
26
+
27
+
28
+ # mname = 'Helsinki-NLP/opus-mt-en-hi'
29
+ # tokenizer = MarianTokenizer.from_pretrained(mname)
30
+ # model = MarianMTModel.from_pretrained(mname)
31
+ # model.to(device)
32
+
33
+ #@st.cache
34
+ def btTranslator(docxfile):
35
+ if torch.cuda.is_available():
36
+ dev = "cuda"
37
+ else:
38
+ dev = "cpu"
39
+ device = torch.device(dev)
40
+ a=getText(docxfile)
41
+ a1=a.split('\n')
42
+ bigtext=''' '''
43
+ for a in a1:
44
+ bigtext=bigtext+'\n'+a
45
+
46
+ files=Document()
47
+
48
+ a="Helsinki-NLP/opus-mt-en-ru"
49
+ b="Helsinki-NLP/opus-mt-ru-fr"
50
+ c="Helsinki-NLP/opus-mt-fr-en"
51
+ # d="Helsinki-NLP/opus-mt-es-en"
52
+ langs=[a,b,c]
53
+ text=bigtext
54
+
55
+ for _,lang in zip(stqdm(langs),langs):
56
+ st.spinner('Wait for it...')
57
+ sleep(0.5)
58
+ # mname = '/content/drive/MyDrive/Transformers Models/opus-mt-en-hi-Trans Model'
59
+ tokenizer = AutoTokenizer.from_pretrained(lang)
60
+ model = AutoModelForSeq2SeqLM.from_pretrained(lang)
61
+ model.to(device)
62
+ lt = LineTokenizer()
63
+ batch_size = 64
64
+ paragraphs = lt.tokenize(bigtext)
65
+ translated_paragraphs = []
66
+
67
+ for _, paragraph in zip(stqdm(paragraphs),paragraphs):
68
+ st.spinner('Wait for it...')
69
+ # ######################################
70
+ sleep(0.5)
71
+
72
+ # ######################################
73
+ sentences = sent_tokenize(paragraph)
74
+ batches = math.ceil(len(sentences) / batch_size)
75
+ translated = []
76
+ for i in range(batches):
77
+ sent_batch = sentences[i*batch_size:(i+1)*batch_size]
78
+ model_inputs = tokenizer(sent_batch, return_tensors="pt", padding=True, truncation=True, max_length=500).to(device)
79
+ with torch.no_grad():
80
+ translated_batch = model.generate(**model_inputs)
81
+ translated += translated_batch
82
+ translated = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
83
+ translated_paragraphs += [" ".join(translated)]
84
+ #files.add_paragraph(translated)
85
+ translated_text = "\n".join(translated_paragraphs)
86
+ bigtext=translated_text
87
+ files.add_paragraph(bigtext)
88
+ #files2save=files.save("Translated.docx")
89
+ #files.save("Translated.docx")
90
+ #binary_output = BytesIO()
91
+ #f=files.save(binary_output)
92
+ #f2=f.getvalue()
93
+ return files
94
+
95
+
96
+ #return translated_text
97
+ st.title('Translator App')
98
+ st.markdown("Translate from Docx file")
99
+ st.subheader("File Upload")
100
+
101
+ datas=st.file_uploader("Original File")
102
+ name=st.text_input('Enter New File Name: ')
103
+ #data=getText("C:\Users\Ambresh C\Desktop\Python Files\Translators\Trail Doc of 500 words.docx")
104
+ #if datas :
105
+ #if st.button(label='Data Process'):
106
+ binary_output = BytesIO()
107
+ if st.button(label='Translate'):
108
+ st.spinner('Waiting...')
109
+ btTranslator(datas).save(binary_output)
110
+ binary_output.getbuffer()
111
+ st.success("Translated")
112
+
113
+ st.download_button(label='Download Translated File',file_name=(f"{name}_Translated.docx"), data=binary_output.getvalue())
114
+ #files.save(f"{name}_Translated.docx")
115
+ #else:
116
+ # st.text('Upload File and Start the process')
117
+
118
+
119
+ #f4=binary_output(f3)
120
+
121
+ #st.sidebar.download_button(label='Download Translated File',file_name='Translated.docx', data=binary_output.getvalue())
122
+ # st.text_area(label="",value=btTranslator(datas),height=100)
123
+ # Footer