Create new file
Browse files
    	
        app.py
    ADDED
    
    | 
         @@ -0,0 +1,123 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import streamlit as st
         
     | 
| 2 | 
         
            +
            from io import BytesIO
         
     | 
| 3 | 
         
            +
            # import gradio as gr
         
     | 
| 4 | 
         
            +
            # Def_04 Docx file to translated_Docx file
         
     | 
| 5 | 
         
            +
            #from transformers import MarianMTModel, MarianTokenizer
         
     | 
| 6 | 
         
            +
            from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
         
     | 
| 7 | 
         
            +
            import nltk
         
     | 
| 8 | 
         
            +
            from nltk.tokenize import sent_tokenize
         
     | 
| 9 | 
         
            +
            from nltk.tokenize import LineTokenizer
         
     | 
| 10 | 
         
            +
            nltk.download('punkt')
         
     | 
| 11 | 
         
            +
            import math
         
     | 
| 12 | 
         
            +
            import torch
         
     | 
| 13 | 
         
            +
            from docx import Document
         
     | 
| 14 | 
         
            +
            from time import sleep
         
     | 
| 15 | 
         
            +
            from stqdm import stqdm
         
     | 
| 16 | 
         
            +
             
     | 
| 17 | 
         
            +
            import docx
         
     | 
| 18 | 
         
            +
            def getText(filename):
         
     | 
| 19 | 
         
            +
                doc = docx.Document(filename)
         
     | 
| 20 | 
         
            +
                fullText = []
         
     | 
| 21 | 
         
            +
                for para in doc.paragraphs:
         
     | 
| 22 | 
         
            +
                    fullText.append(para.text)
         
     | 
| 23 | 
         
            +
                return '\n'.join(fullText)
         
     | 
| 24 | 
         
            +
                
         
     | 
| 25 | 
         
            +
             
     | 
| 26 | 
         
            +
             
     | 
| 27 | 
         
            +
             
         
     | 
| 28 | 
         
            +
            # mname = 'Helsinki-NLP/opus-mt-en-hi'
         
     | 
| 29 | 
         
            +
            # tokenizer = MarianTokenizer.from_pretrained(mname)
         
     | 
| 30 | 
         
            +
            # model = MarianMTModel.from_pretrained(mname)
         
     | 
| 31 | 
         
            +
            # model.to(device)
         
     | 
| 32 | 
         
            +
             
     | 
| 33 | 
         
            +
            #@st.cache
         
     | 
| 34 | 
         
            +
            def btTranslator(docxfile):
         
     | 
| 35 | 
         
            +
              if torch.cuda.is_available():  
         
     | 
| 36 | 
         
            +
                dev = "cuda"
         
     | 
| 37 | 
         
            +
              else:  
         
     | 
| 38 | 
         
            +
                dev = "cpu" 
         
     | 
| 39 | 
         
            +
              device = torch.device(dev)
         
     | 
| 40 | 
         
            +
              a=getText(docxfile)
         
     | 
| 41 | 
         
            +
              a1=a.split('\n')
         
     | 
| 42 | 
         
            +
              bigtext='''  '''
         
     | 
| 43 | 
         
            +
              for a in a1:
         
     | 
| 44 | 
         
            +
                bigtext=bigtext+'\n'+a
         
     | 
| 45 | 
         
            +
                
         
     | 
| 46 | 
         
            +
              files=Document()
         
     | 
| 47 | 
         
            +
              
         
     | 
| 48 | 
         
            +
              a="Helsinki-NLP/opus-mt-en-ru"
         
     | 
| 49 | 
         
            +
              b="Helsinki-NLP/opus-mt-ru-fr"
         
     | 
| 50 | 
         
            +
              c="Helsinki-NLP/opus-mt-fr-en"
         
     | 
| 51 | 
         
            +
              # d="Helsinki-NLP/opus-mt-es-en"
         
     | 
| 52 | 
         
            +
              langs=[a,b,c]
         
     | 
| 53 | 
         
            +
              text=bigtext
         
     | 
| 54 | 
         
            +
              
         
     | 
| 55 | 
         
            +
              for _,lang in zip(stqdm(langs),langs):
         
     | 
| 56 | 
         
            +
                    st.spinner('Wait for it...')
         
     | 
| 57 | 
         
            +
                    sleep(0.5)
         
     | 
| 58 | 
         
            +
                    # mname = '/content/drive/MyDrive/Transformers Models/opus-mt-en-hi-Trans Model'
         
     | 
| 59 | 
         
            +
                    tokenizer = AutoTokenizer.from_pretrained(lang)
         
     | 
| 60 | 
         
            +
                    model = AutoModelForSeq2SeqLM.from_pretrained(lang)
         
     | 
| 61 | 
         
            +
                    model.to(device)
         
     | 
| 62 | 
         
            +
                    lt = LineTokenizer()
         
     | 
| 63 | 
         
            +
                    batch_size = 64
         
     | 
| 64 | 
         
            +
                    paragraphs = lt.tokenize(bigtext)   
         
     | 
| 65 | 
         
            +
                    translated_paragraphs = []
         
     | 
| 66 | 
         
            +
                    
         
     | 
| 67 | 
         
            +
                    for _, paragraph in zip(stqdm(paragraphs),paragraphs):
         
     | 
| 68 | 
         
            +
                        st.spinner('Wait for it...')
         
     | 
| 69 | 
         
            +
                    # ######################################
         
     | 
| 70 | 
         
            +
                        sleep(0.5)
         
     | 
| 71 | 
         
            +
             
     | 
| 72 | 
         
            +
                    # ######################################
         
     | 
| 73 | 
         
            +
                        sentences = sent_tokenize(paragraph)
         
     | 
| 74 | 
         
            +
                        batches = math.ceil(len(sentences) / batch_size)     
         
     | 
| 75 | 
         
            +
                        translated = []
         
     | 
| 76 | 
         
            +
                        for i in range(batches):
         
     | 
| 77 | 
         
            +
                            sent_batch = sentences[i*batch_size:(i+1)*batch_size]
         
     | 
| 78 | 
         
            +
                            model_inputs = tokenizer(sent_batch, return_tensors="pt", padding=True, truncation=True, max_length=500).to(device)
         
     | 
| 79 | 
         
            +
                            with torch.no_grad():
         
     | 
| 80 | 
         
            +
                                translated_batch = model.generate(**model_inputs)
         
     | 
| 81 | 
         
            +
                                translated += translated_batch
         
     | 
| 82 | 
         
            +
                            translated = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
         
     | 
| 83 | 
         
            +
                            translated_paragraphs += [" ".join(translated)]
         
     | 
| 84 | 
         
            +
                            #files.add_paragraph(translated)
         
     | 
| 85 | 
         
            +
                    translated_text = "\n".join(translated_paragraphs)
         
     | 
| 86 | 
         
            +
                    bigtext=translated_text
         
     | 
| 87 | 
         
            +
              files.add_paragraph(bigtext) 
         
     | 
| 88 | 
         
            +
              #files2save=files.save("Translated.docx")
         
     | 
| 89 | 
         
            +
              #files.save("Translated.docx")
         
     | 
| 90 | 
         
            +
              #binary_output = BytesIO()
         
     | 
| 91 | 
         
            +
              #f=files.save(binary_output)
         
     | 
| 92 | 
         
            +
              #f2=f.getvalue()
         
     | 
| 93 | 
         
            +
              return files
         
     | 
| 94 | 
         
            +
             
     | 
| 95 | 
         
            +
             
     | 
| 96 | 
         
            +
              #return translated_text
         
     | 
| 97 | 
         
            +
            st.title('Translator App')
         
     | 
| 98 | 
         
            +
            st.markdown("Translate from Docx file")
         
     | 
| 99 | 
         
            +
            st.subheader("File Upload")
         
     | 
| 100 | 
         
            +
             
     | 
| 101 | 
         
            +
            datas=st.file_uploader("Original File")
         
     | 
| 102 | 
         
            +
            name=st.text_input('Enter New File Name: ')
         
     | 
| 103 | 
         
            +
            #data=getText("C:\Users\Ambresh C\Desktop\Python Files\Translators\Trail Doc of 500 words.docx")
         
     | 
| 104 | 
         
            +
            #if datas :
         
     | 
| 105 | 
         
            +
                #if st.button(label='Data Process'):
         
     | 
| 106 | 
         
            +
            binary_output = BytesIO()
         
     | 
| 107 | 
         
            +
            if st.button(label='Translate'):
         
     | 
| 108 | 
         
            +
                st.spinner('Waiting...')
         
     | 
| 109 | 
         
            +
                btTranslator(datas).save(binary_output)
         
     | 
| 110 | 
         
            +
                binary_output.getbuffer()
         
     | 
| 111 | 
         
            +
                st.success("Translated")
         
     | 
| 112 | 
         
            +
             
     | 
| 113 | 
         
            +
            st.download_button(label='Download Translated File',file_name=(f"{name}_Translated.docx"), data=binary_output.getvalue())
         
     | 
| 114 | 
         
            +
            #files.save(f"{name}_Translated.docx")
         
     | 
| 115 | 
         
            +
            #else:
         
     | 
| 116 | 
         
            +
             #   st.text('Upload File and Start the process')
         
     | 
| 117 | 
         
            +
                    
         
     | 
| 118 | 
         
            +
             
     | 
| 119 | 
         
            +
            #f4=binary_output(f3)
         
     | 
| 120 | 
         
            +
             
     | 
| 121 | 
         
            +
            #st.sidebar.download_button(label='Download Translated File',file_name='Translated.docx', data=binary_output.getvalue()) 
         
     | 
| 122 | 
         
            +
            # st.text_area(label="",value=btTranslator(datas),height=100)
         
     | 
| 123 | 
         
            +
            # Footer
         
     |