gauravchand11 commited on
Commit
22cd54b
·
verified ·
1 Parent(s): 5c70463

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +136 -0
app.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import MarianTokenizer, MarianMTModel
2
+ import streamlit as st
3
+ from PyPDF2 import PdfReader
4
+ import docx
5
+ import os
6
+
7
+ # Load translation models and tokenizers
8
+ def load_translation_model(src_lang, tgt_lang):
9
+ model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
10
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
11
+ model = MarianMTModel.from_pretrained(model_name)
12
+ return tokenizer, model
13
+
14
+ # Initialize models for supported language pairs
15
+ @st.cache_resource
16
+ def initialize_models():
17
+ return {
18
+ "en_hi": load_translation_model("en", "hi"),
19
+ "en_mr": load_translation_model("en", "mr"),
20
+ "hi_en": load_translation_model("hi", "en"),
21
+ "mr_en": load_translation_model("mr", "en")
22
+ }
23
+
24
+ # Function to extract text from different file types
25
+ def extract_text(file):
26
+ ext = os.path.splitext(file.name)[1].lower()
27
+
28
+ if ext == ".pdf":
29
+ reader = PdfReader(file)
30
+ text = ""
31
+ for page in reader.pages:
32
+ text += page.extract_text() + "\n"
33
+ return text
34
+
35
+ elif ext == ".docx":
36
+ doc = docx.Document(file)
37
+ text = ""
38
+ for para in doc.paragraphs:
39
+ text += para.text + "\n"
40
+ return text
41
+
42
+ elif ext == ".txt":
43
+ return file.read().decode("utf-8")
44
+
45
+ else:
46
+ raise ValueError("Unsupported file format. Please upload PDF, DOCX, or TXT files.")
47
+
48
+ # Translation function
49
+ def translate_text(text, src_lang, tgt_lang, models):
50
+ if src_lang == tgt_lang:
51
+ return text
52
+
53
+ model_key = f"{src_lang}_{tgt_lang}"
54
+ if model_key not in models:
55
+ return "Error: Direct translation between Hindi and Marathi is not supported. Please use English as an intermediate language."
56
+
57
+ tokenizer, model = models[model_key]
58
+
59
+ # Split text into manageable chunks (max 512 tokens)
60
+ sentences = text.split("\n")
61
+ translated_text = ""
62
+
63
+ for sentence in sentences:
64
+ if sentence.strip():
65
+ inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
66
+ translated = model.generate(**inputs)
67
+ translated_sentence = tokenizer.decode(translated[0], skip_special_tokens=True)
68
+ translated_text += translated_sentence + "\n"
69
+
70
+ return translated_text
71
+
72
+ # Function to save text as a file
73
+ def save_text_to_file(text, original_filename, prefix="translated"):
74
+ output_filename = f"{prefix}_{os.path.basename(original_filename)}.txt"
75
+ with open(output_filename, "w", encoding="utf-8") as f:
76
+ f.write(text)
77
+ return output_filename
78
+
79
+ # Main processing function
80
+ def process_document(file, source_lang, target_lang, models):
81
+ try:
82
+ # Extract text from uploaded file
83
+ text = extract_text(file)
84
+
85
+ # Translate the text
86
+ translated_text = translate_text(text, source_lang, target_lang, models)
87
+
88
+ # Save the result (success or error) to a file
89
+ if translated_text.startswith("Error:"):
90
+ output_file = save_text_to_file(translated_text, file.name, prefix="error")
91
+ else:
92
+ output_file = save_text_to_file(translated_text, file.name)
93
+
94
+ return output_file, translated_text
95
+ except Exception as e:
96
+ # Save error message to a file
97
+ error_message = f"Error: {str(e)}"
98
+ output_file = save_text_to_file(error_message, file.name, prefix="error")
99
+ return output_file, error_message
100
+
101
+ # Streamlit interface
102
+ def main():
103
+ st.title("Document Translator")
104
+ st.write("Upload a document (PDF, DOCX, or TXT) and select source and target languages (English, Hindi, Marathi).")
105
+
106
+ # Initialize models
107
+ models = initialize_models()
108
+
109
+ # File uploader
110
+ uploaded_file = st.file_uploader("Upload Document", type=["pdf", "docx", "txt"])
111
+
112
+ # Language selection
113
+ col1, col2 = st.columns(2)
114
+ with col1:
115
+ source_lang = st.selectbox("Source Language", ["en", "hi", "mr"], index=0)
116
+ with col2:
117
+ target_lang = st.selectbox("Target Language", ["en", "hi", "mr"], index=1)
118
+
119
+ if uploaded_file is not None and st.button("Translate"):
120
+ with st.spinner("Translating..."):
121
+ output_file, result_text = process_document(uploaded_file, source_lang, target_lang, models)
122
+
123
+ # Display result
124
+ st.text_area("Translated Text", result_text, height=300)
125
+
126
+ # Provide download button
127
+ with open(output_file, "rb") as file:
128
+ st.download_button(
129
+ label="Download Translated Document",
130
+ data=file,
131
+ file_name=os.path.basename(output_file),
132
+ mime="text/plain"
133
+ )
134
+
135
+ if __name__ == "__main__":
136
+ main()