|
import os
|
|
import streamlit as st
|
|
from dotenv import load_dotenv
|
|
from groq import Groq
|
|
import json
|
|
from typing import List, Dict
|
|
import time
|
|
|
|
|
|
load_dotenv()
|
|
|
|
|
|
client = Groq(api_key=os.getenv("GROQ_API_KEY"))
|
|
|
|
class TranslationManager:
|
|
def __init__(self):
|
|
self.chunk_size = 1500
|
|
self.overlap_size = 200
|
|
self.context_window = []
|
|
|
|
def chunk_text_with_context(self, text: str) -> List[Dict]:
|
|
"""Split text into chunks while maintaining context"""
|
|
words = text.split()
|
|
chunks = []
|
|
current_chunk = []
|
|
current_length = 0
|
|
|
|
for i, word in enumerate(words):
|
|
current_chunk.append(word)
|
|
current_length += len(word) + 1
|
|
|
|
|
|
if current_length >= self.chunk_size:
|
|
|
|
overlap_words = words[i+1:i+1+self.overlap_size] if i+1 < len(words) else []
|
|
|
|
chunks.append({
|
|
'main_text': ' '.join(current_chunk),
|
|
'overlap_text': ' '.join(overlap_words),
|
|
'position': len(chunks)
|
|
})
|
|
|
|
|
|
current_chunk = words[max(0, i-50):i+1]
|
|
current_length = sum(len(w) + 1 for w in current_chunk)
|
|
|
|
|
|
if current_chunk:
|
|
chunks.append({
|
|
'main_text': ' '.join(current_chunk),
|
|
'overlap_text': '',
|
|
'position': len(chunks)
|
|
})
|
|
|
|
return chunks
|
|
|
|
def create_translation_prompt(self, chunk: Dict, mode: str, domain: str = None) -> str:
|
|
"""Create appropriate prompt based on translation mode"""
|
|
if mode == "normal":
|
|
prompt = f"""Translate the following English text to Tamil.
|
|
Provide only the Tamil translation without any other text.
|
|
|
|
English text: {chunk['main_text']}"""
|
|
else:
|
|
context = f"Domain: {domain}\n" if domain else ""
|
|
previous_context = self.context_window[-1] if self.context_window else ""
|
|
|
|
prompt = f"""Perform a contextual translation from English to Tamil.
|
|
Consider the following aspects:
|
|
{context}
|
|
Previous context: {previous_context}
|
|
|
|
Maintain the following in your translation:
|
|
- Preserve domain-specific terminology
|
|
- Maintain consistent style and tone
|
|
- Ensure contextual coherence with previous translations
|
|
- Adapt idiomatic expressions appropriately
|
|
|
|
Text to translate: {chunk['main_text']}
|
|
|
|
Overlap context: {chunk['overlap_text']}
|
|
|
|
Provide only the Tamil translation without any explanations."""
|
|
|
|
return prompt
|
|
|
|
def translate_chunk(self, chunk: Dict, mode: str, domain: str = None) -> str:
|
|
"""Translate a single chunk of text"""
|
|
prompt = self.create_translation_prompt(chunk, mode, domain)
|
|
|
|
max_retries = 3
|
|
for attempt in range(max_retries):
|
|
try:
|
|
completion = client.chat.completions.create(
|
|
model="Gemma2-9b-It",
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": prompt
|
|
}
|
|
],
|
|
temperature=0.3 if mode == "normal" else 0.4,
|
|
max_tokens=2048,
|
|
top_p=1,
|
|
stream=True,
|
|
stop=None,
|
|
)
|
|
|
|
translation = ""
|
|
for chunk_response in completion:
|
|
translation += chunk_response.choices[0].delta.content or ""
|
|
|
|
|
|
if mode == "contextual":
|
|
self.context_window.append(translation)
|
|
if len(self.context_window) > 3:
|
|
self.context_window.pop(0)
|
|
|
|
return translation
|
|
|
|
except Exception as e:
|
|
if attempt == max_retries - 1:
|
|
raise e
|
|
time.sleep(2)
|
|
|
|
return ""
|
|
|
|
def main():
|
|
st.set_page_config(page_title="Advanced Tamil Translator", layout="wide")
|
|
|
|
|
|
if 'translation_manager' not in st.session_state:
|
|
st.session_state.translation_manager = TranslationManager()
|
|
|
|
if 'translation_history' not in st.session_state:
|
|
st.session_state.translation_history = []
|
|
|
|
st.title("Advanced English to Tamil Translator")
|
|
|
|
|
|
with st.expander("Translation Settings", expanded=True):
|
|
col1, col2 = st.columns(2)
|
|
with col1:
|
|
translation_mode = st.radio(
|
|
"Translation Mode",
|
|
["Normal", "Contextual"],
|
|
help="Normal: Direct translation\nContextual: Context-aware translation with domain specificity"
|
|
)
|
|
|
|
with col2:
|
|
if translation_mode == "Contextual":
|
|
domain = st.selectbox(
|
|
"Select Domain",
|
|
["General", "Technical", "Medical", "Legal", "Literary", "Business", "Academic"],
|
|
help="Select the domain to improve translation accuracy"
|
|
)
|
|
|
|
|
|
st.subheader("Enter Text")
|
|
english_input = st.text_area("Enter English text of any length:", height=200)
|
|
|
|
|
|
if st.button("Translate"):
|
|
if not english_input:
|
|
st.error("Please enter some text to translate.")
|
|
return
|
|
|
|
try:
|
|
|
|
progress_bar = st.progress(0)
|
|
status_text = st.empty()
|
|
|
|
|
|
st.session_state.translation_manager.context_window = []
|
|
|
|
|
|
chunks = st.session_state.translation_manager.chunk_text_with_context(english_input)
|
|
translated_chunks = []
|
|
|
|
|
|
for i, chunk in enumerate(chunks):
|
|
status_text.text(f"Translating part {i+1} of {len(chunks)}...")
|
|
|
|
translation = st.session_state.translation_manager.translate_chunk(
|
|
chunk,
|
|
mode=translation_mode.lower(),
|
|
domain=domain if translation_mode == "Contextual" else None
|
|
)
|
|
|
|
translated_chunks.append(translation)
|
|
progress_bar.progress((i + 1) / len(chunks))
|
|
|
|
|
|
final_translation = ' '.join(translated_chunks)
|
|
|
|
|
|
col1, col2 = st.columns(2)
|
|
|
|
with col1:
|
|
st.subheader("Original Text")
|
|
st.write(english_input)
|
|
st.info(f"Word count: {len(english_input.split())}")
|
|
|
|
with col2:
|
|
st.subheader("Tamil Translation")
|
|
st.write(final_translation)
|
|
|
|
|
|
st.session_state.translation_history.append({
|
|
'english': english_input,
|
|
'tamil': final_translation,
|
|
'mode': translation_mode,
|
|
'domain': domain if translation_mode == "Contextual" else "N/A",
|
|
'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
|
|
})
|
|
|
|
|
|
col1, col2 = st.columns(2)
|
|
with col1:
|
|
st.download_button(
|
|
"Download Translation",
|
|
final_translation,
|
|
file_name=f"tamil_translation_{translation_mode.lower()}.txt",
|
|
mime="text/plain"
|
|
)
|
|
|
|
with col2:
|
|
|
|
export_data = {
|
|
'original': english_input,
|
|
'translation': final_translation,
|
|
'mode': translation_mode,
|
|
'domain': domain if translation_mode == "Contextual" else "N/A",
|
|
'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
|
|
}
|
|
st.download_button(
|
|
"Export with Metadata",
|
|
json.dumps(export_data, indent=2),
|
|
file_name="translation_with_metadata.json",
|
|
mime="application/json"
|
|
)
|
|
|
|
except Exception as e:
|
|
st.error(f"An error occurred: {str(e)}")
|
|
|
|
finally:
|
|
progress_bar.empty()
|
|
status_text.empty()
|
|
|
|
|
|
if st.session_state.translation_history:
|
|
with st.expander("Translation History"):
|
|
for i, entry in enumerate(reversed(st.session_state.translation_history[-5:])):
|
|
st.write(f"Translation {len(st.session_state.translation_history)-i}")
|
|
st.write(f"Mode: {entry['mode']}")
|
|
if entry['domain'] != "N/A":
|
|
st.write(f"Domain: {entry['domain']}")
|
|
st.write(f"Timestamp: {entry['timestamp']}")
|
|
st.write("English:", entry['english'][:100] + "..." if len(entry['english']) > 100 else entry['english'])
|
|
st.write("Tamil:", entry['tamil'][:100] + "..." if len(entry['tamil']) > 100 else entry['tamil'])
|
|
st.markdown("---")
|
|
|
|
if __name__ == "__main__":
|
|
main() |