Spaces:

arsath-sm
/

ENG-TAM_with_Gemma_model

Running

App Files Files Community

ENG-TAM_with_Gemma_model / app.py

arsath-sm

Rename translator2.py to app.py

f658876 verified 3 months ago

raw

history blame contribute delete

10.4 kB

	import os
	import streamlit as st
	from dotenv import load_dotenv
	from groq import Groq
	import json
	from typing import List, Dict
	import time

	# Load environment variables from .env file
	load_dotenv()

	# Initialize the Groq client
	client = Groq(api_key=os.getenv("GROQ_API_KEY"))

	class TranslationManager:
	def __init__(self):
	self.chunk_size = 1500
	self.overlap_size = 200
	self.context_window = []

	def chunk_text_with_context(self, text: str) -> List[Dict]:
	"""Split text into chunks while maintaining context"""
	words = text.split()
	chunks = []
	current_chunk = []
	current_length = 0

	for i, word in enumerate(words):
	current_chunk.append(word)
	current_length += len(word) + 1

	# Check if chunk size is reached
	if current_length >= self.chunk_size:
	# Add overlap from next words if available
	overlap_words = words[i+1:i+1+self.overlap_size] if i+1 < len(words) else []

	chunks.append({
	'main_text': ' '.join(current_chunk),
	'overlap_text': ' '.join(overlap_words),
	'position': len(chunks)
	})

	# Start new chunk with some overlap
	current_chunk = words[max(0, i-50):i+1]
	current_length = sum(len(w) + 1 for w in current_chunk)

	# Add remaining text as last chunk
	if current_chunk:
	chunks.append({
	'main_text': ' '.join(current_chunk),
	'overlap_text': '',
	'position': len(chunks)
	})

	return chunks

	def create_translation_prompt(self, chunk: Dict, mode: str, domain: str = None) -> str:
	"""Create appropriate prompt based on translation mode"""
	if mode == "normal":
	prompt = f"""Translate the following English text to Tamil.
	Provide only the Tamil translation without any other text.

	English text: {chunk['main_text']}"""
	else: # contextual
	context = f"Domain: {domain}\n" if domain else ""
	previous_context = self.context_window[-1] if self.context_window else ""

	prompt = f"""Perform a contextual translation from English to Tamil.
	Consider the following aspects:
	{context}
	Previous context: {previous_context}

	Maintain the following in your translation:
	- Preserve domain-specific terminology
	- Maintain consistent style and tone
	- Ensure contextual coherence with previous translations
	- Adapt idiomatic expressions appropriately

	Text to translate: {chunk['main_text']}

	Overlap context: {chunk['overlap_text']}

	Provide only the Tamil translation without any explanations."""

	return prompt

	def translate_chunk(self, chunk: Dict, mode: str, domain: str = None) -> str:
	"""Translate a single chunk of text"""
	prompt = self.create_translation_prompt(chunk, mode, domain)

	max_retries = 3
	for attempt in range(max_retries):
	try:
	completion = client.chat.completions.create(
	model="Gemma2-9b-It",
	messages=[
	{
	"role": "user",
	"content": prompt
	}
	],
	temperature=0.3 if mode == "normal" else 0.4,
	max_tokens=2048,
	top_p=1,
	stream=True,
	stop=None,
	)

	translation = ""
	for chunk_response in completion:
	translation += chunk_response.choices[0].delta.content or ""

	# Update context window for contextual translation
	if mode == "contextual":
	self.context_window.append(translation)
	if len(self.context_window) > 3:
	self.context_window.pop(0)

	return translation

	except Exception as e:
	if attempt == max_retries - 1:
	raise e
	time.sleep(2) # Wait before retry

	return ""

	def main():
	st.set_page_config(page_title="Advanced Tamil Translator", layout="wide")

	# Initialize translation manager
	if 'translation_manager' not in st.session_state:
	st.session_state.translation_manager = TranslationManager()

	if 'translation_history' not in st.session_state:
	st.session_state.translation_history = []

	st.title("Advanced English to Tamil Translator")

	# Translation settings
	with st.expander("Translation Settings", expanded=True):
	col1, col2 = st.columns(2)
	with col1:
	translation_mode = st.radio(
	"Translation Mode",
	["Normal", "Contextual"],
	help="Normal: Direct translation\nContextual: Context-aware translation with domain specificity"
	)

	with col2:
	if translation_mode == "Contextual":
	domain = st.selectbox(
	"Select Domain",
	["General", "Technical", "Medical", "Legal", "Literary", "Business", "Academic"],
	help="Select the domain to improve translation accuracy"
	)

	# Input area
	st.subheader("Enter Text")
	english_input = st.text_area("Enter English text of any length:", height=200)

	# Translation button
	if st.button("Translate"):
	if not english_input:
	st.error("Please enter some text to translate.")
	return

	try:
	# Initialize progress tracking
	progress_bar = st.progress(0)
	status_text = st.empty()

	# Reset context window for new translation
	st.session_state.translation_manager.context_window = []

	# Chunk the input text
	chunks = st.session_state.translation_manager.chunk_text_with_context(english_input)
	translated_chunks = []

	# Translate each chunk
	for i, chunk in enumerate(chunks):
	status_text.text(f"Translating part {i+1} of {len(chunks)}...")

	translation = st.session_state.translation_manager.translate_chunk(
	chunk,
	mode=translation_mode.lower(),
	domain=domain if translation_mode == "Contextual" else None
	)

	translated_chunks.append(translation)
	progress_bar.progress((i + 1) / len(chunks))

	# Combine translations
	final_translation = ' '.join(translated_chunks)

	# Display results
	col1, col2 = st.columns(2)

	with col1:
	st.subheader("Original Text")
	st.write(english_input)
	st.info(f"Word count: {len(english_input.split())}")

	with col2:
	st.subheader("Tamil Translation")
	st.write(final_translation)

	# Add to history
	st.session_state.translation_history.append({
	'english': english_input,
	'tamil': final_translation,
	'mode': translation_mode,
	'domain': domain if translation_mode == "Contextual" else "N/A",
	'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
	})

	# Download options
	col1, col2 = st.columns(2)
	with col1:
	st.download_button(
	"Download Translation",
	final_translation,
	file_name=f"tamil_translation_{translation_mode.lower()}.txt",
	mime="text/plain"
	)

	with col2:
	# Export translation with metadata
	export_data = {
	'original': english_input,
	'translation': final_translation,
	'mode': translation_mode,
	'domain': domain if translation_mode == "Contextual" else "N/A",
	'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
	}
	st.download_button(
	"Export with Metadata",
	json.dumps(export_data, indent=2),
	file_name="translation_with_metadata.json",
	mime="application/json"
	)

	except Exception as e:
	st.error(f"An error occurred: {str(e)}")

	finally:
	progress_bar.empty()
	status_text.empty()

	# Translation History
	if st.session_state.translation_history:
	with st.expander("Translation History"):
	for i, entry in enumerate(reversed(st.session_state.translation_history[-5:])):
	st.write(f"Translation {len(st.session_state.translation_history)-i}")
	st.write(f"Mode: {entry['mode']}")
	if entry['domain'] != "N/A":
	st.write(f"Domain: {entry['domain']}")
	st.write(f"Timestamp: {entry['timestamp']}")
	st.write("English:", entry['english'][:100] + "..." if len(entry['english']) > 100 else entry['english'])
	st.write("Tamil:", entry['tamil'][:100] + "..." if len(entry['tamil']) > 100 else entry['tamil'])
	st.markdown("---")

	if __name__ == "__main__":
	main()