Spaces:

cloghost
/

nllb-hind2kangri

Sleeping

App Files Files Community

cloghost commited on 3 days ago

Commit

64ce29d

•

1 Parent(s): fd1f73a

Update app.py

Browse files

Files changed (1) hide show

app.py +210 -62

app.py CHANGED Viewed

@@ -1,6 +1,10 @@
 import streamlit as st
 import torch
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 # Page configuration
 st.set_page_config(
@@ -9,92 +13,236 @@ st.set_page_config(
     layout="wide"
 )
 @st.cache_resource
 def load_model():
     """Load and cache the model and tokenizer"""
-    model_name = "cloghost/nllb-200-distilled-600M-hin-kang-v1"
-    # Load model and tokenizer
-    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    # Set device dynamically
-    device = 0 if torch.cuda.is_available() else -1
-    # Initialize translation pipeline
-    translator = pipeline(
-        "translation",
-        model=model,
-        tokenizer=tokenizer,
-        src_lang="hin_Deva",
-        tgt_lang="kang_Deva",
-        device=device
-    )
-    return translator
 def translate_text(translator, text):
-    """Translate the input text"""
     try:
-        translation = translator(text)
         return translation[0]['translation_text']
     except Exception as e:
         st.error(f"Translation Error: {str(e)}")
         return None
 def main():
-    # App title and description
     st.title("🗣️ Hindi to Kangri Translator")
     st.markdown("""
-    This application translates Hindi (Devanagari) text to Kangri language using a fine-tuned NLLB-200 model.
-    Simply enter your Hindi text in the input box below and click 'Translate'.
     """)
-    # Model loading with spinner
-    with st.spinner("Loading translation model..."):
-        translator = load_model()
-    # Create two columns for input and output
-    col1, col2 = st.columns(2)
-    # Input text area
-    with col1:
-        st.subheader("Hindi Text (हिंदी)")
-        input_text = st.text_area(
-            "Enter Hindi text",
-            height=200,
-            help="Enter the Hindi text you want to translate to Kangri",
-            placeholder="यहाँ हिंदी में टेक्स्ट लिखें..."
-        )
-    # Add translation button
-    if st.button("Translate to Kangri"):
-        if input_text:
-            with st.spinner("Translating..."):
-                translated_text = translate_text(translator, input_text)
-                if translated_text:
-                    with col2:
-                        st.subheader("Kangri Translation (कांगड़ी)")
-                        st.text_area(
-                            "Kangri translation",
-                            value=translated_text,
-                            height=200,
-                            disabled=True
-                        )
-        else:
-            st.warning("Please enter some Hindi text to translate.")
-    # Add information about the model
-    st.markdown("---")
-    st.markdown("""
-    ### About the Model
-    This translator uses the `cloghost/nllb-200-distilled-600M-hin-kang-v1` model, which is a distilled version
-    of the NLLB-200 model specifically fine-tuned for Hindi to Kangri translation. The model supports:
-    - Source Language: Hindi (Devanagari script)
-    - Target Language: Kangri (Devanagari script)
-    - Maximum input length: 512 tokens
-    """)
 if __name__ == "__main__":
     main()

 import streamlit as st
 import torch
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+import re
+import pandas as pd
+from io import StringIO
+import time
 # Page configuration
 st.set_page_config(
     layout="wide"
 )
+# Custom CSS for better styling
+st.markdown("""
+    <style>
+    .stAlert {
+        padding: 10px;
+        margin: 10px 0;
+    }
+    .example-text {
+        padding: 10px;
+        background-color: #f0f2f6;
+        border-radius: 5px;
+        margin: 5px 0;
+        cursor: pointer;
+    }
+    </style>
+""", unsafe_allow_html=True)
+# Example texts
+EXAMPLE_TEXTS = {
+    "General Conversation": "मैं आज बाजार जा रहा हूं। क्या आप मेरे साथ चलना चाहेंगे?",
+    "Cultural": "दिवाली का त्योहार रोशनी और खुशियों का त्योहार है।",
+    "Literature": "साहित्य मानव जीवन का दर्पण है। इसमें समाज की हर छवि दिखाई देती है।",
+    "Tourism": "हिमाचल प्रदेश की सुंदर पहाड़ियां और हरी-भरी वादियां पर्यटकों को आकर्षित करती हैं।"
+}
 @st.cache_resource
 def load_model():
     """Load and cache the model and tokenizer"""
+    try:
+        model_name = "cloghost/nllb-200-distilled-600M-hin-kang-v1"
+        with st.spinner("Loading model and tokenizer..."):
+            model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            device = 0 if torch.cuda.is_available() else -1
+            translator = pipeline(
+                "translation",
+                model=model,
+                tokenizer=tokenizer,
+                src_lang="hin_Deva",
+                tgt_lang="kang_Deva",
+                device=device
+            )
+            return translator
+    except Exception as e:
+        st.error(f"Error loading model: {str(e)}")
+        return None
+def preprocess_text(text):
+    """Preprocess the input text"""
+    # Remove extra whitespace
+    text = re.sub(r'\s+', ' ', text.strip())
+    # Remove special characters except Devanagari and basic punctuation
+    text = re.sub(r'[^\u0900-\u097F\s।,.?!]', '', text)
+    # Normalize common variations of Hindi characters
+    text = text.replace('॰', '.')
+    return text
+def batch_translate(translator, texts):
+    """Translate a batch of texts"""
+    results = []
+    for text in texts:
+        try:
+            if text.strip():  # Only translate non-empty texts
+                translation = translator(text)
+                results.append({
+                    'Source': text.strip(),
+                    'Translation': translation[0]['translation_text']
+                })
+            else:
+                results.append({
+                    'Source': '',
+                    'Translation': ''
+                })
+        except Exception as e:
+            results.append({
+                'Source': text.strip(),
+                'Translation': f'Error: {str(e)}'
+            })
+    return pd.DataFrame(results)
 def translate_text(translator, text):
+    """Translate single text with error handling"""
     try:
+        preprocessed_text = preprocess_text(text)
+        if not preprocessed_text:
+            return None
+        translation = translator(preprocessed_text)
         return translation[0]['translation_text']
     except Exception as e:
         st.error(f"Translation Error: {str(e)}")
         return None
 def main():
     st.title("🗣️ Hindi to Kangri Translator")
     st.markdown("""
+    An advanced translation tool for converting Hindi text to Kangri language.
+    Features include single text translation, batch processing, and text preprocessing.
     """)
+    # Load model
+    translator = load_model()
+    if not translator:
+        st.stop()
+    # Create tabs for different features
+    tabs = st.tabs(["Single Translation", "Batch Translation", "Examples", "About"])
+    # Single Translation Tab
+    with tabs[0]:
+        col1, col2 = st.columns(2)
+        with col1:
+            st.subheader("Hindi Text (हिंदी)")
+            input_text = st.text_area(
+                "Enter Hindi text",
+                height=200,
+                help="Enter the Hindi text you want to translate to Kangri",
+                placeholder="यहाँ हिंदी में टेक्स्ट लिखें..."
+            )
+            # Preprocessing options
+            with st.expander("Preprocessing Options"):
+                remove_special = st.checkbox("Remove special characters", value=True)
+                normalize_chars = st.checkbox("Normalize Hindi characters", value=True)
+            if st.button("Translate to Kangri"):
+                if input_text:
+                    with st.spinner("Translating..."):
+                        # Show preprocessing steps
+                        if remove_special or normalize_chars:
+                            st.info("Preprocessing text...")
+                            processed_text = preprocess_text(input_text)
+                            st.code(processed_text, language="text")
+                        translated_text = translate_text(translator, input_text)
+                        if translated_text:
+                            with col2:
+                                st.subheader("Kangri Translation (कांगड़ी)")
+                                st.text_area(
+                                    "Kangri translation",
+                                    value=translated_text,
+                                    height=200,
+                                    disabled=True
+                                )
+                else:
+                    st.warning("Please enter some Hindi text to translate.")
+    # Batch Translation Tab
+    with tabs[1]:
+        st.subheader("Batch Translation")
+        st.markdown("""
+        Upload a CSV or TXT file containing Hindi texts to translate in bulk.
+        - For CSV: Include a column named 'text' containing Hindi texts
+        - For TXT: Each line should contain one Hindi text to translate
+        """)
+        uploaded_file = st.file_uploader("Choose a file", type=['csv', 'txt'])
+        if uploaded_file:
+            try:
+                if uploaded_file.type == 'text/csv':
+                    df = pd.read_csv(uploaded_file)
+                    texts = df['text'].tolist()
+                else:  # txt file
+                    content = uploaded_file.read().decode()
+                    texts = content.split('\n')
+                if st.button("Translate Batch"):
+                    progress_bar = st.progress(0)
+                    with st.spinner("Processing batch translation..."):
+                        results_df = batch_translate(translator, texts)
+                        progress_bar.progress(100)
+                    st.success("Translation completed!")
+                    st.dataframe(results_df)
+                    # Download button for results
+                    csv = results_df.to_csv(index=False)
+                    st.download_button(
+                        "Download Results",
+                        csv,
+                        "translation_results.csv",
+                        "text/csv",
+                        key='download-csv'
+                    )
+            except Exception as e:
+                st.error(f"Error processing file: {str(e)}")
+    # Examples Tab
+    with tabs[2]:
+        st.subheader("Example Texts")
+        st.markdown("Click on any example to load it into the translator:")
+        for category, text in EXAMPLE_TEXTS.items():
+            st.markdown(f"**{category}:**")
+            if st.button(text, key=f"example_{category}"):
+                tabs[0].button = True  # Switch to translation tab
+                st.session_state.input_text = text
+                st.experimental_rerun()
+    # About Tab
+    with tabs[3]:
+        st.subheader("About the Model")
+        st.markdown("""
+        ### Model Information
+        - **Base Model**: NLLB-200 Distilled (600M parameters)
+        - **Fine-tuned for**: Hindi (hin_Deva) to Kangri (kang_Deva) translation
+        - **Maximum input length**: 512 tokens
+        - **Model ID**: `cloghost/nllb-200-distilled-600M-hin-kang-v1`
+        ### Preprocessing Features
+        - Remove special characters while preserving Devanagari script
+        - Normalize Hindi character variations
+        - Clean extra whitespace and formatting
+        ### Usage Tips
+        1. For best results, input clean Hindi text in Devanagari script
+        2. Use batch translation for processing multiple texts efficiently
+        3. Check preprocessing options for better translation quality
+        4. Refer to example texts for optimal input format
+        """)
 if __name__ == "__main__":
     main()