Spaces:

LocaleNLP
/

LocaleNLP_Translator

Runtime error

App Files Files Community

Mgolo commited on Sep 18

Commit

eafa517

verified ·

1 Parent(s): bbd3488

Update app.py

Browse files

Files changed (1) hide show

app.py +158 -117

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ import os
 import re
 import logging
 import tempfile
-from typing import Optional, Dict, Tuple, Any, Union
 from pathlib import Path
 from dataclasses import dataclass
 from enum import Enum
@@ -131,26 +131,33 @@ class ModelManager:
             # Authenticate with Hugging Face if token provided
             if hf_token := os.getenv("hffff"):
-                login(token=hf_token)
-            model = AutoModelForSeq2SeqLM.from_pretrained(
-                config.model_name,
-                token=hf_token
-            ).to(self._get_device())
-            tokenizer = MarianTokenizer.from_pretrained(
-                config.model_name,
-                token=hf_token
-            )
-            self._translation_pipeline = pipeline(
-                "translation",
-                model=model,
-                tokenizer=tokenizer,
-                device=0 if self._get_device().type == "cuda" else -1
-            )
-            self._current_model_name = config.model_name
         return self._translation_pipeline, config.language_tag
@@ -163,7 +170,11 @@ class ModelManager:
         """
         if self._whisper_model is None:
             logger.info("Loading Whisper base model...")
-            self._whisper_model = whisper.load_model("base")
         return self._whisper_model
     def _get_device(self) -> torch.device:
@@ -196,20 +207,18 @@ class ContentProcessor:
         extension = file_path.suffix.lower()
         try:
-            content = file_path.read_bytes()
             if extension == ".pdf":
-                return ContentProcessor._extract_pdf_text(content)
             elif extension == ".docx":
                 return ContentProcessor._extract_docx_text(file_path)
             elif extension in (".html", ".htm"):
-                return ContentProcessor._extract_html_text(content)
             elif extension == ".md":
-                return ContentProcessor._extract_markdown_text(content)
             elif extension == ".srt":
-                return ContentProcessor._extract_srt_text(content)
             elif extension in (".txt", ".text"):
-                return ContentProcessor._extract_plain_text(content)
             else:
                 raise ValueError(f"Unsupported file type: {extension}")
@@ -218,28 +227,30 @@ class ContentProcessor:
             raise
     @staticmethod
-    def _extract_pdf_text(content: bytes) -> str:
         """Extract text from PDF file."""
-        with fitz.open(stream=content, filetype="pdf") as doc:
             return "\n".join(page.get_text() for page in doc)
     @staticmethod
     def _extract_docx_text(file_path: Path) -> str:
         """Extract text from DOCX file."""
-        doc = docx.Document(str(file_path))
         return "\n".join(paragraph.text for paragraph in doc.paragraphs)
     @staticmethod
-    def _extract_html_text(content: bytes) -> str:
         """Extract text from HTML file."""
         encoding = chardet.detect(content)["encoding"] or "utf-8"
         text = content.decode(encoding, errors="ignore")
         soup = BeautifulSoup(text, "html.parser")
         return soup.get_text()
     @staticmethod
-    def _extract_markdown_text(content: bytes) -> str:
         """Extract text from Markdown file."""
         encoding = chardet.detect(content)["encoding"] or "utf-8"
         text = content.decode(encoding, errors="ignore")
         html = markdown(text)
@@ -247,16 +258,18 @@ class ContentProcessor:
         return soup.get_text()
     @staticmethod
-    def _extract_srt_text(content: bytes) -> str:
         """Extract text from SRT subtitle file."""
         encoding = chardet.detect(content)["encoding"] or "utf-8"
         text = content.decode(encoding, errors="ignore")
         # Remove timestamp lines
         return re.sub(r"\d+\n\d{2}:\d{2}:\d{2},\d{3} --> .*?\n", "", text)
     @staticmethod
-    def _extract_plain_text(content: bytes) -> str:
         """Extract text from plain text file."""
         encoding = chardet.detect(content)["encoding"] or "utf-8"
         return content.decode(encoding, errors="ignore")
@@ -304,11 +317,15 @@ class TranslationService:
         target_lang: Language
     ) -> str:
         """Perform direct translation using available model."""
-        pipeline_obj, lang_tag = self.model_manager.get_translation_pipeline(
-            source_lang, target_lang
-        )
-        return self._process_text_with_pipeline(text, pipeline_obj, lang_tag)
     def _chained_translate(
         self,
@@ -327,17 +344,21 @@ class TranslationService:
         Returns:
             Translated text through chaining
         """
-        # First: source_lang -> English
-        intermediate_text = self._direct_translate(
-            text, source_lang, Language.ENGLISH
-        )
-        # Second: English -> target_lang
-        final_text = self._direct_translate(
-            intermediate_text, Language.ENGLISH, target_lang
-        )
-        return final_text
     def _process_text_with_pipeline(
         self,
@@ -362,30 +383,38 @@ class TranslationService:
                     if s.strip()
                 ]
                 # Add language tag to each sentence
                 formatted_sentences = [
                     f"{lang_tag} {sentence}"
                     for sentence in sentences
                 ]
-                # Perform translation
-                results = pipeline_obj(
-                    formatted_sentences,
-                    max_length=5000,
-                    num_beams=5,
-                    early_stopping=True,
-                    no_repeat_ngram_size=3,
-                    repetition_penalty=1.5,
-                    length_penalty=1.2
-                )
-                # Process results
-                translated_sentences = [
-                    result["translation_text"].capitalize()
-                    for result in results
-                ]
-                translated_paragraphs.append(". ".join(translated_sentences))
         return "\n".join(translated_paragraphs)
@@ -409,9 +438,13 @@ class AudioProcessor:
         Returns:
             Transcribed text
         """
-        model = self.model_manager.get_whisper_model()
-        result = model.transcribe(audio_file_path)
-        return result["text"]
 # ================================
 # Main Application
@@ -432,7 +465,7 @@ class TranslationApp:
         source_lang: Language,
         text_input: str,
         audio_file: Optional[str],
-        file_obj: Optional[gr.FileData]
     ) -> str:
         """
         Process input based on selected mode.
@@ -447,22 +480,29 @@ class TranslationApp:
         Returns:
             Processed text content
         """
-        if mode == InputMode.TEXT:
-            return text_input
-        elif mode == InputMode.AUDIO:
-            if source_lang != Language.ENGLISH:
-                raise ValueError("Audio input must be in English.")
-            if not audio_file:
-                raise ValueError("No audio file provided.")
-            return self.audio_processor.transcribe(audio_file)
-        elif mode == InputMode.FILE:
-            if not file_obj:
-                raise ValueError("No file uploaded.")
-            return self.content_processor.extract_text_from_file(file_obj.name)
-        return ""
     def create_interface(self) -> gr.Blocks:
         """Create and return the Gradio interface."""
@@ -471,6 +511,22 @@ class TranslationApp:
             title="LocaleNLP Translation Service",
             theme=gr.themes.Monochrome()
         ) as interface:
             # Header
             gr.Markdown("""
             # 🌍 LocaleNLP Translation Service
@@ -536,22 +592,26 @@ class TranslationApp:
             )
             # Event handlers
-            def update_visibility(mode: str) -> Dict[str, Any]:
                 """Update component visibility based on input mode."""
-                return {
-                    input_text: gr.update(visible=(mode == InputMode.TEXT.value)),
-                    audio_input: gr.update(visible=(mode == InputMode.AUDIO.value)),
-                    file_input: gr.update(visible=(mode == InputMode.FILE.value)),
-                    extracted_text: gr.update(value="", visible=True),
-                    output_text: gr.update(value="")
-                }
             def handle_process(
                 mode: str,
                 source_lang: str,
                 text_input: str,
                 audio_file: Optional[str],
-                file_obj: Optional[gr.FileData]
             ) -> Tuple[str, str]:
                 """Handle initial input processing."""
                 try:
@@ -601,25 +661,6 @@ class TranslationApp:
                 inputs=[extracted_text, input_lang, output_lang],
                 outputs=output_text
             )
-            # Custom CSS for black button (applied after interface creation)
-            interface.load(lambda: None, None, None, _js="""
-            () => {
-                const style = document.createElement('style');
-                style.textContent = `
-                    .gr-button-secondary {
-                        background-color: #000000 !important;
-                        border-color: #000000 !important;
-                        color: white !important;
-                    }
-                    .gr-button-secondary:hover {
-                        background-color: #333333 !important;
-                        border-color: #333333 !important;
-                    }
-                `;
-                document.head.appendChild(style);
-            }
-            """)
         return interface

 import re
 import logging
 import tempfile
+from typing import Optional, Dict, Tuple, Any, Union, List
 from pathlib import Path
 from dataclasses import dataclass
 from enum import Enum
             # Authenticate with Hugging Face if token provided
             if hf_token := os.getenv("hffff"):
+                try:
+                    login(token=hf_token)
+                except Exception as e:
+                    logger.warning(f"HF login failed: {e}")
+            try:
+                model = AutoModelForSeq2SeqLM.from_pretrained(
+                    config.model_name,
+                    token=hf_token if hf_token else None
+                ).to(self._get_device())
+                tokenizer = MarianTokenizer.from_pretrained(
+                    config.model_name,
+                    token=hf_token if hf_token else None
+                )
+                self._translation_pipeline = pipeline(
+                    "translation",
+                    model=model,
+                    tokenizer=tokenizer,
+                    device=0 if self._get_device().type == "cuda" else -1
+                )
+                self._current_model_name = config.model_name
+            except Exception as e:
+                logger.error(f"Failed to load model {config.model_name}: {e}")
+                raise
         return self._translation_pipeline, config.language_tag
         """
         if self._whisper_model is None:
             logger.info("Loading Whisper base model...")
+            try:
+                self._whisper_model = whisper.load_model("base")
+            except Exception as e:
+                logger.error(f"Failed to load Whisper model: {e}")
+                raise
         return self._whisper_model
     def _get_device(self) -> torch.device:
         extension = file_path.suffix.lower()
         try:
             if extension == ".pdf":
+                return ContentProcessor._extract_pdf_text(file_path)
             elif extension == ".docx":
                 return ContentProcessor._extract_docx_text(file_path)
             elif extension in (".html", ".htm"):
+                return ContentProcessor._extract_html_text(file_path)
             elif extension == ".md":
+                return ContentProcessor._extract_markdown_text(file_path)
             elif extension == ".srt":
+                return ContentProcessor._extract_srt_text(file_path)
             elif extension in (".txt", ".text"):
+                return ContentProcessor._extract_plain_text(file_path)
             else:
                 raise ValueError(f"Unsupported file type: {extension}")
             raise
     @staticmethod
+    def _extract_pdf_text(file_path: Path) -> str:
         """Extract text from PDF file."""
+        with fitz.open(file_path) as doc:
             return "\n".join(page.get_text() for page in doc)
     @staticmethod
     def _extract_docx_text(file_path: Path) -> str:
         """Extract text from DOCX file."""
+        doc = docx.Document(file_path)
         return "\n".join(paragraph.text for paragraph in doc.paragraphs)
     @staticmethod
+    def _extract_html_text(file_path: Path) -> str:
         """Extract text from HTML file."""
+        content = file_path.read_bytes()
         encoding = chardet.detect(content)["encoding"] or "utf-8"
         text = content.decode(encoding, errors="ignore")
         soup = BeautifulSoup(text, "html.parser")
         return soup.get_text()
     @staticmethod
+    def _extract_markdown_text(file_path: Path) -> str:
         """Extract text from Markdown file."""
+        content = file_path.read_bytes()
         encoding = chardet.detect(content)["encoding"] or "utf-8"
         text = content.decode(encoding, errors="ignore")
         html = markdown(text)
         return soup.get_text()
     @staticmethod
+    def _extract_srt_text(file_path: Path) -> str:
         """Extract text from SRT subtitle file."""
+        content = file_path.read_bytes()
         encoding = chardet.detect(content)["encoding"] or "utf-8"
         text = content.decode(encoding, errors="ignore")
         # Remove timestamp lines
         return re.sub(r"\d+\n\d{2}:\d{2}:\d{2},\d{3} --> .*?\n", "", text)
     @staticmethod
+    def _extract_plain_text(file_path: Path) -> str:
         """Extract text from plain text file."""
+        content = file_path.read_bytes()
         encoding = chardet.detect(content)["encoding"] or "utf-8"
         return content.decode(encoding, errors="ignore")
         target_lang: Language
     ) -> str:
         """Perform direct translation using available model."""
+        try:
+            pipeline_obj, lang_tag = self.model_manager.get_translation_pipeline(
+                source_lang, target_lang
+            )
+            return self._process_text_with_pipeline(text, pipeline_obj, lang_tag)
+        except Exception as e:
+            logger.error(f"Direct translation error: {e}")
+            return f"Translation error: {str(e)}"
     def _chained_translate(
         self,
         Returns:
             Translated text through chaining
         """
+        try:
+            # First: source_lang -> English
+            intermediate_text = self._direct_translate(
+                text, source_lang, Language.ENGLISH
+            )
+            # Second: English -> target_lang
+            final_text = self._direct_translate(
+                intermediate_text, Language.ENGLISH, target_lang
+            )
+            return final_text
+        except Exception as e:
+            logger.error(f"Chained translation error: {e}")
+            return f"Chained translation error: {str(e)}"
     def _process_text_with_pipeline(
         self,
                     if s.strip()
                 ]
+                if not sentences:
+                    translated_paragraphs.append("")
+                    continue
                 # Add language tag to each sentence
                 formatted_sentences = [
                     f"{lang_tag} {sentence}"
                     for sentence in sentences
                 ]
+                try:
+                    # Perform translation
+                    results = pipeline_obj(
+                        formatted_sentences,
+                        max_length=5000,
+                        num_beams=5,
+                        early_stopping=True,
+                        no_repeat_ngram_size=3,
+                        repetition_penalty=1.5,
+                        length_penalty=1.2
+                    )
+                    # Process results
+                    translated_sentences = [
+                        result["translation_text"].capitalize()
+                        for result in results
+                    ]
+                    translated_paragraphs.append(". ".join(translated_sentences))
+                except Exception as e:
+                    logger.error(f"Pipeline processing error: {e}")
+                    translated_paragraphs.append(f"[Translation Error: {str(e)}]")
         return "\n".join(translated_paragraphs)
         Returns:
             Transcribed text
         """
+        try:
+            model = self.model_manager.get_whisper_model()
+            result = model.transcribe(audio_file_path)
+            return result["text"]
+        except Exception as e:
+            logger.error(f"Transcription error: {e}")
+            return f"Transcription error: {str(e)}"
 # ================================
 # Main Application
         source_lang: Language,
         text_input: str,
         audio_file: Optional[str],
+        file_obj: Optional[Any]
     ) -> str:
         """
         Process input based on selected mode.
         Returns:
             Processed text content
         """
+        try:
+            if mode == InputMode.TEXT:
+                return text_input
+            elif mode == InputMode.AUDIO:
+                if source_lang != Language.ENGLISH:
+                    return "Audio input must be in English."
+                if not audio_file:
+                    return "No audio file provided."
+                return self.audio_processor.transcribe(audio_file)
+            elif mode == InputMode.FILE:
+                if not file_obj:
+                    return "No file uploaded."
+                # Handle Gradio file object (could be a string path or a file-like object)
+                file_path = file_obj.name if hasattr(file_obj, 'name') else file_obj
+                return self.content_processor.extract_text_from_file(file_path)
+            return ""
+        except Exception as e:
+            logger.error(f"Input processing error: {e}")
+            return f"Input processing error: {str(e)}"
     def create_interface(self) -> gr.Blocks:
         """Create and return the Gradio interface."""
             title="LocaleNLP Translation Service",
             theme=gr.themes.Monochrome()
         ) as interface:
+            # Custom CSS for black button
+            gr.HTML("""
+            <style>
+                .gr-button-secondary {
+                    background-color: #000000 !important;
+                    border-color: #000000 !important;
+                    color: white !important;
+                }
+                .gr-button-secondary:hover {
+                    background-color: #333333 !important;
+                    border-color: #333333 !important;
+                    color: white !important;
+                }
+            </style>
+            """)
             # Header
             gr.Markdown("""
             # 🌍 LocaleNLP Translation Service
             )
             # Event handlers
+            def update_visibility(mode: str) -> List[Dict[str, Any]]:
                 """Update component visibility based on input mode."""
+                visibility_text = mode == InputMode.TEXT.value
+                visibility_audio = mode == InputMode.AUDIO.value
+                visibility_file = mode == InputMode.FILE.value
+                return [
+                    gr.update(visible=visibility_text),
+                    gr.update(visible=visibility_audio),
+                    gr.update(visible=visibility_file),
+                    gr.update(value="", visible=True),
+                    gr.update(value="")
+                ]
             def handle_process(
                 mode: str,
                 source_lang: str,
                 text_input: str,
                 audio_file: Optional[str],
+                file_obj: Optional[Any]
             ) -> Tuple[str, str]:
                 """Handle initial input processing."""
                 try:
                 inputs=[extracted_text, input_lang, output_lang],
                 outputs=output_text
             )
         return interface