Spaces:

Yilin0601
/

Multimodal_Language_Learning_Aid

Running

App Files Files Community

Yilin0601 commited on Mar 25

Commit

c7f56a8

verified ·

1 Parent(s): 4f0da2b

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -28

app.py CHANGED Viewed

@@ -28,51 +28,94 @@ translation_models = {
     "Korean": "Helsinki-NLP/opus-mt-en-ko"
 }
 tts_models = {
     "Spanish": "tts_models/es/tacotron2-DDC",
     "French": "tts_models/fr/tacotron2",
     "German": "tts_models/de/tacotron2",
-    "Chinese": "tts_models/zh/tacotron2",
-    "Russian": "tts_models/ru/tacotron2",
-    "Arabic": "tts_models/ar/tacotron2",
-    "Portuguese": "tts_models/pt/tacotron2",
-    "Japanese": "tts_models/ja/tacotron2",
-    "Italian": "tts_models/it/tacotron2",
-    "Korean": "tts_models/ko/tacotron2"
 }
 # Caches for translator and TTS pipelines
 translator_cache = {}
 tts_cache = {}
 def get_translator(target_language):
     if target_language in translator_cache:
         return translator_cache[target_language]
     model_name = translation_models[target_language]
-    # Pipeline task naming is case sensitive; here we assume task "translation_en_to_<lang>"
-    translator = pipeline("translation_en_to_" + target_language.lower(), model=model_name)
     translator_cache[target_language] = translator
     return translator
 def get_tts(target_language):
     if target_language in tts_cache:
         return tts_cache[target_language]
-    model_name = tts_models[target_language]
-    tts = pipeline("text-to-speech", model=model_name)
-    tts_cache[target_language] = tts
-    return tts
 # --------------------------------------------------
 # Prediction Function
 # --------------------------------------------------
 def predict(audio, text, target_language):
-    # Use text input if provided; otherwise, use ASR on audio
-    if text.strip() != "":
         english_text = text.strip()
     elif audio is not None:
         sample_rate, audio_data = audio
-        # Ensure the audio is floating-point for librosa
         if audio_data.dtype not in [np.float32, np.float64]:
             audio_data = audio_data.astype(np.float32)
@@ -90,16 +133,24 @@ def predict(audio, text, target_language):
     else:
         return "No input provided.", "", None
-    # Translation step
     translator = get_translator(target_language)
-    translation_result = translator(english_text)
-    translated_text = translation_result[0]["translation_text"]
-    # TTS step: synthesize speech from the translated text
-    tts = get_tts(target_language)
-    tts_result = tts(translated_text)
-    # The TTS pipeline returns a dict with "wav" and "sample_rate"
-    synthesized_audio = (tts_result["sample_rate"], tts_result["wav"])
     return english_text, translated_text, synthesized_audio
@@ -122,10 +173,11 @@ iface = gr.Interface(
     description=(
         "This app helps language learners by providing three outputs:\n"
         "1. English transcription (from ASR or text input),\n"
-        "2. Translation to a target language, and\n"
         "3. Synthetic speech in the target language.\n\n"
-        "Choose one of the top 10 commonly used languages from the dropdown.\n"
-        "You can either record/upload an English audio sample or enter English text directly."
     ),
     allow_flagging="never"
 )

     "Korean": "Helsinki-NLP/opus-mt-en-ko"
 }
+# Each language often requires a specific pipeline task name
+# (e.g., "translation_en_to_zh" rather than "translation_en_to_chinese")
+translation_tasks = {
+    "Spanish": "translation_en_to_es",
+    "French": "translation_en_to_fr",
+    "German": "translation_en_to_de",
+    "Chinese": "translation_en_to_zh",
+    "Russian": "translation_en_to_ru",
+    "Arabic": "translation_en_to_ar",
+    "Portuguese": "translation_en_to_pt",
+    "Japanese": "translation_en_to_ja",
+    "Italian": "translation_en_to_it",
+    "Korean": "translation_en_to_ko"
+}
+# TTS models (some may not exist or may be unofficial)
 tts_models = {
     "Spanish": "tts_models/es/tacotron2-DDC",
     "French": "tts_models/fr/tacotron2",
     "German": "tts_models/de/tacotron2",
+    "Chinese": "tts_models/zh/tacotron2",     # Verify if this actually exists on Hugging Face
+    "Russian": "tts_models/ru/tacotron2",     # Same note
+    "Arabic": "tts_models/ar/tacotron2",      # Same note
+    "Portuguese": "tts_models/pt/tacotron2",  # Same note
+    "Japanese": "tts_models/ja/tacotron2",    # Same note
+    "Italian": "tts_models/it/tacotron2",     # Same note
+    "Korean": "tts_models/ko/tacotron2"       # Same note
 }
+# --------------------------------------------------
 # Caches for translator and TTS pipelines
+# --------------------------------------------------
 translator_cache = {}
 tts_cache = {}
 def get_translator(target_language):
+    """
+    Retrieve or create a translation pipeline for the specified language.
+    """
     if target_language in translator_cache:
         return translator_cache[target_language]
     model_name = translation_models[target_language]
+    task_name = translation_tasks[target_language]
+    translator = pipeline(task_name, model=model_name)
     translator_cache[target_language] = translator
     return translator
 def get_tts(target_language):
+    """
+    Retrieve or create a TTS pipeline for the specified language, if available.
+    """
     if target_language in tts_cache:
         return tts_cache[target_language]
+    model_name = tts_models.get(target_language)
+    if model_name is None:
+        # If no TTS model is mapped, raise an error or handle gracefully
+        raise ValueError(f"No TTS model available for {target_language}.")
+    try:
+        tts_pipeline = pipeline("text-to-speech", model=model_name)
+    except Exception as e:
+        raise ValueError(
+            f"Failed to load TTS model for {target_language}. "
+            f"Make sure '{model_name}' exists on Hugging Face.\nError: {e}"
+        )
+    tts_cache[target_language] = tts_pipeline
+    return tts_pipeline
 # --------------------------------------------------
 # Prediction Function
 # --------------------------------------------------
 def predict(audio, text, target_language):
+    """
+    1. Obtain English text (from text input or ASR).
+    2. Translate English -> target_language.
+    3. Synthesize speech in target_language.
+    """
+    # 1. English text from text input (if provided), else from audio via ASR
+    if text.strip():
         english_text = text.strip()
     elif audio is not None:
         sample_rate, audio_data = audio
+        # Ensure the audio is float32 for librosa
         if audio_data.dtype not in [np.float32, np.float64]:
             audio_data = audio_data.astype(np.float32)
     else:
         return "No input provided.", "", None
+    # 2. Translation step
     translator = get_translator(target_language)
+    try:
+        translation_result = translator(english_text)
+        translated_text = translation_result[0]["translation_text"]
+    except Exception as e:
+        # If there's an error in translation, return partial results
+        return english_text, f"Translation error: {e}", None
+    # 3. TTS step: synthesize speech from the translated text
+    try:
+        tts_pipeline = get_tts(target_language)
+        tts_result = tts_pipeline(translated_text)
+        # The TTS pipeline returns a dict with "wav" and "sample_rate"
+        synthesized_audio = (tts_result["sample_rate"], tts_result["wav"])
+    except Exception as e:
+        # If TTS fails, return partial results
+        return english_text, translated_text, f"TTS error: {e}"
     return english_text, translated_text, synthesized_audio
     description=(
         "This app helps language learners by providing three outputs:\n"
         "1. English transcription (from ASR or text input),\n"
+        "2. Translation to a target language (using Helsinki-NLP models), and\n"
         "3. Synthetic speech in the target language.\n\n"
+        "Select one of the top 10 commonly used languages from the dropdown.\n"
+        "Either record/upload an English audio sample or enter English text directly.\n\n"
+        "Note: Some TTS models may not exist or be unstable for certain languages."
     ),
     allow_flagging="never"
 )