Spaces:

kingabzpro
/

Urdu-STT-with-GPT-OSS

Running

App Files Files Community

kingabzpro commited on Sep 7

Commit

d2cd3d4

verified ·

1 Parent(s): 25f7b6e

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -123

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py – Faster Urdu ASR + LLM Polisher (right-side output, unified audio, Soft theme)
 import os
 import json
@@ -63,14 +63,10 @@ def get_groq_client(api_key: Optional[str] = None):
         from groq import Groq  # type: ignore
         return Groq(api_key=key), None
     except Exception as e:
-        return None, f"Groq client import/init failed: {e}"
-def enhance_text_with_llm(
-    text: str,
-    api_key: Optional[str],
-    temperature: float = 0.2,
-    system_prompt: str = DEFAULT_SYSTEM_PROMPT_UR,
-) -> str:
     client, err = get_groq_client(api_key)
     if not client:
         if err:
@@ -90,25 +86,16 @@ def enhance_text_with_llm(
         print(f"[LLM] Full-text enhance failed: {e}")
         return basic_urdu_cleanup(text)
-def enhance_lines_with_llm(
-    lines: List[str],
-    api_key: Optional[str],
-    temperature: float = 0.2,
-    system_prompt: str = DEFAULT_SYSTEM_PROMPT_UR,
-) -> List[str]:
     if not lines:
         return lines
     client, err = get_groq_client(api_key)
     if not client:
-        if err:
-            print(f"[LLM] {err} (line mode fallback)")
         return [basic_urdu_cleanup(x) for x in lines]
     numbered = "\n".join(f"{i+1}. {ln}" for i, ln in enumerate(lines))
-    user_msg = (
-        "ان جملوں کی اردو بہتر کریں۔ بالکل اسی ترتیب اور گنتی کے ساتھ اتنی ہی سطور واپس کریں:"
-        "\n\n" + numbered
-    )
     try:
         resp = client.chat.completions.create(
             model=GROQ_MODEL,
@@ -125,8 +112,7 @@ def enhance_lines_with_llm(
             if not s or "." not in s:
                 continue
             num, rest = s.split(".", 1)
-            num = num.strip()
-            if num.isdigit():
                 improved_map[int(num) - 1] = rest.strip()
         return [improved_map.get(i, basic_urdu_cleanup(lines[i])) for i in range(len(lines))]
     except Exception as e:
@@ -147,9 +133,7 @@ def test_groq(api_key: Optional[str], temperature: float, system_prompt: str) ->
             ],
         )
         txt = (resp.choices[0].message.content or "").strip()
-        if txt:
-            return f"✅ LLM OK · Sample: {txt}"
-        return "⚠️ LLM responded but empty content."
     except Exception as e:
         return f"❌ LLM call failed: {e}"
@@ -158,12 +142,6 @@ def test_groq(api_key: Optional[str], temperature: float, system_prompt: str) ->
 # ────────────────────────────────────────────────────────────────────────────────
 print(f"CUDA available: {torch.cuda.is_available()}")
-if torch.cuda.is_available():
-    try:
-        print(f"GPU: {torch.cuda.get_device_name(0)}")
-    except Exception:
-        pass
 print("Loading model... this may take a minute the first time.")
 model = faster_whisper.WhisperModel(
     MODEL_ID_CT2,
@@ -189,11 +167,8 @@ def transcribe_audio(
         raise gr.Error("Please upload or record an audio clip.")
     seg_iter, info = model.transcribe(
-        audio_path,
-        language="ur",
-        beam_size=int(beam_size),
-        word_timestamps=False,
-        vad_filter=False,
     )
     segments, raw_lines = [], []
@@ -202,148 +177,93 @@ def transcribe_audio(
         segments.append({"start": seg.start, "end": seg.end, "text": text})
         raw_lines.append(text)
-    # Enhance / clean
     if llm_enhance:
         if output_format == "text":
-            cleaned = enhance_text_with_llm(
-                " ".join(raw_lines),
-                api_key=llm_api_key,
-                temperature=llm_temperature,
-                system_prompt=llm_system_prompt or DEFAULT_SYSTEM_PROMPT_UR,
-            )
-            cleaned_lines = [cleaned]
         else:
-            cleaned_lines = enhance_lines_with_llm(
-                raw_lines,
-                api_key=llm_api_key,
-                temperature=llm_temperature,
-                system_prompt=llm_system_prompt or DEFAULT_SYSTEM_PROMPT_UR,
-            )
     else:
         cleaned_lines = (
             [basic_urdu_cleanup(" ".join(raw_lines))] if output_format == "text"
             else [basic_urdu_cleanup(x) for x in raw_lines]
         )
-    # Render
     if output_format == "text":
         return cleaned_lines[0]
     if output_format == "srt":
         lines = []
         for i, s in enumerate(segments, 1):
-            txt = cleaned_lines[i - 1] if len(cleaned_lines) == len(segments) else s["text"]
-            lines += [
-                str(i),
-                f"{format_timestamp(s['start'], 'srt')} --> {format_timestamp(s['end'], 'srt')}",
-                txt,
-                "",
-            ]
         return "\n".join(lines)
     if output_format == "vtt":
         lines = ["WEBVTT", ""]
         for i, s in enumerate(segments, 1):
-            txt = cleaned_lines[i - 1] if len(cleaned_lines) == len(segments) else s["text"]
-            lines += [
-                f"{format_timestamp(s['start'], 'vtt')} --> {format_timestamp(s['end'], 'vtt')}",
-                txt,
-                "",
-            ]
         return "\n".join(lines)
     if output_format == "json":
         segs_out = []
         for i, s in enumerate(segments):
             txt = cleaned_lines[i] if len(cleaned_lines) == len(segments) else s["text"]
             segs_out.append({"start": s["start"], "end": s["end"], "text": txt})
-        return json.dumps(
-            {
-                "text": cleaned_lines[0] if len(cleaned_lines) == 1 else " ".join(cleaned_lines),
-                "segments": segs_out,
-                "language": info.language,
-                "language_probability": info.language_probability,
-                "duration": info.duration,
-                "duration_after_vad": getattr(info, "duration_after_vad", None),
-            },
-            ensure_ascii=False,
-            indent=2,
-        )
     raise gr.Error(f"Unsupported format: {output_format}")
 # ────────────────────────────────────────────────────────────────────────────────
-# UI (right-side output, Soft theme, single audio widget, trimmed controls)
 # ────────────────────────────────────────────────────────────────────────────────
 theme = gr.themes.Soft(primary_hue="rose", secondary_hue="violet", neutral_hue="slate")
-with gr.Blocks(
-    title="Urdu ASR Studio — Faster-Whisper + LLM Polishing",
-    theme=theme,
-) as iface:
-    # ↓↓↓ add this block right after opening Blocks ↓↓↓
     gr.HTML("""
     <style>
-      /* Reduce the large bottom padding Gradio adds for the HF footer */
       .gradio-container { padding-bottom: 16px !important; }
-      /* Tighten vertical gaps between blocks/rows */
-      .gradio-container .gr-row, .gradio-container .gradio-row,
-      .gradio-container .gr-block, .gradio-container .block {
-        margin-bottom: 8px !important;
-      }
-      /* Keep right-side output compact; scroll when long */
       #result_box textarea {
         min-height: 260px !important;
         max-height: 360px !important;
         overflow-y: auto !important;
       }
-      /* Optional: trim footer’s own top spacing a bit */
-      footer { margin-top: 8px !important; padding-top: 4px !important; }
     </style>
     """)
     gr.Markdown(
-        "## **Urdu STT with GPT-OSS**  \n"
         "High-quality Urdu transcription with Faster-Whisper (CT2) and optional Groq LLM polishing."
     )
     with gr.Row():
         with gr.Column(scale=5):
             audio = gr.Audio(
-                sources=["upload", "microphone"],
-                type="filepath",
                 label="Upload or Record Audio",
                 waveform_options={"show_controls": False},
                 autoplay=False, streaming=False,
             )
-            # … your accordions + buttons …
-        with gr.Column(scale=7):
-            out = gr.Textbox(
-                label="Result",
-                lines=14, max_lines=30, show_copy_button=True,
-                elem_id="result_box"  # matches CSS above
-            )
-    # Wiring
-    btn.click(
-        fn=transcribe_audio,
-        inputs=[audio, fmt, beam, llm_toggle, llm_key, llm_temp, llm_sys],
-        outputs=out,
-        api_name="predict",
-    )
-    def _test_llm(api_key, temp, sys_prompt):
-        return test_groq(api_key, temp, sys_prompt)
-    test_btn.click(
-        fn=_test_llm,
-        inputs=[llm_key, llm_temp, llm_sys],
-        outputs=[test_status],
-    )
 if __name__ == "__main__":
     iface.launch()

+# app.py – Urdu ASR Studio with Faster-Whisper + optional LLM Polishing
 import os
 import json
         from groq import Groq  # type: ignore
         return Groq(api_key=key), None
     except Exception as e:
+        return None, f"Groq client init failed: {e}"
+def enhance_text_with_llm(text: str, api_key: Optional[str], temperature: float = 0.2,
+                          system_prompt: str = DEFAULT_SYSTEM_PROMPT_UR) -> str:
     client, err = get_groq_client(api_key)
     if not client:
         if err:
         print(f"[LLM] Full-text enhance failed: {e}")
         return basic_urdu_cleanup(text)
+def enhance_lines_with_llm(lines: List[str], api_key: Optional[str], temperature: float = 0.2,
+                           system_prompt: str = DEFAULT_SYSTEM_PROMPT_UR) -> List[str]:
     if not lines:
         return lines
     client, err = get_groq_client(api_key)
     if not client:
         return [basic_urdu_cleanup(x) for x in lines]
     numbered = "\n".join(f"{i+1}. {ln}" for i, ln in enumerate(lines))
+    user_msg = "ان جملوں کی اردو بہتر کریں۔ اسی ترتیب اور گنتی کے ساتھ اتنی ہی سطور واپس کریں:\n\n" + numbered
     try:
         resp = client.chat.completions.create(
             model=GROQ_MODEL,
             if not s or "." not in s:
                 continue
             num, rest = s.split(".", 1)
+            if num.strip().isdigit():
                 improved_map[int(num) - 1] = rest.strip()
         return [improved_map.get(i, basic_urdu_cleanup(lines[i])) for i in range(len(lines))]
     except Exception as e:
             ],
         )
         txt = (resp.choices[0].message.content or "").strip()
+        return f"✅ LLM OK · Sample: {txt}" if txt else "⚠️ LLM responded but empty content."
     except Exception as e:
         return f"❌ LLM call failed: {e}"
 # ────────────────────────────────────────────────────────────────────────────────
 print(f"CUDA available: {torch.cuda.is_available()}")
 print("Loading model... this may take a minute the first time.")
 model = faster_whisper.WhisperModel(
     MODEL_ID_CT2,
         raise gr.Error("Please upload or record an audio clip.")
     seg_iter, info = model.transcribe(
+        audio_path, language="ur", beam_size=int(beam_size),
+        word_timestamps=False, vad_filter=False
     )
     segments, raw_lines = [], []
         segments.append({"start": seg.start, "end": seg.end, "text": text})
         raw_lines.append(text)
     if llm_enhance:
         if output_format == "text":
+            cleaned_lines = [enhance_text_with_llm(" ".join(raw_lines), llm_api_key, llm_temperature, llm_system_prompt)]
         else:
+            cleaned_lines = enhance_lines_with_llm(raw_lines, llm_api_key, llm_temperature, llm_system_prompt)
     else:
         cleaned_lines = (
             [basic_urdu_cleanup(" ".join(raw_lines))] if output_format == "text"
             else [basic_urdu_cleanup(x) for x in raw_lines]
         )
     if output_format == "text":
         return cleaned_lines[0]
     if output_format == "srt":
         lines = []
         for i, s in enumerate(segments, 1):
+            txt = cleaned_lines[i-1] if len(cleaned_lines) == len(segments) else s["text"]
+            lines += [str(i), f"{format_timestamp(s['start'],'srt')} --> {format_timestamp(s['end'],'srt')}", txt, ""]
         return "\n".join(lines)
     if output_format == "vtt":
         lines = ["WEBVTT", ""]
         for i, s in enumerate(segments, 1):
+            txt = cleaned_lines[i-1] if len(cleaned_lines) == len(segments) else s["text"]
+            lines += [f"{format_timestamp(s['start'],'vtt')} --> {format_timestamp(s['end'],'vtt')}", txt, ""]
         return "\n".join(lines)
     if output_format == "json":
         segs_out = []
         for i, s in enumerate(segments):
             txt = cleaned_lines[i] if len(cleaned_lines) == len(segments) else s["text"]
             segs_out.append({"start": s["start"], "end": s["end"], "text": txt})
+        return json.dumps({"text": " ".join(cleaned_lines), "segments": segs_out}, ensure_ascii=False, indent=2)
     raise gr.Error(f"Unsupported format: {output_format}")
 # ────────────────────────────────────────────────────────────────────────────────
+# UI
 # ────────────────────────────────────────────────────────────────────────────────
 theme = gr.themes.Soft(primary_hue="rose", secondary_hue="violet", neutral_hue="slate")
+with gr.Blocks(title="Urdu ASR Studio — Faster-Whisper + LLM Polishing", theme=theme) as iface:
+    # Custom CSS to fix spacing + output height
     gr.HTML("""
     <style>
       .gradio-container { padding-bottom: 16px !important; }
       #result_box textarea {
         min-height: 260px !important;
         max-height: 360px !important;
         overflow-y: auto !important;
       }
     </style>
     """)
     gr.Markdown(
+        "## **Urdu STT with LLM**  \n"
         "High-quality Urdu transcription with Faster-Whisper (CT2) and optional Groq LLM polishing."
     )
     with gr.Row():
         with gr.Column(scale=5):
             audio = gr.Audio(
+                sources=["upload","microphone"], type="filepath",
                 label="Upload or Record Audio",
                 waveform_options={"show_controls": False},
                 autoplay=False, streaming=False,
             )
+            with gr.Accordion("Transcription Settings", open=False):
+                with gr.Row():
+                    fmt = gr.Radio(choices=["text","srt","vtt","json"], value="text", label="Output Format")
+                    beam = gr.Slider(1,10,5,step=1,label="Beam Size")
+            with gr.Accordion("LLM Polishing (Optional)", open=False):
+                llm_toggle = gr.Checkbox(value=False,label="Polish Urdu text with LLM (Groq · openai/gpt-oss-120b)")
+                with gr.Row():
+                    llm_temp = gr.Slider(0.0,1.0,0.2,step=0.05,label="LLM Temperature")
+                    llm_key = gr.Textbox(label="GROQ_API_KEY (optional if set in environment)", type="password", value="")
+                llm_sys = gr.Textbox(label="LLM System Prompt (Urdu)", value=DEFAULT_SYSTEM_PROMPT_UR, lines=3)
+                with gr.Row():
+                    test_btn = gr.Button("Test LLM", variant="secondary")
+                    test_status = gr.Markdown("")
+            with gr.Row():
+                btn = gr.Button("Transcribe", variant="primary")
+        with gr.Column(scale=7):
+            out = gr.Textbox(label="Result", lines=14, max_lines=30, show_copy_button=True, elem_id="result_box")
+    btn.click(fn=transcribe_audio, inputs=[audio, fmt, beam, llm_toggle, llm_key, llm_temp, llm_sys], outputs=out)
+    test_btn.click(fn=test_groq, inputs=[llm_key,llm_temp,llm_sys], outputs=[test_status])
 if __name__ == "__main__":
     iface.launch()