kingabzpro commited on
Commit
d2cd3d4
Β·
verified Β·
1 Parent(s): 25f7b6e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -123
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # app.py – Faster Urdu ASR + LLM Polisher (right-side output, unified audio, Soft theme)
2
 
3
  import os
4
  import json
@@ -63,14 +63,10 @@ def get_groq_client(api_key: Optional[str] = None):
63
  from groq import Groq # type: ignore
64
  return Groq(api_key=key), None
65
  except Exception as e:
66
- return None, f"Groq client import/init failed: {e}"
67
-
68
- def enhance_text_with_llm(
69
- text: str,
70
- api_key: Optional[str],
71
- temperature: float = 0.2,
72
- system_prompt: str = DEFAULT_SYSTEM_PROMPT_UR,
73
- ) -> str:
74
  client, err = get_groq_client(api_key)
75
  if not client:
76
  if err:
@@ -90,25 +86,16 @@ def enhance_text_with_llm(
90
  print(f"[LLM] Full-text enhance failed: {e}")
91
  return basic_urdu_cleanup(text)
92
 
93
- def enhance_lines_with_llm(
94
- lines: List[str],
95
- api_key: Optional[str],
96
- temperature: float = 0.2,
97
- system_prompt: str = DEFAULT_SYSTEM_PROMPT_UR,
98
- ) -> List[str]:
99
  if not lines:
100
  return lines
101
  client, err = get_groq_client(api_key)
102
  if not client:
103
- if err:
104
- print(f"[LLM] {err} (line mode fallback)")
105
  return [basic_urdu_cleanup(x) for x in lines]
106
 
107
  numbered = "\n".join(f"{i+1}. {ln}" for i, ln in enumerate(lines))
108
- user_msg = (
109
- "Ψ§Ω† Ψ¬Ω…Ω„ΩˆΪΊ کی اردو بہΨͺΨ± Ϊ©Ψ±ΫŒΪΊΫ” Ψ¨Ψ§Ω„Ϊ©Ω„ اسی ΨͺΨ±Ψͺیب اور Ϊ―Ω†Ψͺی Ϊ©Ϋ’ Ψ³Ψ§ΨͺΪΎ Ψ§ΨͺΩ†ΫŒ ہی سطور واپس کریں:"
110
- "\n\n" + numbered
111
- )
112
  try:
113
  resp = client.chat.completions.create(
114
  model=GROQ_MODEL,
@@ -125,8 +112,7 @@ def enhance_lines_with_llm(
125
  if not s or "." not in s:
126
  continue
127
  num, rest = s.split(".", 1)
128
- num = num.strip()
129
- if num.isdigit():
130
  improved_map[int(num) - 1] = rest.strip()
131
  return [improved_map.get(i, basic_urdu_cleanup(lines[i])) for i in range(len(lines))]
132
  except Exception as e:
@@ -147,9 +133,7 @@ def test_groq(api_key: Optional[str], temperature: float, system_prompt: str) ->
147
  ],
148
  )
149
  txt = (resp.choices[0].message.content or "").strip()
150
- if txt:
151
- return f"βœ… LLM OK Β· Sample: {txt}"
152
- return "⚠️ LLM responded but empty content."
153
  except Exception as e:
154
  return f"❌ LLM call failed: {e}"
155
 
@@ -158,12 +142,6 @@ def test_groq(api_key: Optional[str], temperature: float, system_prompt: str) ->
158
  # ────────────────────────────────────────────────────────────────────────────────
159
 
160
  print(f"CUDA available: {torch.cuda.is_available()}")
161
- if torch.cuda.is_available():
162
- try:
163
- print(f"GPU: {torch.cuda.get_device_name(0)}")
164
- except Exception:
165
- pass
166
-
167
  print("Loading model... this may take a minute the first time.")
168
  model = faster_whisper.WhisperModel(
169
  MODEL_ID_CT2,
@@ -189,11 +167,8 @@ def transcribe_audio(
189
  raise gr.Error("Please upload or record an audio clip.")
190
 
191
  seg_iter, info = model.transcribe(
192
- audio_path,
193
- language="ur",
194
- beam_size=int(beam_size),
195
- word_timestamps=False,
196
- vad_filter=False,
197
  )
198
 
199
  segments, raw_lines = [], []
@@ -202,148 +177,93 @@ def transcribe_audio(
202
  segments.append({"start": seg.start, "end": seg.end, "text": text})
203
  raw_lines.append(text)
204
 
205
- # Enhance / clean
206
  if llm_enhance:
207
  if output_format == "text":
208
- cleaned = enhance_text_with_llm(
209
- " ".join(raw_lines),
210
- api_key=llm_api_key,
211
- temperature=llm_temperature,
212
- system_prompt=llm_system_prompt or DEFAULT_SYSTEM_PROMPT_UR,
213
- )
214
- cleaned_lines = [cleaned]
215
  else:
216
- cleaned_lines = enhance_lines_with_llm(
217
- raw_lines,
218
- api_key=llm_api_key,
219
- temperature=llm_temperature,
220
- system_prompt=llm_system_prompt or DEFAULT_SYSTEM_PROMPT_UR,
221
- )
222
  else:
223
  cleaned_lines = (
224
  [basic_urdu_cleanup(" ".join(raw_lines))] if output_format == "text"
225
  else [basic_urdu_cleanup(x) for x in raw_lines]
226
  )
227
 
228
- # Render
229
  if output_format == "text":
230
  return cleaned_lines[0]
231
-
232
  if output_format == "srt":
233
  lines = []
234
  for i, s in enumerate(segments, 1):
235
- txt = cleaned_lines[i - 1] if len(cleaned_lines) == len(segments) else s["text"]
236
- lines += [
237
- str(i),
238
- f"{format_timestamp(s['start'], 'srt')} --> {format_timestamp(s['end'], 'srt')}",
239
- txt,
240
- "",
241
- ]
242
  return "\n".join(lines)
243
-
244
  if output_format == "vtt":
245
  lines = ["WEBVTT", ""]
246
  for i, s in enumerate(segments, 1):
247
- txt = cleaned_lines[i - 1] if len(cleaned_lines) == len(segments) else s["text"]
248
- lines += [
249
- f"{format_timestamp(s['start'], 'vtt')} --> {format_timestamp(s['end'], 'vtt')}",
250
- txt,
251
- "",
252
- ]
253
  return "\n".join(lines)
254
-
255
  if output_format == "json":
256
  segs_out = []
257
  for i, s in enumerate(segments):
258
  txt = cleaned_lines[i] if len(cleaned_lines) == len(segments) else s["text"]
259
  segs_out.append({"start": s["start"], "end": s["end"], "text": txt})
260
- return json.dumps(
261
- {
262
- "text": cleaned_lines[0] if len(cleaned_lines) == 1 else " ".join(cleaned_lines),
263
- "segments": segs_out,
264
- "language": info.language,
265
- "language_probability": info.language_probability,
266
- "duration": info.duration,
267
- "duration_after_vad": getattr(info, "duration_after_vad", None),
268
- },
269
- ensure_ascii=False,
270
- indent=2,
271
- )
272
 
273
  raise gr.Error(f"Unsupported format: {output_format}")
274
 
275
  # ────────────────────────────────────────────────────────────────────────────────
276
- # UI (right-side output, Soft theme, single audio widget, trimmed controls)
277
  # ────────────────────────────────────────────────────────────────────────────────
278
 
279
  theme = gr.themes.Soft(primary_hue="rose", secondary_hue="violet", neutral_hue="slate")
280
 
281
- with gr.Blocks(
282
- title="Urdu ASR Studio β€” Faster-Whisper + LLM Polishing",
283
- theme=theme,
284
- ) as iface:
285
- # ↓↓↓ add this block right after opening Blocks ↓↓↓
286
  gr.HTML("""
287
  <style>
288
- /* Reduce the large bottom padding Gradio adds for the HF footer */
289
  .gradio-container { padding-bottom: 16px !important; }
290
-
291
- /* Tighten vertical gaps between blocks/rows */
292
- .gradio-container .gr-row, .gradio-container .gradio-row,
293
- .gradio-container .gr-block, .gradio-container .block {
294
- margin-bottom: 8px !important;
295
- }
296
-
297
- /* Keep right-side output compact; scroll when long */
298
  #result_box textarea {
299
  min-height: 260px !important;
300
  max-height: 360px !important;
301
  overflow-y: auto !important;
302
  }
303
-
304
- /* Optional: trim footer’s own top spacing a bit */
305
- footer { margin-top: 8px !important; padding-top: 4px !important; }
306
  </style>
307
  """)
308
 
309
  gr.Markdown(
310
- "## **Urdu STT with GPT-OSS** \n"
311
  "High-quality Urdu transcription with Faster-Whisper (CT2) and optional Groq LLM polishing."
312
  )
313
 
314
  with gr.Row():
315
  with gr.Column(scale=5):
316
  audio = gr.Audio(
317
- sources=["upload", "microphone"],
318
- type="filepath",
319
  label="Upload or Record Audio",
320
  waveform_options={"show_controls": False},
321
  autoplay=False, streaming=False,
322
  )
323
- # … your accordions + buttons …
324
- with gr.Column(scale=7):
325
- out = gr.Textbox(
326
- label="Result",
327
- lines=14, max_lines=30, show_copy_button=True,
328
- elem_id="result_box" # matches CSS above
329
- )
330
-
331
- # Wiring
332
- btn.click(
333
- fn=transcribe_audio,
334
- inputs=[audio, fmt, beam, llm_toggle, llm_key, llm_temp, llm_sys],
335
- outputs=out,
336
- api_name="predict",
337
- )
338
 
339
- def _test_llm(api_key, temp, sys_prompt):
340
- return test_groq(api_key, temp, sys_prompt)
341
 
342
- test_btn.click(
343
- fn=_test_llm,
344
- inputs=[llm_key, llm_temp, llm_sys],
345
- outputs=[test_status],
346
- )
347
 
348
  if __name__ == "__main__":
349
  iface.launch()
 
1
+ # app.py – Urdu ASR Studio with Faster-Whisper + optional LLM Polishing
2
 
3
  import os
4
  import json
 
63
  from groq import Groq # type: ignore
64
  return Groq(api_key=key), None
65
  except Exception as e:
66
+ return None, f"Groq client init failed: {e}"
67
+
68
+ def enhance_text_with_llm(text: str, api_key: Optional[str], temperature: float = 0.2,
69
+ system_prompt: str = DEFAULT_SYSTEM_PROMPT_UR) -> str:
 
 
 
 
70
  client, err = get_groq_client(api_key)
71
  if not client:
72
  if err:
 
86
  print(f"[LLM] Full-text enhance failed: {e}")
87
  return basic_urdu_cleanup(text)
88
 
89
+ def enhance_lines_with_llm(lines: List[str], api_key: Optional[str], temperature: float = 0.2,
90
+ system_prompt: str = DEFAULT_SYSTEM_PROMPT_UR) -> List[str]:
 
 
 
 
91
  if not lines:
92
  return lines
93
  client, err = get_groq_client(api_key)
94
  if not client:
 
 
95
  return [basic_urdu_cleanup(x) for x in lines]
96
 
97
  numbered = "\n".join(f"{i+1}. {ln}" for i, ln in enumerate(lines))
98
+ user_msg = "Ψ§Ω† Ψ¬Ω…Ω„ΩˆΪΊ کی اردو بہΨͺΨ± Ϊ©Ψ±ΫŒΪΊΫ” اسی ΨͺΨ±Ψͺیب اور Ϊ―Ω†Ψͺی Ϊ©Ϋ’ Ψ³Ψ§ΨͺΪΎ Ψ§ΨͺΩ†ΫŒ ہی سطور واپس کریں:\n\n" + numbered
 
 
 
99
  try:
100
  resp = client.chat.completions.create(
101
  model=GROQ_MODEL,
 
112
  if not s or "." not in s:
113
  continue
114
  num, rest = s.split(".", 1)
115
+ if num.strip().isdigit():
 
116
  improved_map[int(num) - 1] = rest.strip()
117
  return [improved_map.get(i, basic_urdu_cleanup(lines[i])) for i in range(len(lines))]
118
  except Exception as e:
 
133
  ],
134
  )
135
  txt = (resp.choices[0].message.content or "").strip()
136
+ return f"βœ… LLM OK Β· Sample: {txt}" if txt else "⚠️ LLM responded but empty content."
 
 
137
  except Exception as e:
138
  return f"❌ LLM call failed: {e}"
139
 
 
142
  # ────────────────────────────────────────────────────────────────────────────────
143
 
144
  print(f"CUDA available: {torch.cuda.is_available()}")
 
 
 
 
 
 
145
  print("Loading model... this may take a minute the first time.")
146
  model = faster_whisper.WhisperModel(
147
  MODEL_ID_CT2,
 
167
  raise gr.Error("Please upload or record an audio clip.")
168
 
169
  seg_iter, info = model.transcribe(
170
+ audio_path, language="ur", beam_size=int(beam_size),
171
+ word_timestamps=False, vad_filter=False
 
 
 
172
  )
173
 
174
  segments, raw_lines = [], []
 
177
  segments.append({"start": seg.start, "end": seg.end, "text": text})
178
  raw_lines.append(text)
179
 
 
180
  if llm_enhance:
181
  if output_format == "text":
182
+ cleaned_lines = [enhance_text_with_llm(" ".join(raw_lines), llm_api_key, llm_temperature, llm_system_prompt)]
 
 
 
 
 
 
183
  else:
184
+ cleaned_lines = enhance_lines_with_llm(raw_lines, llm_api_key, llm_temperature, llm_system_prompt)
 
 
 
 
 
185
  else:
186
  cleaned_lines = (
187
  [basic_urdu_cleanup(" ".join(raw_lines))] if output_format == "text"
188
  else [basic_urdu_cleanup(x) for x in raw_lines]
189
  )
190
 
 
191
  if output_format == "text":
192
  return cleaned_lines[0]
 
193
  if output_format == "srt":
194
  lines = []
195
  for i, s in enumerate(segments, 1):
196
+ txt = cleaned_lines[i-1] if len(cleaned_lines) == len(segments) else s["text"]
197
+ lines += [str(i), f"{format_timestamp(s['start'],'srt')} --> {format_timestamp(s['end'],'srt')}", txt, ""]
 
 
 
 
 
198
  return "\n".join(lines)
 
199
  if output_format == "vtt":
200
  lines = ["WEBVTT", ""]
201
  for i, s in enumerate(segments, 1):
202
+ txt = cleaned_lines[i-1] if len(cleaned_lines) == len(segments) else s["text"]
203
+ lines += [f"{format_timestamp(s['start'],'vtt')} --> {format_timestamp(s['end'],'vtt')}", txt, ""]
 
 
 
 
204
  return "\n".join(lines)
 
205
  if output_format == "json":
206
  segs_out = []
207
  for i, s in enumerate(segments):
208
  txt = cleaned_lines[i] if len(cleaned_lines) == len(segments) else s["text"]
209
  segs_out.append({"start": s["start"], "end": s["end"], "text": txt})
210
+ return json.dumps({"text": " ".join(cleaned_lines), "segments": segs_out}, ensure_ascii=False, indent=2)
 
 
 
 
 
 
 
 
 
 
 
211
 
212
  raise gr.Error(f"Unsupported format: {output_format}")
213
 
214
  # ────────────────────────────────────────────────────────────────────────────────
215
+ # UI
216
  # ────────────────────────────────────────────────────────────────────────────────
217
 
218
  theme = gr.themes.Soft(primary_hue="rose", secondary_hue="violet", neutral_hue="slate")
219
 
220
+ with gr.Blocks(title="Urdu ASR Studio β€” Faster-Whisper + LLM Polishing", theme=theme) as iface:
221
+ # Custom CSS to fix spacing + output height
 
 
 
222
  gr.HTML("""
223
  <style>
 
224
  .gradio-container { padding-bottom: 16px !important; }
 
 
 
 
 
 
 
 
225
  #result_box textarea {
226
  min-height: 260px !important;
227
  max-height: 360px !important;
228
  overflow-y: auto !important;
229
  }
 
 
 
230
  </style>
231
  """)
232
 
233
  gr.Markdown(
234
+ "## **Urdu STT with LLM** \n"
235
  "High-quality Urdu transcription with Faster-Whisper (CT2) and optional Groq LLM polishing."
236
  )
237
 
238
  with gr.Row():
239
  with gr.Column(scale=5):
240
  audio = gr.Audio(
241
+ sources=["upload","microphone"], type="filepath",
 
242
  label="Upload or Record Audio",
243
  waveform_options={"show_controls": False},
244
  autoplay=False, streaming=False,
245
  )
246
+ with gr.Accordion("Transcription Settings", open=False):
247
+ with gr.Row():
248
+ fmt = gr.Radio(choices=["text","srt","vtt","json"], value="text", label="Output Format")
249
+ beam = gr.Slider(1,10,5,step=1,label="Beam Size")
250
+ with gr.Accordion("LLM Polishing (Optional)", open=False):
251
+ llm_toggle = gr.Checkbox(value=False,label="Polish Urdu text with LLM (Groq Β· openai/gpt-oss-120b)")
252
+ with gr.Row():
253
+ llm_temp = gr.Slider(0.0,1.0,0.2,step=0.05,label="LLM Temperature")
254
+ llm_key = gr.Textbox(label="GROQ_API_KEY (optional if set in environment)", type="password", value="")
255
+ llm_sys = gr.Textbox(label="LLM System Prompt (Urdu)", value=DEFAULT_SYSTEM_PROMPT_UR, lines=3)
256
+ with gr.Row():
257
+ test_btn = gr.Button("Test LLM", variant="secondary")
258
+ test_status = gr.Markdown("")
259
+ with gr.Row():
260
+ btn = gr.Button("Transcribe", variant="primary")
261
 
262
+ with gr.Column(scale=7):
263
+ out = gr.Textbox(label="Result", lines=14, max_lines=30, show_copy_button=True, elem_id="result_box")
264
 
265
+ btn.click(fn=transcribe_audio, inputs=[audio, fmt, beam, llm_toggle, llm_key, llm_temp, llm_sys], outputs=out)
266
+ test_btn.click(fn=test_groq, inputs=[llm_key,llm_temp,llm_sys], outputs=[test_status])
 
 
 
267
 
268
  if __name__ == "__main__":
269
  iface.launch()