niks1419 commited on
Commit
9ad2a14
·
verified ·
1 Parent(s): 2eb43a7

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +613 -0
app.py ADDED
@@ -0,0 +1,613 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio app wrapping your diarization + separation + enhancement + transcription pipeline.
3
+ """
4
+
5
+ import os
6
+ import tempfile
7
+ import math
8
+ import json
9
+ import shutil
10
+ import time
11
+ from datetime import timedelta
12
+ from pathlib import Path
13
+ from typing import List, Tuple
14
+
15
+ import re
16
+ import numpy as np
17
+ import soundfile as sf
18
+ import librosa
19
+ import noisereduce as nr
20
+ import gradio as gr
21
+
22
+ # Lazy imports (heavy models) will be done inside the worker function
23
+ # to keep the app responsive on startup.
24
+
25
+ # -----------------------
26
+ # Configuration defaults
27
+ # -----------------------
28
+ SAMPLE_RATE = 16000
29
+ CHUNK_DURATION = 8.0
30
+ KEYWORDS = ["red", "yellow", "green"]
31
+ HF_TOKEN_E = os.environ.get("HF_TOKEN")
32
+
33
+ # -----------------------
34
+ # Helper utilities
35
+ # -----------------------
36
+
37
+ def time_to_samples(t: float, sr: int) -> int:
38
+ return int(round(t * sr))
39
+
40
+
41
+ def save_wav(path: str, data: np.ndarray, sr: int = SAMPLE_RATE):
42
+ sf.write(path, data.astype(np.float32), sr)
43
+
44
+
45
+ # -----------------------
46
+ # Transcription helper
47
+ # -----------------------
48
+
49
+ def transcribe_audio_array_with_whisper(audio: np.ndarray, sr: int, whisper_model) -> dict:
50
+ """Whisper expects a file path; write to temp wav then transcribe."""
51
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
52
+ try:
53
+ sf.write(tmp.name, audio.astype(np.float32), sr)
54
+ res = whisper_model.transcribe(tmp.name, task="transcribe", fp16=False, language=None)
55
+ return res
56
+ except Exception as e:
57
+ return {"text": "", "segments": []}
58
+ finally:
59
+ try:
60
+ tmp.close()
61
+ os.unlink(tmp.name)
62
+ except Exception:
63
+ pass
64
+
65
+
66
+ def transcribe_file_with_whisper(wav_path: str, whisper_model) -> dict:
67
+ try:
68
+ res = whisper_model.transcribe(wav_path, task="transcribe", fp16=False, language=None)
69
+ return res
70
+ except Exception as e:
71
+ return {"text": "", "segments": []}
72
+
73
+
74
+ # -----------------------
75
+ # Keyword finder
76
+ # -----------------------
77
+
78
+ def find_keywords_in_text(text: str, keywords: List[str]) -> List[Tuple[str, int]]:
79
+ found = []
80
+ for kw in keywords:
81
+ for match in re.finditer(rf"\b{re.escape(kw)}\b", text, flags=re.IGNORECASE):
82
+ found.append((kw, match.start()))
83
+ return found
84
+
85
+
86
+ # -----------------------
87
+ # Main pipeline (wrapped for Gradio streaming)
88
+ # -----------------------
89
+
90
+ def pipeline_worker(video_file_path: str, keywords: List[str]):
91
+ """
92
+ Generator function that yields progress logs and finally returns (log, file_list, keyword_log, transcripts_json_path)
93
+ The Gradio interface will call this function and stream the logs.
94
+ """
95
+ # Prepare temporary output directory per-run
96
+ run_dir = tempfile.mkdtemp(prefix="diarize_run_")
97
+ out_dir = os.path.join(run_dir, "out")
98
+ os.makedirs(out_dir, exist_ok=True)
99
+
100
+ logs = []
101
+
102
+ def emit(message: str):
103
+ nonlocal logs
104
+ logs.append(message)
105
+ yield "\n".join(logs), "", "", ""
106
+
107
+ # 1) Convert mp4 to wav (use moviepy)
108
+ yield from emit(f"Starting run — saving outputs to: {out_dir}")
109
+
110
+ try:
111
+ from moviepy.editor import VideoFileClip
112
+ except Exception as e:
113
+ yield from emit(f"ERROR: moviepy import failed: {e}")
114
+ return
115
+
116
+ wav_path = os.path.join(run_dir, "input_audio.wav")
117
+ try:
118
+ yield from emit("Extracting audio from video...")
119
+ clip = VideoFileClip(video_file_path)
120
+ clip.audio.write_audiofile(wav_path, codec="pcm_s16le")
121
+ clip.close()
122
+ yield from emit(f"Saved extracted audio: {wav_path}")
123
+ except Exception as e:
124
+ yield from emit(f"ERROR extracting audio: {e}")
125
+ return
126
+
127
+ # 2) Load audio (librosa)
128
+ try:
129
+ y, sr = librosa.load(wav_path, sr=SAMPLE_RATE, mono=True)
130
+ duration = len(y) / sr
131
+ yield from emit(f"Loaded audio: {duration:.1f}s @ {sr}Hz")
132
+ except Exception as e:
133
+ yield from emit(f"ERROR loading audio: {e}")
134
+ return
135
+
136
+ # Lazy-load heavy models
137
+ yield from emit("Loading diarization & embedding models (this can take a while)...")
138
+ HF_TOKEN = os.environ.get("HF_TOKEN_1")
139
+
140
+ try:
141
+ from pyannote.audio import Pipeline, Model
142
+ # diarize_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2022.07", use_auth_token=HF_TOKEN)
143
+ diarize_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization")
144
+ embedding_model = Model.from_pretrained("pyannote/embedding")
145
+
146
+ yield from emit("pyannote models loaded.")
147
+ except Exception as e:
148
+ yield from emit(f"WARNING: pyannote models failed to load: {e}\nDiarization may not work.")
149
+ diarize_pipeline = None
150
+ embedding_model = None
151
+
152
+ # Load separation & enhancement (speechbrain) lazily
153
+ try:
154
+ from speechbrain.pretrained import SepformerSeparation as Sepformer
155
+ from speechbrain.pretrained import SpectralMaskEnhancement as Enhancer
156
+ sepformer = Sepformer.from_hparams(source="speechbrain/sepformer-whamr", savedir=os.path.join(run_dir, "tmp_speechbrain_sepformer"))
157
+ enhancer = Enhancer.from_hparams(source="speechbrain/metricgan-plus-voicebank", savedir=os.path.join(run_dir, "tmp_speechbrain_enh"))
158
+ yield from emit("Speechbrain sepformer + enhancer loaded.")
159
+ except Exception as e:
160
+ yield from emit(f"WARNING: speechbrain models failed to load: {e}\nSeparation/enhancement fallbacks will be used.")
161
+ sepformer = None
162
+ enhancer = None
163
+
164
+ # Load whisper model lazily
165
+ try:
166
+ import whisper
167
+ whisper_model = whisper.load_model("large-v3", device="cpu")
168
+ yield from emit("Whisper loaded (large-v3) on CPU.")
169
+ except Exception as e:
170
+ yield from emit(f"ERROR loading Whisper model: {e}")
171
+ whisper_model = None
172
+
173
+ # run diarization
174
+ if diarize_pipeline is None:
175
+ yield from emit("Skipping diarization (pipeline unavailable). Creating single ""speaker_0"" segment covering full audio.")
176
+ diarization = None
177
+ speakers = ["SPEAKER_0"]
178
+ segments = [ (0.0, duration, "SPEAKER_0") ]
179
+ else:
180
+ yield from emit("Running diarization... This may take a while.")
181
+ try:
182
+ diarization = diarize_pipeline({"audio": wav_path})
183
+ speakers = sorted({label for segment, track, label in diarization.itertracks(yield_label=True)})
184
+ yield from emit(f"Detected speakers: {speakers}")
185
+ except Exception as e:
186
+ yield from emit(f"ERROR during diarization: {e}")
187
+ diarization = None
188
+ speakers = ["SPEAKER_0"]
189
+
190
+ # Prepare speaker buffers
191
+ speaker_buffers = {sp: [] for sp in speakers}
192
+ transcriptions = []
193
+
194
+ # Helper to compute embedding from numpy audio (if model available)
195
+ def embedding_from_audio(audio_np: np.ndarray):
196
+ if embedding_model is None:
197
+ return np.zeros((1, 256))
198
+ waveform = audio_np.reshape(1, -1)
199
+ try:
200
+ emb = embedding_model({'waveform': waveform, 'sample_rate': SAMPLE_RATE})
201
+ return emb.data.numpy().reshape(1, -1)
202
+ except Exception:
203
+ return np.zeros((1, 256))
204
+
205
+ # Iterate through diarized segments (or single fallback)
206
+ yield from emit("Processing diarized segments (separation/enhancement/transcription)...")
207
+
208
+ if diarization is None:
209
+ segments_iter = [(0.0, duration, "SPEAKER_0")]
210
+ else:
211
+ segments_iter = [(seg.start, seg.end, lbl) for seg, _, lbl in diarization.itertracks(yield_label=True)]
212
+
213
+ for idx, (start, end, label) in enumerate(segments_iter):
214
+ seg_dur = end - start
215
+ a_samp = time_to_samples(start, sr)
216
+ b_samp = time_to_samples(end, sr)
217
+ seg_audio = y[a_samp:b_samp]
218
+
219
+ yield from emit(f"Segment {idx+1}/{len(segments_iter)}: {label} [{start:.2f}-{end:.2f}] ({seg_dur:.2f}s)")
220
+
221
+ # Detect overlaps (simple check)
222
+ is_overlap = False
223
+ if diarization is not None:
224
+ overlapped_labels = [lbl for s2, _, lbl in diarization.itertracks(yield_label=True) if s2.start < end and s2.end > start and lbl != label]
225
+ is_overlap = len(overlapped_labels) > 0
226
+
227
+ # Non-overlap & short => enhance and append
228
+ if not is_overlap and seg_dur <= CHUNK_DURATION:
229
+ # attempt enhancer
230
+ try:
231
+ if enhancer is not None:
232
+ import torch
233
+ wav_tensor = torch.tensor(seg_audio).float().unsqueeze(0)
234
+ enhanced = enhancer.enhance_batch(wav_tensor).squeeze(0).numpy()
235
+ else:
236
+ raise Exception("enhancer unavailable")
237
+ except Exception:
238
+ enhanced = nr.reduce_noise(y=seg_audio, sr=sr)
239
+
240
+ speaker_buffers[label].append(enhanced.flatten())
241
+
242
+ # transcribe
243
+ if whisper_model is not None:
244
+ try:
245
+ res = transcribe_audio_array_with_whisper(enhanced, sr, whisper_model)
246
+ transcript_text = res.get("text", "").strip()
247
+ except Exception:
248
+ transcript_text = "[Transcription failed]"
249
+ else:
250
+ transcript_text = "[Whisper unavailable]"
251
+
252
+ transcriptions.append({
253
+ "speaker": label,
254
+ "start": float(start),
255
+ "end": float(end),
256
+ "duration": float(seg_dur),
257
+ "text": transcript_text,
258
+ })
259
+
260
+ else:
261
+ # Overlapped or long: chunk, separate, embed, match to prototypes
262
+ samples = seg_audio
263
+ n_chunks = max(1, math.ceil(len(samples) / int(CHUNK_DURATION * sr)))
264
+ chunk_size = int(len(samples) / n_chunks)
265
+
266
+ for i in range(n_chunks):
267
+ a = i * chunk_size
268
+ b = min(len(samples), (i + 1) * chunk_size)
269
+ chunk = samples[a:b]
270
+ if len(chunk) < 100:
271
+ continue
272
+
273
+ # Try sepformer separation
274
+ est_sources = None
275
+ try:
276
+ if sepformer is not None:
277
+ # speechbrain sepformer has a separate_file_chunkwise or separate_file; attempt both
278
+ try:
279
+ est_sources = sepformer.separate_file_chunkwise(batch_audio=chunk, sample_rate=sr)
280
+ except Exception:
281
+ tmpf = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
282
+ sf.write(tmpf.name, chunk, sr)
283
+ est = sepformer.separate_file(tmpf.name)
284
+ tmpf.close()
285
+ os.unlink(tmpf.name)
286
+ est_sources = est
287
+ except Exception:
288
+ est_sources = None
289
+
290
+ if est_sources is None:
291
+ # fallback: attempt simple split into two channels (if mono, duplicate) — conservative fallback
292
+ est_sources = [chunk, chunk]
293
+
294
+ # Compute embeddings
295
+ embeddings = []
296
+ for src in est_sources:
297
+ try:
298
+ emb = embedding_from_audio(np.asarray(src).flatten())
299
+ except Exception:
300
+ emb = np.zeros((1, 256))
301
+ embeddings.append(emb)
302
+
303
+ # Speaker prototypes
304
+ speaker_protos = {}
305
+ for sp in speakers:
306
+ if len(speaker_buffers[sp]) > 0:
307
+ ex = np.concatenate([np.asarray(p).flatten() for p in speaker_buffers[sp][:1]])
308
+ speaker_protos[sp] = embedding_from_audio(ex)
309
+ else:
310
+ speaker_protos[sp] = None
311
+
312
+ for src_idx, emb in enumerate(embeddings):
313
+ best_sp, best_sim = None, -1
314
+ for sp in speakers:
315
+ proto = speaker_protos[sp]
316
+ if proto is None:
317
+ continue
318
+ try:
319
+ from sklearn.metrics.pairwise import cosine_similarity
320
+ sim = cosine_similarity(emb, proto)[0, 0]
321
+ except Exception:
322
+ sim = -1
323
+ if sim > best_sim:
324
+ best_sim = sim
325
+ best_sp = sp
326
+
327
+ assign_to = best_sp if best_sp is not None else speakers[src_idx % len(speakers)]
328
+ speaker_buffers[assign_to].append(np.asarray(est_sources[src_idx]).flatten())
329
+
330
+ # Transcribe separated chunk
331
+ if whisper_model is not None:
332
+ try:
333
+ res = transcribe_audio_array_with_whisper(np.asarray(est_sources[src_idx]).flatten(), sr, whisper_model)
334
+ transcript_text = res.get("text", "").strip()
335
+ except Exception:
336
+ transcript_text = "[Transcription failed]"
337
+ else:
338
+ transcript_text = "[Whisper unavailable]"
339
+
340
+ transcriptions.append({
341
+ "speaker": assign_to,
342
+ "start": float(start + a / sr),
343
+ "end": float(start + b / sr),
344
+ "duration": float((b - a) / sr),
345
+ "text": transcript_text,
346
+ })
347
+
348
+ # Emit progress after each segment
349
+ yield from emit(f"Processed segment {idx+1}/{len(segments_iter)}")
350
+
351
+ # After processing all segments: write per-speaker concatenated wavs
352
+ yield from emit("Concatenating speaker buffers and saving speaker wav files...")
353
+ generated_files = []
354
+ for sp, pieces in speaker_buffers.items():
355
+ if len(pieces) == 0:
356
+ continue
357
+ out = np.concatenate([np.asarray(p).flatten() for p in pieces])
358
+ out_path = os.path.join(out_dir, f"{sp}.wav")
359
+ save_wav(out_path, out, sr)
360
+ generated_files.append(out_path)
361
+ yield from emit(f"Saved speaker file: {out_path}")
362
+
363
+ # Build residual noise track (simple reconstruction)
364
+ yield from emit("Building residual noise track...")
365
+ recon = np.zeros_like(y)
366
+ cursor = 0
367
+ for sp, pieces in speaker_buffers.items():
368
+ if len(pieces) == 0:
369
+ continue
370
+ recon_piece = np.concatenate([np.asarray(p).flatten() for p in pieces])
371
+ length = min(len(recon_piece), len(recon) - cursor)
372
+ if length <= 0:
373
+ continue
374
+ recon[cursor:cursor+length] += recon_piece[:length]
375
+ cursor += length
376
+
377
+ residual = y - recon
378
+ residual_path = os.path.join(out_dir, "noise_residual.wav")
379
+ save_wav(residual_path, residual, sr)
380
+ generated_files.append(residual_path)
381
+ yield from emit(f"Saved residual: {residual_path}")
382
+
383
+ # Save timestamped transcriptions (from the `transcriptions` built earlier)
384
+ transcript_file = os.path.join(out_dir, "timestamped_transcriptions.json")
385
+ with open(transcript_file, "w", encoding="utf-8") as f:
386
+ json.dump(transcriptions, f, indent=2, ensure_ascii=False)
387
+ generated_files.append(transcript_file)
388
+ yield from emit(f"Saved timestamped transcriptions: {transcript_file}")
389
+
390
+ # Run a second pass: run whisper on each speaker file for segments (detailed JSON)
391
+ yield from emit("Running final Whisper pass on each speaker file to produce detailed transcripts...")
392
+ detailed_paths = []
393
+ for sp in speakers:
394
+ sp_wav_path = os.path.join(out_dir, f"{sp}.wav")
395
+ if not os.path.exists(sp_wav_path):
396
+ continue
397
+ if whisper_model is not None:
398
+ res = transcribe_file_with_whisper(sp_wav_path, whisper_model)
399
+ text = res.get("text", "").strip()
400
+ segments = res.get("segments", [])
401
+ else:
402
+ text = ""
403
+ segments = []
404
+
405
+ json_path = os.path.join(out_dir, f"{sp}_transcript.json")
406
+ with open(json_path, "w", encoding="utf-8") as fj:
407
+ json.dump({"speaker": sp, "text": text, "segments": segments}, fj, indent=2, ensure_ascii=False)
408
+ detailed_paths.append(json_path)
409
+ generated_files.append(json_path)
410
+ yield from emit(f"Saved detailed JSON: {json_path}")
411
+
412
+ # Keyword scanning
413
+ yield from emit("Scanning transcripts for keywords...")
414
+ keyword_log_lines = []
415
+ for sp in speakers:
416
+ json_path = os.path.join(out_dir, f"{sp}_transcript.json")
417
+ if not os.path.exists(json_path):
418
+ continue
419
+ with open(json_path, "r", encoding="utf-8") as f:
420
+ data = json.load(f)
421
+ text = data.get("text", "")
422
+ segments = data.get("segments", [])
423
+
424
+ if segments:
425
+ for seg in segments:
426
+ seg_text = seg.get("text", "")
427
+ seg_start = seg.get("start", 0)
428
+ seg_end = seg.get("end", 0)
429
+ hits = find_keywords_in_text(seg_text, keywords)
430
+ if hits:
431
+ s_td = str(timedelta(seconds=float(seg_start)))
432
+ e_td = str(timedelta(seconds=float(seg_end)))
433
+ line = f"Speaker: {sp} [{s_td} --> {e_td}] Text: {seg_text.strip()}"
434
+ keyword_log_lines.append(line)
435
+ else:
436
+ hits = find_keywords_in_text(text, keywords)
437
+ if hits:
438
+ line = f"Speaker: {sp} [No segment timestamps available] Excerpt: {text.strip()[:200]}"
439
+ keyword_log_lines.append(line)
440
+
441
+ if len(keyword_log_lines) == 0:
442
+ keyword_log = "No keyword matches found."
443
+ else:
444
+ keyword_log = "\n".join(keyword_log_lines)
445
+
446
+ yield from emit("Keyword scan complete.")
447
+
448
+ # Final return: logs, list of generated files (as newline list), keywords, path to timestamped JSON
449
+ file_list_text = "\n".join(generated_files)
450
+
451
+ yield "\n".join(logs), file_list_text, keyword_log, transcript_file
452
+
453
+
454
+
455
+
456
+
457
+
458
+
459
+
460
+
461
+
462
+
463
+
464
+
465
+ # # -----------------------
466
+ # # Gradio UI
467
+ # # -----------------------
468
+
469
+ # def build_interface():
470
+ # with gr.Blocks() as demo:
471
+ # gr.Markdown("# Voice Analysis (Diarisation and Signal Identification)\nUpload an MP4 and click Run to start analysis.")
472
+
473
+ # with gr.Row():
474
+ # video_in = gr.Video(label="Input video (.mp4)")
475
+ # keywords_in = gr.Textbox(value=",".join(KEYWORDS), label="Keywords (comma separated)")
476
+
477
+ # run_btn = gr.Button("Run")
478
+
479
+ # with gr.Row():
480
+ # # logs_out = gr.Textbox(label="Progress logs", lines=20)
481
+ # # files_out = gr.Textbox(label="Generated files (saved in temp run folder)", lines=20)
482
+
483
+ # keywords_out = gr.Textbox(label="Keyword matches (console-style)", lines=5)
484
+ # transcript_json_out = gr.Textbox(label="Timestamped transcript JSON path")
485
+
486
+ # # Loading indicator (spinner)
487
+ # with gr.Row():
488
+ # status_msg = gr.Markdown("⏳ *Idle...*")
489
+
490
+
491
+ # # Add a JSON viewer for transcript preview
492
+ # with gr.Accordion("📜 View Detailed Transcript JSON", open=False):
493
+ # transcript_view = gr.JSON(label="Transcript Data (Timestamps + Text)")
494
+
495
+ # # Function to open and display transcript JSON file
496
+ # def open_transcript_json(json_path):
497
+ # if not os.path.exists(json_path):
498
+ # return {"error": "File not found"}
499
+ # try:
500
+ # with open(json_path, "r", encoding="utf-8") as f:
501
+ # data = json.load(f)
502
+ # return data
503
+ # except Exception as e:
504
+ # return {"error": str(e)}
505
+
506
+ # # Button to view JSON file content
507
+ # view_btn = gr.Button("Open Transcript JSON")
508
+ # view_btn.click(fn=open_transcript_json, inputs=transcript_json_out, outputs=transcript_view)
509
+
510
+ # def run_and_stream(video_path, keywords_text, progress=gr.Progress(track_tqdm=True)):
511
+ # progress(0, desc="Starting analysis...")
512
+ # keys = [k.strip() for k in keywords_text.split(",") if k.strip()]
513
+ # gen = pipeline_worker(video_path, keys)
514
+ # for out in gen:
515
+ # yield out
516
+
517
+ # # Update status to "Processing..."
518
+ # yield "Processing...", "", "⏳ **Processing... Please wait.**"
519
+
520
+ # for out in pipeline_worker(video_path, keys):
521
+ # progress(0.5, desc="Running pipeline...")
522
+ # yield out, "", "⚙️ **Working...**"
523
+
524
+ # # Done
525
+ # progress(1, desc="Completed!")
526
+ # yield "Processing done", "Processing complete", "✅ **Processing done!**"
527
+
528
+ # # -----------------------
529
+ # # Attach button to function
530
+ # # -----------------------
531
+ # run_btn.click(
532
+ # fn=run_and_stream,
533
+ # inputs=[video_in, keywords_in],
534
+ # outputs=[keywords_out, transcript_json_out, status_msg]
535
+ # )
536
+
537
+
538
+
539
+ # # def run_and_stream(video_path, keywords_text):
540
+ # # keys = [k.strip() for k in keywords_text.split(",") if k.strip()]
541
+ # # gen = pipeline_worker(video_path, keys)
542
+ # # for out in gen:
543
+ # # yield out
544
+ # # yield "Processing done", "Output is ready"
545
+
546
+ # # # run_btn.click(fn=run_and_stream, inputs=[video_in, keywords_in], outputs=[logs_out, files_out, keywords_out, transcript_json_out])
547
+ # # run_btn.click(fn=run_and_stream, inputs=[video_in, keywords_in], outputs=[keywords_out, transcript_json_out])
548
+
549
+
550
+ # return demo
551
+
552
+
553
+
554
+ # -----------------------
555
+ # Gradio UI
556
+ # -----------------------
557
+
558
+ def build_interface():
559
+ with gr.Blocks() as demo:
560
+ gr.Markdown("# Voice Analysis (Diarisation and Signal Identification)\nUpload an MP4 and click Run to start analysis.")
561
+
562
+ with gr.Row():
563
+ video_in = gr.Video(label="Input video (.mp4)")
564
+ keywords_in = gr.Textbox(value=",".join(KEYWORDS), label="Keywords (comma separated)")
565
+
566
+ run_btn = gr.Button("Run")
567
+
568
+ with gr.Row():
569
+ logs_out = gr.Textbox(label="Progress logs", lines=20)
570
+ files_out = gr.Textbox(label="Generated files (saved in temp run folder)", lines=20)
571
+
572
+ with gr.Row():
573
+ keywords_out = gr.Textbox(label="Keyword matches (console-style)", lines=5)
574
+ transcript_json_out = gr.Textbox(label="Timestamped transcript JSON path")
575
+
576
+ # Add a JSON viewer for transcript preview
577
+ with gr.Accordion("📜 View Detailed Transcript JSON", open=False):
578
+ transcript_view = gr.JSON(label="Transcript Data (Timestamps + Text)")
579
+
580
+ # Function to open and display transcript JSON file
581
+ def open_transcript_json(json_path):
582
+ if not os.path.exists(json_path):
583
+ return {"error": "File not found"}
584
+ try:
585
+ with open(json_path, "r", encoding="utf-8") as f:
586
+ data = json.load(f)
587
+ return data
588
+ except Exception as e:
589
+ return {"error": str(e)}
590
+
591
+ # Button to view JSON file content
592
+ view_btn = gr.Button("Open Transcript JSON")
593
+ view_btn.click(fn=open_transcript_json, inputs=transcript_json_out, outputs=transcript_view)
594
+
595
+ def run_and_stream(video_path, keywords_text):
596
+ keys = [k.strip() for k in keywords_text.split(",") if k.strip()]
597
+ gen = pipeline_worker(video_path, keys)
598
+ for out in gen:
599
+ yield out
600
+
601
+ run_btn.click(fn=run_and_stream, inputs=[video_in, keywords_in], outputs=[logs_out, files_out, keywords_out, transcript_json_out])
602
+ # run_btn.click(fn=run_and_stream, inputs=[video_in, keywords_in], outputs=[keywords_out, transcript_json_out])
603
+
604
+
605
+ return demo
606
+
607
+ app = build_interface()
608
+
609
+ if __name__ == "__main__":
610
+ app.launch(server_name="0.0.0.0", server_port=7860)
611
+
612
+
613
+