jmisak commited on
Commit
faacab1
Β·
verified Β·
1 Parent(s): 695f66d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +192 -15
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import gradio as gr
2
  import os
 
3
  from typing import List, Dict, Tuple
4
  from extractors import extract_docx, extract_pdf, validate_extraction
5
  from tagging import tag_speakers_advanced
@@ -8,8 +9,36 @@ from llm import query_llm, extract_structured_data
8
  from reporting import generate_enhanced_csv, generate_enhanced_pdf
9
  from dashboard import generate_comprehensive_dashboard
10
  from validation import validate_transcript_quality, check_data_completeness
11
- #from audio_transcriber import transcribe_with_diarization_streaming
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
 
15
  def analyze(files, file_type, user_comments, role_hint, debug_mode, interviewee_type, progress=gr.Progress()):
@@ -173,16 +202,44 @@ Additional Instructions:
173
  print(f"[File {i+1}] βœ“ Processing complete")
174
 
175
  except Exception as e:
176
- error_msg = f"[Error] {file_name} failed: {str(e)}"
 
 
 
 
 
 
177
  print(error_msg)
178
- processing_errors.append(error_msg)
 
 
 
 
 
 
 
 
 
179
  all_results.append({
180
  "transcript_id": f"Transcript {i+1}",
181
  "file_name": file_name,
182
  "full_text": error_msg,
183
  "structured_data": {},
184
  "quality_score": 0.0,
185
- "word_count": 0
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  })
187
 
188
  # Generate cross-transcript summary
@@ -256,16 +313,96 @@ Additional Instructions:
256
  Be specific. Use numbers. Cite transcript IDs. Flag weak evidence.
257
  """
258
 
259
- summary, summary_data = query_llm(
260
- summary_prompt,
261
- user_context,
262
- interviewee_type,
263
- extract_structured=False,
264
- is_summary=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  )
266
-
267
- print("[Summary] βœ“ Generated")
268
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  # Generate enhanced reports
270
  csv_path = generate_enhanced_csv(csv_rows, interviewee_type)
271
  print(f"[CSV] βœ“ Saved to {csv_path}")
@@ -373,7 +510,39 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
373
 
374
  with gr.Tabs():
375
 
376
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
 
378
 
379
 
@@ -526,4 +695,12 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
526
  """)
527
 
528
  if __name__ == "__main__":
529
- demo.launch()
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import os
3
+ from datetime import datetime
4
  from typing import List, Dict, Tuple
5
  from extractors import extract_docx, extract_pdf, validate_extraction
6
  from tagging import tag_speakers_advanced
 
9
  from reporting import generate_enhanced_csv, generate_enhanced_pdf
10
  from dashboard import generate_comprehensive_dashboard
11
  from validation import validate_transcript_quality, check_data_completeness
12
+ from audio_transcriber import transcribe_with_diarization_streaming
13
 
14
+ # HuggingFace Spaces Configuration
15
+ import os
16
+ os.environ["LLM_BACKEND"] = "hf_api"
17
+ os.environ["LLM_TIMEOUT"] = "25"
18
+ os.environ["MAX_TOKENS_PER_REQUEST"] = "100"
19
+ print("πŸš€ Running on HuggingFace Spaces - Optimized Configuration Loaded")
20
+
21
+ def preprocess_audio(audio_files, num_speakers):
22
+ """Convert audio to transcripts"""
23
+ if not audio_files:
24
+ return None, "No audio files provided"
25
+
26
+ transcript_paths = []
27
+ status = ""
28
+
29
+ for audio in audio_files:
30
+ try:
31
+ # Get the actual file path
32
+ audio_path = audio.name if hasattr(audio, 'name') else str(audio)
33
+
34
+ transcript_path = transcribe_with_diarization(audio_path, num_speakers)
35
+ transcript_paths.append(transcript_path)
36
+ status += f"βœ“ {os.path.basename(audio_path)} β†’ {transcript_path}\n"
37
+ except Exception as e:
38
+ status += f"βœ— {os.path.basename(audio_path)}: {str(e)}\n"
39
+
40
+ # Return list of paths for file component
41
+ return transcript_paths if transcript_paths else None, status
42
 
43
 
44
  def analyze(files, file_type, user_comments, role_hint, debug_mode, interviewee_type, progress=gr.Progress()):
 
202
  print(f"[File {i+1}] βœ“ Processing complete")
203
 
204
  except Exception as e:
205
+ # Enhanced error tracking with type and traceback
206
+ import traceback
207
+ error_type = type(e).__name__
208
+ error_details = str(e)
209
+ error_traceback = traceback.format_exc()
210
+
211
+ error_msg = f"[{error_type}] {file_name}: {error_details}"
212
  print(error_msg)
213
+
214
+ # Store comprehensive error information
215
+ processing_errors.append({
216
+ "transcript_id": f"Transcript {i+1}",
217
+ "file_name": file_name,
218
+ "error_type": error_type,
219
+ "error_message": error_details[:200], # Truncate long messages
220
+ "timestamp": datetime.now().isoformat()
221
+ })
222
+
223
  all_results.append({
224
  "transcript_id": f"Transcript {i+1}",
225
  "file_name": file_name,
226
  "full_text": error_msg,
227
  "structured_data": {},
228
  "quality_score": 0.0,
229
+ "word_count": 0,
230
+ "processing_status": "FAILED",
231
+ "error_type": error_type
232
+ })
233
+
234
+ # Add to CSV with error metadata
235
+ csv_rows.append({
236
+ "Transcript ID": f"Transcript {i+1}",
237
+ "File Name": file_name,
238
+ "Quality Score": 0.0,
239
+ "Word Count": 0,
240
+ "Processing Status": "FAILED",
241
+ "Error Type": error_type,
242
+ "Error Message": error_details[:100]
243
  })
244
 
245
  # Generate cross-transcript summary
 
313
  Be specific. Use numbers. Cite transcript IDs. Flag weak evidence.
314
  """
315
 
316
+ # Use robust LLM with aggressive timeout protection
317
+ print("[Summary] Generating cross-transcript summary...")
318
+ print("[Summary] Note: This may take 30-60 seconds for large datasets")
319
+
320
+ try:
321
+ from llm_robust import query_llm_with_timeout
322
+
323
+ summary, summary_data = query_llm_with_timeout(
324
+ summary_prompt,
325
+ user_context,
326
+ interviewee_type,
327
+ extract_structured=False,
328
+ is_summary=True,
329
+ max_timeout=60 # 60 second hard timeout
330
+ )
331
+ except Exception as e:
332
+ # Ultimate fallback
333
+ print(f"[Summary] Critical error: {e}")
334
+ print("[Summary] Using emergency fallback...")
335
+ from llm_robust import generate_emergency_summary
336
+ summary, summary_data = generate_emergency_summary(interviewee_type)
337
+
338
+ # Validate summary quality and retry if needed
339
+ from validation import validate_summary_quality
340
+ summary_score, summary_issues = validate_summary_quality(
341
+ summary,
342
+ len(valid_results)
343
  )
344
+
345
+ if summary_score < 0.7: # Quality threshold
346
+ print(f"[Warning] Summary quality issues (score: {summary_score:.2f}): {summary_issues}")
347
+ print("[Summary] Retrying with stricter validation...")
348
+
349
+ # Retry with enhanced prompt emphasizing validation failures
350
+ retry_prompt = summary_prompt + f"""
351
+
352
+ CRITICAL: Previous attempt failed validation with these issues:
353
+ {chr(10).join('- ' + issue for issue in summary_issues)}
354
+
355
+ MANDATORY CORRECTIONS:
356
+ - Use ONLY specific numbers (e.g., "8 out of {len(valid_results)}" not "most")
357
+ - Include percentages in parentheses
358
+ - Cite transcript numbers for every claim
359
+ - Minimum length: 500 words
360
+ - No absolute terms (all/everyone/never/always) without 100% evidence
361
+ """
362
+
363
+ try:
364
+ summary, summary_data = query_llm_with_timeout(
365
+ retry_prompt,
366
+ user_context,
367
+ interviewee_type,
368
+ extract_structured=False,
369
+ is_summary=True,
370
+ max_timeout=60 # 60 second hard timeout for retry
371
+ )
372
+ except Exception as e:
373
+ print(f"[Summary] Retry also failed: {e}")
374
+ print("[Summary] Using emergency fallback for retry...")
375
+ summary, summary_data = generate_emergency_summary(interviewee_type)
376
+
377
+ # Re-validate
378
+ summary_score, summary_issues = validate_summary_quality(summary, len(valid_results))
379
+
380
+ if summary_score < 0.7:
381
+ # Add quality warning to summary header
382
+ warning_header = f"""[QUALITY WARNING - Score: {summary_score:.2f}]
383
+ Validation issues detected: {'; '.join(summary_issues)}
384
+ Please review findings carefully and verify against source data.
385
+
386
+ {'='*60}
387
+
388
+ """
389
+ summary = warning_header + summary
390
+ print(f"[Warning] Summary still has issues after retry (score: {summary_score:.2f})")
391
+ else:
392
+ print(f"[Summary] βœ“ Validation passed after retry (score: {summary_score:.2f})")
393
+ else:
394
+ print(f"[Summary] βœ“ Validation passed (score: {summary_score:.2f})")
395
+
396
+ # Verify consensus claims against actual data
397
+ from validation import verify_consensus_claims
398
+ consensus_warnings = verify_consensus_claims(summary, valid_results)
399
+ if consensus_warnings:
400
+ print(f"[Warning] Consensus verification issues: {len(consensus_warnings)} found")
401
+ consensus_note = "\n\n[CONSENSUS VERIFICATION NOTES]:\n" + "\n".join(f"- {w}" for w in consensus_warnings) + "\n\n"
402
+ summary = summary + consensus_note
403
+ else:
404
+ print("[Summary] βœ“ Consensus claims verified")
405
+
406
  # Generate enhanced reports
407
  csv_path = generate_enhanced_csv(csv_rows, interviewee_type)
408
  print(f"[CSV] βœ“ Saved to {csv_path}")
 
510
 
511
  with gr.Tabs():
512
 
513
+ with gr.TabItem("🎀 Audio Preprocessing"):
514
+ gr.Markdown("""
515
+ Upload audio interviews to auto-transcribe with speaker identification.
516
+ Outputs DOCX files ready for analysis.
517
+ """)
518
+
519
+ with gr.Row():
520
+ audio_input = gr.File(
521
+ label="Upload Audio Files",
522
+ file_types=[".mp3", ".wav", ".m4a", ".flac"],
523
+ file_count="multiple"
524
+ )
525
+ num_speakers_input = gr.Slider(
526
+ minimum=1,
527
+ maximum=5,
528
+ value=2,
529
+ step=1,
530
+ label="Number of Speakers"
531
+ )
532
+
533
+ transcribe_btn = gr.Button("πŸŽ™οΈ Transcribe Audio", variant="primary")
534
+ transcribe_status = gr.Textbox(label="Status", lines=10)
535
+ transcript_files = gr.File(label="Download Transcripts", file_count="multiple")
536
+
537
+ transcribe_btn.click(
538
+ fn=preprocess_audio,
539
+ inputs=[audio_input, num_speakers_input],
540
+ outputs=[transcript_files, transcribe_status]
541
+ )
542
+
543
+ gr.Markdown("""
544
+ **Next:** Download transcripts, then go to "Transcript Analysis" tab to analyze them.
545
+ """)
546
 
547
 
548
 
 
695
  """)
696
 
697
  if __name__ == "__main__":
698
+ demo.queue(
699
+ concurrency_count=1,
700
+ max_size=10,
701
+ api_open=False
702
+ ).launch(
703
+ server_name="0.0.0.0",
704
+ server_port=7860,
705
+ show_error=True
706
+ )