Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import os
|
|
|
|
| 3 |
from typing import List, Dict, Tuple
|
| 4 |
from extractors import extract_docx, extract_pdf, validate_extraction
|
| 5 |
from tagging import tag_speakers_advanced
|
|
@@ -8,8 +9,36 @@ from llm import query_llm, extract_structured_data
|
|
| 8 |
from reporting import generate_enhanced_csv, generate_enhanced_pdf
|
| 9 |
from dashboard import generate_comprehensive_dashboard
|
| 10 |
from validation import validate_transcript_quality, check_data_completeness
|
| 11 |
-
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
def analyze(files, file_type, user_comments, role_hint, debug_mode, interviewee_type, progress=gr.Progress()):
|
|
@@ -173,16 +202,44 @@ Additional Instructions:
|
|
| 173 |
print(f"[File {i+1}] β Processing complete")
|
| 174 |
|
| 175 |
except Exception as e:
|
| 176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
print(error_msg)
|
| 178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
all_results.append({
|
| 180 |
"transcript_id": f"Transcript {i+1}",
|
| 181 |
"file_name": file_name,
|
| 182 |
"full_text": error_msg,
|
| 183 |
"structured_data": {},
|
| 184 |
"quality_score": 0.0,
|
| 185 |
-
"word_count": 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
})
|
| 187 |
|
| 188 |
# Generate cross-transcript summary
|
|
@@ -256,16 +313,96 @@ Additional Instructions:
|
|
| 256 |
Be specific. Use numbers. Cite transcript IDs. Flag weak evidence.
|
| 257 |
"""
|
| 258 |
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
)
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
# Generate enhanced reports
|
| 270 |
csv_path = generate_enhanced_csv(csv_rows, interviewee_type)
|
| 271 |
print(f"[CSV] β Saved to {csv_path}")
|
|
@@ -373,7 +510,39 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 373 |
|
| 374 |
with gr.Tabs():
|
| 375 |
|
| 376 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
|
| 378 |
|
| 379 |
|
|
@@ -526,4 +695,12 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 526 |
""")
|
| 527 |
|
| 528 |
if __name__ == "__main__":
|
| 529 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import os
|
| 3 |
+
from datetime import datetime
|
| 4 |
from typing import List, Dict, Tuple
|
| 5 |
from extractors import extract_docx, extract_pdf, validate_extraction
|
| 6 |
from tagging import tag_speakers_advanced
|
|
|
|
| 9 |
from reporting import generate_enhanced_csv, generate_enhanced_pdf
|
| 10 |
from dashboard import generate_comprehensive_dashboard
|
| 11 |
from validation import validate_transcript_quality, check_data_completeness
|
| 12 |
+
from audio_transcriber import transcribe_with_diarization_streaming
|
| 13 |
|
| 14 |
+
# HuggingFace Spaces Configuration
|
| 15 |
+
import os
|
| 16 |
+
os.environ["LLM_BACKEND"] = "hf_api"
|
| 17 |
+
os.environ["LLM_TIMEOUT"] = "25"
|
| 18 |
+
os.environ["MAX_TOKENS_PER_REQUEST"] = "100"
|
| 19 |
+
print("π Running on HuggingFace Spaces - Optimized Configuration Loaded")
|
| 20 |
+
|
| 21 |
+
def preprocess_audio(audio_files, num_speakers):
|
| 22 |
+
"""Convert audio to transcripts"""
|
| 23 |
+
if not audio_files:
|
| 24 |
+
return None, "No audio files provided"
|
| 25 |
+
|
| 26 |
+
transcript_paths = []
|
| 27 |
+
status = ""
|
| 28 |
+
|
| 29 |
+
for audio in audio_files:
|
| 30 |
+
try:
|
| 31 |
+
# Get the actual file path
|
| 32 |
+
audio_path = audio.name if hasattr(audio, 'name') else str(audio)
|
| 33 |
+
|
| 34 |
+
transcript_path = transcribe_with_diarization(audio_path, num_speakers)
|
| 35 |
+
transcript_paths.append(transcript_path)
|
| 36 |
+
status += f"β {os.path.basename(audio_path)} β {transcript_path}\n"
|
| 37 |
+
except Exception as e:
|
| 38 |
+
status += f"β {os.path.basename(audio_path)}: {str(e)}\n"
|
| 39 |
+
|
| 40 |
+
# Return list of paths for file component
|
| 41 |
+
return transcript_paths if transcript_paths else None, status
|
| 42 |
|
| 43 |
|
| 44 |
def analyze(files, file_type, user_comments, role_hint, debug_mode, interviewee_type, progress=gr.Progress()):
|
|
|
|
| 202 |
print(f"[File {i+1}] β Processing complete")
|
| 203 |
|
| 204 |
except Exception as e:
|
| 205 |
+
# Enhanced error tracking with type and traceback
|
| 206 |
+
import traceback
|
| 207 |
+
error_type = type(e).__name__
|
| 208 |
+
error_details = str(e)
|
| 209 |
+
error_traceback = traceback.format_exc()
|
| 210 |
+
|
| 211 |
+
error_msg = f"[{error_type}] {file_name}: {error_details}"
|
| 212 |
print(error_msg)
|
| 213 |
+
|
| 214 |
+
# Store comprehensive error information
|
| 215 |
+
processing_errors.append({
|
| 216 |
+
"transcript_id": f"Transcript {i+1}",
|
| 217 |
+
"file_name": file_name,
|
| 218 |
+
"error_type": error_type,
|
| 219 |
+
"error_message": error_details[:200], # Truncate long messages
|
| 220 |
+
"timestamp": datetime.now().isoformat()
|
| 221 |
+
})
|
| 222 |
+
|
| 223 |
all_results.append({
|
| 224 |
"transcript_id": f"Transcript {i+1}",
|
| 225 |
"file_name": file_name,
|
| 226 |
"full_text": error_msg,
|
| 227 |
"structured_data": {},
|
| 228 |
"quality_score": 0.0,
|
| 229 |
+
"word_count": 0,
|
| 230 |
+
"processing_status": "FAILED",
|
| 231 |
+
"error_type": error_type
|
| 232 |
+
})
|
| 233 |
+
|
| 234 |
+
# Add to CSV with error metadata
|
| 235 |
+
csv_rows.append({
|
| 236 |
+
"Transcript ID": f"Transcript {i+1}",
|
| 237 |
+
"File Name": file_name,
|
| 238 |
+
"Quality Score": 0.0,
|
| 239 |
+
"Word Count": 0,
|
| 240 |
+
"Processing Status": "FAILED",
|
| 241 |
+
"Error Type": error_type,
|
| 242 |
+
"Error Message": error_details[:100]
|
| 243 |
})
|
| 244 |
|
| 245 |
# Generate cross-transcript summary
|
|
|
|
| 313 |
Be specific. Use numbers. Cite transcript IDs. Flag weak evidence.
|
| 314 |
"""
|
| 315 |
|
| 316 |
+
# Use robust LLM with aggressive timeout protection
|
| 317 |
+
print("[Summary] Generating cross-transcript summary...")
|
| 318 |
+
print("[Summary] Note: This may take 30-60 seconds for large datasets")
|
| 319 |
+
|
| 320 |
+
try:
|
| 321 |
+
from llm_robust import query_llm_with_timeout
|
| 322 |
+
|
| 323 |
+
summary, summary_data = query_llm_with_timeout(
|
| 324 |
+
summary_prompt,
|
| 325 |
+
user_context,
|
| 326 |
+
interviewee_type,
|
| 327 |
+
extract_structured=False,
|
| 328 |
+
is_summary=True,
|
| 329 |
+
max_timeout=60 # 60 second hard timeout
|
| 330 |
+
)
|
| 331 |
+
except Exception as e:
|
| 332 |
+
# Ultimate fallback
|
| 333 |
+
print(f"[Summary] Critical error: {e}")
|
| 334 |
+
print("[Summary] Using emergency fallback...")
|
| 335 |
+
from llm_robust import generate_emergency_summary
|
| 336 |
+
summary, summary_data = generate_emergency_summary(interviewee_type)
|
| 337 |
+
|
| 338 |
+
# Validate summary quality and retry if needed
|
| 339 |
+
from validation import validate_summary_quality
|
| 340 |
+
summary_score, summary_issues = validate_summary_quality(
|
| 341 |
+
summary,
|
| 342 |
+
len(valid_results)
|
| 343 |
)
|
| 344 |
+
|
| 345 |
+
if summary_score < 0.7: # Quality threshold
|
| 346 |
+
print(f"[Warning] Summary quality issues (score: {summary_score:.2f}): {summary_issues}")
|
| 347 |
+
print("[Summary] Retrying with stricter validation...")
|
| 348 |
+
|
| 349 |
+
# Retry with enhanced prompt emphasizing validation failures
|
| 350 |
+
retry_prompt = summary_prompt + f"""
|
| 351 |
+
|
| 352 |
+
CRITICAL: Previous attempt failed validation with these issues:
|
| 353 |
+
{chr(10).join('- ' + issue for issue in summary_issues)}
|
| 354 |
+
|
| 355 |
+
MANDATORY CORRECTIONS:
|
| 356 |
+
- Use ONLY specific numbers (e.g., "8 out of {len(valid_results)}" not "most")
|
| 357 |
+
- Include percentages in parentheses
|
| 358 |
+
- Cite transcript numbers for every claim
|
| 359 |
+
- Minimum length: 500 words
|
| 360 |
+
- No absolute terms (all/everyone/never/always) without 100% evidence
|
| 361 |
+
"""
|
| 362 |
+
|
| 363 |
+
try:
|
| 364 |
+
summary, summary_data = query_llm_with_timeout(
|
| 365 |
+
retry_prompt,
|
| 366 |
+
user_context,
|
| 367 |
+
interviewee_type,
|
| 368 |
+
extract_structured=False,
|
| 369 |
+
is_summary=True,
|
| 370 |
+
max_timeout=60 # 60 second hard timeout for retry
|
| 371 |
+
)
|
| 372 |
+
except Exception as e:
|
| 373 |
+
print(f"[Summary] Retry also failed: {e}")
|
| 374 |
+
print("[Summary] Using emergency fallback for retry...")
|
| 375 |
+
summary, summary_data = generate_emergency_summary(interviewee_type)
|
| 376 |
+
|
| 377 |
+
# Re-validate
|
| 378 |
+
summary_score, summary_issues = validate_summary_quality(summary, len(valid_results))
|
| 379 |
+
|
| 380 |
+
if summary_score < 0.7:
|
| 381 |
+
# Add quality warning to summary header
|
| 382 |
+
warning_header = f"""[QUALITY WARNING - Score: {summary_score:.2f}]
|
| 383 |
+
Validation issues detected: {'; '.join(summary_issues)}
|
| 384 |
+
Please review findings carefully and verify against source data.
|
| 385 |
+
|
| 386 |
+
{'='*60}
|
| 387 |
+
|
| 388 |
+
"""
|
| 389 |
+
summary = warning_header + summary
|
| 390 |
+
print(f"[Warning] Summary still has issues after retry (score: {summary_score:.2f})")
|
| 391 |
+
else:
|
| 392 |
+
print(f"[Summary] β Validation passed after retry (score: {summary_score:.2f})")
|
| 393 |
+
else:
|
| 394 |
+
print(f"[Summary] β Validation passed (score: {summary_score:.2f})")
|
| 395 |
+
|
| 396 |
+
# Verify consensus claims against actual data
|
| 397 |
+
from validation import verify_consensus_claims
|
| 398 |
+
consensus_warnings = verify_consensus_claims(summary, valid_results)
|
| 399 |
+
if consensus_warnings:
|
| 400 |
+
print(f"[Warning] Consensus verification issues: {len(consensus_warnings)} found")
|
| 401 |
+
consensus_note = "\n\n[CONSENSUS VERIFICATION NOTES]:\n" + "\n".join(f"- {w}" for w in consensus_warnings) + "\n\n"
|
| 402 |
+
summary = summary + consensus_note
|
| 403 |
+
else:
|
| 404 |
+
print("[Summary] β Consensus claims verified")
|
| 405 |
+
|
| 406 |
# Generate enhanced reports
|
| 407 |
csv_path = generate_enhanced_csv(csv_rows, interviewee_type)
|
| 408 |
print(f"[CSV] β Saved to {csv_path}")
|
|
|
|
| 510 |
|
| 511 |
with gr.Tabs():
|
| 512 |
|
| 513 |
+
with gr.TabItem("π€ Audio Preprocessing"):
|
| 514 |
+
gr.Markdown("""
|
| 515 |
+
Upload audio interviews to auto-transcribe with speaker identification.
|
| 516 |
+
Outputs DOCX files ready for analysis.
|
| 517 |
+
""")
|
| 518 |
+
|
| 519 |
+
with gr.Row():
|
| 520 |
+
audio_input = gr.File(
|
| 521 |
+
label="Upload Audio Files",
|
| 522 |
+
file_types=[".mp3", ".wav", ".m4a", ".flac"],
|
| 523 |
+
file_count="multiple"
|
| 524 |
+
)
|
| 525 |
+
num_speakers_input = gr.Slider(
|
| 526 |
+
minimum=1,
|
| 527 |
+
maximum=5,
|
| 528 |
+
value=2,
|
| 529 |
+
step=1,
|
| 530 |
+
label="Number of Speakers"
|
| 531 |
+
)
|
| 532 |
+
|
| 533 |
+
transcribe_btn = gr.Button("ποΈ Transcribe Audio", variant="primary")
|
| 534 |
+
transcribe_status = gr.Textbox(label="Status", lines=10)
|
| 535 |
+
transcript_files = gr.File(label="Download Transcripts", file_count="multiple")
|
| 536 |
+
|
| 537 |
+
transcribe_btn.click(
|
| 538 |
+
fn=preprocess_audio,
|
| 539 |
+
inputs=[audio_input, num_speakers_input],
|
| 540 |
+
outputs=[transcript_files, transcribe_status]
|
| 541 |
+
)
|
| 542 |
+
|
| 543 |
+
gr.Markdown("""
|
| 544 |
+
**Next:** Download transcripts, then go to "Transcript Analysis" tab to analyze them.
|
| 545 |
+
""")
|
| 546 |
|
| 547 |
|
| 548 |
|
|
|
|
| 695 |
""")
|
| 696 |
|
| 697 |
if __name__ == "__main__":
|
| 698 |
+
demo.queue(
|
| 699 |
+
concurrency_count=1,
|
| 700 |
+
max_size=10,
|
| 701 |
+
api_open=False
|
| 702 |
+
).launch(
|
| 703 |
+
server_name="0.0.0.0",
|
| 704 |
+
server_port=7860,
|
| 705 |
+
show_error=True
|
| 706 |
+
)
|