Rajor78 commited on
Commit
1b4137e
·
verified ·
1 Parent(s): 10c0b1a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -662
app.py CHANGED
@@ -1,676 +1,55 @@
1
  import os
2
- import subprocess
3
- import time
4
- import json
5
- import argparse
6
- from pathlib import Path
7
- import numpy as np
8
- import torch
9
- import pandas as pd
10
- import matplotlib.pyplot as plt
11
- import re
12
  from docx import Document
13
- from docx.shared import RGBColor, Pt
14
- from docx.enum.text import WD_ALIGN_PARAGRAPH
15
- from langdetect import detect
16
 
17
- # Import Hugging Face components
18
- from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
19
- from pyannote.audio import Pipeline
20
- from datasets import Dataset
21
-
22
- # Constants
23
- SPACY_MODELS = {
24
- 'es': 'es_core_news_sm', # Spanish
25
- 'en': 'en_core_web_sm', # English
26
- 'fr': 'fr_core_news_sm', # French
27
- 'it': 'it_core_news_sm', # Italian
28
- 'de': 'de_core_news_sm', # German
29
- 'pt': 'pt_core_news_sm', # Portuguese
30
- 'nl': 'nl_core_news_sm', # Dutch
31
- 'ca': 'ca_core_news_sm', # Catalan
32
- }
33
-
34
- # Function to load Spacy model based on language
35
- def load_spacy_model(language):
36
- import spacy
37
- from spacy.cli import download as spacy_download
38
-
39
- model_name = SPACY_MODELS.get(language, 'es_core_news_sm')
40
-
41
- try:
42
- print(f"Attempting to load Spacy model for language: {language} ({model_name})...")
43
- nlp = spacy.load(model_name)
44
- return nlp
45
- except OSError:
46
- print(f"Model {model_name} not found. Installing...")
47
- spacy_download(model_name)
48
- nlp = spacy.load(model_name)
49
- return nlp
50
- except Exception as e:
51
- print(f"Could not load Spacy model for language {language}: {str(e)}")
52
- print("Trying to load default English model...")
53
- try:
54
- spacy_download('en_core_web_sm')
55
- return spacy.load('en_core_web_sm')
56
- except Exception as e2:
57
- print(f"Could not load English model either: {str(e2)}")
58
- print("Using a minimal model...")
59
- return spacy.blank('en')
60
-
61
- # Function to extract audio from a video
62
  def extract_audio(video_path, audio_path):
63
  try:
64
- command = f"ffmpeg -i '{video_path}' -ar 16000 -ac 1 -c:a pcm_s16le '{audio_path}' -y"
65
- subprocess.run(command, shell=True, check=True)
66
- print(f"Audio extracted and saved to: {audio_path}")
67
  return True
68
- except subprocess.CalledProcessError as e:
69
- print(f"Error extracting audio: {e}")
70
- return False
71
-
72
- # Function to detect language of the audio
73
- def detect_language(transcribed_text):
74
- try:
75
- language = detect(transcribed_text)
76
- print(f"Detected language: {language}")
77
- return language
78
- except Exception as e:
79
- print(f"Error detecting language: {e}")
80
- return "es" # Spanish by default
81
-
82
- # Function to perform speaker diarization with pyannote.audio
83
- def diarize_speakers(audio_path, huggingface_token=None):
84
- try:
85
- print("Initializing speaker diarization...")
86
-
87
- # Use pyannote.audio for diarization
88
- use_auth = True if huggingface_token else False
89
-
90
- # If Hugging Face token is provided, use it
91
- if huggingface_token:
92
- diarization_pipeline = Pipeline.from_pretrained(
93
- "pyannote/speaker-diarization-3.1",
94
- use_auth_token=huggingface_token
95
- )
96
- else:
97
- # Try to load without token (will only work if license has been accepted)
98
- try:
99
- diarization_pipeline = Pipeline.from_pretrained(
100
- "pyannote/speaker-diarization-3.1",
101
- use_auth_token=False
102
- )
103
- except Exception as e:
104
- print(f"Error loading diarization model without token: {e}")
105
- print("It's recommended to create a Hugging Face account, accept the model license, and provide a token.")
106
- return {}
107
-
108
- print("Running diarization...")
109
- diarization = diarization_pipeline(audio_path)
110
-
111
- # Store speaker information and turns
112
- speakers = {}
113
- for turn, _, speaker in diarization.itertracks(yield_label=True):
114
- if speaker not in speakers:
115
- speakers[speaker] = []
116
- speakers[speaker].append({
117
- 'start': turn.start,
118
- 'end': turn.end
119
- })
120
-
121
- # Rename speakers to be more user-friendly
122
- renamed_speakers = {}
123
- for i, (speaker, turns) in enumerate(speakers.items(), 1):
124
- renamed_speakers[f"Speaker {i}"] = turns
125
-
126
- print(f"Diarization completed. {len(renamed_speakers)} speakers identified.")
127
- return renamed_speakers
128
  except Exception as e:
129
- print(f"Error in speaker diarization: {e}")
130
- print("Continuing without diarization...")
131
- return {}
132
-
133
- # Function to transcribe audio with Whisper and get timestamps
134
- def transcribe_audio_with_timing(audio_path, model_name="openai/whisper-base", language=None):
135
- try:
136
- print(f"Loading Whisper model ({model_name})...")
137
-
138
- # Use Transformers pipeline for transcription
139
- transcription_pipeline = pipeline(
140
- "automatic-speech-recognition",
141
- model=model_name,
142
- chunk_length_s=30,
143
- device=0 if torch.cuda.is_available() else -1,
144
- return_timestamps="word"
145
- )
146
-
147
- print("Transcribing audio with timestamps...")
148
-
149
- # If language is provided, use it; otherwise, let Whisper detect it
150
- if language:
151
- result = transcription_pipeline(audio_path, language=language)
152
- else:
153
- result = transcription_pipeline(audio_path)
154
-
155
- # Process the result to match the expected format
156
- transcribed_text = result.get("text", "")
157
-
158
- # Create segments from chunks with timestamps
159
- segments = []
160
- chunk_words = result.get("chunks", [])
161
-
162
- # Group words into sentences/segments
163
- current_segment = {
164
- "start": 0,
165
- "end": 0,
166
- "text": "",
167
- "words": []
168
- }
169
-
170
- for word_data in chunk_words:
171
- word = word_data.get("text", "")
172
- start_time = word_data.get("timestamp", (0, 0))[0]
173
- end_time = word_data.get("timestamp", (0, 0))[1]
174
-
175
- # Initialize first segment
176
- if not current_segment["text"]:
177
- current_segment["start"] = start_time
178
-
179
- current_segment["text"] += " " + word
180
- current_segment["words"].append(word_data)
181
- current_segment["end"] = end_time
182
-
183
- # Start a new segment at sentence end
184
- if word.endswith((".", "!", "?")):
185
- segments.append(current_segment)
186
- current_segment = {
187
- "start": end_time,
188
- "end": end_time,
189
- "text": "",
190
- "words": []
191
- }
192
-
193
- # Add the last segment if not empty
194
- if current_segment["text"]:
195
- segments.append(current_segment)
196
-
197
- detected_language = result.get("language", "unknown")
198
-
199
- print(f"Transcription completed in language: {detected_language}")
200
- return transcribed_text, segments, detected_language
201
- except Exception as e:
202
- print(f"Error in transcription: {e}")
203
- return "", [], "unknown"
204
-
205
- # Function to assign speakers to transcribed segments
206
- def assign_speakers_to_segments(segments, speakers):
207
- if not speakers:
208
- # If no speaker information, assign "Unknown Speaker" to all segments
209
- for segment in segments:
210
- segment['speaker'] = "Unknown Speaker"
211
- return segments
212
-
213
- for segment in segments:
214
- start_time = segment['start']
215
- end_time = segment['end']
216
-
217
- # Find the speaker with the most overlap for this segment
218
- best_speaker = None
219
- max_overlap = 0
220
-
221
- for speaker, turns in speakers.items():
222
- for turn in turns:
223
- turn_start = turn['start']
224
- turn_end = turn['end']
225
-
226
- # Calculate overlap time
227
- overlap_start = max(start_time, turn_start)
228
- overlap_end = min(end_time, turn_end)
229
- overlap = max(0, overlap_end - overlap_start)
230
-
231
- if overlap > max_overlap:
232
- max_overlap = overlap
233
- best_speaker = speaker
234
-
235
- # Assign the best speaker found or "Unknown" if no match
236
- segment['speaker'] = best_speaker if best_speaker else "Unknown Speaker"
237
-
238
- return segments
239
-
240
- # Function to extract speaker information (how much each one speaks)
241
- def analyze_speaker_stats(segments):
242
- speaker_stats = {}
243
- total_duration = 0
244
-
245
- for segment in segments:
246
- speaker = segment.get('speaker', 'Unknown Speaker')
247
- duration = segment['end'] - segment['start']
248
- total_duration += duration
249
-
250
- if speaker not in speaker_stats:
251
- speaker_stats[speaker] = {
252
- 'total_time': 0,
253
- 'word_count': 0,
254
- 'segments': 0
255
- }
256
-
257
- speaker_stats[speaker]['total_time'] += duration
258
- speaker_stats[speaker]['word_count'] += len(segment['text'].split())
259
- speaker_stats[speaker]['segments'] += 1
260
-
261
- # Calculate percentages
262
- for speaker in speaker_stats:
263
- speaker_stats[speaker]['percentage'] = (speaker_stats[speaker]['total_time'] / total_duration) * 100
264
-
265
- return speaker_stats, total_duration
266
-
267
- # Function to generate speaker analysis charts
268
- def generate_speaker_analysis_charts(speaker_stats, output_path):
269
- try:
270
- # Create DataFrame for easier visualization
271
- speakers = list(speaker_stats.keys())
272
- percentages = [speaker_stats[speaker]['percentage'] for speaker in speakers]
273
- word_counts = [speaker_stats[speaker]['word_count'] for speaker in speakers]
274
-
275
- # Create figure with two subplots
276
- fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
277
-
278
- # Chart 1: Speaking time by speaker (pie)
279
- ax1.pie(percentages, labels=speakers, autopct='%1.1f%%', startangle=90)
280
- ax1.set_title('Speaking Time Distribution')
281
-
282
- # Chart 2: Number of words by speaker (bars)
283
- ax2.bar(speakers, word_counts)
284
- ax2.set_title('Word Count by Speaker')
285
- ax2.set_ylabel('Word Count')
286
- ax2.tick_params(axis='x', rotation=45)
287
-
288
- plt.tight_layout()
289
- plt.savefig(output_path)
290
- print(f"Analysis charts saved to: {output_path}")
291
- return True
292
- except Exception as e:
293
- print(f"Error generating analysis charts: {e}")
294
  return False
295
 
296
- # Function to choose organization mode: chronological or by speakers
297
- def organize_segments(segments, mode="chronological"):
298
- if mode == "by_speaker":
299
- # Organize by speakers
300
- speakers_content = {}
301
- for segment in segments:
302
- speaker = segment.get('speaker', 'Unknown Speaker')
303
- if speaker not in speakers_content:
304
- speakers_content[speaker] = []
305
- speakers_content[speaker].append(segment)
306
-
307
- # Sort segments by time within each speaker
308
- for speaker in speakers_content:
309
- speakers_content[speaker].sort(key=lambda x: x['start'])
310
-
311
- return speakers_content
312
- else:
313
- # Organize chronologically (already sorted by time)
314
- return segments
315
-
316
- # Function to divide text into paragraphs based on organization mode
317
- def process_segments_for_document(segments, mode="chronological"):
318
- if mode == "by_speaker":
319
- # Organize by speakers
320
- speakers_content = organize_segments(segments, "by_speaker")
321
- paragraphs = []
322
-
323
- for speaker, speaker_segments in speakers_content.items():
324
- speaker_text = ""
325
- for segment in speaker_segments:
326
- speaker_text += segment['text'] + " "
327
 
328
- paragraphs.append({
329
- 'speaker': speaker,
330
- 'text': speaker_text
331
- })
332
 
333
- return paragraphs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
  else:
335
- # Organize chronologically
336
- chronological_paragraphs = []
337
- current_paragraph = []
338
- current_speaker = None
339
- current_timestamp = None
340
-
341
- for segment in segments:
342
- speaker = segment.get('speaker', 'Unknown Speaker')
343
- text = segment['text']
344
- start_time = segment['start']
345
- end_time = segment['end']
346
-
347
- # Format time as HH:MM:SS
348
- time_str = format_timestamp(start_time)
349
-
350
- # If speaker changes, start a new paragraph
351
- if current_speaker and current_speaker != speaker and current_paragraph:
352
- chronological_paragraphs.append({
353
- 'speaker': current_speaker,
354
- 'text': ' '.join(current_paragraph),
355
- 'timestamp': current_timestamp
356
- })
357
- current_paragraph = []
358
-
359
- # Update current speaker and add text
360
- current_speaker = speaker
361
- current_timestamp = time_str
362
- current_paragraph.append(text)
363
-
364
- # Add the last paragraph if there's content
365
- if current_paragraph:
366
- chronological_paragraphs.append({
367
- 'speaker': current_speaker,
368
- 'text': ' '.join(current_paragraph),
369
- 'timestamp': current_timestamp
370
- })
371
-
372
- return chronological_paragraphs
373
-
374
- # Function to format time in HH:MM:SS format
375
- def format_timestamp(seconds):
376
- m, s = divmod(seconds, 60)
377
- h, m = divmod(m, 60)
378
- return f"{int(h):02d}:{int(m):02d}:{int(s):02d}"
379
-
380
- # Function to improve text style and grammar before saving
381
- def correct_text(text, language="es"):
382
- try:
383
- import language_tool_python
384
-
385
- language_code = language[:2].lower() # Get only the 2-letter language code
386
- supported_languages = ["es", "en", "fr", "de", "pt", "nl"]
387
-
388
- if language_code not in supported_languages:
389
- print(f"Grammar correction not available for language {language_code}, using Spanish by default.")
390
- language_code = "es"
391
-
392
- tool = language_tool_python.LanguageTool(language_code)
393
- matches = tool.check(text)
394
- corrected_text = language_tool_python.utils.correct(text, matches)
395
- return corrected_text
396
- except Exception as e:
397
- print(f"Error correcting text: {e}")
398
- return text # Return original text if there's an error
399
-
400
- # Function to create Word document with organized transcription
401
- def create_word_document(paragraphs, output_path, include_timestamps=True, stats=None, chart_path=None):
402
- try:
403
- doc = Document()
404
-
405
- # Configure document style
406
- style = doc.styles['Normal']
407
- style.font.name = 'Arial'
408
- style.font.size = Pt(11)
409
-
410
- # Main title
411
- title = doc.add_heading('Transcription with Speaker Identification', 0)
412
- title.alignment = WD_ALIGN_PARAGRAPH.CENTER
413
-
414
- # Add statistics information if available
415
- if stats:
416
- doc.add_heading('Participation Summary', level=1)
417
- stats_table = doc.add_table(rows=1, cols=5)
418
- stats_table.style = 'Table Grid'
419
-
420
- # Table headers
421
- hdr_cells = stats_table.rows[0].cells
422
- hdr_cells[0].text = 'Speaker'
423
- hdr_cells[1].text = 'Time (s)'
424
- hdr_cells[2].text = 'Percentage (%)'
425
- hdr_cells[3].text = 'Words'
426
- hdr_cells[4].text = 'Interventions'
427
-
428
- # Add data for each speaker
429
- for speaker, data in stats.items():
430
- row_cells = stats_table.add_row().cells
431
- row_cells[0].text = speaker
432
- row_cells[1].text = f"{data['total_time']:.2f}"
433
- row_cells[2].text = f"{data['percentage']:.2f}"
434
- row_cells[3].text = f"{data['word_count']}"
435
- row_cells[4].text = f"{data['segments']}"
436
-
437
- doc.add_paragraph()
438
-
439
- # Add chart if available
440
- if chart_path and os.path.exists(chart_path):
441
- doc.add_heading('Graphical Analysis', level=1)
442
- doc.add_picture(chart_path, width=Pt(450))
443
- doc.add_paragraph()
444
-
445
- # Transcription title
446
- doc.add_heading('Complete Transcription', level=1)
447
-
448
- # Add paragraphs to document
449
- for paragraph in paragraphs:
450
- speaker = paragraph['speaker']
451
- text = paragraph['text']
452
-
453
- # Create paragraph with appropriate formatting
454
- p = doc.add_paragraph()
455
 
456
- # Add timestamp if available and option is enabled
457
- if include_timestamps and 'timestamp' in paragraph:
458
- timestamp_run = p.add_run(f"[{paragraph['timestamp']}] ")
459
- timestamp_run.bold = True
460
- timestamp_run.font.color.rgb = RGBColor(128, 128, 128)
461
-
462
- # Add speaker
463
- speaker_run = p.add_run(f"{speaker}: ")
464
- speaker_run.bold = True
465
-
466
- # Text color according to speaker for easier reading
467
- if "Speaker 1" in speaker:
468
- speaker_run.font.color.rgb = RGBColor(0, 0, 200) # Blue
469
- elif "Speaker 2" in speaker:
470
- speaker_run.font.color.rgb = RGBColor(200, 0, 0) # Red
471
- elif "Speaker 3" in speaker:
472
- speaker_run.font.color.rgb = RGBColor(0, 150, 0) # Green
473
- elif "Speaker 4" in speaker:
474
- speaker_run.font.color.rgb = RGBColor(128, 0, 128) # Purple
475
-
476
- # Add paragraph text
477
- text_run = p.add_run(text)
478
-
479
- # Add separator for better readability
480
- doc.add_paragraph()
481
-
482
- # Save document
483
- doc.save(output_path)
484
- print(f"Word document saved to: {output_path}")
485
- return True
486
- except Exception as e:
487
- print(f"Error creating Word document: {str(e)}")
488
- return False
489
-
490
- # Function to save results as JSON for later processing
491
- def save_json_results(segments, output_path):
492
- try:
493
- # Convert segments to serializable format
494
- serializable_segments = []
495
- for segment in segments:
496
- serializable_segment = {
497
- 'start': segment['start'],
498
- 'end': segment['end'],
499
- 'text': segment['text'],
500
- 'speaker': segment.get('speaker', 'Unknown Speaker')
501
- }
502
- serializable_segments.append(serializable_segment)
503
-
504
- # Save to JSON file
505
- with open(output_path, 'w', encoding='utf-8') as f:
506
- json.dump(serializable_segments, f, ensure_ascii=False, indent=2)
507
-
508
- print(f"Results saved in JSON format: {output_path}")
509
- return True
510
- except Exception as e:
511
- print(f"Error saving results to JSON: {e}")
512
- return False
513
-
514
- # Function to save results to Hugging Face Dataset
515
- def save_to_huggingface_dataset(segments, output_path=None, push_to_hub=False, repo_id=None, token=None):
516
- try:
517
- # Prepare data for Dataset format
518
- data = {
519
- "segment_id": [],
520
- "start_time": [],
521
- "end_time": [],
522
- "speaker": [],
523
- "text": []
524
- }
525
-
526
- for i, segment in enumerate(segments):
527
- data["segment_id"].append(i)
528
- data["start_time"].append(segment["start"])
529
- data["end_time"].append(segment["end"])
530
- data["speaker"].append(segment.get("speaker", "Unknown Speaker"))
531
- data["text"].append(segment["text"])
532
-
533
- # Create Dataset
534
- dataset = Dataset.from_dict(data)
535
-
536
- # Save locally if path provided
537
- if output_path:
538
- dataset.save_to_disk(output_path)
539
- print(f"Dataset saved locally to: {output_path}")
540
-
541
- # Push to Hugging Face Hub if requested
542
- if push_to_hub and repo_id:
543
- dataset.push_to_hub(repo_id, token=token)
544
- print(f"Dataset pushed to Hugging Face Hub: {repo_id}")
545
-
546
- return dataset
547
- except Exception as e:
548
- print(f"Error saving to Hugging Face dataset: {e}")
549
- return None
550
-
551
- # Main function
552
- def main():
553
- parser = argparse.ArgumentParser(description="Audio transcription with speaker diarization using Hugging Face models")
554
- parser.add_argument("--video", type=str, help="Path to video file")
555
- parser.add_argument("--audio", type=str, help="Path to audio file (if already extracted)")
556
- parser.add_argument("--output_dir", type=str, default="./output", help="Directory to save output files")
557
- parser.add_argument("--model", type=str, default="openai/whisper-base",
558
- help="Whisper model to use: openai/whisper-tiny, openai/whisper-base, openai/whisper-small, openai/whisper-medium, openai/whisper-large")
559
- parser.add_argument("--language", type=str, help="Language code (e.g., 'es' for Spanish)")
560
- parser.add_argument("--hf_token", type=str, help="Hugging Face API token for speaker diarization")
561
- parser.add_argument("--organization", type=str, default="chronological",
562
- choices=["chronological", "by_speaker"], help="Transcription organization mode")
563
- parser.add_argument("--push_to_hub", action="store_true", help="Push results to Hugging Face Hub")
564
- parser.add_argument("--repo_id", type=str, help="Hugging Face repository ID for pushing dataset")
565
-
566
- args = parser.parse_args()
567
-
568
- # Create output directory if it doesn't exist
569
- os.makedirs(args.output_dir, exist_ok=True)
570
-
571
- # Timestamp for output files
572
- timestamp = time.strftime("%Y%m%d_%H%M%S")
573
-
574
- try:
575
- print("=== TRANSCRIPTION WITH SPEAKER DETECTION ===")
576
-
577
- # Check input file
578
- if args.audio:
579
- audio_path = args.audio
580
- base_filename = os.path.splitext(os.path.basename(audio_path))[0]
581
- elif args.video:
582
- video_path = args.video
583
- base_filename = os.path.splitext(os.path.basename(video_path))[0]
584
- audio_path = os.path.join(args.output_dir, f"{base_filename}_{timestamp}.wav")
585
-
586
- # Extract audio from video
587
- if not extract_audio(video_path, audio_path):
588
- print("Could not extract audio. Process canceled.")
589
- return
590
- else:
591
- print("Error: You must provide either a video file or an audio file.")
592
- return
593
-
594
- # Output file paths
595
- word_output_path = os.path.join(args.output_dir, f"{base_filename}_{timestamp}_transcription.docx")
596
- json_output_path = os.path.join(args.output_dir, f"{base_filename}_{timestamp}_data.json")
597
- chart_output_path = os.path.join(args.output_dir, f"{base_filename}_{timestamp}_analysis.png")
598
- dataset_output_path = os.path.join(args.output_dir, f"{base_filename}_{timestamp}_dataset")
599
-
600
- print(f"\nProcessing audio: {audio_path}")
601
- start_time = time.time()
602
-
603
- # Transcribe with Whisper
604
- print(f"\nStarting transcription with Whisper model {args.model}...")
605
- transcribed_text, segments, detected_language = transcribe_audio_with_timing(
606
- audio_path,
607
- model_name=args.model,
608
- language=args.language
609
- )
610
-
611
- if not transcribed_text:
612
- print("Transcription failed. Process canceled.")
613
- return
614
-
615
- print(f"Transcription completed: {transcribed_text[:100]}...\n")
616
-
617
- # If no language specified, use the detected one
618
- if not args.language:
619
- detected_language = detect_language(transcribed_text) if detected_language == "unknown" else detected_language
620
- else:
621
- detected_language = args.language
622
-
623
- # Speaker diarization
624
- print("Starting speaker detection...")
625
- speakers = diarize_speakers(audio_path, args.hf_token)
626
-
627
- # Assign speakers to segments
628
- segments_with_speakers = assign_speakers_to_segments(segments, speakers)
629
-
630
- # Analyze speaker statistics
631
- speaker_stats, total_duration = analyze_speaker_stats(segments_with_speakers)
632
- print("\n=== PARTICIPATION STATISTICS ===")
633
- for speaker, stats in speaker_stats.items():
634
- print(f"{speaker}: {stats['percentage']:.2f}% of time, {stats['word_count']} words, {stats['segments']} interventions")
635
-
636
- # Generate analysis charts
637
- generate_speaker_analysis_charts(speaker_stats, chart_output_path)
638
-
639
- # Process segments according to selected organization mode
640
- paragraphs = process_segments_for_document(segments_with_speakers, args.organization)
641
-
642
- # Save results as JSON
643
- save_json_results(segments_with_speakers, json_output_path)
644
-
645
- # Create Word document with transcription
646
- create_word_document(
647
- paragraphs,
648
- word_output_path,
649
- include_timestamps=True,
650
- stats=speaker_stats,
651
- chart_path=chart_output_path
652
- )
653
-
654
- # Save to Hugging Face Dataset
655
- if args.push_to_hub or os.path.exists(dataset_output_path):
656
- save_to_huggingface_dataset(
657
- segments_with_speakers,
658
- output_path=dataset_output_path,
659
- push_to_hub=args.push_to_hub,
660
- repo_id=args.repo_id,
661
- token=args.hf_token
662
- )
663
-
664
- # Total processing time
665
- end_time = time.time()
666
- elapsed_time = end_time - start_time
667
- print(f"\nTotal processing time: {elapsed_time:.2f} seconds")
668
-
669
- print("\nProcess completed successfully!")
670
-
671
- except Exception as e:
672
- print(f"Unexpected error during the process: {str(e)}")
673
 
674
- # Run the script
675
- if __name__ == "__main__":
676
- main()
 
1
  import os
2
+ import whisper
3
+ import spacy
4
+ import language_tool_python
5
+ import gradio as gr
6
+ from moviepy.editor import VideoFileClip
 
 
 
 
 
7
  from docx import Document
 
 
 
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  def extract_audio(video_path, audio_path):
10
  try:
11
+ video = VideoFileClip(video_path)
12
+ audio = video.audio
13
+ audio.write_audiofile(audio_path, codec='pcm_s16le', fps=16000)
14
  return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  except Exception as e:
16
+ print(f"Error al extraer audio: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  return False
18
 
19
+ def transcribe_audio(audio_path):
20
+ model = whisper.load_model("base")
21
+ result = model.transcribe(audio_path, word_timestamps=True)
22
+ return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ def correct_text(text):
25
+ tool = language_tool_python.LanguageTool('es')
26
+ matches = tool.check(text)
27
+ return language_tool_python.utils.correct(text, matches)
28
 
29
+ def create_word_doc(segments, output_path):
30
+ doc = Document()
31
+ for segment in segments:
32
+ corrected_text = correct_text(segment['text'])
33
+ doc.add_paragraph(corrected_text)
34
+ doc.save(output_path)
35
+ return output_path
36
+
37
+ def process_video(video_file):
38
+ audio_path = video_file.replace(".mp4", ".wav")
39
+ word_output = video_file.replace(".mp4", "_transcription.docx")
40
+
41
+ if extract_audio(video_file, audio_path):
42
+ result = transcribe_audio(audio_path)
43
+ segments = result['segments']
44
+ doc_path = create_word_doc(segments, word_output)
45
+ return "Transcripción completada.", doc_path
46
  else:
47
+ return "Error al procesar el archivo.", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
+ demo = gr.Interface(
50
+ fn=process_video,
51
+ inputs=gr.File(label="Sube un archivo de video"),
52
+ outputs=["text", gr.File(label="Descargar transcripción")]
53
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ demo.launch()