sharmaarush commited on
Commit
fa1fe99
1 Parent(s): 2c7aad7

initial upload

Browse files
Files changed (3) hide show
  1. .gitignore +2 -0
  2. app.py +123 -0
  3. requirements.txt +6 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ myenv
2
+ *.wav
app.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import pipeline
3
+ import os
4
+ import gradio as gr
5
+ from pydub import AudioSegment
6
+ from pytube import YouTube
7
+ import timeit
8
+ import math
9
+
10
+ device = "cuda" if torch.cuda.is_available() else "cpu"
11
+ pipe = pipeline("automatic-speech-recognition", model="distil-whisper/distil-medium.en", device=device)
12
+
13
+ def transcribe_speech_local(filepath):
14
+ if filepath is None:
15
+ return [{"error": "No audio found, please retry."}]
16
+
17
+ # Split audio into 15-second chunks
18
+ audio = AudioSegment.from_file(filepath)
19
+ chunk_length_ms = 15000 # 15 seconds in milliseconds
20
+ chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
21
+ print(chunks)
22
+ aligned_chunks = []
23
+ transcription_time_total = 0
24
+
25
+ # Transcribe each chunk and measure time
26
+ for chunk_id, chunk in enumerate(chunks):
27
+ start_time = timeit.default_timer()
28
+ chunk.export("temp_chunk.wav", format="wav")
29
+ output = pipe("temp_chunk.wav")
30
+ transcription_time = timeit.default_timer() - start_time
31
+ transcription_time_total += transcription_time
32
+
33
+ # Calculate start and end times in seconds
34
+ start_time_sec = chunk_id * 15
35
+ end_time_sec = start_time_sec + len(chunk) / 1000.0
36
+
37
+ aligned_chunks.append({
38
+ "chunk_id": chunk_id,
39
+ "chunk_length": len(chunk) / 1000.0,
40
+ "text": output["text"],
41
+ "start_time": start_time_sec,
42
+ "end_time": end_time_sec,
43
+ "transcription_time": transcription_time
44
+ })
45
+
46
+ return aligned_chunks
47
+
48
+ def download_audio_from_youtube(youtube_url):
49
+ yt = YouTube(youtube_url)
50
+ stream = yt.streams.filter(only_audio=True).first()
51
+ output_path = stream.download()
52
+ base, ext = os.path.splitext(output_path)
53
+ audio_file = base + '.mp3'
54
+ os.rename(output_path, audio_file)
55
+ return audio_file
56
+
57
+ def transcribe_speech_from_youtube(youtube_url):
58
+ audio_filepath = download_audio_from_youtube(youtube_url)
59
+
60
+ # Convert to WAV format with 16kHz sample rate if necessary
61
+ audio = AudioSegment.from_file(audio_filepath)
62
+ audio = audio.set_frame_rate(16000).set_channels(1)
63
+ audio.export("converted_audio.wav", format="wav")
64
+ audio = AudioSegment.from_file("converted_audio.wav")
65
+
66
+ # Split audio into 15-second chunks
67
+ chunk_length_ms = 15000 # 15 seconds in milliseconds
68
+ chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
69
+
70
+ aligned_chunks = []
71
+ transcription_time_total = 0
72
+
73
+ # Transcribe each chunk and measure time
74
+ for chunk_id, chunk in enumerate(chunks):
75
+ start_time = timeit.default_timer()
76
+ chunk.export("temp_chunk.wav", format="wav")
77
+ output = pipe("temp_chunk.wav")
78
+ transcription_time = timeit.default_timer() - start_time
79
+ transcription_time_total += transcription_time
80
+
81
+ # Calculate start and end times in seconds
82
+ start_time_sec = chunk_id * 15
83
+ end_time_sec = start_time_sec + len(chunk) / 1000.0
84
+
85
+ aligned_chunks.append({
86
+ "chunk_id": chunk_id,
87
+ "chunk_length": len(chunk) / 1000.0,
88
+ "text": output["text"],
89
+ "start_time": start_time_sec,
90
+ "end_time": end_time_sec,
91
+ "transcription_time": transcription_time
92
+ })
93
+
94
+ # Clean up temporary files
95
+ if os.path.exists("temp_chunk.wav"):
96
+ os.remove("temp_chunk.wav")
97
+ if os.path.exists("converted_audio.wav"):
98
+ os.remove("converted_audio.wav")
99
+ if os.path.exists(audio_filepath):
100
+ os.remove(audio_filepath)
101
+
102
+ return aligned_chunks
103
+
104
+ file_transcribe = gr.Interface(
105
+ fn=transcribe_speech_local,
106
+ inputs=gr.Audio(sources="upload", type="filepath"),
107
+ outputs=gr.JSON(label="Transcription with Time Alignment"),
108
+ allow_flagging="never"
109
+ )
110
+
111
+ link_transcribe = gr.Interface(
112
+ fn=transcribe_speech_from_youtube,
113
+ inputs=gr.Textbox(lines=1, placeholder="Enter YouTube URL here...", label="YouTube URL"),
114
+ outputs=gr.JSON(label="Transcription with Time Alignment"),
115
+ allow_flagging="never"
116
+ )
117
+
118
+ demo = gr.TabbedInterface(
119
+ [file_transcribe, link_transcribe ],
120
+ ["Local files(mp3/mp4/wav)", "Links"]
121
+ )
122
+
123
+ demo.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ transformers
2
+ gradio
3
+ torch
4
+ torchaudio
5
+ pydub
6
+ pytube