MohamedRashad commited on
Commit
b2b8bd5
β€’
1 Parent(s): bd91e9f

chore: Add app.py and requirements.txt

Browse files
Files changed (2) hide show
  1. app.py +142 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import spaces
3
+ import gradio as gr
4
+ from pytube import YouTube
5
+ from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
6
+ from transformers.pipelines.audio_utils import ffmpeg_read
7
+
8
+ import tempfile
9
+ import os
10
+
11
+ MODEL_NAME = "MohamedRashad/Arabic-Whisper-CodeSwitching-Edition"
12
+ BATCH_SIZE = 8
13
+ FILE_LIMIT_MB = 1000*3
14
+ YT_LENGTH_LIMIT_S = 60*60*3 # limit to 3 hour YouTube files
15
+
16
+ device = 0 if torch.cuda.is_available() else "cpu"
17
+
18
+ processor = WhisperProcessor.from_pretrained(MODEL_NAME)
19
+ model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16)
20
+
21
+ pipe = pipeline(
22
+ task="automatic-speech-recognition",
23
+ model=model,
24
+ tokenizer=processor.tokenizer,
25
+ feature_extractor=processor.feature_extractor,
26
+ chunk_length_s=30,
27
+ device=device,
28
+ )
29
+
30
+ @spaces.GPU(120)
31
+ def transcribe(inputs):
32
+ if inputs is None:
33
+ raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
34
+
35
+ text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe", "language": "arabic"}, return_timestamps=True)["text"]
36
+ return text
37
+
38
+
39
+ def _return_yt_html_embed(yt_url):
40
+ video_id = YouTube(yt_url).video_id
41
+ HTML_str = (
42
+ f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
43
+ " </center>"
44
+ )
45
+ return HTML_str
46
+
47
+ def download_yt_audio(yt_url, filename):
48
+ yt = YouTube(yt_url)
49
+
50
+ if yt.length > YT_LENGTH_LIMIT_S:
51
+ raise gr.Error("YouTube video is too long! Please upload a video that is less than 1 hour long.")
52
+
53
+ stream = yt.streams.filter(only_audio=True).first()
54
+ stream.download(filename=filename)
55
+
56
+ def seconds_to_timestamp(seconds):
57
+ total_seconds = int(seconds)
58
+ hours = total_seconds // 3600
59
+ minutes = (total_seconds % 3600) // 60
60
+ remaining_seconds = seconds % 60
61
+ return f"{hours:02d}:{minutes:02d}:{remaining_seconds:06.3f}"
62
+
63
+
64
+ def chunks_to_subtitle(chunks):
65
+ subtitle = ""
66
+ for chunk in chunks:
67
+ start = seconds_to_timestamp(chunk["timestamp"][0])
68
+ end = seconds_to_timestamp(chunk["timestamp"][1])
69
+ text = chunk["text"]
70
+ subtitle += f"{start} --> {end}\n{text}\n\n"
71
+ return subtitle
72
+
73
+ @spaces.GPU(120)
74
+ def yt_transcribe(yt_url):
75
+ html_embed_str = _return_yt_html_embed(yt_url)
76
+
77
+ with tempfile.TemporaryDirectory() as tmpdirname:
78
+ filepath = os.path.join(tmpdirname, "video.mp4")
79
+ download_yt_audio(yt_url, filepath)
80
+ with open(filepath, "rb") as f:
81
+ inputs = f.read()
82
+
83
+ inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
84
+ inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
85
+
86
+ output = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe", "language": "arabic"}, return_timestamps=True)
87
+ subtitle = chunks_to_subtitle(output["chunks"])
88
+
89
+ return html_embed_str, subtitle
90
+
91
+
92
+ demo = gr.Blocks()
93
+
94
+ mf_transcribe = gr.Interface(
95
+ fn=transcribe,
96
+ inputs=[
97
+ gr.Audio(sources="microphone", type="filepath"),
98
+ ],
99
+ outputs="text",
100
+ title="Whisper Large V3: Transcribe Audio",
101
+ description=(
102
+ "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
103
+ f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to transcribe audio files"
104
+ " of arbitrary length."
105
+ ),
106
+ allow_flagging="never",
107
+ )
108
+
109
+ file_transcribe = gr.Interface(
110
+ fn=transcribe,
111
+ inputs=[
112
+ gr.Audio(sources="upload", type="filepath", label="Audio file"),
113
+ ],
114
+ outputs="text",
115
+ title="Whisper Large V3: Transcribe Audio",
116
+ description=(
117
+ "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
118
+ f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to transcribe audio files"
119
+ " of arbitrary length."
120
+ ),
121
+ allow_flagging="never",
122
+ )
123
+
124
+ yt_transcribe_demo = gr.Interface(
125
+ fn=yt_transcribe,
126
+ inputs=[
127
+ gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
128
+ ],
129
+ outputs=["html", "text"],
130
+ title="Whisper Large V3: Transcribe YouTube",
131
+ description=(
132
+ "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
133
+ f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to transcribe video files of"
134
+ " arbitrary length."
135
+ ),
136
+ allow_flagging="never",
137
+ )
138
+
139
+ with demo:
140
+ gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe_demo], ["Microphone", "Audio file", "YouTube"])
141
+
142
+ demo.queue().launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch
2
+ pytube
3
+ transformers
4
+ spaces