Kazuki Nakayashiki commited on
Commit
e21b25c
1 Parent(s): 342a478

Add initial commit

Browse files
Files changed (3) hide show
  1. README.md +3 -3
  2. app.py +66 -0
  3. requirements.txt +3 -0
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  title: Youtube Whisper
3
- emoji: 🔥
4
- colorFrom: purple
5
- colorTo: blue
6
  sdk: gradio
7
  sdk_version: 3.16.2
8
  app_file: app.py
 
1
  ---
2
  title: Youtube Whisper
3
+ emoji:
4
+ colorFrom: green
5
+ colorTo: red
6
  sdk: gradio
7
  sdk_version: 3.16.2
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import whisper
3
+ from pytube import YouTube
4
+
5
+ def get_audio(url):
6
+ yt = YouTube(url)
7
+ return yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4")
8
+
9
+ def get_transcript(url, model_size, lang, format):
10
+
11
+ model = whisper.load_model(model_size)
12
+
13
+ if lang == "None":
14
+ lang = None
15
+
16
+ result = model.transcribe(get_audio(url), fp16=False, language=lang)
17
+
18
+ if format == "None":
19
+ return result["text"]
20
+ elif format == ".srt":
21
+ return format_to_srt(result["segments"])
22
+
23
+ def format_to_srt(segments):
24
+ output = ""
25
+ for i, segment in enumerate(segments):
26
+ output += f"{i + 1}\n"
27
+ output += f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
28
+ output += f"{segment['text']}\n\n"
29
+ return output
30
+
31
+ def format_timestamp(t):
32
+ hh = t//3600
33
+ mm = (t - hh*3600)//60
34
+ ss = t - hh*3600 - mm*60
35
+ mi = (t - int(t))*1000
36
+ return f"{int(hh):02d}:{int(mm):02d}:{int(ss):02d},{int(mi):03d}"
37
+
38
+
39
+ langs = ["None"] + sorted(list(whisper.tokenizer.LANGUAGES.values()))
40
+ model_size = list(whisper._MODELS.keys())
41
+
42
+ with gr.Blocks() as demo:
43
+
44
+ with gr.Row():
45
+
46
+ with gr.Column():
47
+
48
+ with gr.Row():
49
+ url = gr.Textbox(placeholder='Youtube video URL', label='URL')
50
+
51
+ with gr.Row():
52
+
53
+ model_size = gr.Dropdown(choices=model_size, value='tiny', label="Model")
54
+ lang = gr.Dropdown(choices=langs, value="None", label="Language (Optional)")
55
+ format = gr.Dropdown(choices=["None", ".srt"], value="None", label="Timestamps? (Optional)")
56
+
57
+ with gr.Row():
58
+ gr.Markdown("Larger models are more accurate, but slower. For 1min video, it'll take ~30s (tiny), ~1min (base), ~3min (small), ~5min (medium), etc.")
59
+ transcribe_btn = gr.Button('Transcribe')
60
+
61
+ with gr.Column():
62
+ outputs = gr.Textbox(placeholder='Transcription of the video', label='Transcription')
63
+
64
+ transcribe_btn.click(get_transcript, inputs=[url, model_size, lang, format], outputs=outputs)
65
+
66
+ demo.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ transformers
2
+ pytube
3
+ git+https://github.com/openai/whisper.git