Files changed (2) hide show
  1. app.py +20 -77
  2. requirements.txt +1 -3
app.py CHANGED
@@ -1,85 +1,37 @@
1
- import os
2
- import time
3
- import tempfile
4
- from math import floor
5
- from typing import Optional, List, Dict, Any
6
-
7
  import torch
 
8
  import gradio as gr
9
  import yt_dlp as youtube_dl
10
  from transformers import pipeline
11
  from transformers.pipelines.audio_utils import ffmpeg_read
12
 
 
 
13
 
14
- # configuration
15
- MODEL_NAME = "kotoba-tech/kotoba-whisper-v1.1"
16
- BATCH_SIZE = 16
17
- CHUNK_LENGTH_S = 15
18
  FILE_LIMIT_MB = 1000
19
  YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
20
- # device setting
21
- if torch.cuda.is_available():
22
- torch_dtype = torch.bfloat16
23
- device = "cuda:0"
24
- model_kwargs = {'attn_implementation': 'sdpa'}
25
- else:
26
- torch_dtype = torch.float32
27
- device = "cpu"
28
- model_kwargs = {}
29
- # define the pipeline
30
  pipe = pipeline(
 
31
  model=MODEL_NAME,
32
- chunk_length_s=CHUNK_LENGTH_S,
33
- batch_size=BATCH_SIZE,
34
- torch_dtype=torch_dtype,
35
  device=device,
36
- model_kwargs=model_kwargs,
37
- trust_remote_code=True
38
  )
39
 
40
 
41
- def format_time(start: Optional[float], end: Optional[float]):
42
-
43
- def _format_time(seconds: Optional[float]):
44
- if seconds is None:
45
- return "complete "
46
- minutes = floor(seconds / 60)
47
- hours = floor(seconds / 3600)
48
- seconds = seconds - hours * 3600 - minutes * 60
49
- m_seconds = floor(round(seconds - floor(seconds), 3) * 10 ** 3)
50
- seconds = floor(seconds)
51
- return f'{hours:02}:{minutes:02}:{seconds:02}.{m_seconds:03}'
52
-
53
- return f"[{_format_time(start)}-> {_format_time(end)}]:"
54
-
55
-
56
- def get_prediction(inputs, prompt: Optional[str]):
57
- generate_kwargs = {"language": "japanese", "task": "transcribe"}
58
- if prompt:
59
- generate_kwargs['prompt_ids'] = pipe.tokenizer.get_prompt_ids(prompt, return_tensors='pt').to(device)
60
- prediction = pipe(inputs, return_timestamps=True, generate_kwargs=generate_kwargs)
61
- text = "".join([c['text'] for c in prediction['chunks']])
62
- text_timestamped = "\n".join([
63
- f"{format_time(*c['timestamp'])} {c['text']}" for c in prediction['chunks']
64
- ])
65
- return text, text_timestamped
66
-
67
-
68
- def transcribe(inputs: str, prompt):
69
  if inputs is None:
70
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
71
- with open(inputs, "rb") as f:
72
- inputs = f.read()
73
- inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
74
- inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
75
- return get_prediction(inputs, prompt)
76
 
77
 
78
  def _return_yt_html_embed(yt_url):
79
  video_id = yt_url.split("?v=")[-1]
80
  return f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe> </center>'
81
 
82
-
83
  def download_yt_audio(yt_url, filename):
84
  info_loader = youtube_dl.YoutubeDL()
85
  try:
@@ -106,7 +58,7 @@ def download_yt_audio(yt_url, filename):
106
  raise gr.Error(str(err))
107
 
108
 
109
- def yt_transcribe(yt_url, prompt):
110
  html_embed_str = _return_yt_html_embed(yt_url)
111
  with tempfile.TemporaryDirectory() as tmpdirname:
112
  filepath = os.path.join(tmpdirname, "video.mp4")
@@ -115,18 +67,15 @@ def yt_transcribe(yt_url, prompt):
115
  inputs = f.read()
116
  inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
117
  inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
118
- text, text_timestamped = get_prediction(inputs, prompt)
119
- return html_embed_str, text, text_timestamped
120
 
121
 
122
  demo = gr.Blocks()
123
  mf_transcribe = gr.Interface(
124
  fn=transcribe,
125
- inputs=[
126
- gr.inputs.Audio(source="microphone", type="filepath", optional=True),
127
- gr.inputs.Textbox(lines=1, placeholder="Prompt", optional=True),
128
- ],
129
- outputs=["text", "text"],
130
  layout="horizontal",
131
  theme="huggingface",
132
  title=f"Transcribe Audio with {os.path.basename(MODEL_NAME)}",
@@ -136,11 +85,8 @@ mf_transcribe = gr.Interface(
136
 
137
  file_transcribe = gr.Interface(
138
  fn=transcribe,
139
- inputs=[
140
- gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Audio file"),
141
- gr.inputs.Textbox(lines=1, placeholder="Prompt", optional=True),
142
- ],
143
- outputs=["text", "text"],
144
  layout="horizontal",
145
  theme="huggingface",
146
  title=f"Transcribe Audio with {os.path.basename(MODEL_NAME)}",
@@ -149,11 +95,8 @@ file_transcribe = gr.Interface(
149
  )
150
  yt_transcribe = gr.Interface(
151
  fn=yt_transcribe,
152
- inputs=[
153
- gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
154
- gr.inputs.Textbox(lines=1, placeholder="Prompt", optional=True),
155
- ],
156
- outputs=["html", "text", "text"],
157
  layout="horizontal",
158
  theme="huggingface",
159
  title=f"Transcribe YouTube with {os.path.basename(MODEL_NAME)}",
 
 
 
 
 
 
 
1
  import torch
2
+
3
  import gradio as gr
4
  import yt_dlp as youtube_dl
5
  from transformers import pipeline
6
  from transformers.pipelines.audio_utils import ffmpeg_read
7
 
8
+ import tempfile
9
+ import os
10
 
11
+ MODEL_NAME = "kotoba-tech/kotoba-whisper-v1.0"
12
+ BATCH_SIZE = 8
 
 
13
  FILE_LIMIT_MB = 1000
14
  YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
15
+
16
+ device = 0 if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
 
 
17
  pipe = pipeline(
18
+ task="automatic-speech-recognition",
19
  model=MODEL_NAME,
20
+ chunk_length_s=30,
 
 
21
  device=device,
 
 
22
  )
23
 
24
 
25
+ def transcribe(inputs):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  if inputs is None:
27
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
28
+ return pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
 
 
 
 
29
 
30
 
31
  def _return_yt_html_embed(yt_url):
32
  video_id = yt_url.split("?v=")[-1]
33
  return f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe> </center>'
34
 
 
35
  def download_yt_audio(yt_url, filename):
36
  info_loader = youtube_dl.YoutubeDL()
37
  try:
 
58
  raise gr.Error(str(err))
59
 
60
 
61
+ def yt_transcribe(yt_url, max_filesize=75.0):
62
  html_embed_str = _return_yt_html_embed(yt_url)
63
  with tempfile.TemporaryDirectory() as tmpdirname:
64
  filepath = os.path.join(tmpdirname, "video.mp4")
 
67
  inputs = f.read()
68
  inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
69
  inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
70
+ text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
71
+ return html_embed_str, text
72
 
73
 
74
  demo = gr.Blocks()
75
  mf_transcribe = gr.Interface(
76
  fn=transcribe,
77
+ inputs=[gr.inputs.Audio(source="microphone", type="filepath", optional=True)],
78
+ outputs="text",
 
 
 
79
  layout="horizontal",
80
  theme="huggingface",
81
  title=f"Transcribe Audio with {os.path.basename(MODEL_NAME)}",
 
85
 
86
  file_transcribe = gr.Interface(
87
  fn=transcribe,
88
+ inputs=[gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Audio file")],
89
+ outputs="text",
 
 
 
90
  layout="horizontal",
91
  theme="huggingface",
92
  title=f"Transcribe Audio with {os.path.basename(MODEL_NAME)}",
 
95
  )
96
  yt_transcribe = gr.Interface(
97
  fn=yt_transcribe,
98
+ inputs=[gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")],
99
+ outputs=["html", "text"],
 
 
 
100
  layout="horizontal",
101
  theme="huggingface",
102
  title=f"Transcribe YouTube with {os.path.basename(MODEL_NAME)}",
requirements.txt CHANGED
@@ -1,5 +1,3 @@
1
  git+https://github.com/huggingface/transformers
2
  torch
3
- yt-dlp
4
- punctuators==0.0.5
5
- stable-ts==2.16.0
 
1
  git+https://github.com/huggingface/transformers
2
  torch
3
+ yt-dlp