Spaces:
Running
on
Zero
Running
on
Zero
Charlie Amalet
commited on
Upload app.py
Browse files
app.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import whisper
|
2 |
+
from whisper.transcribe import LANGUAGES, TO_LANGUAGE_CODE
|
3 |
+
from whisper.utils import get_writer
|
4 |
+
import torch
|
5 |
+
import gradio as gr
|
6 |
+
import pathlib
|
7 |
+
import random
|
8 |
+
from datetime import datetime
|
9 |
+
|
10 |
+
APP_DIR = pathlib.Path(__file__).parent.absolute()
|
11 |
+
|
12 |
+
LOCAL_DIR = APP_DIR / "files"
|
13 |
+
LOCAL_DIR.mkdir(exist_ok=True)
|
14 |
+
SAVE_DIR = LOCAL_DIR / "transcripts"
|
15 |
+
SAVE_DIR.mkdir(exist_ok=True)
|
16 |
+
LANGS = [lang.capitalize() for lang in list(LANGUAGES.values())]
|
17 |
+
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
18 |
+
loaded_model = whisper.load_model("medium", DEVICE)
|
19 |
+
current_size = "None"
|
20 |
+
|
21 |
+
def generate_random_filename():
|
22 |
+
now = datetime.now()
|
23 |
+
timestamp = now.strftime("%H_%M_%S_%d_%m_%Y")
|
24 |
+
random_suffix = ''.join(random.choices('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', k=8))
|
25 |
+
filename = f"transcript_{timestamp}_{random_suffix}"
|
26 |
+
return filename
|
27 |
+
|
28 |
+
def get_transcript(audio_path, task_selection:str, language:str, max_line_width=0, max_line_count=0, max_words_per_line=0):
|
29 |
+
output_format = "all"
|
30 |
+
writer = get_writer(output_format, SAVE_DIR)
|
31 |
+
writer_args = {
|
32 |
+
"max_line_width": max_line_width if max_line_width > 0 else None,
|
33 |
+
"max_line_count": max_line_count if max_line_count > 0 else None,
|
34 |
+
"max_words_per_line": max_words_per_line if max_words_per_line > 0 else None
|
35 |
+
}
|
36 |
+
|
37 |
+
options = dict(task=task_selection.lower(), best_of=5, language=TO_LANGUAGE_CODE[language.lower()])
|
38 |
+
results = loaded_model.transcribe(audio_path, verbose=True, word_timestamps=True, **options)
|
39 |
+
# sample_rate, audio = audiodata
|
40 |
+
# results = loaded_model.transcribe(audio, verbose=True, word_timestamps=True, **options)
|
41 |
+
filename = generate_random_filename()
|
42 |
+
writer(results, filename, **writer_args)
|
43 |
+
|
44 |
+
return str(SAVE_DIR / f"{filename}.txt"), str(SAVE_DIR / f"{filename}.srt"), str(SAVE_DIR / f"{filename}.vtt")
|
45 |
+
|
46 |
+
# input_audio = gr.Audio(label="Upload an audio file", type="file")
|
47 |
+
# task_selection = gr.Radio(["Transcribe", "Translate"], label="Select Task")
|
48 |
+
# output_transcript = gr.Textbox(label="Transcript")
|
49 |
+
|
50 |
+
# block = gr.Blocks(
|
51 |
+
# inputs=input_audio,
|
52 |
+
# outputs=output_transcript,
|
53 |
+
# title="Auto Transcriber",
|
54 |
+
# description="Input an audio file and get a transcript.",
|
55 |
+
# update_fn=transcriber,
|
56 |
+
# inputs_layout="vertical",
|
57 |
+
# outputs_layout="vertical",
|
58 |
+
# input_component_labels=["Audio File", "Task"],
|
59 |
+
# output_component_labels=["Transcript"]
|
60 |
+
# )
|
61 |
+
|
62 |
+
# block.launch()
|
63 |
+
|
64 |
+
title="""
|
65 |
+
<div style="text-align: center; max-width: 500px; margin: 0 auto;">
|
66 |
+
<div
|
67 |
+
style="
|
68 |
+
display: inline-flex;
|
69 |
+
align-items: center;
|
70 |
+
gap: 0.8rem;
|
71 |
+
font-size: 1.75rem;
|
72 |
+
margin-bottom: 10px;
|
73 |
+
"
|
74 |
+
>
|
75 |
+
<h1 style="font-weight: 600; margin-bottom: 7px;">
|
76 |
+
Auto Transcriber 🔊
|
77 |
+
</h1>
|
78 |
+
</div>
|
79 |
+
|
80 |
+
</div>
|
81 |
+
"""
|
82 |
+
|
83 |
+
with gr.Blocks() as monapp:
|
84 |
+
with gr.Column():
|
85 |
+
gr.HTML(title)
|
86 |
+
with gr.Row():
|
87 |
+
with gr.Column():
|
88 |
+
audio_input = gr.Audio(label="Audiofile to transcribe", sources=["upload"], type="filepath")
|
89 |
+
with gr.Accordion("Transcribe options", open=True):
|
90 |
+
task_selection = gr.Radio(["Transcribe", "Translate"], value="Transcribe", label="Select a Task")
|
91 |
+
language = gr.Dropdown(choices=LANGS, value="English", label="Language spoken in the audio")
|
92 |
+
with gr.Column():
|
93 |
+
gr.HTML("<p>keep at 0 to <strong>don't use</strong></p>\n<p>max_words_per_line has no effect with max_line_width activated\nWord-level timestamps on translations may not be reliable.</p>")
|
94 |
+
# gr.HTML("<p>max_words_per_line has no effect with max_line_width activated</p>")
|
95 |
+
max_line_width = gr.Number(label="Maximum number of characters in a line before breaking the line", minimum=0, precision=0, value=0, step=1)
|
96 |
+
max_line_count = gr.Number(label="Maximum number of lines in a segment", minimum=0, precision=0, value=0, step=1)
|
97 |
+
max_words_per_line = gr.Number(label="Maximum number of words in a segment", minimum=0, precision=0, value=0, step=1)
|
98 |
+
# with gr.Group():
|
99 |
+
# active_img_bg= gr.Checkbox(False, label="Enable Background image")
|
100 |
+
# img_bg = gr.Textbox(None, label="Background image", placeholder="Background image path", show_label=False)
|
101 |
+
submit_btn = gr.Button("Transcribe")
|
102 |
+
|
103 |
+
with gr.Column():
|
104 |
+
transcript_txt = gr.File(height=50)
|
105 |
+
transcript_srt = gr.File(height=50)
|
106 |
+
transcript_vtt = gr.File(height=50)
|
107 |
+
|
108 |
+
submit_btn.click(fn=get_transcript, inputs=[audio_input, task_selection, language, max_line_width, max_line_count, max_words_per_line], outputs=[transcript_txt, transcript_srt, transcript_vtt])
|
109 |
+
|
110 |
+
monapp.launch(debug=True, show_error=True)
|
111 |
+
|
112 |
+
# loaded_model = whisper.load_model("base", DEVICE)
|
113 |
+
# loaded_model.transcribe
|
114 |
+
|
115 |
+
# parser.add_argument("--highlight_words", type=str2bool, default=False, help="(requires --word_timestamps True) underline each word as it is spoken in srt and vtt")
|
116 |
+
# parser.add_argument("--max_line_width", type=optional_int, default=None, help="(requires --word_timestamps True) the maximum number of characters in a line before breaking the line")
|
117 |
+
# parser.add_argument("--max_line_count", type=optional_int, default=None, help="(requires --word_timestamps True) the maximum number of lines in a segment")
|
118 |
+
# parser.add_argument("--max_words_per_line", type=optional_int, default=None, help="(requires --word_timestamps True, no effect with --max_line_width) the maximum number of words in a segment")
|