Charlie Amalet commited on
Commit
2e802f8
·
verified ·
1 Parent(s): 7a0c320

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -0
app.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ from whisper.transcribe import LANGUAGES, TO_LANGUAGE_CODE
3
+ from whisper.utils import get_writer
4
+ import torch
5
+ import gradio as gr
6
+ import pathlib
7
+ import random
8
+ from datetime import datetime
9
+
10
+ APP_DIR = pathlib.Path(__file__).parent.absolute()
11
+
12
+ LOCAL_DIR = APP_DIR / "files"
13
+ LOCAL_DIR.mkdir(exist_ok=True)
14
+ SAVE_DIR = LOCAL_DIR / "transcripts"
15
+ SAVE_DIR.mkdir(exist_ok=True)
16
+ LANGS = [lang.capitalize() for lang in list(LANGUAGES.values())]
17
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
18
+ loaded_model = whisper.load_model("medium", DEVICE)
19
+ current_size = "None"
20
+
21
+ def generate_random_filename():
22
+ now = datetime.now()
23
+ timestamp = now.strftime("%H_%M_%S_%d_%m_%Y")
24
+ random_suffix = ''.join(random.choices('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', k=8))
25
+ filename = f"transcript_{timestamp}_{random_suffix}"
26
+ return filename
27
+
28
+ def get_transcript(audio_path, task_selection:str, language:str, max_line_width=0, max_line_count=0, max_words_per_line=0):
29
+ output_format = "all"
30
+ writer = get_writer(output_format, SAVE_DIR)
31
+ writer_args = {
32
+ "max_line_width": max_line_width if max_line_width > 0 else None,
33
+ "max_line_count": max_line_count if max_line_count > 0 else None,
34
+ "max_words_per_line": max_words_per_line if max_words_per_line > 0 else None
35
+ }
36
+
37
+ options = dict(task=task_selection.lower(), best_of=5, language=TO_LANGUAGE_CODE[language.lower()])
38
+ results = loaded_model.transcribe(audio_path, verbose=True, word_timestamps=True, **options)
39
+ # sample_rate, audio = audiodata
40
+ # results = loaded_model.transcribe(audio, verbose=True, word_timestamps=True, **options)
41
+ filename = generate_random_filename()
42
+ writer(results, filename, **writer_args)
43
+
44
+ return str(SAVE_DIR / f"{filename}.txt"), str(SAVE_DIR / f"{filename}.srt"), str(SAVE_DIR / f"{filename}.vtt")
45
+
46
+ # input_audio = gr.Audio(label="Upload an audio file", type="file")
47
+ # task_selection = gr.Radio(["Transcribe", "Translate"], label="Select Task")
48
+ # output_transcript = gr.Textbox(label="Transcript")
49
+
50
+ # block = gr.Blocks(
51
+ # inputs=input_audio,
52
+ # outputs=output_transcript,
53
+ # title="Auto Transcriber",
54
+ # description="Input an audio file and get a transcript.",
55
+ # update_fn=transcriber,
56
+ # inputs_layout="vertical",
57
+ # outputs_layout="vertical",
58
+ # input_component_labels=["Audio File", "Task"],
59
+ # output_component_labels=["Transcript"]
60
+ # )
61
+
62
+ # block.launch()
63
+
64
+ title="""
65
+ <div style="text-align: center; max-width: 500px; margin: 0 auto;">
66
+ <div
67
+ style="
68
+ display: inline-flex;
69
+ align-items: center;
70
+ gap: 0.8rem;
71
+ font-size: 1.75rem;
72
+ margin-bottom: 10px;
73
+ "
74
+ >
75
+ <h1 style="font-weight: 600; margin-bottom: 7px;">
76
+ Auto Transcriber 🔊
77
+ </h1>
78
+ </div>
79
+
80
+ </div>
81
+ """
82
+
83
+ with gr.Blocks() as monapp:
84
+ with gr.Column():
85
+ gr.HTML(title)
86
+ with gr.Row():
87
+ with gr.Column():
88
+ audio_input = gr.Audio(label="Audiofile to transcribe", sources=["upload"], type="filepath")
89
+ with gr.Accordion("Transcribe options", open=True):
90
+ task_selection = gr.Radio(["Transcribe", "Translate"], value="Transcribe", label="Select a Task")
91
+ language = gr.Dropdown(choices=LANGS, value="English", label="Language spoken in the audio")
92
+ with gr.Column():
93
+ gr.HTML("<p>keep at 0 to <strong>don't use</strong></p>\n<p>max_words_per_line has no effect with max_line_width activated\nWord-level timestamps on translations may not be reliable.</p>")
94
+ # gr.HTML("<p>max_words_per_line has no effect with max_line_width activated</p>")
95
+ max_line_width = gr.Number(label="Maximum number of characters in a line before breaking the line", minimum=0, precision=0, value=0, step=1)
96
+ max_line_count = gr.Number(label="Maximum number of lines in a segment", minimum=0, precision=0, value=0, step=1)
97
+ max_words_per_line = gr.Number(label="Maximum number of words in a segment", minimum=0, precision=0, value=0, step=1)
98
+ # with gr.Group():
99
+ # active_img_bg= gr.Checkbox(False, label="Enable Background image")
100
+ # img_bg = gr.Textbox(None, label="Background image", placeholder="Background image path", show_label=False)
101
+ submit_btn = gr.Button("Transcribe")
102
+
103
+ with gr.Column():
104
+ transcript_txt = gr.File(height=50)
105
+ transcript_srt = gr.File(height=50)
106
+ transcript_vtt = gr.File(height=50)
107
+
108
+ submit_btn.click(fn=get_transcript, inputs=[audio_input, task_selection, language, max_line_width, max_line_count, max_words_per_line], outputs=[transcript_txt, transcript_srt, transcript_vtt])
109
+
110
+ monapp.launch(debug=True, show_error=True)
111
+
112
+ # loaded_model = whisper.load_model("base", DEVICE)
113
+ # loaded_model.transcribe
114
+
115
+ # parser.add_argument("--highlight_words", type=str2bool, default=False, help="(requires --word_timestamps True) underline each word as it is spoken in srt and vtt")
116
+ # parser.add_argument("--max_line_width", type=optional_int, default=None, help="(requires --word_timestamps True) the maximum number of characters in a line before breaking the line")
117
+ # parser.add_argument("--max_line_count", type=optional_int, default=None, help="(requires --word_timestamps True) the maximum number of lines in a segment")
118
+ # parser.add_argument("--max_words_per_line", type=optional_int, default=None, help="(requires --word_timestamps True, no effect with --max_line_width) the maximum number of words in a segment")