aadnk commited on
Commit
05a2178
1 Parent(s): 23c3153

Initial commit

Browse files
Files changed (4) hide show
  1. .gitignore +4 -0
  2. app.py +141 -0
  3. requirements.txt +2 -0
  4. utils.py +54 -0
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
app.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import StringIO
2
+ import gradio as gr
3
+
4
+ from utils import write_vtt
5
+ import whisper
6
+
7
+ #import os
8
+ #os.system("pip install git+https://github.com/openai/whisper.git")
9
+
10
+ LANGUAGES = [
11
+ "English",
12
+ "Chinese",
13
+ "German",
14
+ "Spanish",
15
+ "Russian",
16
+ "Korean",
17
+ "French",
18
+ "Japanese",
19
+ "Portuguese",
20
+ "Turkish",
21
+ "Polish",
22
+ "Catalan",
23
+ "Dutch",
24
+ "Arabic",
25
+ "Swedish",
26
+ "Italian",
27
+ "Indonesian",
28
+ "Hindi",
29
+ "Finnish",
30
+ "Vietnamese",
31
+ "Hebrew",
32
+ "Ukrainian",
33
+ "Greek",
34
+ "Malay",
35
+ "Czech",
36
+ "Romanian",
37
+ "Danish",
38
+ "Hungarian",
39
+ "Tamil",
40
+ "Norwegian",
41
+ "Thai",
42
+ "Urdu",
43
+ "Croatian",
44
+ "Bulgarian",
45
+ "Lithuanian",
46
+ "Latin",
47
+ "Maori",
48
+ "Malayalam",
49
+ "Welsh",
50
+ "Slovak",
51
+ "Telugu",
52
+ "Persian",
53
+ "Latvian",
54
+ "Bengali",
55
+ "Serbian",
56
+ "Azerbaijani",
57
+ "Slovenian",
58
+ "Kannada",
59
+ "Estonian",
60
+ "Macedonian",
61
+ "Breton",
62
+ "Basque",
63
+ "Icelandic",
64
+ "Armenian",
65
+ "Nepali",
66
+ "Mongolian",
67
+ "Bosnian",
68
+ "Kazakh",
69
+ "Albanian",
70
+ "Swahili",
71
+ "Galician",
72
+ "Marathi",
73
+ "Punjabi",
74
+ "Sinhala",
75
+ "Khmer",
76
+ "Shona",
77
+ "Yoruba",
78
+ "Somali",
79
+ "Afrikaans",
80
+ "Occitan",
81
+ "Georgian",
82
+ "Belarusian",
83
+ "Tajik",
84
+ "Sindhi",
85
+ "Gujarati",
86
+ "Amharic",
87
+ "Yiddish",
88
+ "Lao",
89
+ "Uzbek",
90
+ "Faroese",
91
+ "Haitian Creole",
92
+ "Pashto",
93
+ "Turkmen",
94
+ "Nynorsk",
95
+ "Maltese",
96
+ "Sanskrit",
97
+ "Luxembourgish",
98
+ "Myanmar",
99
+ "Tibetan",
100
+ "Tagalog",
101
+ "Malagasy",
102
+ "Assamese",
103
+ "Tatar",
104
+ "Hawaiian",
105
+ "Lingala",
106
+ "Hausa",
107
+ "Bashkir",
108
+ "Javanese",
109
+ "Sundanese"
110
+ ]
111
+
112
+ model_cache = dict()
113
+
114
+ def greet(modelName, languageName, uploadFile, microphoneData, task):
115
+ source = uploadFile if uploadFile is not None else microphoneData
116
+ selectedLanguage = languageName.lower() if len(languageName) > 0 else None
117
+ selectedModel = modelName if modelName is not None else "base"
118
+
119
+ model = model_cache.get(selectedModel, None)
120
+
121
+ if not model:
122
+ model = whisper.load_model(selectedModel)
123
+ model_cache[selectedModel] = model
124
+
125
+ result = model.transcribe(source, language=selectedLanguage, task=task)
126
+
127
+ segmentStream = StringIO()
128
+ write_vtt(result["segments"], file=segmentStream)
129
+ segmentStream.seek(0)
130
+
131
+ return result["text"], segmentStream.read()
132
+
133
+ demo = gr.Interface(fn=greet, description="Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification.", inputs=[
134
+ gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value="medium", label="Model"),
135
+ gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
136
+ gr.Audio(source="upload", type="filepath", label="Upload Audio"),
137
+ gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
138
+ gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
139
+ ], outputs=[gr.Text(label="Transcription"), gr.Text(label="Segments")])
140
+
141
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ git+https://github.com/openai/whisper.git
2
+ transformers
utils.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import zlib
2
+ from typing import Iterator, TextIO
3
+
4
+
5
+ def exact_div(x, y):
6
+ assert x % y == 0
7
+ return x // y
8
+
9
+
10
+ def str2bool(string):
11
+ str2val = {"True": True, "False": False}
12
+ if string in str2val:
13
+ return str2val[string]
14
+ else:
15
+ raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
16
+
17
+
18
+ def optional_int(string):
19
+ return None if string == "None" else int(string)
20
+
21
+
22
+ def optional_float(string):
23
+ return None if string == "None" else float(string)
24
+
25
+
26
+ def compression_ratio(text) -> float:
27
+ return len(text) / len(zlib.compress(text.encode("utf-8")))
28
+
29
+
30
+ def format_timestamp(seconds: float):
31
+ assert seconds >= 0, "non-negative timestamp expected"
32
+ milliseconds = round(seconds * 1000.0)
33
+
34
+ hours = milliseconds // 3_600_000
35
+ milliseconds -= hours * 3_600_000
36
+
37
+ minutes = milliseconds // 60_000
38
+ milliseconds -= minutes * 60_000
39
+
40
+ seconds = milliseconds // 1_000
41
+ milliseconds -= seconds * 1_000
42
+
43
+ return (f"{hours}:" if hours > 0 else "") + f"{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
44
+
45
+
46
+ def write_vtt(transcript: Iterator[dict], file: TextIO):
47
+ print("WEBVTT\n", file=file)
48
+ for segment in transcript:
49
+ print(
50
+ f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
51
+ f"{segment['text'].replace('-->', '->')}\n",
52
+ file=file,
53
+ flush=True,
54
+ )