Update app.py
Browse files
app.py
CHANGED
@@ -1,8 +1,14 @@
|
|
1 |
import os
|
2 |
|
|
|
|
|
3 |
|
4 |
os.system('git clone https://github.com/ggerganov/whisper.cpp.git')
|
5 |
os.system('make -C ./whisper.cpp')
|
|
|
|
|
|
|
|
|
6 |
os.system('bash ./whisper.cpp/models/download-ggml-model.sh small')
|
7 |
os.system('bash ./whisper.cpp/models/download-ggml-model.sh base')
|
8 |
os.system('bash ./whisper.cpp/models/download-ggml-model.sh medium')
|
@@ -15,17 +21,14 @@ os.system('bash ./whisper.cpp/models/download-ggml-model.sh base.en')
|
|
15 |
|
16 |
|
17 |
|
18 |
-
import os
|
19 |
-
|
20 |
-
|
21 |
import gradio as gr
|
22 |
-
import os
|
23 |
from pathlib import Path
|
24 |
import pysrt
|
25 |
import pandas as pd
|
26 |
import re
|
27 |
import time
|
28 |
import os
|
|
|
29 |
|
30 |
from pytube import YouTube
|
31 |
from transformers import MarianMTModel, MarianTokenizer
|
@@ -33,32 +36,7 @@ from transformers import MarianMTModel, MarianTokenizer
|
|
33 |
import psutil
|
34 |
num_cores = psutil.cpu_count()
|
35 |
os.environ["OMP_NUM_THREADS"] = f"{num_cores}"
|
36 |
-
|
37 |
-
|
38 |
-
import torch
|
39 |
-
|
40 |
-
|
41 |
-
finnish_marian_nmt_model = "Helsinki-NLP/opus-mt-tc-big-en-fi"
|
42 |
-
finnish_tokenizer_marian = MarianTokenizer.from_pretrained(finnish_marian_nmt_model, max_length=40)
|
43 |
-
finnish_tokenizer_marian.max_new_tokens = 30
|
44 |
-
finnish_translation_model = MarianMTModel.from_pretrained(finnish_marian_nmt_model)
|
45 |
-
|
46 |
-
swedish_marian_nmt_model = "Helsinki-NLP/opus-mt-en-sv"
|
47 |
-
swedish_tokenizer_marian = MarianTokenizer.from_pretrained(swedish_marian_nmt_model, max_length=40)
|
48 |
-
swedish_tokenizer_marian.max_new_tokens = 30
|
49 |
-
swedish_translation_model = MarianMTModel.from_pretrained(swedish_marian_nmt_model)
|
50 |
-
|
51 |
-
danish_marian_nmt_model = "Helsinki-NLP/opus-mt-en-da"
|
52 |
-
danish_tokenizer_marian = MarianTokenizer.from_pretrained(danish_marian_nmt_model, max_length=40)
|
53 |
-
danish_tokenizer_marian.max_new_tokens = 30
|
54 |
-
danish_translation_model = MarianMTModel.from_pretrained(danish_marian_nmt_model)
|
55 |
-
|
56 |
-
|
57 |
-
translation_models = {
|
58 |
-
"Finnish": [finnish_tokenizer_marian, finnish_translation_model],
|
59 |
-
"Swedish": [swedish_tokenizer_marian, swedish_translation_model],
|
60 |
-
"Danish": [danish_tokenizer_marian, danish_translation_model]
|
61 |
-
}
|
62 |
|
63 |
whisper_models = ["base", "small", "medium", "base.en"]
|
64 |
|
@@ -80,8 +58,34 @@ source_languages = {
|
|
80 |
"Let the model analyze": "Let the model analyze"
|
81 |
}
|
82 |
|
83 |
-
|
84 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
}
|
86 |
|
87 |
|
@@ -90,7 +94,7 @@ transcribe_options = dict(beam_size=3, best_of=3, without_timestamps=False)
|
|
90 |
|
91 |
|
92 |
source_language_list = [key[0] for key in source_languages.items()]
|
93 |
-
source_language_list_2 = [key[0] for key in
|
94 |
translation_models_list = [key[0] for key in translation_models.items()]
|
95 |
|
96 |
|
@@ -190,27 +194,32 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model):
|
|
190 |
|
191 |
|
192 |
def translate_transcriptions(df, selected_translation_lang_2, selected_source_lang_2):
|
193 |
-
print("IN TRANSLATE")
|
194 |
-
|
195 |
if selected_translation_lang_2 is None:
|
196 |
-
selected_translation_lang_2 = '
|
197 |
df.reset_index(inplace=True)
|
198 |
|
199 |
-
print("Getting models")
|
200 |
-
|
201 |
-
tokenizer_marian = translation_models.get(selected_translation_lang_2)[0]
|
202 |
-
translation_model = translation_models.get(selected_translation_lang_2)[1]
|
203 |
-
|
204 |
print("start_translation")
|
205 |
translations = []
|
206 |
-
|
207 |
if selected_translation_lang_2 != selected_source_lang_2:
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
else:
|
215 |
df['translation'] = df['text']
|
216 |
print("translations done")
|
|
|
1 |
import os
|
2 |
|
3 |
+
# Download and build ggergavos/whisper.cpp Kudos to this man for wonderful whisper implementation!
|
4 |
+
# This means speed!
|
5 |
|
6 |
os.system('git clone https://github.com/ggerganov/whisper.cpp.git')
|
7 |
os.system('make -C ./whisper.cpp')
|
8 |
+
|
9 |
+
# Download models, add finetuned languages later once whisper finetuning event is ready
|
10 |
+
# Models are downloaded on the fly so we can get quite many models :)
|
11 |
+
|
12 |
os.system('bash ./whisper.cpp/models/download-ggml-model.sh small')
|
13 |
os.system('bash ./whisper.cpp/models/download-ggml-model.sh base')
|
14 |
os.system('bash ./whisper.cpp/models/download-ggml-model.sh medium')
|
|
|
21 |
|
22 |
|
23 |
|
|
|
|
|
|
|
24 |
import gradio as gr
|
|
|
25 |
from pathlib import Path
|
26 |
import pysrt
|
27 |
import pandas as pd
|
28 |
import re
|
29 |
import time
|
30 |
import os
|
31 |
+
import json
|
32 |
|
33 |
from pytube import YouTube
|
34 |
from transformers import MarianMTModel, MarianTokenizer
|
|
|
36 |
import psutil
|
37 |
num_cores = psutil.cpu_count()
|
38 |
os.environ["OMP_NUM_THREADS"] = f"{num_cores}"
|
39 |
+
headers = {'Authorization': os.environ['DeepL_API_KEY']}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
whisper_models = ["base", "small", "medium", "base.en"]
|
42 |
|
|
|
58 |
"Let the model analyze": "Let the model analyze"
|
59 |
}
|
60 |
|
61 |
+
DeepL_language_codes_for_translation = {
|
62 |
+
"Bulgarian": "BG",
|
63 |
+
"Czech": "CS",
|
64 |
+
"Danish": "DA",
|
65 |
+
"German": "DE",
|
66 |
+
"Greek": "EL",
|
67 |
+
"English": "EN",
|
68 |
+
"Spanish": "ES",
|
69 |
+
"Estonian": "ET",
|
70 |
+
"Finnish": "FI",
|
71 |
+
"French": "FR",
|
72 |
+
"Hungarian": "HU",
|
73 |
+
"Indonesian": "ID",
|
74 |
+
"Italian": "IT",
|
75 |
+
"Japanese": "JA",
|
76 |
+
"Lithuanian": "LT",
|
77 |
+
"Latvian": "LV",
|
78 |
+
"Dutch": "NL",
|
79 |
+
"Polish": "PL",
|
80 |
+
"Portuguese": "PT",
|
81 |
+
"Romanian": "RO",
|
82 |
+
"Russian": "RU",
|
83 |
+
"Slovak": "SK",
|
84 |
+
"Slovenian": "SL",
|
85 |
+
"Swedish": "SV",
|
86 |
+
"Turkish": "TR",
|
87 |
+
"Ukrainian": "UK",
|
88 |
+
"Chinese": "ZH"
|
89 |
}
|
90 |
|
91 |
|
|
|
94 |
|
95 |
|
96 |
source_language_list = [key[0] for key in source_languages.items()]
|
97 |
+
source_language_list_2 = [key[0] for key in DeepL_language_codes_for_translation.items()]
|
98 |
translation_models_list = [key[0] for key in translation_models.items()]
|
99 |
|
100 |
|
|
|
194 |
|
195 |
|
196 |
def translate_transcriptions(df, selected_translation_lang_2, selected_source_lang_2):
|
|
|
|
|
197 |
if selected_translation_lang_2 is None:
|
198 |
+
selected_translation_lang_2 = 'English'
|
199 |
df.reset_index(inplace=True)
|
200 |
|
|
|
|
|
|
|
|
|
|
|
201 |
print("start_translation")
|
202 |
translations = []
|
203 |
+
|
204 |
if selected_translation_lang_2 != selected_source_lang_2:
|
205 |
+
|
206 |
+
text_combined = ""
|
207 |
+
for i, sentence in enumerate(init__df['text']):
|
208 |
+
if i == 0:
|
209 |
+
text_combined = sentence
|
210 |
+
else:
|
211 |
+
text_combined = text_combined + '\n' + sentence
|
212 |
+
|
213 |
+
data = {'text': text_combined,
|
214 |
+
'tag_spitting': 'xml',
|
215 |
+
'target_lang': DeepL_language_codes.get(selected_source_lang_2)
|
216 |
+
}
|
217 |
+
response = requests.post('https://api-free.deepl.com/v2/translate', headers=headers, data=data)
|
218 |
+
|
219 |
+
# Print the response from the server
|
220 |
+
translated_sentences = json.loads(response.text)
|
221 |
+
translated_sentences['translations'][0]['text'].split('\n')
|
222 |
+
df['translation'] = translated_sentences
|
223 |
else:
|
224 |
df['translation'] = df['text']
|
225 |
print("translations done")
|