RASMUS commited on
Commit
e8a76b4
1 Parent(s): fce64ec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -48
app.py CHANGED
@@ -1,8 +1,14 @@
1
  import os
2
 
 
 
3
 
4
  os.system('git clone https://github.com/ggerganov/whisper.cpp.git')
5
  os.system('make -C ./whisper.cpp')
 
 
 
 
6
  os.system('bash ./whisper.cpp/models/download-ggml-model.sh small')
7
  os.system('bash ./whisper.cpp/models/download-ggml-model.sh base')
8
  os.system('bash ./whisper.cpp/models/download-ggml-model.sh medium')
@@ -15,17 +21,14 @@ os.system('bash ./whisper.cpp/models/download-ggml-model.sh base.en')
15
 
16
 
17
 
18
- import os
19
-
20
-
21
  import gradio as gr
22
- import os
23
  from pathlib import Path
24
  import pysrt
25
  import pandas as pd
26
  import re
27
  import time
28
  import os
 
29
 
30
  from pytube import YouTube
31
  from transformers import MarianMTModel, MarianTokenizer
@@ -33,32 +36,7 @@ from transformers import MarianMTModel, MarianTokenizer
33
  import psutil
34
  num_cores = psutil.cpu_count()
35
  os.environ["OMP_NUM_THREADS"] = f"{num_cores}"
36
-
37
-
38
- import torch
39
-
40
-
41
- finnish_marian_nmt_model = "Helsinki-NLP/opus-mt-tc-big-en-fi"
42
- finnish_tokenizer_marian = MarianTokenizer.from_pretrained(finnish_marian_nmt_model, max_length=40)
43
- finnish_tokenizer_marian.max_new_tokens = 30
44
- finnish_translation_model = MarianMTModel.from_pretrained(finnish_marian_nmt_model)
45
-
46
- swedish_marian_nmt_model = "Helsinki-NLP/opus-mt-en-sv"
47
- swedish_tokenizer_marian = MarianTokenizer.from_pretrained(swedish_marian_nmt_model, max_length=40)
48
- swedish_tokenizer_marian.max_new_tokens = 30
49
- swedish_translation_model = MarianMTModel.from_pretrained(swedish_marian_nmt_model)
50
-
51
- danish_marian_nmt_model = "Helsinki-NLP/opus-mt-en-da"
52
- danish_tokenizer_marian = MarianTokenizer.from_pretrained(danish_marian_nmt_model, max_length=40)
53
- danish_tokenizer_marian.max_new_tokens = 30
54
- danish_translation_model = MarianMTModel.from_pretrained(danish_marian_nmt_model)
55
-
56
-
57
- translation_models = {
58
- "Finnish": [finnish_tokenizer_marian, finnish_translation_model],
59
- "Swedish": [swedish_tokenizer_marian, swedish_translation_model],
60
- "Danish": [danish_tokenizer_marian, danish_translation_model]
61
- }
62
 
63
  whisper_models = ["base", "small", "medium", "base.en"]
64
 
@@ -80,8 +58,34 @@ source_languages = {
80
  "Let the model analyze": "Let the model analyze"
81
  }
82
 
83
- source_languages_2 = {
84
- "English":"en",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  }
86
 
87
 
@@ -90,7 +94,7 @@ transcribe_options = dict(beam_size=3, best_of=3, without_timestamps=False)
90
 
91
 
92
  source_language_list = [key[0] for key in source_languages.items()]
93
- source_language_list_2 = [key[0] for key in source_languages_2.items()]
94
  translation_models_list = [key[0] for key in translation_models.items()]
95
 
96
 
@@ -190,27 +194,32 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model):
190
 
191
 
192
  def translate_transcriptions(df, selected_translation_lang_2, selected_source_lang_2):
193
- print("IN TRANSLATE")
194
-
195
  if selected_translation_lang_2 is None:
196
- selected_translation_lang_2 = 'Finnish'
197
  df.reset_index(inplace=True)
198
 
199
- print("Getting models")
200
-
201
- tokenizer_marian = translation_models.get(selected_translation_lang_2)[0]
202
- translation_model = translation_models.get(selected_translation_lang_2)[1]
203
-
204
  print("start_translation")
205
  translations = []
206
- print(df.head())
207
  if selected_translation_lang_2 != selected_source_lang_2:
208
- print("TRASNLATING")
209
- sentences = list(df['text'])
210
- sentences = [stringi.replace('[','').replace(']','') for stringi in sentences]
211
- translations = translation_model.generate(**tokenizer_marian(sentences, return_tensors="pt", padding=True, truncation=True))
212
- print(translations)
213
- df['translation'] = translations
 
 
 
 
 
 
 
 
 
 
 
 
214
  else:
215
  df['translation'] = df['text']
216
  print("translations done")
 
1
  import os
2
 
3
+ # Download and build ggergavos/whisper.cpp Kudos to this man for wonderful whisper implementation!
4
+ # This means speed!
5
 
6
  os.system('git clone https://github.com/ggerganov/whisper.cpp.git')
7
  os.system('make -C ./whisper.cpp')
8
+
9
+ # Download models, add finetuned languages later once whisper finetuning event is ready
10
+ # Models are downloaded on the fly so we can get quite many models :)
11
+
12
  os.system('bash ./whisper.cpp/models/download-ggml-model.sh small')
13
  os.system('bash ./whisper.cpp/models/download-ggml-model.sh base')
14
  os.system('bash ./whisper.cpp/models/download-ggml-model.sh medium')
 
21
 
22
 
23
 
 
 
 
24
  import gradio as gr
 
25
  from pathlib import Path
26
  import pysrt
27
  import pandas as pd
28
  import re
29
  import time
30
  import os
31
+ import json
32
 
33
  from pytube import YouTube
34
  from transformers import MarianMTModel, MarianTokenizer
 
36
  import psutil
37
  num_cores = psutil.cpu_count()
38
  os.environ["OMP_NUM_THREADS"] = f"{num_cores}"
39
+ headers = {'Authorization': os.environ['DeepL_API_KEY']}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  whisper_models = ["base", "small", "medium", "base.en"]
42
 
 
58
  "Let the model analyze": "Let the model analyze"
59
  }
60
 
61
+ DeepL_language_codes_for_translation = {
62
+ "Bulgarian": "BG",
63
+ "Czech": "CS",
64
+ "Danish": "DA",
65
+ "German": "DE",
66
+ "Greek": "EL",
67
+ "English": "EN",
68
+ "Spanish": "ES",
69
+ "Estonian": "ET",
70
+ "Finnish": "FI",
71
+ "French": "FR",
72
+ "Hungarian": "HU",
73
+ "Indonesian": "ID",
74
+ "Italian": "IT",
75
+ "Japanese": "JA",
76
+ "Lithuanian": "LT",
77
+ "Latvian": "LV",
78
+ "Dutch": "NL",
79
+ "Polish": "PL",
80
+ "Portuguese": "PT",
81
+ "Romanian": "RO",
82
+ "Russian": "RU",
83
+ "Slovak": "SK",
84
+ "Slovenian": "SL",
85
+ "Swedish": "SV",
86
+ "Turkish": "TR",
87
+ "Ukrainian": "UK",
88
+ "Chinese": "ZH"
89
  }
90
 
91
 
 
94
 
95
 
96
  source_language_list = [key[0] for key in source_languages.items()]
97
+ source_language_list_2 = [key[0] for key in DeepL_language_codes_for_translation.items()]
98
  translation_models_list = [key[0] for key in translation_models.items()]
99
 
100
 
 
194
 
195
 
196
  def translate_transcriptions(df, selected_translation_lang_2, selected_source_lang_2):
 
 
197
  if selected_translation_lang_2 is None:
198
+ selected_translation_lang_2 = 'English'
199
  df.reset_index(inplace=True)
200
 
 
 
 
 
 
201
  print("start_translation")
202
  translations = []
203
+
204
  if selected_translation_lang_2 != selected_source_lang_2:
205
+
206
+ text_combined = ""
207
+ for i, sentence in enumerate(init__df['text']):
208
+ if i == 0:
209
+ text_combined = sentence
210
+ else:
211
+ text_combined = text_combined + '\n' + sentence
212
+
213
+ data = {'text': text_combined,
214
+ 'tag_spitting': 'xml',
215
+ 'target_lang': DeepL_language_codes.get(selected_source_lang_2)
216
+ }
217
+ response = requests.post('https://api-free.deepl.com/v2/translate', headers=headers, data=data)
218
+
219
+ # Print the response from the server
220
+ translated_sentences = json.loads(response.text)
221
+ translated_sentences['translations'][0]['text'].split('\n')
222
+ df['translation'] = translated_sentences
223
  else:
224
  df['translation'] = df['text']
225
  print("translations done")