TIMBOVILL commited on
Commit
80d8416
1 Parent(s): 21bb7e7

Upload 3 files

Browse files
src/modules/Speech_Recognition/Whisper.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Whisper Speech Recognition Module"""
2
+
3
+ import sys
4
+
5
+ import whisperx
6
+ from torch.cuda import OutOfMemoryError
7
+
8
+ from modules.console_colors import ULTRASINGER_HEAD, blue_highlighted, red_highlighted
9
+ from modules.Speech_Recognition.TranscribedData import TranscribedData
10
+
11
+
12
+ def transcribe_with_whisper(
13
+ audio_path: str,
14
+ model: str,
15
+ device="cpu",
16
+ model_name: str = None,
17
+ batch_size: int = 16,
18
+ compute_type: str = None,
19
+ language: str = None,
20
+ ) -> (list[TranscribedData], str):
21
+ """Transcribe with whisper"""
22
+
23
+ # Info: Regardless of the audio sampling rate used in the original audio file, whisper resample the audio signal to 16kHz (via ffmpeg). So the standard input from (44.1 or 48 kHz) should work.
24
+
25
+ print(
26
+ f"{ULTRASINGER_HEAD} Loading {blue_highlighted('whisper')} with model {blue_highlighted(model)} and {red_highlighted(device)} as worker"
27
+ )
28
+ if model_name is not None:
29
+ print(f"{ULTRASINGER_HEAD} using alignment model {blue_highlighted(model_name)}")
30
+
31
+ if compute_type is None:
32
+ compute_type = "float16" if device == "cuda" else "int8"
33
+
34
+ try:
35
+ loaded_whisper_model = whisperx.load_model(
36
+ model, language=language, device=device, compute_type=compute_type
37
+ )
38
+ except ValueError as value_error:
39
+ if (
40
+ "Requested float16 compute type, but the target device or backend do not support efficient float16 computation."
41
+ in str(value_error.args[0])
42
+ ):
43
+ print(value_error)
44
+ print(
45
+ f"{ULTRASINGER_HEAD} Your GPU does not support efficient float16 computation; run UltraSinger with '--whisper_compute_type int8'"
46
+ )
47
+ sys.exit(1)
48
+
49
+ raise value_error
50
+ except OutOfMemoryError as oom_exception:
51
+ print(oom_exception)
52
+ print(
53
+ f"{ULTRASINGER_HEAD} {blue_highlighted('whisper')} ran out of GPU memory; reduce --whisper_batch_size or force usage of cpu with --force_cpu"
54
+ )
55
+ sys.exit(1)
56
+
57
+ audio = whisperx.load_audio(audio_path)
58
+
59
+ print(f"{ULTRASINGER_HEAD} Transcribing {audio_path}")
60
+
61
+ result = loaded_whisper_model.transcribe(
62
+ audio, batch_size=batch_size, language=language
63
+ )
64
+
65
+ detected_language = result["language"]
66
+ if language is None:
67
+ language = detected_language
68
+
69
+ # load alignment model and metadata
70
+ try:
71
+ model_a, metadata = whisperx.load_align_model(
72
+ language_code=language, device=device, model_name=model_name
73
+ )
74
+ except ValueError as ve:
75
+ print(
76
+ f"{red_highlighted(f'{ve}')}"
77
+ f"\n"
78
+ f"{ULTRASINGER_HEAD} {red_highlighted('Error:')} Unknown language. "
79
+ f"Try add it with --align_model [huggingface]."
80
+ )
81
+ sys.exit(1)
82
+
83
+ # align whisper output
84
+ result_aligned = whisperx.align(
85
+ result["segments"],
86
+ model_a,
87
+ metadata,
88
+ audio,
89
+ device,
90
+ return_char_alignments=False,
91
+ )
92
+
93
+ transcribed_data = convert_to_transcribed_data(result_aligned)
94
+
95
+ return transcribed_data, detected_language
96
+
97
+
98
+ def convert_to_transcribed_data(result_aligned):
99
+ transcribed_data = []
100
+ for segment in result_aligned["segments"]:
101
+ for obj in segment["words"]:
102
+ vtd = TranscribedData(obj) # create custom Word object
103
+ vtd.word = vtd.word + " " # add space to end of word
104
+ if len(obj) < 4:
105
+ previous = transcribed_data[-1]
106
+ if not previous:
107
+ previous.end = 0
108
+ previous.end = ""
109
+ vtd.start = previous.end + 0.1
110
+ vtd.end = previous.end + 0.2
111
+ msg = f'Error: There is no timestamp for word: "{obj["word"]}". ' \
112
+ f'Fixing it by placing it after the previous word: "{previous.word}". At start: {vtd.start} end: {vtd.end}. Fix it manually!'
113
+ print(f"{red_highlighted(msg)}")
114
+ transcribed_data.append(vtd) # and add it to list
115
+ return transcribed_data
src/modules/Speech_Recognition/hyphenation.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Hyphenation module"""
2
+
3
+ import string
4
+
5
+ from hyphen import Hyphenator, dictools
6
+
7
+ from modules.console_colors import (
8
+ ULTRASINGER_HEAD,
9
+ blue_highlighted,
10
+ )
11
+
12
+ # PyHyphen tries to retrieve dictionaries for download 'https://cgit.freedesktop.org/libreoffice/dictionaries/plain/'
13
+ # Updated PyHyphen dictools Languages, so they can be installed
14
+ LANGUAGES = [
15
+ "af_ZA",
16
+ "an_ES",
17
+ "ar",
18
+ "be_BY",
19
+ "bg_BG",
20
+ "bn_BD",
21
+ "bo",
22
+ "br_FR",
23
+ "bs_BA",
24
+ "ca",
25
+ "ckb",
26
+ "cs_CZ",
27
+ "da_DK",
28
+ "de",
29
+ "el_GR",
30
+ "en",
31
+ "eo",
32
+ "es",
33
+ "et_EE",
34
+ "fa_IR",
35
+ "fr_FR",
36
+ "gd_GB",
37
+ "gl",
38
+ "gu_IN",
39
+ "gug",
40
+ "he_IL",
41
+ "hi_IN",
42
+ "hr_HR",
43
+ "hu_HU",
44
+ "id",
45
+ "is",
46
+ "it_IT",
47
+ "kmr_Latn",
48
+ "ko_KR",
49
+ "lo_LA",
50
+ "lt_LT",
51
+ "lv_LV",
52
+ "mn_MN",
53
+ "ne_NP",
54
+ "nl_NL",
55
+ "no",
56
+ "oc_FR",
57
+ "pl_PL",
58
+ "pt_BR",
59
+ "pt_PT",
60
+ "ro",
61
+ "ru_RU",
62
+ "si_LK",
63
+ "sk_SK",
64
+ "sl_SI",
65
+ "sq_AL",
66
+ "sr",
67
+ "sv_SE",
68
+ "sw_TZ",
69
+ "te_IN",
70
+ "th_TH",
71
+ "tr_TR",
72
+ "uk_UA",
73
+ "vi",
74
+ "zu_ZA",
75
+ ]
76
+
77
+ def language_check(language="en") -> str | None:
78
+ """Check if language is supported"""
79
+
80
+ lang_region = None
81
+ installed = dictools.list_installed()
82
+ installed_region_keys = [i for i in installed if i.startswith(language) and "_" in i]
83
+ try:
84
+ # Try to find installed language with region prediction
85
+ lang_region = next(i for i in installed_region_keys if i == f"{language}_{language.upper()}")
86
+ except StopIteration:
87
+ if installed_region_keys:
88
+ # Take first installed region language
89
+ lang_region = installed_region_keys[0]
90
+ else:
91
+ # Take downloadable language key
92
+ downloadable_key = [i for i in LANGUAGES if i.startswith(language)]
93
+ downloadable_folder_key = [i for i in downloadable_key if i == language]
94
+ if downloadable_folder_key:
95
+ lang_region = downloadable_key[0]
96
+ else:
97
+ try:
98
+ # Try to find downloadable language with region prediction
99
+ lang_region = next(i for i in downloadable_key if i == f"{language}_{language.upper()}")
100
+ except StopIteration:
101
+ if downloadable_key:
102
+ # Take first installed region language
103
+ lang_region = downloadable_key[0]
104
+
105
+ if lang_region is None:
106
+ return None
107
+
108
+ print(
109
+ f"{ULTRASINGER_HEAD} Hyphenate using language code: {blue_highlighted(lang_region)}"
110
+ )
111
+ return lang_region
112
+
113
+
114
+ def contains_punctuation(word: str) -> bool:
115
+ """Check if word contains punctuation"""
116
+
117
+ return any(elem in word for elem in string.punctuation)
118
+
119
+
120
+ def clean_word(word: str):
121
+ """Remove punctuation from word"""
122
+ cleaned_string = ""
123
+ removed_indices = []
124
+ removed_symbols = []
125
+ for i, char in enumerate(word):
126
+ if char not in string.punctuation and char not in " ":
127
+ cleaned_string += char
128
+ else:
129
+ removed_indices.append(i)
130
+ removed_symbols.append(char)
131
+ return cleaned_string, removed_indices, removed_symbols
132
+
133
+
134
+ def insert_removed_symbols(separated_array, removed_indices, symbols):
135
+ """Insert symbols into the syllables"""
136
+ result = []
137
+ symbol_index = 0
138
+ i = 0
139
+
140
+ # Add removed symbols to the syllables
141
+ for syllable in separated_array:
142
+ tmp = ""
143
+ for char in syllable:
144
+ if i in removed_indices:
145
+ tmp += symbols[symbol_index]
146
+ symbol_index += 1
147
+ i += 1
148
+ tmp += char
149
+ i += 1
150
+ result.append(tmp)
151
+
152
+ # Add remaining symbols to the last syllable
153
+ if symbol_index < len(symbols):
154
+ tmp = result[-1]
155
+ for i in range(symbol_index, len(symbols)):
156
+ tmp += symbols[i]
157
+ result[-1] = tmp
158
+
159
+ return result
160
+
161
+
162
+ def create_hyphenator(lang_region: str) -> Hyphenator:
163
+ """Create hyphenator"""
164
+ hyphenator = Hyphenator(lang_region)
165
+ return hyphenator
166
+
167
+
168
+ def hyphenation(word: str, hyphenator: Hyphenator) -> list[str] | None:
169
+ """Hyphenate word"""
170
+
171
+ cleaned_string, removed_indices, removed_symbols = clean_word(word)
172
+
173
+ # Hyphenation of word longer than 100 characters throws exception
174
+ if len(cleaned_string) > 100:
175
+ return None
176
+
177
+ syllabus = hyphenator.syllables(cleaned_string)
178
+
179
+ length = len(syllabus)
180
+ if length > 1:
181
+ hyphen = []
182
+ for i in range(length):
183
+ hyphen.append(syllabus[i])
184
+ hyphen = insert_removed_symbols(hyphen, removed_indices, removed_symbols)
185
+ else:
186
+ hyphen = None
187
+
188
+ return hyphen
src/modules/Speech_Recognition/speech_recognition.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Docstring"""
2
+
3
+ import os
4
+
5
+ import speech_recognition as sr
6
+ from pydub import AudioSegment
7
+ from pydub.silence import split_on_silence
8
+
9
+ from modules.console_colors import ULTRASINGER_HEAD
10
+
11
+ # todo: Code from here: https://www.thepythoncode.com/article/using-speech-recognition-to-convert-speech-to-text-python
12
+
13
+
14
+ def print_text(wav_file):
15
+ """Docstring"""
16
+
17
+ # English speech!
18
+ recognizer = sr.Recognizer()
19
+
20
+ # open the file
21
+ with sr.AudioFile(wav_file) as source:
22
+ # listen for the data (load audio to memory)
23
+ audio_data = recognizer.record(source)
24
+ # recognize (convert from speech to text)
25
+ text = recognizer.recognize_google(audio_data)
26
+
27
+ print(text)
28
+
29
+
30
+ def get_large_audio_transcription(wav_file):
31
+ """
32
+ Splitting the large audio file into chunks
33
+ and apply speech recognition on each of these chunks
34
+ """
35
+ # open the audio file using pydub
36
+ sound = AudioSegment.from_wav(wav_file)
37
+
38
+ # split audio sound where silence is 700 miliseconds or more and get chunks
39
+ chunks = split_on_silence(
40
+ sound,
41
+ # experiment with this value for your target audio file
42
+ min_silence_len=500,
43
+ # adjust this per requirement
44
+ silence_thresh=sound.dBFS - 14,
45
+ # keep the silence for 1 second, adjustable as well
46
+ keep_silence=500,
47
+ )
48
+
49
+ folder_name = "audio-chunks"
50
+ # create a directory to store the audio chunks
51
+ if not os.path.isdir(folder_name):
52
+ os.mkdir(folder_name)
53
+ whole_text = ""
54
+
55
+ recognizer = sr.Recognizer()
56
+
57
+ # process each chunk
58
+ for i, audio_chunk in enumerate(chunks, start=1):
59
+ # export audio chunk and save it in
60
+ # the `folder_name` directory.
61
+ chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
62
+ audio_chunk.export(chunk_filename, format="wav")
63
+ # recognize the chunk
64
+ with sr.AudioFile(chunk_filename) as source:
65
+ audio_listened = recognizer.record(source)
66
+ # try converting it to text
67
+ try:
68
+ text = recognizer.recognize_google(audio_listened)
69
+ except sr.UnknownValueError as error:
70
+ print("Error:", str(error))
71
+ else:
72
+ text = f"{text.capitalize()}. "
73
+ print(chunk_filename, ":", text)
74
+ whole_text += text
75
+ # return the text for all chunks detected
76
+ return whole_text
77
+
78
+
79
+ def transcribe_audio(audio_file):
80
+ """Docstring"""
81
+
82
+ recognizer = sr.Recognizer()
83
+ with sr.AudioFile(audio_file) as source:
84
+ audio = recognizer.record(source)
85
+ try:
86
+ transcript = recognizer.recognize_google(audio, show_all=True)
87
+ start_time = transcript["result"][0]["alternative"][0]["words"][0][
88
+ "startTime"
89
+ ]
90
+ end_time = transcript["result"][0]["alternative"][0]["words"][-1][
91
+ "endTime"
92
+ ]
93
+ return (
94
+ transcript["result"][0]["alternative"][0]["transcript"],
95
+ start_time,
96
+ end_time,
97
+ )
98
+ except sr.UnknownValueError:
99
+ print(f"{ULTRASINGER_HEAD} Could not understand audio")
100
+ except sr.RequestError as error:
101
+ print(f"{ULTRASINGER_HEAD} Error with recognizing service; {error}")
102
+
103
+
104
+ class SpeechToText:
105
+ """Docstring"""