Spaces:

TIMBOVILL
/

UltraSingerUI

Sleeping

App Files Files Community

TIMBOVILL commited on Jun 16

Commit

80d8416

•

1 Parent(s): 21bb7e7

Upload 3 files

Browse files

Files changed (3) hide show

src/modules/Speech_Recognition/Whisper.py +115 -0
src/modules/Speech_Recognition/hyphenation.py +188 -0
src/modules/Speech_Recognition/speech_recognition.py +105 -0

src/modules/Speech_Recognition/Whisper.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""Whisper Speech Recognition Module"""
+import sys
+import whisperx
+from torch.cuda import OutOfMemoryError
+from modules.console_colors import ULTRASINGER_HEAD, blue_highlighted, red_highlighted
+from modules.Speech_Recognition.TranscribedData import TranscribedData
+def transcribe_with_whisper(
+    audio_path: str,
+    model: str,
+    device="cpu",
+    model_name: str = None,
+    batch_size: int = 16,
+    compute_type: str = None,
+    language: str = None,
+) -> (list[TranscribedData], str):
+    """Transcribe with whisper"""
+    # Info: Regardless of the audio sampling rate used in the original audio file, whisper resample the audio signal to 16kHz (via ffmpeg). So the standard input from (44.1 or 48 kHz) should work.
+    print(
+        f"{ULTRASINGER_HEAD} Loading {blue_highlighted('whisper')} with model {blue_highlighted(model)} and {red_highlighted(device)} as worker"
+    )
+    if model_name is not None:
+        print(f"{ULTRASINGER_HEAD} using alignment model {blue_highlighted(model_name)}")
+    if compute_type is None:
+        compute_type = "float16" if device == "cuda" else "int8"
+    try:
+        loaded_whisper_model = whisperx.load_model(
+            model, language=language, device=device, compute_type=compute_type
+        )
+    except ValueError as value_error:
+        if (
+            "Requested float16 compute type, but the target device or backend do not support efficient float16 computation."
+            in str(value_error.args[0])
+        ):
+            print(value_error)
+            print(
+                f"{ULTRASINGER_HEAD} Your GPU does not support efficient float16 computation; run UltraSinger with '--whisper_compute_type int8'"
+            )
+            sys.exit(1)
+        raise value_error
+    except OutOfMemoryError as oom_exception:
+        print(oom_exception)
+        print(
+            f"{ULTRASINGER_HEAD} {blue_highlighted('whisper')} ran out of GPU memory; reduce --whisper_batch_size or force usage of cpu with --force_cpu"
+        )
+        sys.exit(1)
+    audio = whisperx.load_audio(audio_path)
+    print(f"{ULTRASINGER_HEAD} Transcribing {audio_path}")
+    result = loaded_whisper_model.transcribe(
+        audio, batch_size=batch_size, language=language
+    )
+    detected_language = result["language"]
+    if language is None:
+        language = detected_language
+    # load alignment model and metadata
+    try:
+        model_a, metadata = whisperx.load_align_model(
+            language_code=language, device=device, model_name=model_name
+        )
+    except ValueError as ve:
+        print(
+            f"{red_highlighted(f'{ve}')}"
+            f"\n"
+            f"{ULTRASINGER_HEAD} {red_highlighted('Error:')} Unknown language. "
+            f"Try add it with --align_model [huggingface]."
+        )
+        sys.exit(1)
+    # align whisper output
+    result_aligned = whisperx.align(
+        result["segments"],
+        model_a,
+        metadata,
+        audio,
+        device,
+        return_char_alignments=False,
+    )
+    transcribed_data = convert_to_transcribed_data(result_aligned)
+    return transcribed_data, detected_language
+def convert_to_transcribed_data(result_aligned):
+    transcribed_data = []
+    for segment in result_aligned["segments"]:
+        for obj in segment["words"]:
+            vtd = TranscribedData(obj)  # create custom Word object
+            vtd.word = vtd.word + " "  # add space to end of word
+            if len(obj) < 4:
+                previous = transcribed_data[-1]
+                if not previous:
+                    previous.end = 0
+                    previous.end = ""
+                vtd.start = previous.end + 0.1
+                vtd.end = previous.end + 0.2
+                msg = f'Error: There is no timestamp for word: "{obj["word"]}". ' \
+                      f'Fixing it by placing it after the previous word: "{previous.word}". At start: {vtd.start} end: {vtd.end}. Fix it manually!'
+                print(f"{red_highlighted(msg)}")
+            transcribed_data.append(vtd)  # and add it to list
+    return transcribed_data

src/modules/Speech_Recognition/hyphenation.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""Hyphenation module"""
+import string
+from hyphen import Hyphenator, dictools
+from modules.console_colors import (
+    ULTRASINGER_HEAD,
+    blue_highlighted,
+)
+# PyHyphen tries to retrieve dictionaries for download 'https://cgit.freedesktop.org/libreoffice/dictionaries/plain/'
+# Updated PyHyphen dictools Languages, so they can be installed
+LANGUAGES = [
+"af_ZA",
+"an_ES",
+"ar",
+"be_BY",
+"bg_BG",
+"bn_BD",
+"bo",
+"br_FR",
+"bs_BA",
+"ca",
+"ckb",
+"cs_CZ",
+"da_DK",
+"de",
+"el_GR",
+"en",
+"eo",
+"es",
+"et_EE",
+"fa_IR",
+"fr_FR",
+"gd_GB",
+"gl",
+"gu_IN",
+"gug",
+"he_IL",
+"hi_IN",
+"hr_HR",
+"hu_HU",
+"id",
+"is",
+"it_IT",
+"kmr_Latn",
+"ko_KR",
+"lo_LA",
+"lt_LT",
+"lv_LV",
+"mn_MN",
+"ne_NP",
+"nl_NL",
+"no",
+"oc_FR",
+"pl_PL",
+"pt_BR",
+"pt_PT",
+"ro",
+"ru_RU",
+"si_LK",
+"sk_SK",
+"sl_SI",
+"sq_AL",
+"sr",
+"sv_SE",
+"sw_TZ",
+"te_IN",
+"th_TH",
+"tr_TR",
+"uk_UA",
+"vi",
+"zu_ZA",
+]
+def language_check(language="en") -> str | None:
+    """Check if language is supported"""
+    lang_region = None
+    installed = dictools.list_installed()
+    installed_region_keys = [i for i in installed if i.startswith(language) and "_" in i]
+    try:
+        # Try to find installed language with region prediction
+        lang_region = next(i for i in installed_region_keys if i == f"{language}_{language.upper()}")
+    except StopIteration:
+        if installed_region_keys:
+            # Take first installed region language
+            lang_region = installed_region_keys[0]
+        else:
+            # Take downloadable language key
+            downloadable_key = [i for i in LANGUAGES if i.startswith(language)]
+            downloadable_folder_key = [i for i in downloadable_key if i == language]
+            if downloadable_folder_key:
+                lang_region = downloadable_key[0]
+            else:
+                try:
+                    # Try to find downloadable language with region prediction
+                    lang_region = next(i for i in downloadable_key if i == f"{language}_{language.upper()}")
+                except StopIteration:
+                    if downloadable_key:
+                        # Take first installed region language
+                        lang_region = downloadable_key[0]
+    if lang_region is None:
+        return None
+    print(
+        f"{ULTRASINGER_HEAD} Hyphenate using language code: {blue_highlighted(lang_region)}"
+    )
+    return lang_region
+def contains_punctuation(word: str) -> bool:
+    """Check if word contains punctuation"""
+    return any(elem in word for elem in string.punctuation)
+def clean_word(word: str):
+    """Remove punctuation from word"""
+    cleaned_string = ""
+    removed_indices = []
+    removed_symbols = []
+    for i, char in enumerate(word):
+        if char not in string.punctuation and char not in " ":
+            cleaned_string += char
+        else:
+            removed_indices.append(i)
+            removed_symbols.append(char)
+    return cleaned_string, removed_indices, removed_symbols
+def insert_removed_symbols(separated_array, removed_indices, symbols):
+    """Insert symbols into the syllables"""
+    result = []
+    symbol_index = 0
+    i = 0
+    # Add removed symbols to the syllables
+    for syllable in separated_array:
+        tmp = ""
+        for char in syllable:
+            if i in removed_indices:
+                tmp += symbols[symbol_index]
+                symbol_index += 1
+                i += 1
+            tmp += char
+            i += 1
+        result.append(tmp)
+    # Add remaining symbols to the last syllable
+    if symbol_index < len(symbols):
+        tmp = result[-1]
+        for i in range(symbol_index, len(symbols)):
+            tmp += symbols[i]
+        result[-1] = tmp
+    return result
+def create_hyphenator(lang_region: str) -> Hyphenator:
+    """Create hyphenator"""
+    hyphenator = Hyphenator(lang_region)
+    return hyphenator
+def hyphenation(word: str, hyphenator: Hyphenator) -> list[str] | None:
+    """Hyphenate word"""
+    cleaned_string, removed_indices, removed_symbols = clean_word(word)
+    # Hyphenation of word longer than 100 characters throws exception
+    if len(cleaned_string) > 100:
+        return None
+    syllabus = hyphenator.syllables(cleaned_string)
+    length = len(syllabus)
+    if length > 1:
+        hyphen = []
+        for i in range(length):
+            hyphen.append(syllabus[i])
+        hyphen = insert_removed_symbols(hyphen, removed_indices, removed_symbols)
+    else:
+        hyphen = None
+    return hyphen

src/modules/Speech_Recognition/speech_recognition.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""Docstring"""
+import os
+import speech_recognition as sr
+from pydub import AudioSegment
+from pydub.silence import split_on_silence
+from modules.console_colors import ULTRASINGER_HEAD
+# todo: Code from here: https://www.thepythoncode.com/article/using-speech-recognition-to-convert-speech-to-text-python
+def print_text(wav_file):
+    """Docstring"""
+    # English speech!
+    recognizer = sr.Recognizer()
+    # open the file
+    with sr.AudioFile(wav_file) as source:
+        # listen for the data (load audio to memory)
+        audio_data = recognizer.record(source)
+        # recognize (convert from speech to text)
+        text = recognizer.recognize_google(audio_data)
+        print(text)
+def get_large_audio_transcription(wav_file):
+    """
+    Splitting the large audio file into chunks
+    and apply speech recognition on each of these chunks
+    """
+    # open the audio file using pydub
+    sound = AudioSegment.from_wav(wav_file)
+    # split audio sound where silence is 700 miliseconds or more and get chunks
+    chunks = split_on_silence(
+        sound,
+        # experiment with this value for your target audio file
+        min_silence_len=500,
+        # adjust this per requirement
+        silence_thresh=sound.dBFS - 14,
+        # keep the silence for 1 second, adjustable as well
+        keep_silence=500,
+    )
+    folder_name = "audio-chunks"
+    # create a directory to store the audio chunks
+    if not os.path.isdir(folder_name):
+        os.mkdir(folder_name)
+    whole_text = ""
+    recognizer = sr.Recognizer()
+    # process each chunk
+    for i, audio_chunk in enumerate(chunks, start=1):
+        # export audio chunk and save it in
+        # the `folder_name` directory.
+        chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
+        audio_chunk.export(chunk_filename, format="wav")
+        # recognize the chunk
+        with sr.AudioFile(chunk_filename) as source:
+            audio_listened = recognizer.record(source)
+            # try converting it to text
+            try:
+                text = recognizer.recognize_google(audio_listened)
+            except sr.UnknownValueError as error:
+                print("Error:", str(error))
+            else:
+                text = f"{text.capitalize()}. "
+                print(chunk_filename, ":", text)
+                whole_text += text
+    # return the text for all chunks detected
+    return whole_text
+def transcribe_audio(audio_file):
+    """Docstring"""
+    recognizer = sr.Recognizer()
+    with sr.AudioFile(audio_file) as source:
+        audio = recognizer.record(source)
+    try:
+        transcript = recognizer.recognize_google(audio, show_all=True)
+        start_time = transcript["result"][0]["alternative"][0]["words"][0][
+            "startTime"
+        ]
+        end_time = transcript["result"][0]["alternative"][0]["words"][-1][
+            "endTime"
+        ]
+        return (
+            transcript["result"][0]["alternative"][0]["transcript"],
+            start_time,
+            end_time,
+        )
+    except sr.UnknownValueError:
+        print(f"{ULTRASINGER_HEAD} Could not understand audio")
+    except sr.RequestError as error:
+        print(f"{ULTRASINGER_HEAD} Error with recognizing service; {error}")
+class SpeechToText:
+    """Docstring"""