3417543_models

Paused

App Files Files Community

ajayarora1235 commited on Apr 1, 2024

Commit

29e1d72

1 Parent(s): 87c0d35

fix env issues

Browse files

Files changed (4) hide show

app.py +30 -36
lib/voicecraft/inference_speech_editing_scale.py +2 -2
lib/voicecraft/inference_tts_scale.py +2 -2
requirements.txt +1 -3

app.py CHANGED Viewed

@@ -6,9 +6,10 @@ from lib.voicecraft.data.tokenizer import (
     AudioTokenizer,
     TextTokenizer,
 )
-import whisper
 import os
 import time
 from mega import Mega
 os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
@@ -1474,15 +1475,28 @@ def ilariaTTS(text, ttsvoice):
     aud_path = save_to_wav('./temp_ilaria.mp3')
     return aud_path, aud_path
-def transcribe_btn_click(model_choice, audio_choice, transcribed_text):
-    model = whisper.load_model(model_choice)  # pass the value of model_choice to whisper.load_model()
-    result = model.transcribe(audio_choice)  # pass the value of audio_choice to model.transcribe()
-    print("transcribe text: " + result["text"])
     # point to the original file or record the file
     # write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file
     orig_audio = audio_choice
-    orig_transcript = result["text"]
     # move the audio and transcript to temp folder
     temp_folder = "./demo/temp"
     os.makedirs(temp_folder, exist_ok=True)
@@ -1494,42 +1508,22 @@ def transcribe_btn_click(model_choice, audio_choice, transcribed_text):
     align_temp = f"{temp_folder}/mfa_alignments"
     os.makedirs(align_temp, exist_ok=True)
-    if os.path.exists(f"{align_temp}/{filename}.csv"):
-        pass
-        print("mfa.cvs file exists already")
-    else:
-        print(align_temp + " is None")
-        os.system(f"mfa align -j 1 --output_format csv --clean {temp_folder} english_us_arpa english_us_arpa {align_temp}")
-    # if the above fails, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue
-    # or try a larger model
-    # os.system(f"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000")
-    print("yes")
     global audio_fn
     audio_fn = f"{temp_folder}/{filename}.wav"
     global transcript_fn
     transcript_fn = f"{temp_folder}/{filename}.txt"
-    global align_fn
-    align_fn = f"{align_temp}/{filename}.csv"
-    df = pd.read_csv(align_fn)
-    # Select the first three columns
-    df = df.iloc[:, :3]
-    # Convert DataFrame to HTML
-    html = df.to_html(index=False)
-    return [result["text"], html]
 def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
-        temperature, kvcache, cutoff_value, target_transcript, silence_tokens):
     os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
     os.environ["CUDA_VISIBLE_DEVICES"] = "0"
     # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
     cut_off_sec = cutoff_value  # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
-    target_transcript = target_transcript
     info = torchaudio.info(audio_fn)
     audio_dur = info.num_frames / info.sample_rate
@@ -1688,9 +1682,9 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
                     with gr.Row():
                         with gr.Column():
                             input_audio = gr.Audio(label="Input Audio", type="filepath")
-                            transcribe_btn_model = gr.Radio(value="base.en", interactive=True, label="what whisper model to download",
-                                                            choices=["tiny.en", "base.en", "small.en", "medium.en", "large"],
-                                                            info="VRAM usage: tiny.en 1 GB, base.en 1GB, small.en 2GB, medium.en 5GB, large 10GB.")
                             transcribed_text = gr.Textbox(label="transcibed text + mfa",
                                                         info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
                             transcribe_info_text = gr.TextArea(label="How to use",
@@ -1720,10 +1714,9 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
                             cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
                             run_btn = gr.Button(value="run")
                             target_transcript = gr.Textbox(label="target transcript")
-                            cvs_file_html = gr.HTML()
-                        transcribe_btn.click(fn=transcribe_btn_click, inputs=[transcribe_btn_model, input_audio, transcribed_text],
-                                             outputs=[transcribed_text, cvs_file_html])
                         run_btn.click(fn=run,
                                     inputs=[
@@ -1740,7 +1733,8 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
                                         kvcache,
                                         cutoff_value,
                                         target_transcript,
-                                        silence_tokens],
                                     outputs=[
                                         output_audio_con,
                                         output_audio_gen

     AudioTokenizer,
     TextTokenizer,
 )
+import whisperx
 import os
 import time
+import gc
 from mega import Mega
 os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
     aud_path = save_to_wav('./temp_ilaria.mp3')
     return aud_path, aud_path
+def transcribe_btn_click(audio_choice):
+    batch_size = 1  # Adjust based on your GPU memory availability
+    compute_type = "float16"
+    model = whisperx.load_model("large-v2", config.device, compute_type=compute_type)
+    pre_result = model.transcribe(audio_choice, batch_size=batch_size)
+    # Correctly handle the transcription result based on its structure
+    if 'segments' in pre_result:
+        result = " ".join([segment['text'] for segment in pre_result['segments']])
+    else:
+        result = pre_result.get('text', '')
+    print("Transcribe text: " + result)  # Directly print the result as it is now a string
+    # remove model to save VRAM
+    gc.collect(); torch.cuda.empty_cache(); del model
     # point to the original file or record the file
     # write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file
     orig_audio = audio_choice
+    orig_transcript = result
     # move the audio and transcript to temp folder
     temp_folder = "./demo/temp"
     os.makedirs(temp_folder, exist_ok=True)
     align_temp = f"{temp_folder}/mfa_alignments"
     os.makedirs(align_temp, exist_ok=True)
     global audio_fn
     audio_fn = f"{temp_folder}/{filename}.wav"
     global transcript_fn
     transcript_fn = f"{temp_folder}/{filename}.txt"
+    return result
 def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
+        temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text):
     os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
     os.environ["CUDA_VISIBLE_DEVICES"] = "0"
     # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
     cut_off_sec = cutoff_value  # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
+    target_transcript = transcribed_text + target_transcript
     info = torchaudio.info(audio_fn)
     audio_dur = info.num_frames / info.sample_rate
                     with gr.Row():
                         with gr.Column():
                             input_audio = gr.Audio(label="Input Audio", type="filepath")
+                            # transcribe_btn_model = gr.Radio(value="base.en", interactive=True, label="what whisper model to download",
+                            #                                 choices=["tiny.en", "base.en", "small.en", "medium.en", "large"],
+                            #                                 info="VRAM usage: tiny.en 1 GB, base.en 1GB, small.en 2GB, medium.en 5GB, large 10GB.")
                             transcribed_text = gr.Textbox(label="transcibed text + mfa",
                                                         info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
                             transcribe_info_text = gr.TextArea(label="How to use",
                             cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
                             run_btn = gr.Button(value="run")
                             target_transcript = gr.Textbox(label="target transcript")
+                        transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
+                                             outputs=[transcribed_text])
                         run_btn.click(fn=run,
                                     inputs=[
                                         kvcache,
                                         cutoff_value,
                                         target_transcript,
+                                        silence_tokens,
+                                        transcribed_text],
                                     outputs=[
                                         output_audio_con,
                                         output_audio_gen

lib/voicecraft/inference_speech_editing_scale.py CHANGED Viewed

@@ -5,14 +5,14 @@ import numpy as np
 import torch
 import torchaudio
-from data.tokenizer import (
     AudioTokenizer,
     TextTokenizer,
     tokenize_audio,
     tokenize_text
 )
-from models import voicecraft
 import argparse, time, tqdm
 # this script only works for the musicgen architecture

 import torch
 import torchaudio
+from lib.voicecraft.data.tokenizer import (
     AudioTokenizer,
     TextTokenizer,
     tokenize_audio,
     tokenize_text
 )
+from lib.voicecraft.models import voicecraft
 import argparse, time, tqdm
 # this script only works for the musicgen architecture

lib/voicecraft/inference_tts_scale.py CHANGED Viewed

@@ -5,14 +5,14 @@ import numpy as np
 import torch
 import torchaudio
-from data.tokenizer import (
     AudioTokenizer,
     TextTokenizer,
     tokenize_audio,
     tokenize_text
 )
-from models import voicecraft
 import argparse, time, tqdm

 import torch
 import torchaudio
+from lib.voicecraft.data.tokenizer import (
     AudioTokenizer,
     TextTokenizer,
     tokenize_audio,
     tokenize_text
 )
+from lib.voicecraft.models import voicecraft
 import argparse, time, tqdm

requirements.txt CHANGED Viewed

@@ -29,7 +29,5 @@ tensorboard==2.16.2
 phonemizer==3.2.1
 datasets==2.16.0
 torchmetrics==0.11.1
 # install MFA for getting forced-alignment, this could take a few minutes
-montreal-forced-aligner==2.2.17
-openfst==1.8.2
-kaldi==5.5.1068

 phonemizer==3.2.1
 datasets==2.16.0
 torchmetrics==0.11.1
+whisperx @ git+https://github.com/m-bain/whisperx.git
 # install MFA for getting forced-alignment, this could take a few minutes