ajayarora1235 commited on
Commit
29e1d72
·
1 Parent(s): 87c0d35

fix env issues

Browse files
app.py CHANGED
@@ -6,9 +6,10 @@ from lib.voicecraft.data.tokenizer import (
6
  AudioTokenizer,
7
  TextTokenizer,
8
  )
9
- import whisper
10
  import os
11
  import time
 
12
 
13
  from mega import Mega
14
  os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
@@ -1474,15 +1475,28 @@ def ilariaTTS(text, ttsvoice):
1474
  aud_path = save_to_wav('./temp_ilaria.mp3')
1475
  return aud_path, aud_path
1476
 
1477
- def transcribe_btn_click(model_choice, audio_choice, transcribed_text):
1478
- model = whisper.load_model(model_choice) # pass the value of model_choice to whisper.load_model()
1479
- result = model.transcribe(audio_choice) # pass the value of audio_choice to model.transcribe()
1480
- print("transcribe text: " + result["text"])
 
 
 
 
 
 
 
 
 
 
 
 
 
1481
 
1482
  # point to the original file or record the file
1483
  # write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file
1484
  orig_audio = audio_choice
1485
- orig_transcript = result["text"]
1486
  # move the audio and transcript to temp folder
1487
  temp_folder = "./demo/temp"
1488
  os.makedirs(temp_folder, exist_ok=True)
@@ -1494,42 +1508,22 @@ def transcribe_btn_click(model_choice, audio_choice, transcribed_text):
1494
  align_temp = f"{temp_folder}/mfa_alignments"
1495
  os.makedirs(align_temp, exist_ok=True)
1496
 
1497
- if os.path.exists(f"{align_temp}/{filename}.csv"):
1498
- pass
1499
- print("mfa.cvs file exists already")
1500
- else:
1501
- print(align_temp + " is None")
1502
- os.system(f"mfa align -j 1 --output_format csv --clean {temp_folder} english_us_arpa english_us_arpa {align_temp}")
1503
-
1504
-
1505
- # if the above fails, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue
1506
- # or try a larger model
1507
- # os.system(f"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000")
1508
- print("yes")
1509
  global audio_fn
1510
  audio_fn = f"{temp_folder}/{filename}.wav"
1511
  global transcript_fn
1512
  transcript_fn = f"{temp_folder}/{filename}.txt"
1513
- global align_fn
1514
- align_fn = f"{align_temp}/{filename}.csv"
1515
-
1516
- df = pd.read_csv(align_fn)
1517
- # Select the first three columns
1518
- df = df.iloc[:, :3]
1519
 
1520
- # Convert DataFrame to HTML
1521
- html = df.to_html(index=False)
1522
 
1523
- return [result["text"], html]
1524
 
1525
 
1526
  def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
1527
- temperature, kvcache, cutoff_value, target_transcript, silence_tokens):
1528
  os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
1529
  os.environ["CUDA_VISIBLE_DEVICES"] = "0"
1530
  # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
1531
  cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
1532
- target_transcript = target_transcript
1533
  info = torchaudio.info(audio_fn)
1534
  audio_dur = info.num_frames / info.sample_rate
1535
 
@@ -1688,9 +1682,9 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
1688
  with gr.Row():
1689
  with gr.Column():
1690
  input_audio = gr.Audio(label="Input Audio", type="filepath")
1691
- transcribe_btn_model = gr.Radio(value="base.en", interactive=True, label="what whisper model to download",
1692
- choices=["tiny.en", "base.en", "small.en", "medium.en", "large"],
1693
- info="VRAM usage: tiny.en 1 GB, base.en 1GB, small.en 2GB, medium.en 5GB, large 10GB.")
1694
  transcribed_text = gr.Textbox(label="transcibed text + mfa",
1695
  info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
1696
  transcribe_info_text = gr.TextArea(label="How to use",
@@ -1720,10 +1714,9 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
1720
  cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
1721
  run_btn = gr.Button(value="run")
1722
  target_transcript = gr.Textbox(label="target transcript")
1723
- cvs_file_html = gr.HTML()
1724
 
1725
- transcribe_btn.click(fn=transcribe_btn_click, inputs=[transcribe_btn_model, input_audio, transcribed_text],
1726
- outputs=[transcribed_text, cvs_file_html])
1727
 
1728
  run_btn.click(fn=run,
1729
  inputs=[
@@ -1740,7 +1733,8 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
1740
  kvcache,
1741
  cutoff_value,
1742
  target_transcript,
1743
- silence_tokens],
 
1744
  outputs=[
1745
  output_audio_con,
1746
  output_audio_gen
 
6
  AudioTokenizer,
7
  TextTokenizer,
8
  )
9
+ import whisperx
10
  import os
11
  import time
12
+ import gc
13
 
14
  from mega import Mega
15
  os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
 
1475
  aud_path = save_to_wav('./temp_ilaria.mp3')
1476
  return aud_path, aud_path
1477
 
1478
+ def transcribe_btn_click(audio_choice):
1479
+ batch_size = 1 # Adjust based on your GPU memory availability
1480
+ compute_type = "float16"
1481
+
1482
+ model = whisperx.load_model("large-v2", config.device, compute_type=compute_type)
1483
+ pre_result = model.transcribe(audio_choice, batch_size=batch_size)
1484
+
1485
+ # Correctly handle the transcription result based on its structure
1486
+ if 'segments' in pre_result:
1487
+ result = " ".join([segment['text'] for segment in pre_result['segments']])
1488
+ else:
1489
+ result = pre_result.get('text', '')
1490
+
1491
+ print("Transcribe text: " + result) # Directly print the result as it is now a string
1492
+
1493
+ # remove model to save VRAM
1494
+ gc.collect(); torch.cuda.empty_cache(); del model
1495
 
1496
  # point to the original file or record the file
1497
  # write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file
1498
  orig_audio = audio_choice
1499
+ orig_transcript = result
1500
  # move the audio and transcript to temp folder
1501
  temp_folder = "./demo/temp"
1502
  os.makedirs(temp_folder, exist_ok=True)
 
1508
  align_temp = f"{temp_folder}/mfa_alignments"
1509
  os.makedirs(align_temp, exist_ok=True)
1510
 
 
 
 
 
 
 
 
 
 
 
 
 
1511
  global audio_fn
1512
  audio_fn = f"{temp_folder}/{filename}.wav"
1513
  global transcript_fn
1514
  transcript_fn = f"{temp_folder}/{filename}.txt"
 
 
 
 
 
 
1515
 
 
 
1516
 
1517
+ return result
1518
 
1519
 
1520
  def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
1521
+ temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text):
1522
  os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
1523
  os.environ["CUDA_VISIBLE_DEVICES"] = "0"
1524
  # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
1525
  cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
1526
+ target_transcript = transcribed_text + target_transcript
1527
  info = torchaudio.info(audio_fn)
1528
  audio_dur = info.num_frames / info.sample_rate
1529
 
 
1682
  with gr.Row():
1683
  with gr.Column():
1684
  input_audio = gr.Audio(label="Input Audio", type="filepath")
1685
+ # transcribe_btn_model = gr.Radio(value="base.en", interactive=True, label="what whisper model to download",
1686
+ # choices=["tiny.en", "base.en", "small.en", "medium.en", "large"],
1687
+ # info="VRAM usage: tiny.en 1 GB, base.en 1GB, small.en 2GB, medium.en 5GB, large 10GB.")
1688
  transcribed_text = gr.Textbox(label="transcibed text + mfa",
1689
  info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
1690
  transcribe_info_text = gr.TextArea(label="How to use",
 
1714
  cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
1715
  run_btn = gr.Button(value="run")
1716
  target_transcript = gr.Textbox(label="target transcript")
 
1717
 
1718
+ transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
1719
+ outputs=[transcribed_text])
1720
 
1721
  run_btn.click(fn=run,
1722
  inputs=[
 
1733
  kvcache,
1734
  cutoff_value,
1735
  target_transcript,
1736
+ silence_tokens,
1737
+ transcribed_text],
1738
  outputs=[
1739
  output_audio_con,
1740
  output_audio_gen
lib/voicecraft/inference_speech_editing_scale.py CHANGED
@@ -5,14 +5,14 @@ import numpy as np
5
  import torch
6
  import torchaudio
7
 
8
- from data.tokenizer import (
9
  AudioTokenizer,
10
  TextTokenizer,
11
  tokenize_audio,
12
  tokenize_text
13
  )
14
 
15
- from models import voicecraft
16
  import argparse, time, tqdm
17
 
18
  # this script only works for the musicgen architecture
 
5
  import torch
6
  import torchaudio
7
 
8
+ from lib.voicecraft.data.tokenizer import (
9
  AudioTokenizer,
10
  TextTokenizer,
11
  tokenize_audio,
12
  tokenize_text
13
  )
14
 
15
+ from lib.voicecraft.models import voicecraft
16
  import argparse, time, tqdm
17
 
18
  # this script only works for the musicgen architecture
lib/voicecraft/inference_tts_scale.py CHANGED
@@ -5,14 +5,14 @@ import numpy as np
5
  import torch
6
  import torchaudio
7
 
8
- from data.tokenizer import (
9
  AudioTokenizer,
10
  TextTokenizer,
11
  tokenize_audio,
12
  tokenize_text
13
  )
14
 
15
- from models import voicecraft
16
  import argparse, time, tqdm
17
 
18
 
 
5
  import torch
6
  import torchaudio
7
 
8
+ from lib.voicecraft.data.tokenizer import (
9
  AudioTokenizer,
10
  TextTokenizer,
11
  tokenize_audio,
12
  tokenize_text
13
  )
14
 
15
+ from lib.voicecraft.models import voicecraft
16
  import argparse, time, tqdm
17
 
18
 
requirements.txt CHANGED
@@ -29,7 +29,5 @@ tensorboard==2.16.2
29
  phonemizer==3.2.1
30
  datasets==2.16.0
31
  torchmetrics==0.11.1
 
32
  # install MFA for getting forced-alignment, this could take a few minutes
33
- montreal-forced-aligner==2.2.17
34
- openfst==1.8.2
35
- kaldi==5.5.1068
 
29
  phonemizer==3.2.1
30
  datasets==2.16.0
31
  torchmetrics==0.11.1
32
+ whisperx @ git+https://github.com/m-bain/whisperx.git
33
  # install MFA for getting forced-alignment, this could take a few minutes