ajayarora1235
commited on
Commit
·
29e1d72
1
Parent(s):
87c0d35
fix env issues
Browse files- app.py +30 -36
- lib/voicecraft/inference_speech_editing_scale.py +2 -2
- lib/voicecraft/inference_tts_scale.py +2 -2
- requirements.txt +1 -3
app.py
CHANGED
@@ -6,9 +6,10 @@ from lib.voicecraft.data.tokenizer import (
|
|
6 |
AudioTokenizer,
|
7 |
TextTokenizer,
|
8 |
)
|
9 |
-
import
|
10 |
import os
|
11 |
import time
|
|
|
12 |
|
13 |
from mega import Mega
|
14 |
os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
|
@@ -1474,15 +1475,28 @@ def ilariaTTS(text, ttsvoice):
|
|
1474 |
aud_path = save_to_wav('./temp_ilaria.mp3')
|
1475 |
return aud_path, aud_path
|
1476 |
|
1477 |
-
def transcribe_btn_click(
|
1478 |
-
|
1479 |
-
|
1480 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1481 |
|
1482 |
# point to the original file or record the file
|
1483 |
# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file
|
1484 |
orig_audio = audio_choice
|
1485 |
-
orig_transcript = result
|
1486 |
# move the audio and transcript to temp folder
|
1487 |
temp_folder = "./demo/temp"
|
1488 |
os.makedirs(temp_folder, exist_ok=True)
|
@@ -1494,42 +1508,22 @@ def transcribe_btn_click(model_choice, audio_choice, transcribed_text):
|
|
1494 |
align_temp = f"{temp_folder}/mfa_alignments"
|
1495 |
os.makedirs(align_temp, exist_ok=True)
|
1496 |
|
1497 |
-
if os.path.exists(f"{align_temp}/{filename}.csv"):
|
1498 |
-
pass
|
1499 |
-
print("mfa.cvs file exists already")
|
1500 |
-
else:
|
1501 |
-
print(align_temp + " is None")
|
1502 |
-
os.system(f"mfa align -j 1 --output_format csv --clean {temp_folder} english_us_arpa english_us_arpa {align_temp}")
|
1503 |
-
|
1504 |
-
|
1505 |
-
# if the above fails, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue
|
1506 |
-
# or try a larger model
|
1507 |
-
# os.system(f"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000")
|
1508 |
-
print("yes")
|
1509 |
global audio_fn
|
1510 |
audio_fn = f"{temp_folder}/{filename}.wav"
|
1511 |
global transcript_fn
|
1512 |
transcript_fn = f"{temp_folder}/{filename}.txt"
|
1513 |
-
global align_fn
|
1514 |
-
align_fn = f"{align_temp}/{filename}.csv"
|
1515 |
-
|
1516 |
-
df = pd.read_csv(align_fn)
|
1517 |
-
# Select the first three columns
|
1518 |
-
df = df.iloc[:, :3]
|
1519 |
|
1520 |
-
# Convert DataFrame to HTML
|
1521 |
-
html = df.to_html(index=False)
|
1522 |
|
1523 |
-
return
|
1524 |
|
1525 |
|
1526 |
def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
|
1527 |
-
temperature, kvcache, cutoff_value, target_transcript, silence_tokens):
|
1528 |
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
1529 |
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
1530 |
# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
|
1531 |
cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
|
1532 |
-
target_transcript = target_transcript
|
1533 |
info = torchaudio.info(audio_fn)
|
1534 |
audio_dur = info.num_frames / info.sample_rate
|
1535 |
|
@@ -1688,9 +1682,9 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
|
|
1688 |
with gr.Row():
|
1689 |
with gr.Column():
|
1690 |
input_audio = gr.Audio(label="Input Audio", type="filepath")
|
1691 |
-
transcribe_btn_model = gr.Radio(value="base.en", interactive=True, label="what whisper model to download",
|
1692 |
-
|
1693 |
-
|
1694 |
transcribed_text = gr.Textbox(label="transcibed text + mfa",
|
1695 |
info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
|
1696 |
transcribe_info_text = gr.TextArea(label="How to use",
|
@@ -1720,10 +1714,9 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
|
|
1720 |
cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
|
1721 |
run_btn = gr.Button(value="run")
|
1722 |
target_transcript = gr.Textbox(label="target transcript")
|
1723 |
-
cvs_file_html = gr.HTML()
|
1724 |
|
1725 |
-
transcribe_btn.click(fn=transcribe_btn_click, inputs=[
|
1726 |
-
outputs=[transcribed_text
|
1727 |
|
1728 |
run_btn.click(fn=run,
|
1729 |
inputs=[
|
@@ -1740,7 +1733,8 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
|
|
1740 |
kvcache,
|
1741 |
cutoff_value,
|
1742 |
target_transcript,
|
1743 |
-
silence_tokens
|
|
|
1744 |
outputs=[
|
1745 |
output_audio_con,
|
1746 |
output_audio_gen
|
|
|
6 |
AudioTokenizer,
|
7 |
TextTokenizer,
|
8 |
)
|
9 |
+
import whisperx
|
10 |
import os
|
11 |
import time
|
12 |
+
import gc
|
13 |
|
14 |
from mega import Mega
|
15 |
os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
|
|
|
1475 |
aud_path = save_to_wav('./temp_ilaria.mp3')
|
1476 |
return aud_path, aud_path
|
1477 |
|
1478 |
+
def transcribe_btn_click(audio_choice):
|
1479 |
+
batch_size = 1 # Adjust based on your GPU memory availability
|
1480 |
+
compute_type = "float16"
|
1481 |
+
|
1482 |
+
model = whisperx.load_model("large-v2", config.device, compute_type=compute_type)
|
1483 |
+
pre_result = model.transcribe(audio_choice, batch_size=batch_size)
|
1484 |
+
|
1485 |
+
# Correctly handle the transcription result based on its structure
|
1486 |
+
if 'segments' in pre_result:
|
1487 |
+
result = " ".join([segment['text'] for segment in pre_result['segments']])
|
1488 |
+
else:
|
1489 |
+
result = pre_result.get('text', '')
|
1490 |
+
|
1491 |
+
print("Transcribe text: " + result) # Directly print the result as it is now a string
|
1492 |
+
|
1493 |
+
# remove model to save VRAM
|
1494 |
+
gc.collect(); torch.cuda.empty_cache(); del model
|
1495 |
|
1496 |
# point to the original file or record the file
|
1497 |
# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file
|
1498 |
orig_audio = audio_choice
|
1499 |
+
orig_transcript = result
|
1500 |
# move the audio and transcript to temp folder
|
1501 |
temp_folder = "./demo/temp"
|
1502 |
os.makedirs(temp_folder, exist_ok=True)
|
|
|
1508 |
align_temp = f"{temp_folder}/mfa_alignments"
|
1509 |
os.makedirs(align_temp, exist_ok=True)
|
1510 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1511 |
global audio_fn
|
1512 |
audio_fn = f"{temp_folder}/{filename}.wav"
|
1513 |
global transcript_fn
|
1514 |
transcript_fn = f"{temp_folder}/{filename}.txt"
|
|
|
|
|
|
|
|
|
|
|
|
|
1515 |
|
|
|
|
|
1516 |
|
1517 |
+
return result
|
1518 |
|
1519 |
|
1520 |
def run(seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
|
1521 |
+
temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text):
|
1522 |
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
1523 |
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
1524 |
# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
|
1525 |
cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
|
1526 |
+
target_transcript = transcribed_text + target_transcript
|
1527 |
info = torchaudio.info(audio_fn)
|
1528 |
audio_dur = info.num_frames / info.sample_rate
|
1529 |
|
|
|
1682 |
with gr.Row():
|
1683 |
with gr.Column():
|
1684 |
input_audio = gr.Audio(label="Input Audio", type="filepath")
|
1685 |
+
# transcribe_btn_model = gr.Radio(value="base.en", interactive=True, label="what whisper model to download",
|
1686 |
+
# choices=["tiny.en", "base.en", "small.en", "medium.en", "large"],
|
1687 |
+
# info="VRAM usage: tiny.en 1 GB, base.en 1GB, small.en 2GB, medium.en 5GB, large 10GB.")
|
1688 |
transcribed_text = gr.Textbox(label="transcibed text + mfa",
|
1689 |
info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
|
1690 |
transcribe_info_text = gr.TextArea(label="How to use",
|
|
|
1714 |
cutoff_value = gr.Number(label="cutoff_time", interactive=True, step=0.01)
|
1715 |
run_btn = gr.Button(value="run")
|
1716 |
target_transcript = gr.Textbox(label="target transcript")
|
|
|
1717 |
|
1718 |
+
transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
|
1719 |
+
outputs=[transcribed_text])
|
1720 |
|
1721 |
run_btn.click(fn=run,
|
1722 |
inputs=[
|
|
|
1733 |
kvcache,
|
1734 |
cutoff_value,
|
1735 |
target_transcript,
|
1736 |
+
silence_tokens,
|
1737 |
+
transcribed_text],
|
1738 |
outputs=[
|
1739 |
output_audio_con,
|
1740 |
output_audio_gen
|
lib/voicecraft/inference_speech_editing_scale.py
CHANGED
@@ -5,14 +5,14 @@ import numpy as np
|
|
5 |
import torch
|
6 |
import torchaudio
|
7 |
|
8 |
-
from data.tokenizer import (
|
9 |
AudioTokenizer,
|
10 |
TextTokenizer,
|
11 |
tokenize_audio,
|
12 |
tokenize_text
|
13 |
)
|
14 |
|
15 |
-
from models import voicecraft
|
16 |
import argparse, time, tqdm
|
17 |
|
18 |
# this script only works for the musicgen architecture
|
|
|
5 |
import torch
|
6 |
import torchaudio
|
7 |
|
8 |
+
from lib.voicecraft.data.tokenizer import (
|
9 |
AudioTokenizer,
|
10 |
TextTokenizer,
|
11 |
tokenize_audio,
|
12 |
tokenize_text
|
13 |
)
|
14 |
|
15 |
+
from lib.voicecraft.models import voicecraft
|
16 |
import argparse, time, tqdm
|
17 |
|
18 |
# this script only works for the musicgen architecture
|
lib/voicecraft/inference_tts_scale.py
CHANGED
@@ -5,14 +5,14 @@ import numpy as np
|
|
5 |
import torch
|
6 |
import torchaudio
|
7 |
|
8 |
-
from data.tokenizer import (
|
9 |
AudioTokenizer,
|
10 |
TextTokenizer,
|
11 |
tokenize_audio,
|
12 |
tokenize_text
|
13 |
)
|
14 |
|
15 |
-
from models import voicecraft
|
16 |
import argparse, time, tqdm
|
17 |
|
18 |
|
|
|
5 |
import torch
|
6 |
import torchaudio
|
7 |
|
8 |
+
from lib.voicecraft.data.tokenizer import (
|
9 |
AudioTokenizer,
|
10 |
TextTokenizer,
|
11 |
tokenize_audio,
|
12 |
tokenize_text
|
13 |
)
|
14 |
|
15 |
+
from lib.voicecraft.models import voicecraft
|
16 |
import argparse, time, tqdm
|
17 |
|
18 |
|
requirements.txt
CHANGED
@@ -29,7 +29,5 @@ tensorboard==2.16.2
|
|
29 |
phonemizer==3.2.1
|
30 |
datasets==2.16.0
|
31 |
torchmetrics==0.11.1
|
|
|
32 |
# install MFA for getting forced-alignment, this could take a few minutes
|
33 |
-
montreal-forced-aligner==2.2.17
|
34 |
-
openfst==1.8.2
|
35 |
-
kaldi==5.5.1068
|
|
|
29 |
phonemizer==3.2.1
|
30 |
datasets==2.16.0
|
31 |
torchmetrics==0.11.1
|
32 |
+
whisperx @ git+https://github.com/m-bain/whisperx.git
|
33 |
# install MFA for getting forced-alignment, this could take a few minutes
|
|
|
|
|
|