Spaces:
Build error
Build error
import gradio as gr | |
import git | |
import os, gc, torch | |
from datetime import datetime | |
from huggingface_hub import hf_hub_download | |
from pynvml import * | |
nvmlInit() | |
gpu_h = nvmlDeviceGetHandleByIndex(0) | |
ctx_limit = 1024 | |
title1 = "RWKV-4-Raven-7B-v9-Eng99%-Other1%-20230412-ctx8192" | |
from rwkv.model import RWKV | |
model_path = hf_hub_download(repo_id="BlinkDL/rwkv-4-raven", filename=f"{title1}.pth") | |
model = RWKV(model=model_path, strategy='cuda fp16i8 *8 -> cuda fp16') | |
from rwkv.utils import PIPELINE, PIPELINE_ARGS | |
pipeline = PIPELINE(model, "20B_tokenizer.json") | |
os.environ["RWKV_JIT_ON"] = '1' | |
os.environ["RWKV_CUDA_ON"] = '1' # if '1' then use CUDA kernel for seq mode (much faster) | |
os.system('git clone https://github.com/Edresson/Coqui-TTS -b multilingual-torchaudio-SE TTS') | |
os.system('pip install -q -e TTS/') | |
os.system('pip install -q torchaudio==0.9.0') | |
os.system('pip install voicefixer --upgrade') | |
from voicefixer import VoiceFixer | |
voicefixer = VoiceFixer() | |
import sys | |
TTS_PATH = "TTS/" | |
# add libraries into environment | |
sys.path.append(TTS_PATH) # set this if TTS is not installed globally | |
import string | |
import time | |
import argparse | |
import json | |
import numpy as np | |
import IPython | |
from IPython.display import Audio | |
import torchaudio | |
from speechbrain.pretrained import SpectralMaskEnhancement | |
enhance_model = SpectralMaskEnhancement.from_hparams( | |
source="speechbrain/metricgan-plus-voicebank", | |
savedir="pretrained_models/metricgan-plus-voicebank", | |
run_opts={"device":"cuda"}, | |
) | |
from TTS.tts.utils.synthesis import synthesis | |
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols | |
try: | |
from TTS.utils.audio import AudioProcessor | |
except: | |
from TTS.utils.audio import AudioProcessor | |
from TTS.tts.models import setup_model | |
from TTS.config import load_config | |
from TTS.tts.models.vits import * | |
OUT_PATH = 'out/' | |
# create output path | |
os.makedirs(OUT_PATH, exist_ok=True) | |
# model vars | |
MODEL_PATH = '/home/user/app/best_model_latest.pth.tar' | |
CONFIG_PATH = '/home/user/app/config.json' | |
TTS_LANGUAGES = "/home/user/app/language_ids.json" | |
TTS_SPEAKERS = "/home/user/app/speakers.json" | |
USE_CUDA = torch.cuda.is_available() | |
# load the config | |
C = load_config(CONFIG_PATH) | |
# load the audio processor | |
ap = AudioProcessor(**C.audio) | |
speaker_embedding = None | |
C.model_args['d_vector_file'] = TTS_SPEAKERS | |
C.model_args['use_speaker_encoder_as_loss'] = False | |
model = setup_model(C) | |
model.language_manager.set_language_ids_from_file(TTS_LANGUAGES) | |
# print(model.language_manager.num_languages, model.embedded_language_dim) | |
# print(model.emb_l) | |
cp = torch.load(MODEL_PATH, map_location=torch.device('cpu')) | |
# remove speaker encoder | |
model_weights = cp['model'].copy() | |
for key in list(model_weights.keys()): | |
if "speaker_encoder" in key: | |
del model_weights[key] | |
model.load_state_dict(model_weights) | |
model.eval() | |
if USE_CUDA: | |
model = model.cuda() | |
# synthesize voice | |
use_griffin_lim = False | |
os.system('pip install -q pydub ffmpeg-normalize') | |
CONFIG_SE_PATH = "config_se.json" | |
CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar" | |
from TTS.tts.utils.speakers import SpeakerManager | |
from pydub import AudioSegment | |
import librosa | |
SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=USE_CUDA) | |
def compute_spec(ref_file): | |
y, sr = librosa.load(ref_file, sr=ap.sample_rate) | |
spec = ap.spectrogram(y) | |
spec = torch.FloatTensor(spec).unsqueeze(0) | |
return spec | |
def greet(Text,Voicetoclone,VoiceMicrophone): | |
text= "%s" % (Text) | |
if Voicetoclone is not None: | |
reference_files= "%s" % (Voicetoclone) | |
print("path url") | |
print(Voicetoclone) | |
sample= str(Voicetoclone) | |
else: | |
reference_files= "%s" % (VoiceMicrophone) | |
print("path url") | |
print(VoiceMicrophone) | |
sample= str(VoiceMicrophone) | |
size= len(reference_files)*sys.getsizeof(reference_files) | |
size2= size / 1000000 | |
if (size2 > 0.012) or len(text)>2000: | |
message="File is greater than 30mb or Text inserted is longer than 2000 characters. Please re-try with smaller sizes." | |
print(message) | |
raise SystemExit("File is greater than 30mb. Please re-try or Text inserted is longer than 2000 characters. Please re-try with smaller sizes.") | |
else: | |
os.system('ffmpeg-normalize $sample -nt rms -t=-27 -o $sample -ar 16000 -f') | |
reference_emb = SE_speaker_manager.compute_d_vector_from_clip(reference_files) | |
model.length_scale = 1 # scaler for the duration predictor. The larger it is, the slower the speech. | |
model.inference_noise_scale = 0.3 # defines the noise variance applied to the random z vector at inference. | |
model.inference_noise_scale_dp = 0.3 # defines the noise variance applied to the duration predictor z vector at inference. | |
text = text | |
model.language_manager.language_id_mapping | |
language_id = 0 | |
print(" > text: {}".format(text)) | |
wav, alignment, _, _ = synthesis( | |
model, | |
text, | |
C, | |
"cuda" in str(next(model.parameters()).device), | |
ap, | |
speaker_id=None, | |
d_vector=reference_emb, | |
style_wav=None, | |
language_id=language_id, | |
enable_eos_bos_chars=C.enable_eos_bos_chars, | |
use_griffin_lim=True, | |
do_trim_silence=False, | |
).values() | |
print("Generated Audio") | |
IPython.display.display(Audio(wav, rate=ap.sample_rate)) | |
#file_name = text.replace(" ", "_") | |
#file_name = file_name.translate(str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav' | |
file_name="Audio.wav" | |
out_path = os.path.join(OUT_PATH, file_name) | |
print(" > Saving output to {}".format(out_path)) | |
ap.save_wav(wav, out_path) | |
voicefixer.restore(input=out_path, # input wav file path | |
output="audio1.wav", # output wav file path | |
cuda=True, # whether to use gpu acceleration' | |
mode = 0) # You can try out mode 0, 1, or 2 to find out the best result | |
noisy = enhance_model.load_audio( | |
"audio1.wav" | |
).unsqueeze(0) | |
enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.])) | |
torchaudio.save("enhanced.wav", enhanced.cpu(), 16000) | |
return "enhanced.wav" | |
def generate_prompt(instruction, input=None): | |
if input: | |
return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. | |
# Instruction: | |
{instruction} | |
# Input: | |
{input} | |
# Response: | |
""" | |
else: | |
return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. | |
# Instruction: | |
{instruction} | |
# Response: | |
""" | |
def evaluate( | |
instruction, | |
input=None, | |
# token_count=200, | |
# temperature=1.0, | |
# top_p=0.7, | |
# presencePenalty = 0.1, | |
# countPenalty = 0.1, | |
): | |
args = PIPELINE_ARGS(temperature = max(0.2, float(1.0)), top_p = float(0.5), | |
alpha_frequency = 0.4, | |
alpha_presence = 0.4, | |
token_ban = [], # ban the generation of some tokens | |
token_stop = [0]) # stop generation whenever you see any token here | |
instruction = instruction.strip() | |
input = input.strip() | |
ctx = generate_prompt(instruction, input) | |
gpu_info = nvmlDeviceGetMemoryInfo(gpu_h) | |
print(f'vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}') | |
all_tokens = [] | |
out_last = 0 | |
out_str = '' | |
occurrence = {} | |
state = None | |
for i in range(int(200)): | |
out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token], state) | |
for n in occurrence: | |
out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency) | |
token = pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p) | |
if token in args.token_stop: | |
break | |
all_tokens += [token] | |
if token not in occurrence: | |
occurrence[token] = 1 | |
else: | |
occurrence[token] += 1 | |
tmp = pipeline.decode(all_tokens[out_last:]) | |
if '\ufffd' not in tmp: | |
out_str += tmp | |
yield out_str.strip() | |
out_last = i + 1 | |
gc.collect() | |
torch.cuda.empty_cache() | |
yield out_str.strip() | |
block = gr.Blocks() | |
with block: | |
with gr.Group(): | |
gr.Markdown( | |
""" <center>🥳💬💕 - TalktoAI,随时随地,谈天说地!</center> | |
## <center>🤖 - 让有人文关怀的AI造福每一个人!AI向善,文明璀璨!TalktoAI - Enable the future!</center> | |
### <center>注意❗:请不要输入或生成会对个人以及组织造成侵害的内容,此程序仅供科研、学习及娱乐使用。用户输入或生成的内容与程序开发者无关,请自觉合法合规使用,违反者一切后果自负。</center> | |
### <center>Model by [Raven](https://huggingface.co/spaces/BlinkDL/Raven-RWKV-7B). Thanks to [PENG Bo](https://github.com/BlinkDL). Please follow me on [Bilibili](https://space.bilibili.com/501495851?spm_id_from=333.1007.0.0).</center> | |
""" | |
) | |
with gr.Box(): | |
with gr.Row().style(mobile_collapse=False, equal_height=True): | |
inp1 = gr.components.Textbox(lines=2, label="说些什么吧(中英皆可,英文对话效果更好)", value="Tell me a joke.") | |
inp2 = gr.components.Textbox(lines=2, label="对话的背景信息(选填,请合理合规使用此程序)", placeholder="none") | |
btn = gr.Button("开始对话吧") | |
text = gr.Textbox(lines=5, label="Raven的回答") | |
btn.click(evaluate, [inp1, inp2], [text]) | |
with gr.Box(): | |
with gr.Row().style(mobile_collapse=False, equal_height=True): | |
inp3 = text | |
inp4 = gr.Audio(source="upload", label = "请上传您喜欢的声音(wav/mp3文件, max. 30mb)", type="filepath") | |
inp5 = gr.Audio(source="microphone", type="filepath", label = '请用麦克风上传您喜欢的声音,与文件上传二选一即可') | |
btn1 = gr.Button("用喜欢的声音听一听吧") | |
out1 = gr.Audio(label="合成的专属声音") | |
btn1.click(greet, [inp3, inp4, inp5], [out1]) | |
gr.HTML(''' | |
<div class="footer"> | |
<p>🎶🖼️🎡 - It’s the intersection of technology and liberal arts that makes our hearts sing. - Steve Jobs | |
</p> | |
</div> | |
''') | |
block.launch(show_error=True) | |