Raven-with-Voice-Cloning

Runtime error

App Files Files Community

Kevin676 commited on Apr 9, 2023

Commit

1de6169

•

1 Parent(s): 81e7dbf

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -68

app.py CHANGED Viewed

@@ -6,6 +6,8 @@ from pynvml import *
 nvmlInit()
 gpu_h = nvmlDeviceGetHandleByIndex(0)
 ctx_limit = 1024
 title1 = "RWKV-4-Raven-7B-v8-Eng-20230408-ctx4096"
 os.environ["RWKV_JIT_ON"] = '1'
@@ -17,24 +19,6 @@ model = RWKV(model=model_path, strategy='cuda fp16i8 *8 -> cuda fp16')
 from rwkv.utils import PIPELINE, PIPELINE_ARGS
 pipeline = PIPELINE(model, "20B_tokenizer.json")
-from TTS.api import TTS
-tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
-import whisper
-model1 = whisper.load_model("small")
-os.system('pip install voicefixer --upgrade')
-from voicefixer import VoiceFixer
-voicefixer = VoiceFixer()
-import torchaudio
-from speechbrain.pretrained import SpectralMaskEnhancement
-enhance_model = SpectralMaskEnhancement.from_hparams(
-source="speechbrain/metricgan-plus-voicebank",
-savedir="pretrained_models/metricgan-plus-voicebank",
-run_opts={"device":"cuda"},
-)
 def generate_prompt(instruction, input=None):
     if input:
         return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
@@ -52,17 +36,17 @@ def generate_prompt(instruction, input=None):
 """
 def evaluate(
-    upload,
-    audio,
 #    instruction,
-#    input=None,
-#    token_count=200,
-#    temperature=1.0,
-#    top_p=0.7,
-#    presencePenalty = 0.1,
-#    countPenalty = 0.1,
 ):
     audio = whisper.load_audio(audio)
     audio = whisper.pad_or_trim(audio)
@@ -76,18 +60,15 @@ def evaluate(
     # decode the audio
     options = whisper.DecodingOptions()
     result = whisper.decode(model1, mel, options)
-    res = []
-    args = PIPELINE_ARGS(temperature = max(0.2, float(1)), top_p = float(0.5),
-                     alpha_frequency = 0.4,
-                     alpha_presence = 0.4,
                      token_ban = [], # ban the generation of some tokens
                      token_stop = [0]) # stop generation whenever you see any token here
-    instruction = result.text.strip()
-    input=None
-#    input = input.strip()
     ctx = generate_prompt(instruction, input)
     gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
@@ -98,7 +79,7 @@ def evaluate(
     out_str = ''
     occurrence = {}
     state = None
-    for i in range(int(150)):
         out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token], state)
         for n in occurrence:
             out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
@@ -119,46 +100,25 @@ def evaluate(
             out_last = i + 1
     gc.collect()
     torch.cuda.empty_cache()
-    res.append(out_str.strip())
-#    res1 = ''.join(str(x) for x in res)
-    tts.tts_to_file(res, speaker_wav = upload, language="en", file_path="output.wav")
-    voicefixer.restore(input="output.wav", # input wav file path
-                    output="audio1.wav", # output wav file path
-                    cuda=True, # whether to use gpu acceleration
-                    mode = 0) # You can try out mode 0, 1, or 2 to find out the best result
-    noisy = enhance_model.load_audio(
-    "audio1.wav"
-    ).unsqueeze(0)
-    enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))
-    torchaudio.save("enhanced.wav", enhanced.cpu(), 16000)
-    return [result.text, res, "enhanced.wav"]
-#    yield out_str.strip()
 g = gr.Interface(
     fn=evaluate,
     inputs=[
-        gr.Audio(source="upload", label = "请上传您喜欢的声音(wav文件)", type="filepath"),
-        gr.Audio(source="microphone", label = "和您的专属AI聊天吧！", type="filepath"),
 #        gr.components.Textbox(lines=2, label="Instruction", value="Tell me about ravens."),
-#        gr.components.Textbox(lines=2, label="Input", placeholder="none"),
-#        gr.components.Slider(minimum=10, maximum=200, step=10, value=150), # token_count
-#        gr.components.Slider(minimum=0.2, maximum=2.0, step=0.1, value=1.0), # temperature
-#        gr.components.Slider(minimum=0, maximum=1, step=0.05, value=0.5), # top_p
-#        gr.components.Slider(0.0, 1.0, step=0.1, value=0.4),  # presencePenalty
-#        gr.components.Slider(0.0, 1.0, step=0.1, value=0.4),  # countPenalty
     ],
     outputs=[
-        gr.Textbox(label="Speech to Text"),
-        gr.Textbox(label="Raven Output"),
-        gr.Audio(label="Audio with Custom Voice"),
     ],
     title="🥳💬💕 - TalktoAI，随时随地，谈天说地！",
     description="🤖 - 让有人文关怀的AI造福每一个人！AI向善，文明璀璨！TalktoAI - Enable the future！",

 nvmlInit()
 gpu_h = nvmlDeviceGetHandleByIndex(0)
 ctx_limit = 1024
+import whisper
+model1 = whisper.load_model("small")
 title1 = "RWKV-4-Raven-7B-v8-Eng-20230408-ctx4096"
 os.environ["RWKV_JIT_ON"] = '1'
 from rwkv.utils import PIPELINE, PIPELINE_ARGS
 pipeline = PIPELINE(model, "20B_tokenizer.json")
 def generate_prompt(instruction, input=None):
     if input:
         return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
 """
 def evaluate(
 #    instruction,
+    audio,
+    input=None,
+    token_count=200,
+    temperature=1.0,
+    top_p=0.7,
+    presencePenalty = 0.1,
+    countPenalty = 0.1,
 ):
+    # load audio and pad/trim it to fit 30 seconds
     audio = whisper.load_audio(audio)
     audio = whisper.pad_or_trim(audio)
     # decode the audio
     options = whisper.DecodingOptions()
     result = whisper.decode(model1, mel, options)
+    args = PIPELINE_ARGS(temperature = max(0.2, float(temperature)), top_p = float(top_p),
+                     alpha_frequency = countPenalty,
+                     alpha_presence = presencePenalty,
                      token_ban = [], # ban the generation of some tokens
                      token_stop = [0]) # stop generation whenever you see any token here
+    instruction = result.text
+    input = input.strip()
     ctx = generate_prompt(instruction, input)
     gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
     out_str = ''
     occurrence = {}
     state = None
+    for i in range(int(token_count)):
         out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:] if i == 0 else [token], state)
         for n in occurrence:
             out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
             out_last = i + 1
     gc.collect()
     torch.cuda.empty_cache()
+    yield out_str.strip()
 g = gr.Interface(
     fn=evaluate,
     inputs=[
 #        gr.components.Textbox(lines=2, label="Instruction", value="Tell me about ravens."),
+        gr.Audio(source="microphone", label = "请开始对话吧！", type="filepath"),
+        gr.components.Textbox(lines=2, label="Input", placeholder="none"),
+        gr.components.Slider(minimum=10, maximum=200, step=10, value=150), # token_count
+        gr.components.Slider(minimum=0.2, maximum=2.0, step=0.1, value=1.0), # temperature
+        gr.components.Slider(minimum=0, maximum=1, step=0.05, value=0.5), # top_p
+        gr.components.Slider(0.0, 1.0, step=0.1, value=0.4),  # presencePenalty
+        gr.components.Slider(0.0, 1.0, step=0.1, value=0.4),  # countPenalty
     ],
     outputs=[
+        gr.inputs.Textbox(
+            lines=5,
+            label="Output",
+        )
     ],
     title="🥳💬💕 - TalktoAI，随时随地，谈天说地！",
     description="🤖 - 让有人文关怀的AI造福每一个人！AI向善，文明璀璨！TalktoAI - Enable the future！",