Spaces:
Runtime error
Runtime error
File size: 4,722 Bytes
3824a4a 9617099 bb7c6aa e982bb1 65fbe83 e982bb1 5e62e0a 65fbe83 9617099 3824a4a 74b879d b3056e9 4d66286 bb900d4 74b879d bb900d4 4d66286 1cd7901 4d66286 1cd7901 4d66286 74b879d 1cd7901 4d66286 1cd7901 4d66286 1cd7901 4d66286 bb900d4 4d66286 d0825e8 08a9183 caf94e8 1cd7901 335c461 1cd7901 74b879d 08a9183 1cd7901 8583fc2 335c461 8583fc2 44d5308 6c15fe9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import gradio as gr
import os
import wave
import json
import requests
import urllib.request
import speech_recognition
from vosk_tts import Model, Synth
from scipy.io.wavfile import write
from pydub import AudioSegment
from pydub.playback import play
PATH_TO_MODEL = os.path.join(os.getcwd(), "vosk-model-tts-ru-0.4-multi")
PATH_TO_OUTPUT = os.path.join(os.getcwd(), "content")
k = "sk-YOVNQzHmpga9My3dwlSo9BQN907TuPZQXcHn50ztigTwm3I2"
YA_GPT_KEY = "AQVNyVqBi-XoJ1cAo7VIxq6ztgXm3owqowtso5Qb"
instruction = """
Ответь на запрос так, как ответил бы на него Павел Воля. Используй данные из биографии Павла Воли, если это потребуется. Отвечай на запрос в его стиле. Ответ должен содержать не болеее 10 предложений.
"""
#files = [
# ("input_face", open("C:\\Users\\user\\Desktop\\deepfake_sirius\\materials\\scale_1200.jpg", "rb")), #TODO: IT
# ("input_audio", open("C:\\Users\\user\\Desktop\\deepfake_sirius\\materials\\audio\\output.wav", "rb")), #TODO: IT
#]
# while True:
# # старт записи речи с последующим выводом распознанной речи
# # и удалением записанного в микрофон аудио
# voice_input = record_and_recognize_audio()
# os.remove("microphone-results.wav")
# print(voice_input)
# path_to_file = vg.generate(ask(voice_input))
# print(path_to_file)
# response = requests.post(
# "https://api.gooey.ai/v2/Lipsync/form/",
# headers={
# "Authorization": "Bearer " + k,
# },
# files=files,
# data={"json": json.dumps(payload)},
# )
# assert response.ok, response.content
# #song = AudioSegment.from_wav(path_to_file)
# result = response.json()
# print(response.status_code, result["output"]["output_video"])
# #play(song)
# urllib.request.urlretrieve(result["output"]["output_video"], "C:\\Users\\user\\Desktop\\deepfake_sirius\\materials\\video.mp4")
# os.startfile("C:\\Users\\user\\Desktop\\deepfake_sirius\\materials\\video.mp4")
# break;
class VoiceGenerator:
def __init__(self):
self.model = Model(PATH_TO_MODEL)
def generate(self, text, file_name='output.wav'):
synth = Synth(self.model)
path = os.path.join(PATH_TO_OUTPUT, file_name)
synth.synth(text, path)
return path
def recognize_audio(file_path):
with speech_recognition.AudioFile(file_path) as source:
audio = recognizer.record(source)
try:
recognized_data = recognizer.recognize_google(audio, language="ru").lower()
except speech_recognition.UnknownValueError:
pass
except speech_recognition.RequestError:
pass
return recognized_data
def ask_gpt(request):
result = requests.post(
url='https://llm.api.cloud.yandex.net/llm/v1alpha/instruct',
headers={
"Authorization": "Api-Key " + YA_GPT_KEY,
},
json={
"model": "general",
"instruction_text": instruction,
"request_text": request,
"generation_options": {
"max_tokens": 1500,
"temperature": 0.5
}
}
)
data = json.loads(result.text)
return(data['result']['alternatives'][0]['text'])
recognizer = speech_recognition.Recognizer()
vg = VoiceGenerator()
def get_video(pathToWav, pathToImage):
files = [
("input_face", open(pathToImage, "rb")),
("input_audio", open(pathToWav, "rb")),
]
payload = {}
response = requests.post(
"https://api.gooey.ai/v2/Lipsync/form/",
headers={
"Authorization": "Bearer " + k,
},
files = files,
data={"json": json.dumps(payload)}
)
assert response.ok, response.content
def result(audio):
sample_rate, data = audio
#print('sample_rate:', sample_rate, 'data:', data)
#return os.path.join(os.path.abspath(''), "video_sample.mp4")
file_name = 'voice_input.wav'
file_path = os.path.join(PATH_TO_OUTPUT, file_name)
write(file_path, sample_rate, data)
text_from_audio = recognize_audio(file_path)
generated_audio = vg.generate(ask_gpt(text_from_audio))
referenceName = ''
pathToReference = os.path.join(os.getcwd(), os.path.join("content", referenceName))
return generated_audio
demo = gr.Interface(
result,
gr.Audio(sources=["microphone"]),
"audio", #playable_video
)
demo.launch() |