deepfake_sirius / app.py
ledddev's picture
Update app.py
08a9183
raw
history blame
No virus
4.72 kB
import gradio as gr
import os
import wave
import json
import requests
import urllib.request
import speech_recognition
from vosk_tts import Model, Synth
from scipy.io.wavfile import write
from pydub import AudioSegment
from pydub.playback import play
PATH_TO_MODEL = os.path.join(os.getcwd(), "vosk-model-tts-ru-0.4-multi")
PATH_TO_OUTPUT = os.path.join(os.getcwd(), "content")
k = "sk-YOVNQzHmpga9My3dwlSo9BQN907TuPZQXcHn50ztigTwm3I2"
YA_GPT_KEY = "AQVNyVqBi-XoJ1cAo7VIxq6ztgXm3owqowtso5Qb"
instruction = """
Ответь на запрос так, как ответил бы на него Павел Воля. Используй данные из биографии Павла Воли, если это потребуется. Отвечай на запрос в его стиле. Ответ должен содержать не болеее 10 предложений.
"""
#files = [
# ("input_face", open("C:\\Users\\user\\Desktop\\deepfake_sirius\\materials\\scale_1200.jpg", "rb")), #TODO: IT
# ("input_audio", open("C:\\Users\\user\\Desktop\\deepfake_sirius\\materials\\audio\\output.wav", "rb")), #TODO: IT
#]
# while True:
# # старт записи речи с последующим выводом распознанной речи
# # и удалением записанного в микрофон аудио
# voice_input = record_and_recognize_audio()
# os.remove("microphone-results.wav")
# print(voice_input)
# path_to_file = vg.generate(ask(voice_input))
# print(path_to_file)
# response = requests.post(
# "https://api.gooey.ai/v2/Lipsync/form/",
# headers={
# "Authorization": "Bearer " + k,
# },
# files=files,
# data={"json": json.dumps(payload)},
# )
# assert response.ok, response.content
# #song = AudioSegment.from_wav(path_to_file)
# result = response.json()
# print(response.status_code, result["output"]["output_video"])
# #play(song)
# urllib.request.urlretrieve(result["output"]["output_video"], "C:\\Users\\user\\Desktop\\deepfake_sirius\\materials\\video.mp4")
# os.startfile("C:\\Users\\user\\Desktop\\deepfake_sirius\\materials\\video.mp4")
# break;
class VoiceGenerator:
def __init__(self):
self.model = Model(PATH_TO_MODEL)
def generate(self, text, file_name='output.wav'):
synth = Synth(self.model)
path = os.path.join(PATH_TO_OUTPUT, file_name)
synth.synth(text, path)
return path
def recognize_audio(file_path):
with speech_recognition.AudioFile(file_path) as source:
audio = recognizer.record(source)
try:
recognized_data = recognizer.recognize_google(audio, language="ru").lower()
except speech_recognition.UnknownValueError:
pass
except speech_recognition.RequestError:
pass
return recognized_data
def ask_gpt(request):
result = requests.post(
url='https://llm.api.cloud.yandex.net/llm/v1alpha/instruct',
headers={
"Authorization": "Api-Key " + YA_GPT_KEY,
},
json={
"model": "general",
"instruction_text": instruction,
"request_text": request,
"generation_options": {
"max_tokens": 1500,
"temperature": 0.5
}
}
)
data = json.loads(result.text)
return(data['result']['alternatives'][0]['text'])
recognizer = speech_recognition.Recognizer()
vg = VoiceGenerator()
def get_video(pathToWav, pathToImage):
files = [
("input_face", open(pathToImage, "rb")),
("input_audio", open(pathToWav, "rb")),
]
payload = {}
response = requests.post(
"https://api.gooey.ai/v2/Lipsync/form/",
headers={
"Authorization": "Bearer " + k,
},
files = files,
data={"json": json.dumps(payload)}
)
assert response.ok, response.content
def result(audio):
sample_rate, data = audio
#print('sample_rate:', sample_rate, 'data:', data)
#return os.path.join(os.path.abspath(''), "video_sample.mp4")
file_name = 'voice_input.wav'
file_path = os.path.join(PATH_TO_OUTPUT, file_name)
write(file_path, sample_rate, data)
text_from_audio = recognize_audio(file_path)
generated_audio = vg.generate(ask_gpt(text_from_audio))
referenceName = ''
pathToReference = os.path.join(os.getcwd(), os.path.join("content", referenceName))
return generated_audio
demo = gr.Interface(
result,
gr.Audio(sources=["microphone"]),
"audio", #playable_video
)
demo.launch()