deepfake_sirius / app.py
ledddev's picture
Update app.py
4e6667e
import gradio as gr
import os
import wave
import json
import requests
import urllib.request
import speech_recognition
from vosk_tts import Model, Synth
from scipy.io.wavfile import write
from pydub import AudioSegment
from pydub.playback import play
PATH_TO_MODEL = os.path.join(os.getcwd(), "vosk-model-tts-ru-0.4-multi")
PATH_TO_OUTPUT = os.path.join(os.getcwd(), "content")
LIPSYNC_KEY = "sk-YOVNQzHmpga9My3dwlSo9BQN907TuPZQXcHn50ztigTwm3I2"
YA_GPT_KEY = "NONE"
IAM_TOKEN = "NONE"
X_FOLDER_ID = "NONE"
instruction = """
Ответь на запрос так, как ответил бы на него Павел Воля. Используй данные из биографии Павла Воли, если это потребуется. Отвечай на запрос в его стиле. Ответ должен содержать не болеее 10 предложений. Все цифры пиши словами.
"""
class VoiceGenerator:
def __init__(self):
self.model = Model(PATH_TO_MODEL)
def generate(self, text, file_name='output.wav'):
synth = Synth(self.model)
path = os.path.join(PATH_TO_OUTPUT, file_name)
synth.synth(text, path)
return path
def recognize_audio(file_path):
with speech_recognition.AudioFile(file_path) as source:
audio = recognizer.record(source)
recognized_data = "Ошибка распознавания речи"
try:
recognized_data = recognizer.recognize_google(audio, language="ru").lower()
except speech_recognition.UnknownValueError:
recognized_data = recognized_data + ' UnknownValueError'
except speech_recognition.RequestError:
recognized_data = recognized_data + ' RequestError'
print(recognized_data)
return recognized_data
def ask_gpt(request):
result = requests.post(
url='https://llm.api.cloud.yandex.net/llm/v1alpha/instruct',
headers={'Authorization': f'Bearer {IAM_TOKEN}', 'x-folder-id': X_FOLDER_ID},
json={
"model": "general",
"instruction_text": instruction,
"request_text": request,
"generation_options": {
"max_tokens": 1500,
"temperature": 0.5
}
}
)
data = json.loads(result.text)
print(data)
if 'result' in data:
return data['result']['alternatives'][0]['text']
else:
return 'Я пока не могу ответить на ваш вопрос. Все мои мозги на сервере, а он не отвечает.'
recognizer = speech_recognition.Recognizer()
vg = VoiceGenerator()
def get_video(pathToWav, pathToImage, pathToResult):
files = [
("input_face", open(pathToImage, "rb")),
("input_audio", open(pathToWav, "rb")),
]
payload = {}
response = requests.post(
"https://api.gooey.ai/v2/Lipsync/form/",
headers={
"Authorization": "Bearer " + LIPSYNC_KEY,
},
files = files,
data={"json": json.dumps(payload)}
)
assert response.ok, response.content
result = response.json()
#print(response.status_code, result["output"]["output_video"])
urllib.request.urlretrieve(result["output"]["output_video"], pathToResult)
def resultSay(text):
generated_audio = vg.generate(text)
referenceName = 'reference.jpg'
resultName = 'video.mp4'
pathToReference = os.path.join(PATH_TO_OUTPUT, referenceName)
pathToResult = os.path.join(PATH_TO_OUTPUT, resultName)
get_video(generated_audio, pathToReference, pathToResult)
return pathToResult
def resultText(text):
return resultSay(ask_gpt(text))
def resultAudio(audio):
sample_rate, data = audio
file_name = 'voice_input.wav'
file_path = os.path.join(PATH_TO_OUTPUT, file_name)
write(file_path, sample_rate, data)
return resultText(recognize_audio(file_path))
def resultOnlyAudio(text):
return vg.generate(text)
def resultCustom(text, photo):
print(text, photo)
generated_audio = vg.generate(text)
referenceName = 'reference.jpg'
if (photo != None):
referenceName = 'custom.jpg'
photo.save(os.path.join(PATH_TO_OUTPUT, referenceName))
resultName = 'video.mp4'
pathToReference = os.path.join(PATH_TO_OUTPUT, referenceName)
pathToResult = os.path.join(PATH_TO_OUTPUT, resultName)
get_video(generated_audio, pathToReference, pathToResult)
return pathToResult
demoMic = gr.Interface(
resultAudio,
gr.Audio(sources=["microphone"]),
"playable_video",
)
demoText = gr.Interface(
resultText,
gr.Textbox(label="Query"),
"playable_video",
)
demoSay = gr.Interface(
resultCustom,
[gr.Textbox(label="Query"), gr.Image(type="pil")],
"playable_video",
)
demoOnly = gr.Interface(
resultOnlyAudio,
gr.Textbox(label="Query"),
"audio",
)
demo = gr.TabbedInterface([demoMic, demoText, demoSay, demoOnly], ["Ask by micro", "Ask by text", "Generate video", "Say this text"])
demo.launch()