import gradio as gr import os import wave import json import requests import urllib.request import speech_recognition from vosk_tts import Model, Synth from scipy.io.wavfile import write from pydub import AudioSegment from pydub.playback import play PATH_TO_MODEL = os.path.join(os.getcwd(), "vosk-model-tts-ru-0.4-multi") PATH_TO_OUTPUT = os.path.join(os.getcwd(), "content") k = "sk-YOVNQzHmpga9My3dwlSo9BQN907TuPZQXcHn50ztigTwm3I2" YA_GPT_KEY = "AQVNyVqBi-XoJ1cAo7VIxq6ztgXm3owqowtso5Qb" instruction = """ Ответь на запрос так, как ответил бы на него Павел Воля. Используй данные из биографии Павла Воли, если это потребуется. Отвечай на запрос в его стиле. Ответ должен содержать не болеее 10 предложений. """ #files = [ # ("input_face", open("C:\\Users\\user\\Desktop\\deepfake_sirius\\materials\\scale_1200.jpg", "rb")), #TODO: IT # ("input_audio", open("C:\\Users\\user\\Desktop\\deepfake_sirius\\materials\\audio\\output.wav", "rb")), #TODO: IT #] # while True: # # старт записи речи с последующим выводом распознанной речи # # и удалением записанного в микрофон аудио # voice_input = record_and_recognize_audio() # os.remove("microphone-results.wav") # print(voice_input) # path_to_file = vg.generate(ask(voice_input)) # print(path_to_file) # response = requests.post( # "https://api.gooey.ai/v2/Lipsync/form/", # headers={ # "Authorization": "Bearer " + k, # }, # files=files, # data={"json": json.dumps(payload)}, # ) # assert response.ok, response.content # #song = AudioSegment.from_wav(path_to_file) # result = response.json() # print(response.status_code, result["output"]["output_video"]) # #play(song) # urllib.request.urlretrieve(result["output"]["output_video"], "C:\\Users\\user\\Desktop\\deepfake_sirius\\materials\\video.mp4") # os.startfile("C:\\Users\\user\\Desktop\\deepfake_sirius\\materials\\video.mp4") # break; class VoiceGenerator: def __init__(self): self.model = Model(PATH_TO_MODEL) def generate(self, text, file_name='output.wav'): synth = Synth(self.model) path = os.path.join(PATH_TO_OUTPUT, file_name) synth.synth(text, path) return path def recognize_audio(file_path): with speech_recognition.AudioFile(file_path) as source: audio = recognizer.record(source) try: recognized_data = recognizer.recognize_google(audio, language="ru").lower() except speech_recognition.UnknownValueError: pass except speech_recognition.RequestError: pass return recognized_data def ask_gpt(request): result = requests.post( url='https://llm.api.cloud.yandex.net/llm/v1alpha/instruct', headers={ "Authorization": "Api-Key " + YA_GPT_KEY, }, json={ "model": "general", "instruction_text": instruction, "request_text": request, "generation_options": { "max_tokens": 1500, "temperature": 0.5 } } ) data = json.loads(result.text) return(data['result']['alternatives'][0]['text']) recognizer = speech_recognition.Recognizer() vg = VoiceGenerator() def get_video(pathToWav, pathToImage): files = [ ("input_face", open(pathToImage, "rb")), ("input_audio", open(pathToWav, "rb")), ] payload = {} response = requests.post( "https://api.gooey.ai/v2/Lipsync/form/", headers={ "Authorization": "Bearer " + k, }, files = files, data={"json": json.dumps(payload)} ) assert response.ok, response.content def result(audio): sample_rate, data = audio #print('sample_rate:', sample_rate, 'data:', data) #return os.path.join(os.path.abspath(''), "video_sample.mp4") file_name = 'voice_input.wav' file_path = os.path.join(PATH_TO_OUTPUT, file_name) write(file_path, sample_rate, data) text_from_audio = recognize_audio(file_path) generated_audio = vg.generate(ask_gpt(text_from_audio)) referenceName = '' pathToReference = os.path.join(os.getcwd(), os.path.join("content", referenceName)) return generated_audio demo = gr.Interface( result, gr.Audio(sources=["microphone"]), "audio", #playable_video ) demo.launch()