import gradio as gr import os import wave import json import requests import urllib.request import speech_recognition from vosk_tts import Model, Synth from scipy.io.wavfile import write from pydub import AudioSegment from pydub.playback import play PATH_TO_MODEL = os.path.join(os.getcwd(), "vosk-model-tts-ru-0.4-multi") PATH_TO_OUTPUT = os.path.join(os.getcwd(), "content") LIPSYNC_KEY = "sk-YOVNQzHmpga9My3dwlSo9BQN907TuPZQXcHn50ztigTwm3I2" YA_GPT_KEY = "NONE" IAM_TOKEN = "NONE" X_FOLDER_ID = "NONE" instruction = """ Ответь на запрос так, как ответил бы на него Павел Воля. Используй данные из биографии Павла Воли, если это потребуется. Отвечай на запрос в его стиле. Ответ должен содержать не болеее 10 предложений. Все цифры пиши словами. """ class VoiceGenerator: def __init__(self): self.model = Model(PATH_TO_MODEL) def generate(self, text, file_name='output.wav'): synth = Synth(self.model) path = os.path.join(PATH_TO_OUTPUT, file_name) synth.synth(text, path) return path def recognize_audio(file_path): with speech_recognition.AudioFile(file_path) as source: audio = recognizer.record(source) recognized_data = "Ошибка распознавания речи" try: recognized_data = recognizer.recognize_google(audio, language="ru").lower() except speech_recognition.UnknownValueError: recognized_data = recognized_data + ' UnknownValueError' except speech_recognition.RequestError: recognized_data = recognized_data + ' RequestError' print(recognized_data) return recognized_data def ask_gpt(request): result = requests.post( url='https://llm.api.cloud.yandex.net/llm/v1alpha/instruct', headers={'Authorization': f'Bearer {IAM_TOKEN}', 'x-folder-id': X_FOLDER_ID}, json={ "model": "general", "instruction_text": instruction, "request_text": request, "generation_options": { "max_tokens": 1500, "temperature": 0.5 } } ) data = json.loads(result.text) print(data) if 'result' in data: return data['result']['alternatives'][0]['text'] else: return 'Я пока не могу ответить на ваш вопрос. Все мои мозги на сервере, а он не отвечает.' recognizer = speech_recognition.Recognizer() vg = VoiceGenerator() def get_video(pathToWav, pathToImage, pathToResult): files = [ ("input_face", open(pathToImage, "rb")), ("input_audio", open(pathToWav, "rb")), ] payload = {} response = requests.post( "https://api.gooey.ai/v2/Lipsync/form/", headers={ "Authorization": "Bearer " + LIPSYNC_KEY, }, files = files, data={"json": json.dumps(payload)} ) assert response.ok, response.content result = response.json() #print(response.status_code, result["output"]["output_video"]) urllib.request.urlretrieve(result["output"]["output_video"], pathToResult) def resultSay(text): generated_audio = vg.generate(text) referenceName = 'reference.jpg' resultName = 'video.mp4' pathToReference = os.path.join(PATH_TO_OUTPUT, referenceName) pathToResult = os.path.join(PATH_TO_OUTPUT, resultName) get_video(generated_audio, pathToReference, pathToResult) return pathToResult def resultText(text): return resultSay(ask_gpt(text)) def resultAudio(audio): sample_rate, data = audio file_name = 'voice_input.wav' file_path = os.path.join(PATH_TO_OUTPUT, file_name) write(file_path, sample_rate, data) return resultText(recognize_audio(file_path)) def resultOnlyAudio(text): return vg.generate(text) def resultCustom(text, photo): print(text, photo) generated_audio = vg.generate(text) referenceName = 'reference.jpg' if (photo != None): referenceName = 'custom.jpg' photo.save(os.path.join(PATH_TO_OUTPUT, referenceName)) resultName = 'video.mp4' pathToReference = os.path.join(PATH_TO_OUTPUT, referenceName) pathToResult = os.path.join(PATH_TO_OUTPUT, resultName) get_video(generated_audio, pathToReference, pathToResult) return pathToResult demoMic = gr.Interface( resultAudio, gr.Audio(sources=["microphone"]), "playable_video", ) demoText = gr.Interface( resultText, gr.Textbox(label="Query"), "playable_video", ) demoSay = gr.Interface( resultCustom, [gr.Textbox(label="Query"), gr.Image(type="pil")], "playable_video", ) demoOnly = gr.Interface( resultOnlyAudio, gr.Textbox(label="Query"), "audio", ) demo = gr.TabbedInterface([demoMic, demoText, demoSay, demoOnly], ["Ask by micro", "Ask by text", "Generate video", "Say this text"]) demo.launch()