File size: 4,722 Bytes
3824a4a
9617099
bb7c6aa
e982bb1
 
 
65fbe83
 
e982bb1
5e62e0a
65fbe83
 
9617099
3824a4a
74b879d
b3056e9
4d66286
bb900d4
 
 
 
74b879d
 
bb900d4
 
 
 
4d66286
 
1cd7901
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d66286
1cd7901
4d66286
74b879d
1cd7901
 
4d66286
 
 
 
 
 
 
1cd7901
4d66286
 
 
 
 
 
 
 
 
 
 
1cd7901
4d66286
 
 
bb900d4
4d66286
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d0825e8
 
 
 
08a9183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
caf94e8
1cd7901
 
335c461
1cd7901
 
 
 
74b879d
08a9183
 
1cd7901
 
8583fc2
 
 
 
335c461
8583fc2
44d5308
6c15fe9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import gradio as gr

import os
import wave
import json
import requests
import urllib.request
import speech_recognition
from vosk_tts import Model, Synth
from scipy.io.wavfile import write
from pydub import AudioSegment
from pydub.playback import play


PATH_TO_MODEL = os.path.join(os.getcwd(), "vosk-model-tts-ru-0.4-multi")
PATH_TO_OUTPUT = os.path.join(os.getcwd(), "content")
k = "sk-YOVNQzHmpga9My3dwlSo9BQN907TuPZQXcHn50ztigTwm3I2"
YA_GPT_KEY = "AQVNyVqBi-XoJ1cAo7VIxq6ztgXm3owqowtso5Qb"
instruction = """
Ответь на запрос так, как ответил бы на него Павел Воля. Используй данные из биографии Павла Воли, если это потребуется. Отвечай на запрос в его стиле. Ответ должен содержать не болеее 10 предложений.
"""


#files = [
#    ("input_face", open("C:\\Users\\user\\Desktop\\deepfake_sirius\\materials\\scale_1200.jpg", "rb")), #TODO: IT
#    ("input_audio", open("C:\\Users\\user\\Desktop\\deepfake_sirius\\materials\\audio\\output.wav", "rb")), #TODO: IT
#]


#     while True:
#         # старт записи речи с последующим выводом распознанной речи
#         # и удалением записанного в микрофон аудио
#         voice_input = record_and_recognize_audio()
#         os.remove("microphone-results.wav")
#         print(voice_input)
#         path_to_file = vg.generate(ask(voice_input))
#         print(path_to_file)
#         response = requests.post(
#             "https://api.gooey.ai/v2/Lipsync/form/",
#             headers={
#                 "Authorization": "Bearer " + k,
#             },
#             files=files,
#             data={"json": json.dumps(payload)},
#         )
#         assert response.ok, response.content
#         #song = AudioSegment.from_wav(path_to_file)
#         result = response.json()
#         print(response.status_code, result["output"]["output_video"])
#         #play(song)
#         urllib.request.urlretrieve(result["output"]["output_video"], "C:\\Users\\user\\Desktop\\deepfake_sirius\\materials\\video.mp4")
#         os.startfile("C:\\Users\\user\\Desktop\\deepfake_sirius\\materials\\video.mp4")
#         break;


class VoiceGenerator:
    
    def __init__(self):
        self.model = Model(PATH_TO_MODEL)

        
    def generate(self, text, file_name='output.wav'):
        synth = Synth(self.model)
        path = os.path.join(PATH_TO_OUTPUT, file_name)
        synth.synth(text, path)
        return path


def recognize_audio(file_path):
    with speech_recognition.AudioFile(file_path) as source:
        audio = recognizer.record(source)
    try:
        recognized_data = recognizer.recognize_google(audio, language="ru").lower()
    except speech_recognition.UnknownValueError:
        pass
    except speech_recognition.RequestError:
        pass
    return recognized_data


def ask_gpt(request):
    result = requests.post(
        url='https://llm.api.cloud.yandex.net/llm/v1alpha/instruct',
        headers={
            "Authorization": "Api-Key " + YA_GPT_KEY,
        },
        json={
            "model": "general",
            "instruction_text": instruction,
            "request_text": request,
            "generation_options": {
                "max_tokens": 1500,
                "temperature": 0.5
            }
        }
    )
    data = json.loads(result.text)
    return(data['result']['alternatives'][0]['text'])


recognizer = speech_recognition.Recognizer()
vg = VoiceGenerator()


def get_video(pathToWav, pathToImage):
    files = [
       ("input_face", open(pathToImage, "rb")),
       ("input_audio", open(pathToWav, "rb")),
    ]
    payload = {}
    response = requests.post(
        "https://api.gooey.ai/v2/Lipsync/form/",
        headers={
            "Authorization": "Bearer " + k,
        },
        files = files,
        data={"json": json.dumps(payload)}
    )
    assert response.ok, response.content


def result(audio):
    sample_rate, data = audio
    #print('sample_rate:', sample_rate, 'data:', data)
    #return os.path.join(os.path.abspath(''), "video_sample.mp4")
    file_name = 'voice_input.wav'
    file_path = os.path.join(PATH_TO_OUTPUT, file_name)
    write(file_path, sample_rate, data)
    text_from_audio = recognize_audio(file_path)
    generated_audio = vg.generate(ask_gpt(text_from_audio))
    referenceName = ''
    pathToReference = os.path.join(os.getcwd(), os.path.join("content", referenceName))
    return generated_audio


demo = gr.Interface(
    result,
    gr.Audio(sources=["microphone"]),
    "audio", #playable_video
)

demo.launch()