File size: 7,110 Bytes
0668b67
6c8885f
c6fcce4
0668b67
2914c65
0668b67
37b0e3a
879c526
 
 
 
 
 
0668b67
37b0e3a
261c279
0668b67
37b0e3a
 
a7a78fa
37b0e3a
a7a78fa
37b0e3a
a7a78fa
37b0e3a
a7a78fa
37b0e3a
a7a78fa
37b0e3a
 
 
 
 
 
879c526
 
 
 
 
 
0668b67
58cb1a8
 
0668b67
 
 
 
58cb1a8
 
c6fcce4
58cb1a8
0668b67
 
 
 
 
 
 
 
1eac7b5
0668b67
 
 
 
879c526
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0668b67
 
 
 
 
 
a7a78fa
0668b67
 
55070d4
0668b67
 
c6fcce4
0668b67
e0b1de2
879c526
 
 
0668b67
a7a78fa
0668b67
 
 
 
 
 
 
879c526
517eb68
5e0755e
 
 
 
 
 
0668b67
 
 
 
 
c6fcce4
 
36b2c0f
0668b67
 
879c526
0668b67
 
 
 
517eb68
49f5187
0668b67
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import torch
import os
from transformers import pipeline, VitsModel, VitsTokenizer, SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor
import numpy as np
os.system("pip install git+https://github.com/openai/whisper.git")
import gradio as gr
import whisper
import requests

MODEL = "gpt-3.5-turbo"
API_URL = os.getenv("API_URL")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
NUM_THREADS = int(os.getenv("NUM_THREADS"))

model = whisper.load_model("small")
device = "cuda:0" if torch.cuda.is_available() else "cpu"

def inference(audio):
    audio = whisper.load_audio(audio)
    print("loading finished")
    audio = whisper.pad_or_trim(audio)
    print("audio trimed")
    mel = whisper.log_mel_spectrogram(audio).to(model.device)
    print("spectro finished")
    _, probs = model.detect_language(mel)
    print("lang detected")
    options = whisper.DecodingOptions(fp16 = False)
    print("options decoded")
    result = whisper.decode(model, mel, options)
    
    print(result.text)
    return result.text

    
Load Whisper-small
pipe = pipeline("automatic-speech-recognition",
                model="openai/whisper-small",
                device=device
)
#pipe = pipeline(model="Sleepyp00/whisper-small-Swedish") 

model2 = VitsModel.from_pretrained("facebook/mms-tts-eng")
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")


# Define a function to translate an audio, in english here
def translate(audio):
    # return inference(audio)
    outputs = pipe(audio, max_new_tokens=256,
                   generate_kwargs={"task": "translate"})
    return outputs["text"]


# Define function to generate the waveform output
def synthesise(text):
    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs["input_ids"]

    with torch.no_grad(): 
      outputs = model2(input_ids)
    
    return outputs.audio[0]


def gpt_predict(inputs, top_p = 1, temperature = 1, chat_counter = 0,history =[], request:gr.Request):
    payload = {
        "model": MODEL,
        "messages": [{"role": "user", "content": f"{inputs}"}],
        "temperature" : 1.0,
        "top_p":1.0,
        "n" : 1,
        "stream": True,
        "presence_penalty":0,
        "frequency_penalty":0,
    }

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {OPENAI_API_KEY}",
        "Headers": f"{request.kwargs['headers']}"
    }

    # print(f"chat_counter - {chat_counter}")
    if chat_counter != 0 :
        messages = []
        for i, data in enumerate(history):
            if i % 2 == 0:
                role = 'user'
            else:
                role = 'assistant'
            message = {}
            message["role"] = role
            message["content"] = data
            messages.append(message)
        
        message = {}
        message["role"] = "user" 
        message["content"] = inputs
        messages.append(message)
        payload = {
            "model": MODEL,
            "messages": messages,
            "temperature" : temperature,
            "top_p": top_p,
            "n" : 1,
            "stream": True,
            "presence_penalty":0,
            "frequency_penalty":0,
        }

    chat_counter += 1

    history.append(inputs)
    token_counter = 0 
    partial_words = "" 
    counter = 0

    try:
        # make a POST request to the API endpoint using the requests.post method, passing in stream=True
        response = requests.post(API_URL, headers=headers, json=payload, stream=True)
        response_code = f"{response}"
        #if response_code.strip() != "<Response [200]>":
        #    #print(f"response code - {response}")
        #    raise Exception(f"Sorry, hitting rate limit. Please try again later. {response}")
        
        for chunk in response.iter_lines():
            #Skipping first chunk
            if counter == 0:
                counter += 1
                continue
                #counter+=1
            # check whether each line is non-empty
            if chunk.decode() :
                chunk = chunk.decode()
                # decode each line as response data is in bytes
                if len(chunk) > 12 and "content" in json.loads(chunk[6:])['choices'][0]['delta']:
                    partial_words = partial_words + json.loads(chunk[6:])['choices'][0]["delta"]["content"]
                    if token_counter == 0:
                        history.append(" " + partial_words)
                    else:
                        history[-1] = partial_words
                    token_counter += 1
                    yield [(parse_codeblock(history[i]), parse_codeblock(history[i + 1])) for i in range(0, len(history) - 1, 2) ], history, chat_counter, response, gr.update(interactive=False), gr.update(interactive=False)  # resembles {chatbot: chat, state: history}  
    except Exception as e:
        print (f'error found: {e}')
    yield [(parse_codeblock(history[i]), parse_codeblock(history[i + 1])) for i in range(0, len(history) - 1, 2) ], history, chat_counter, response, gr.update(interactive=True), gr.update(interactive=True)
    print(json.dumps({"chat_counter": chat_counter, "payload": payload, "partial_words": partial_words, "token_counter": token_counter, "counter": counter}))
         

# Define the pipeline
def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (
        synthesised_speech.numpy() * 32767).astype(np.int16)
    return [translated_text, (16000, synthesised_speech)]

def predict(transType, language, audio, audio_mic = None):
        print("debug1:", audio,"debug2", audio_mic)
        if not audio and audio_mic:
            audio = audio_mic
        
        if transType == "Text":
            return translate(audio), None
        if transType == "GPT answer":
            req = translate(audio)
            return gpt_predict(req)
        if transType == "Audio":
            return speech_to_speech_translation(audio)
            
# Define the title etc
title = "Swedish STSOT (Speech To Speech Or Text)"
description="Use Whisper pretrained model to convert swedish audio to english (text or audio)"


supportLangs = ["Swedish", "French (in training)"]
transTypes = ["Text", "Audio", "GPT answer"]

#examples = [
#    ["Text", "Swedish", "./ex1.wav", None],
#    ["Audio", "Swedish", "./ex2.wav", None]
#]

examples =[]
demo = gr.Interface(
    fn=predict,
    inputs=[
        gr.Radio(label="Choose your output format", choices=transTypes),
        gr.Radio(label="Choose a source language", choices=supportLangs, value="Swedish"),
        gr.Audio(label="Import an audio", sources="upload", type="filepath"),
        #gr.Audio(label="Import an audio", sources="upload", type="numpy"),
        gr.Audio(label="Record an audio", sources="microphone", type="filepath"),
    ],
    outputs=[
        gr.Text(label="Text translation or gpt answer"),gr.Audio(label="Audio translation",type = "numpy")
    ],
    title=title,
    description=description,
    article="",
    examples=examples,
)


demo.launch()