pedropauletti commited on
Commit
3efe6ac
1 Parent(s): c38177b

Create helpers.py

Browse files
Files changed (1) hide show
  1. helpers.py +445 -0
helpers.py ADDED
@@ -0,0 +1,445 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import pandas as pd
4
+ import tensorflow as tf
5
+ import tensorflow_io as tfio
6
+ import csv
7
+ from scipy.io import wavfile
8
+ import scipy
9
+ import librosa
10
+ import soundfile as sf
11
+ import time
12
+ import soundfile as sf
13
+ import gradio as gr
14
+
15
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
16
+ from transformers import AutoProcessor
17
+ from transformers import BarkModel
18
+ from optimum.bettertransformer import BetterTransformer
19
+ import torch
20
+
21
+ from nemo.collections.tts.models import FastPitchModel
22
+
23
+ from nemo.collections.tts.models import HifiGanModel
24
+
25
+ from deep_translator import GoogleTranslator
26
+ from haystack.document_stores import InMemoryDocumentStore
27
+ from haystack.nodes import EmbeddingRetriever
28
+
29
+
30
+ # --- Load models ---
31
+
32
+ #Load a model from tensorflow hub
33
+ def load_model_hub(model_url):
34
+ model = hub.load(model_url)
35
+ return model
36
+
37
+ # Load a model from the project folder
38
+ def load_model_file(model_path):
39
+ interpreter = tf.lite.Interpreter(model_path)
40
+ interpreter.allocate_tensors()
41
+ return interpreter
42
+
43
+ # --- Initialize models ---
44
+
45
+ def initialize_text_to_speech_model():
46
+ spec_generator = FastPitchModel.from_pretrained("nvidia/tts_en_fastpitch")
47
+ # Load vocoder
48
+ model = HifiGanModel.from_pretrained(model_name="nvidia/tts_hifigan")
49
+ return spec_generator, model
50
+
51
+
52
+ def initialize_tt5_model():
53
+ from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor, SpeechT5HifiGan
54
+ from datasets import load_dataset
55
+
56
+ dataset = load_dataset("pedropauletti/librispeech-portuguese")
57
+
58
+ model = SpeechT5ForTextToSpeech.from_pretrained("pedropauletti/speecht5_finetuned_librispeech_pt")
59
+
60
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
61
+
62
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
63
+
64
+ example = dataset["test"][100]
65
+ speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
66
+
67
+ return model, processor, vocoder, speaker_embeddings
68
+
69
+
70
+ def load_qa_model():
71
+ document_store = InMemoryDocumentStore()
72
+ retriever = EmbeddingRetriever(
73
+ document_store=document_store,
74
+ embedding_model="sentence-transformers/all-MiniLM-L6-v2",
75
+ use_gpu=False,
76
+ scale_score=False,
77
+ )
78
+ # Get dataframe with columns "question", "answer" and some custom metadata
79
+ df = pd.read_csv('content/social-faq.csv', on_bad_lines='skip', delimiter=';')
80
+ # Minimal cleaning
81
+ df.fillna(value="", inplace=True)
82
+ df["question"] = df["question"].apply(lambda x: x.strip())
83
+
84
+ questions = list(df["question"].values)
85
+ df["embedding"] = retriever.embed_queries(queries=questions).tolist()
86
+ df = df.rename(columns={"question": "content"})
87
+
88
+ # Convert Dataframe to list of dicts and index them in our DocumentStore
89
+ docs_to_index = df.to_dict(orient="records")
90
+ document_store.write_documents(docs_to_index)
91
+
92
+ return retriever
93
+
94
+
95
+ # --- Audio pre-processing ---
96
+
97
+ # Utility functions for loading audio files and making sure the sample rate is correct.
98
+ @tf.function
99
+ def load_wav_16k_mono(filename):
100
+ """ Load a WAV file, convert it to a float tensor, resample to 16 kHz single-channel audio. """
101
+ file_contents = tf.io.read_file(filename)
102
+ wav, sample_rate = tf.audio.decode_wav(
103
+ file_contents,
104
+ desired_channels=1)
105
+ wav = tf.squeeze(wav, axis=-1)
106
+ sample_rate = tf.cast(sample_rate, dtype=tf.int64)
107
+ wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
108
+ return wav
109
+
110
+
111
+ def load_wav_16k_mono_librosa(filename):
112
+ """ Load a WAV file, convert it to a float tensor, resample to 16 kHz single-channel audio using librosa. """
113
+ wav, sample_rate = librosa.load(filename, sr=16000, mono=True)
114
+ return wav
115
+
116
+
117
+ def load_wav_16k_mono_soundfile(filename):
118
+ """ Load a WAV file, convert it to a float tensor, resample to 16 kHz single-channel audio using soundfile. """
119
+ wav, sample_rate = sf.read(filename, dtype='float32')
120
+ # Resample to 16 kHz if necessary
121
+ if sample_rate != 16000:
122
+ wav = librosa.resample(wav, orig_sr=sample_rate, target_sr=16000)
123
+ return wav
124
+
125
+
126
+ # --- History ---
127
+ def updateHistory():
128
+ global history
129
+ return history
130
+
131
+ def clearHistory():
132
+ global history
133
+ history = ""
134
+ return history
135
+
136
+ def clear():
137
+ return None
138
+
139
+ # --- Output Format ---
140
+
141
+ def format_dictionary(dictionary):
142
+ result = []
143
+ for key, value in dictionary.items():
144
+ percentage = int(value * 100)
145
+ result.append(f"{key}: {percentage}%")
146
+ return ', '.join(result)
147
+
148
+
149
+ def format_json(json_data):
150
+ confidence_strings = [f"{item['label']}: {round(item['confidence']*100)}%" for item in json_data['confidences']]
151
+ result_string = f"{', '.join(confidence_strings)}"
152
+ return result_string
153
+
154
+ def format_json_pt(json_data):
155
+ from unidecode import unidecode
156
+ confidence_strings = [f"{item['label']}... " for item in json_data['confidences']]
157
+ result_string = f"{', '.join(confidence_strings)}"
158
+ return unidecode(result_string)
159
+
160
+
161
+ # --- Classification ---
162
+
163
+ def load_label_mapping(csv_path):
164
+ label_mapping = {}
165
+ with open(csv_path, newline='', encoding='utf-8') as csvfile:
166
+ reader = csv.DictReader(csvfile)
167
+ for row in reader:
168
+ label_mapping[int(row['index'])] = row['display_name']
169
+ return label_mapping
170
+
171
+
172
+ def predict_yamnet(interpreter, waveform, input_details, output_details, label_mapping):
173
+ # Pré-processamento da waveform para corresponder aos requisitos do modelo
174
+ input_shape = input_details[0]['shape']
175
+ input_data = np.array(waveform, dtype=np.float32)
176
+
177
+ if input_data.shape != input_shape:
178
+ # Redimensionar ou preencher a waveform para corresponder ao tamanho esperado
179
+ if input_data.shape[0] < input_shape[0]:
180
+ # Preencher a waveform com zeros
181
+ padding = np.zeros((input_shape[0] - input_data.shape[0],))
182
+ input_data = np.concatenate((input_data, padding))
183
+ elif input_data.shape[0] > input_shape[0]:
184
+ # Redimensionar a waveform
185
+ input_data = input_data[:input_shape[0]]
186
+
187
+ input_data = np.reshape(input_data, input_shape)
188
+
189
+ # Executar a inferência
190
+ interpreter.set_tensor(input_details[0]['index'], input_data)
191
+ interpreter.invoke()
192
+
193
+ # Obter os resultados da inferência
194
+ output_data = interpreter.get_tensor(output_details[0]['index'])
195
+
196
+ # Processar os resultados e imprimir nome da etiqueta
197
+ top_labels_indices = np.argsort(output_data[0])[::-1][:3]
198
+ results = []
199
+ for i in top_labels_indices:
200
+ label_name = label_mapping.get(i, "Unknown Label")
201
+ probability = float(output_data[0][i]) # Converter para float
202
+ results.append({'label': label_name, 'probability': str(probability)})
203
+
204
+ return results # Retornar um dicionário contendo a lista de resultados
205
+
206
+
207
+ def classify(audio, language="en-us"):
208
+ #Preprocessing audio
209
+ wav_data = load_wav_16k_mono_librosa(audio)
210
+
211
+ if(language == "pt-br"):
212
+ #Label Mapping
213
+ label_mapping = load_label_mapping('content/yamnet_class_map_ptbr.csv')
214
+ else:
215
+ label_mapping = load_label_mapping('content/yamnet_class_map.csv')
216
+
217
+ #Load Model by File
218
+ model = load_model_file('content/yamnet_classification.tflite')
219
+ input_details = model.get_input_details()
220
+ output_details = model.get_output_details()
221
+
222
+ #Classification
223
+ result = predict_yamnet(model, wav_data, input_details, output_details, label_mapping)
224
+
225
+ return result
226
+
227
+ def classify_realtime(language, audio, state):
228
+ #Preprocessing audio
229
+ wav_data = load_wav_16k_mono_librosa(audio)
230
+
231
+ if(language == "pt-br"):
232
+ #Label Mapping
233
+ label_mapping = load_label_mapping('content/yamnet_class_map_ptbr.csv')
234
+ else:
235
+ label_mapping = load_label_mapping('content/yamnet_class_map.csv')
236
+
237
+ #Load Model by File
238
+ model = load_model_file('content/yamnet_classification.tflite')
239
+ input_details = model.get_input_details()
240
+ output_details = model.get_output_details()
241
+
242
+ #Classification
243
+ result = predict_yamnet(model, wav_data, input_details, output_details, label_mapping)
244
+
245
+ state += result + " "
246
+
247
+ return result, state
248
+
249
+
250
+ # --- TTS ---
251
+
252
+ def generate_audio(spec_generator, model, input_text):
253
+ parsed = spec_generator.parse(input_text)
254
+ spectrogram = spec_generator.generate_spectrogram(tokens=parsed)
255
+ audio = model.convert_spectrogram_to_audio(spec=spectrogram)
256
+ return 22050, audio.cpu().detach().numpy().squeeze()
257
+
258
+
259
+ def generate_audio_tt5(model, processor, vocoder, speaker_embeddings, text):
260
+ inputs = processor(text=text, return_tensors="pt")
261
+ audio = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
262
+ return 16000, audio.cpu().detach().numpy().squeeze()
263
+
264
+
265
+
266
+ def TTS(json_input, language):
267
+ global spec_generator, model_nvidia, history
268
+ global model_tt5, processor, vocoder, speaker_embeddings
269
+
270
+ if language == 'en-us':
271
+ sr, generatedAudio = generate_audio(spec_generator, model_nvidia, format_json(json_input))
272
+ else:
273
+ sr, generatedAudio = generate_audio_tt5(model_tt5, processor, vocoder, speaker_embeddings, format_json_pt(json_input))
274
+
275
+ return (sr, generatedAudio)
276
+
277
+
278
+ def TTS_ASR(json_input, language):
279
+ global spec_generator, model_nvidia, history
280
+ global model_tt5, processor, vocoder, speaker_embeddings
281
+
282
+ if language == 'en-us':
283
+ sr, generatedAudio = generate_audio(spec_generator, model_nvidia, json_input['label'])
284
+ else:
285
+ sr, generatedAudio = generate_audio_tt5(model_tt5, processor, vocoder, speaker_embeddings, json_input['label'])
286
+
287
+ return (sr, generatedAudio)
288
+
289
+
290
+ def TTS_chatbot(language):
291
+ global spec_generator, model_nvidia, history
292
+ global model_tt5, processor, vocoder, speaker_embeddings
293
+ global last_answer
294
+
295
+ if language == 'en-us':
296
+ sr, generatedAudio = generate_audio(spec_generator, model_nvidia, last_answer)
297
+ else:
298
+ sr, generatedAudio = generate_audio_tt5(model_tt5, processor, vocoder, speaker_embeddings, last_answer)
299
+
300
+ return (sr, generatedAudio)
301
+
302
+ # --- ASR ---
303
+
304
+ def transcribe_speech(filepath, language):
305
+ print(filepath)
306
+ if(language == "pt-br"):
307
+ output = pipe(
308
+ filepath,
309
+ max_new_tokens=256,
310
+ generate_kwargs={
311
+ "task": "transcribe",
312
+ "language": "portuguese",
313
+ },
314
+ chunk_length_s=30,
315
+ batch_size=8,
316
+ )
317
+ else:
318
+ output = pipe_en(
319
+ filepath,
320
+ max_new_tokens=256,
321
+ generate_kwargs={
322
+ "task": "transcribe",
323
+ "language": "english",
324
+ },
325
+ chunk_length_s=30,
326
+ batch_size=8,
327
+ )
328
+
329
+
330
+ return output["text"]
331
+
332
+
333
+ def transcribe_speech_realtime(filepath, state):
334
+ output = pipe(
335
+ filepath,
336
+ max_new_tokens=256,
337
+ generate_kwargs={
338
+ "task": "transcribe",
339
+ "language": "english",
340
+ },
341
+ chunk_length_s=30,
342
+ batch_size=8,
343
+ )
344
+ state += output["text"] + " "
345
+ return output["text"], state
346
+
347
+
348
+ def transcribe_realtime(new_chunk, stream):
349
+ sr, y = new_chunk
350
+ y = y.astype(np.float32)
351
+ y /= np.max(np.abs(y))
352
+
353
+ if stream is not None:
354
+ stream = np.concatenate([stream, y])
355
+ else:
356
+ stream = y
357
+ return stream, pipe_en({"sampling_rate": sr, "raw": stream})["text"]
358
+
359
+
360
+ # --- Translation ---
361
+
362
+ def translate_enpt(text):
363
+ global enpt_pipeline
364
+ translation = enpt_pipeline(f"translate English to Portuguese: {text}")
365
+ return translation[0]['generated_text']
366
+
367
+
368
+ # --- Gradio Interface ---
369
+
370
+ def interface(language, audio):
371
+ global classificationResult
372
+ result = classify(language, audio)
373
+ dic = {result[0]['label']: float(result[0]['probability']),
374
+ result[1]['label']: float(result[1]['probability']),
375
+ result[2]['label']: float(result[2]['probability'])
376
+ }
377
+ # history += result[0]['label'] + '\n'
378
+ classificationResult = dic
379
+
380
+ return dic
381
+
382
+ def interface_realtime(language, audio):
383
+ global history
384
+ result = classify(language, audio)
385
+ dic = {result[0]['label']: float(result[0]['probability']),
386
+ result[1]['label']: float(result[1]['probability']),
387
+ result[2]['label']: float(result[2]['probability'])
388
+ }
389
+ history = result[0]['label'] + '\n' + history
390
+ return dic
391
+
392
+
393
+
394
+ # --- QA Model ---
395
+
396
+ def get_answers(retriever, query):
397
+ from haystack.pipelines import FAQPipeline
398
+
399
+ pipe = FAQPipeline(retriever=retriever)
400
+
401
+ from haystack.utils import print_answers
402
+
403
+ # Run any question and change top_k to see more or less answers
404
+ prediction = pipe.run(query=query, params={"Retriever": {"top_k": 1}})
405
+
406
+ answers = prediction['answers']
407
+
408
+ if answers:
409
+ return answers[0].answer
410
+ else:
411
+ return "I don't have an answer to that question"
412
+
413
+
414
+ def add_text(chat_history, text):
415
+ chat_history = chat_history + [(text, None)]
416
+ return chat_history, gr.Textbox(value="", interactive=False)
417
+
418
+
419
+ def chatbot_response(chat_history, language):
420
+
421
+ chat_history[-1][1] = ""
422
+
423
+ global retriever
424
+ global last_answer
425
+
426
+ if language == 'pt-br':
427
+ response = get_answers(retriever, GoogleTranslator(source='pt', target='en').translate(chat_history[-1][0]))
428
+ response = GoogleTranslator(source='en', target='pt').translate(response)
429
+ else:
430
+ response = get_answers(retriever, chat_history[-1][0])
431
+
432
+ last_answer = response
433
+
434
+ for character in response:
435
+ chat_history[-1][1] += character
436
+ time.sleep(0.01)
437
+ yield chat_history
438
+
439
+
440
+
441
+ retriever = load_qa_model()
442
+ spec_generator, model_nvidia = initialize_text_to_speech_model()
443
+ model_tt5, processor, vocoder, speaker_embeddings = initialize_tt5_model()
444
+ pipe = pipeline("automatic-speech-recognition", model="pedropauletti/whisper-small-pt")
445
+ pipe_en = pipeline("automatic-speech-recognition", model="openai/whisper-small")