Prathamesh1420 commited on
Commit
bacd0f5
1 Parent(s): ae32500

Update app.py

Browse files

new groq api method

Files changed (1) hide show
  1. app.py +124 -159
app.py CHANGED
@@ -1,162 +1,127 @@
1
- import streamlit as st
2
- import pyttsx3
3
- import speech_recognition as sr
4
- from playsound import playsound
5
- import random
6
- import datetime
7
- import webbrowser as wb
8
- import tensorflow as tf
9
  import numpy as np
10
- import librosa
11
- import matplotlib.pyplot as plt
12
- import seaborn as sns
13
- from modules import commands_answers, load_agenda
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- # Initial settings
16
- sns.set()
17
- commands = commands_answers.commands
18
- answers = commands_answers.answers
19
- my_name = 'Bob'
20
-
21
- # Paths for browser
22
- chrome_path = 'open -a /Applications/Google\ Chrome.app %s' # MacOS
23
- # chrome_path = 'C:/Program Files/Google/Chrome/Application/chrome.exe %s' # Windows
24
- # chrome_path = '/usr/bin/google-chrome %s' # Linux
25
-
26
- # Load model
27
- MODEL_TYPES = ['EMOTION']
28
- def load_model_by_name(model_type):
29
- if model_type == MODEL_TYPES[0]:
30
- model = tf.keras.models.load_model('models/speech_emotion_recognition.hdf5')
31
- model_dict = list(['calm', 'happy', 'fear', 'nervous', 'neutral', 'disgust', 'surprise', 'sad'])
32
- SAMPLE_RATE = 48000
33
- return model, model_dict, SAMPLE_RATE
34
-
35
- loaded_model = load_model_by_name('EMOTION')
36
-
37
- # Functions
38
- def search(sentence):
39
- wb.get(chrome_path).open('https://www.google.com/search?q=' + sentence)
40
-
41
- def predict_sound(AUDIO, SAMPLE_RATE, plot=True):
42
- results = []
43
- wav_data, sample_rate = librosa.load(AUDIO, sr=SAMPLE_RATE)
44
- clip, index = librosa.effects.trim(wav_data, top_db=60, frame_length=512, hop_length=64)
45
- splitted_audio_data = tf.signal.frame(clip, sample_rate, sample_rate, pad_end=True, pad_value=0)
46
- for i, data in enumerate(splitted_audio_data.numpy()):
47
- mfccs_features = librosa.feature.mfcc(y=data, sr=sample_rate, n_mfcc=40)
48
- mfccs_scaled_features = np.mean(mfccs_features.T, axis=0)
49
- mfccs_scaled_features = mfccs_scaled_features.reshape(1, -1)[:, :, np.newaxis]
50
- predictions = loaded_model[0].predict(mfccs_scaled_features)
51
- if plot:
52
- plt.figure(figsize=(len(splitted_audio_data), 5))
53
- plt.barh(loaded_model[1], predictions[0])
54
- plt.tight_layout()
55
- st.pyplot(plt)
56
-
57
- predictions = predictions.argmax(axis=1)
58
- predictions = predictions.astype(int).flatten()
59
- predictions = loaded_model[1][predictions[0]]
60
- results.append(predictions)
61
-
62
- count_results = [[results.count(x), x] for x in set(results)]
63
- return max(count_results)
64
-
65
- def play_music_youtube(emotion):
66
- play = False
67
- if emotion == 'sad' or emotion == 'fear':
68
- wb.get(chrome_path).open('https://www.youtube.com/watch?v=k32IPg4dbz0&ab_channel=Amelhorm%C3%BAsicainstrumental')
69
- play = True
70
- if emotion == 'nervous' or emotion == 'surprise':
71
- wb.get(chrome_path).open('https://www.youtube.com/watch?v=pWjmpSD-ph0&ab_channel=CassioToledo')
72
- play = True
73
- return play
74
-
75
- def speak(text):
76
- engine = pyttsx3.init()
77
- engine.setProperty('rate', 90) # number of words per second
78
- engine.setProperty('volume', 1) # min: 0, max: 1
79
- engine.say(text)
80
- engine.runAndWait()
81
-
82
- def listen_microphone():
83
- microphone = sr.Recognizer()
84
- with sr.Microphone() as source:
85
- microphone.adjust_for_ambient_noise(source, duration=0.8)
86
- st.write('Listening...')
87
- audio = microphone.listen(source)
88
- with open('recordings/speech.wav', 'wb') as f:
89
- f.write(audio.get_wav_data())
90
  try:
91
- sentence = microphone.recognize_google(audio, language='en-US')
92
- st.write('You said: ' + sentence)
93
- except sr.UnknownValueError:
94
- sentence = ''
95
- st.write('Not understood')
96
- return sentence
97
-
98
- def test_models():
99
- audio_source = 'recordings/speech.wav'
100
- prediction = predict_sound(audio_source, loaded_model[2], plot=False)
101
- return prediction
102
-
103
- # Streamlit UI
104
- st.title("Virtual Assistant")
105
- st.write("This assistant can perform tasks based on your voice commands.")
106
-
107
- if st.button("Activate Assistant"):
108
- result = listen_microphone()
109
-
110
- if my_name.lower() in result.lower():
111
- result = str(result.split(my_name + ' ')[1])
112
- result = result.lower()
113
-
114
- if result in commands[0]:
115
- speak('I will read my list of functionalities: ' + answers[0])
116
-
117
- elif result in commands[3]:
118
- speak('It is now ' + datetime.datetime.now().strftime('%H:%M'))
119
-
120
- elif result in commands[4]:
121
- date = datetime.date.today().strftime('%d/%B/%Y').split('/')
122
- speak('Today is ' + date[0] + ' of ' + date[1])
123
-
124
- elif result in commands[1]:
125
- speak('Please, tell me the activity!')
126
- result = listen_microphone()
127
- annotation = open('annotation.txt', mode='a+', encoding='utf-8')
128
- annotation.write(result + '\n')
129
- annotation.close()
130
- speak(''.join(random.sample(answers[1], k=1)))
131
- speak('Want me to read the notes?')
132
- result = listen_microphone()
133
- if result == 'yes' or result == 'sure':
134
- with open('annotation.txt') as file_source:
135
- lines = file_source.readlines()
136
- for line in lines:
137
- speak(line)
138
- else:
139
- speak('Ok!')
140
-
141
- elif result in commands[2]:
142
- speak(''.join(random.sample(answers[2], k=1)))
143
- result = listen_microphone()
144
- search(result)
145
-
146
- elif result in commands[6]:
147
- if load_agenda.load_agenda():
148
- speak('These are the events for today:')
149
- for i in range(len(load_agenda.load_agenda()[1])):
150
- speak(load_agenda.load_agenda()[1][i] + ' ' + load_agenda.load_agenda()[0][i] + ' schedule for ' + str(load_agenda.load_agenda()[2][i]))
151
- else:
152
- speak('There are no events for today considering the current time!')
153
-
154
- elif result in commands[5]:
155
- st.write('Emotion analysis mode activated!')
156
- analyse = test_models()
157
- st.write(f'I heard {analyse} in your voice!')
158
- play_music_youtube(analyse[1])
159
-
160
- elif result == 'turn off':
161
- speak(''.join(random.sample(answers[4], k=1)))
162
- st.write("Assistant turned off.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import groq
3
+ import io
 
 
 
 
 
4
  import numpy as np
5
+ import soundfile as sf
6
+ import pyttsx3 # Text-to-speech conversion
7
+
8
+ # Initialize text-to-speech engine
9
+ tts_engine = pyttsx3.init()
10
+
11
+ def transcribe_audio(audio, api_key):
12
+ if audio is None:
13
+ return ""
14
+
15
+ client = groq.Client(api_key=api_key)
16
+
17
+ # Convert audio to the format expected by the model
18
+ audio_data = audio[1] # Get the numpy array from the tuple
19
+ buffer = io.BytesIO()
20
+ sf.write(buffer, audio_data, audio[0], format='wav')
21
+ buffer.seek(0)
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  try:
24
+ # Use Distil-Whisper English powered by Groq for transcription
25
+ completion = client.audio.transcriptions.create(
26
+ model="distil-whisper-large-v3-en",
27
+ file=("audio.wav", buffer),
28
+ response_format="text"
29
+ )
30
+ return completion
31
+ except Exception as e:
32
+ return f"Error in transcription: {str(e)}"
33
+
34
+ def generate_response(transcription, api_key):
35
+ if not transcription:
36
+ return "No transcription available. Please try speaking again."
37
+
38
+ client = groq.Client(api_key=api_key)
39
+
40
+ try:
41
+ # Use Llama 3 70B powered by Groq for text generation
42
+ completion = client.chat.completions.create(
43
+ model="llama3-70b-8192",
44
+ messages=[{"role": "user", "content": transcription}]
45
+ )
46
+ return completion.choices[0].message.content
47
+ except Exception as e:
48
+ return f"Error in response generation: {str(e)}"
49
+
50
+ def convert_text_to_speech(text):
51
+ tts_engine.save_to_file(text, 'response_output.wav')
52
+ tts_engine.runAndWait()
53
+ with open("response_output.wav", "rb") as f:
54
+ audio_bytes = f.read()
55
+ return audio_bytes
56
+
57
+ def process_audio(audio, api_key):
58
+ if not api_key:
59
+ return "Please enter your Groq API key.", "API key is required."
60
+
61
+ transcription = transcribe_audio(audio, api_key)
62
+ response = generate_response(transcription, api_key)
63
+
64
+ if "Error" in response:
65
+ return transcription, response, None # In case of error, return empty audio
66
+
67
+ audio_output = convert_text_to_speech(response)
68
+ return transcription, response, audio_output
69
+
70
+ # Custom CSS
71
+ custom_css = """
72
+ .gradio-container {
73
+ background-color: #f5f5f5;
74
+ }
75
+ .gr-button-primary {
76
+ background-color: #f55036 !important;
77
+ border-color: #f55036 !important;
78
+ }
79
+ .gr-button-secondary {
80
+ color: #f55036 !important;
81
+ border-color: #f55036 !important;
82
+ }
83
+ #groq-badge {
84
+ position: fixed;
85
+ bottom: 20px;
86
+ right: 20px;
87
+ z-index: 1000;
88
+ }
89
+ """
90
+
91
+ # Gradio Interface
92
+ with gr.Blocks(theme=gr.themes.Default()) as demo:
93
+ gr.Markdown("# 🎙️ Groq x Gradio Voice-Powered AI Assistant")
94
+
95
+ api_key_input = gr.Textbox(type="password", label="Enter your Groq API Key")
96
+
97
+ with gr.Row():
98
+ audio_input = gr.Audio(label="Speak!", type="numpy")
99
+
100
+ with gr.Row():
101
+ transcription_output = gr.Textbox(label="Transcription")
102
+ response_output = gr.Textbox(label="AI Assistant Response")
103
+ audio_output = gr.Audio(label="Voice Response", type="file")
104
+
105
+ submit_button = gr.Button("Process", variant="primary")
106
+
107
+ gr.HTML("""
108
+ <div id="groq-badge">
109
+ <div style="color: #f55036; font-weight: bold;">POWERED BY GROQ</div>
110
+ </div>
111
+ """)
112
+
113
+ submit_button.click(
114
+ process_audio,
115
+ inputs=[audio_input, api_key_input],
116
+ outputs=[transcription_output, response_output, audio_output]
117
+ )
118
+
119
+ gr.Markdown("""
120
+ ## How to use this app:
121
+ 1. Enter your [Groq API Key](https://console.groq.com/keys) in the provided field.
122
+ 2. Click on the microphone icon and speak your message (or upload an audio file).
123
+ 3. Click the "Process" button to transcribe your speech and generate a response from our AI assistant.
124
+ 4. The transcription, AI assistant response, and voice response will appear.
125
+ """)
126
+
127
+ demo.launch()