Spaces:

Prathamesh1420
/

Virtual_assistant

Sleeping

App Files Files Community

Prathamesh1420 commited on Sep 15

Commit

bacd0f5

•

1 Parent(s): ae32500

Update app.py

Browse files

new groq api method

Files changed (1) hide show

app.py +124 -159

app.py CHANGED Viewed

@@ -1,162 +1,127 @@
-import streamlit as st
-import pyttsx3
-import speech_recognition as sr
-from playsound import playsound
-import random
-import datetime
-import webbrowser as wb
-import tensorflow as tf
 import numpy as np
-import librosa
-import matplotlib.pyplot as plt
-import seaborn as sns
-from modules import commands_answers, load_agenda
-# Initial settings
-sns.set()
-commands = commands_answers.commands
-answers = commands_answers.answers
-my_name = 'Bob'
-# Paths for browser
-chrome_path = 'open -a /Applications/Google\ Chrome.app %s'  # MacOS
-# chrome_path = 'C:/Program Files/Google/Chrome/Application/chrome.exe %s'  # Windows
-# chrome_path = '/usr/bin/google-chrome %s'  # Linux
-# Load model
-MODEL_TYPES = ['EMOTION']
-def load_model_by_name(model_type):
-    if model_type == MODEL_TYPES[0]:
-        model = tf.keras.models.load_model('models/speech_emotion_recognition.hdf5')
-        model_dict = list(['calm', 'happy', 'fear', 'nervous', 'neutral', 'disgust', 'surprise', 'sad'])
-        SAMPLE_RATE = 48000
-    return model, model_dict, SAMPLE_RATE
-loaded_model = load_model_by_name('EMOTION')
-# Functions
-def search(sentence):
-    wb.get(chrome_path).open('https://www.google.com/search?q=' + sentence)
-def predict_sound(AUDIO, SAMPLE_RATE, plot=True):
-    results = []
-    wav_data, sample_rate = librosa.load(AUDIO, sr=SAMPLE_RATE)
-    clip, index = librosa.effects.trim(wav_data, top_db=60, frame_length=512, hop_length=64)
-    splitted_audio_data = tf.signal.frame(clip, sample_rate, sample_rate, pad_end=True, pad_value=0)
-    for i, data in enumerate(splitted_audio_data.numpy()):
-        mfccs_features = librosa.feature.mfcc(y=data, sr=sample_rate, n_mfcc=40)
-        mfccs_scaled_features = np.mean(mfccs_features.T, axis=0)
-        mfccs_scaled_features = mfccs_scaled_features.reshape(1, -1)[:, :, np.newaxis]
-        predictions = loaded_model[0].predict(mfccs_scaled_features)
-        if plot:
-            plt.figure(figsize=(len(splitted_audio_data), 5))
-            plt.barh(loaded_model[1], predictions[0])
-            plt.tight_layout()
-            st.pyplot(plt)
-        predictions = predictions.argmax(axis=1)
-        predictions = predictions.astype(int).flatten()
-        predictions = loaded_model[1][predictions[0]]
-        results.append(predictions)
-    count_results = [[results.count(x), x] for x in set(results)]
-    return max(count_results)
-def play_music_youtube(emotion):
-    play = False
-    if emotion == 'sad' or emotion == 'fear':
-        wb.get(chrome_path).open('https://www.youtube.com/watch?v=k32IPg4dbz0&ab_channel=Amelhorm%C3%BAsicainstrumental')
-        play = True
-    if emotion == 'nervous' or emotion == 'surprise':
-        wb.get(chrome_path).open('https://www.youtube.com/watch?v=pWjmpSD-ph0&ab_channel=CassioToledo')
-        play = True
-    return play
-def speak(text):
-    engine = pyttsx3.init()
-    engine.setProperty('rate', 90)  # number of words per second
-    engine.setProperty('volume', 1)  # min: 0, max: 1
-    engine.say(text)
-    engine.runAndWait()
-def listen_microphone():
-    microphone = sr.Recognizer()
-    with sr.Microphone() as source:
-        microphone.adjust_for_ambient_noise(source, duration=0.8)
-        st.write('Listening...')
-        audio = microphone.listen(source)
-        with open('recordings/speech.wav', 'wb') as f:
-            f.write(audio.get_wav_data())
     try:
-        sentence = microphone.recognize_google(audio, language='en-US')
-        st.write('You said: ' + sentence)
-    except sr.UnknownValueError:
-        sentence = ''
-        st.write('Not understood')
-    return sentence
-def test_models():
-    audio_source = 'recordings/speech.wav'
-    prediction = predict_sound(audio_source, loaded_model[2], plot=False)
-    return prediction
-# Streamlit UI
-st.title("Virtual Assistant")
-st.write("This assistant can perform tasks based on your voice commands.")
-if st.button("Activate Assistant"):
-    result = listen_microphone()
-    if my_name.lower() in result.lower():
-        result = str(result.split(my_name + ' ')[1])
-        result = result.lower()
-        if result in commands[0]:
-            speak('I will read my list of functionalities: ' + answers[0])
-        elif result in commands[3]:
-            speak('It is now ' + datetime.datetime.now().strftime('%H:%M'))
-        elif result in commands[4]:
-            date = datetime.date.today().strftime('%d/%B/%Y').split('/')
-            speak('Today is ' + date[0] + ' of ' + date[1])
-        elif result in commands[1]:
-            speak('Please, tell me the activity!')
-            result = listen_microphone()
-            annotation = open('annotation.txt', mode='a+', encoding='utf-8')
-            annotation.write(result + '\n')
-            annotation.close()
-            speak(''.join(random.sample(answers[1], k=1)))
-            speak('Want me to read the notes?')
-            result = listen_microphone()
-            if result == 'yes' or result == 'sure':
-                with open('annotation.txt') as file_source:
-                    lines = file_source.readlines()
-                    for line in lines:
-                        speak(line)
-            else:
-                speak('Ok!')
-        elif result in commands[2]:
-            speak(''.join(random.sample(answers[2], k=1)))
-            result = listen_microphone()
-            search(result)
-        elif result in commands[6]:
-            if load_agenda.load_agenda():
-                speak('These are the events for today:')
-                for i in range(len(load_agenda.load_agenda()[1])):
-                    speak(load_agenda.load_agenda()[1][i] + ' ' + load_agenda.load_agenda()[0][i] + ' schedule for ' + str(load_agenda.load_agenda()[2][i]))
-            else:
-                speak('There are no events for today considering the current time!')
-        elif result in commands[5]:
-            st.write('Emotion analysis mode activated!')
-            analyse = test_models()
-            st.write(f'I heard {analyse} in your voice!')
-            play_music_youtube(analyse[1])
-        elif result == 'turn off':
-            speak(''.join(random.sample(answers[4], k=1)))
-            st.write("Assistant turned off.")

+import gradio as gr
+import groq
+import io
 import numpy as np
+import soundfile as sf
+import pyttsx3  # Text-to-speech conversion
+# Initialize text-to-speech engine
+tts_engine = pyttsx3.init()
+def transcribe_audio(audio, api_key):
+    if audio is None:
+        return ""
+    client = groq.Client(api_key=api_key)
+    # Convert audio to the format expected by the model
+    audio_data = audio[1]  # Get the numpy array from the tuple
+    buffer = io.BytesIO()
+    sf.write(buffer, audio_data, audio[0], format='wav')
+    buffer.seek(0)
     try:
+        # Use Distil-Whisper English powered by Groq for transcription
+        completion = client.audio.transcriptions.create(
+            model="distil-whisper-large-v3-en",
+            file=("audio.wav", buffer),
+            response_format="text"
+        )
+        return completion
+    except Exception as e:
+        return f"Error in transcription: {str(e)}"
+def generate_response(transcription, api_key):
+    if not transcription:
+        return "No transcription available. Please try speaking again."
+    client = groq.Client(api_key=api_key)
+    try:
+        # Use Llama 3 70B powered by Groq for text generation
+        completion = client.chat.completions.create(
+            model="llama3-70b-8192",
+            messages=[{"role": "user", "content": transcription}]
+        )
+        return completion.choices[0].message.content
+    except Exception as e:
+        return f"Error in response generation: {str(e)}"
+def convert_text_to_speech(text):
+    tts_engine.save_to_file(text, 'response_output.wav')
+    tts_engine.runAndWait()
+    with open("response_output.wav", "rb") as f:
+        audio_bytes = f.read()
+    return audio_bytes
+def process_audio(audio, api_key):
+    if not api_key:
+        return "Please enter your Groq API key.", "API key is required."
+    transcription = transcribe_audio(audio, api_key)
+    response = generate_response(transcription, api_key)
+    if "Error" in response:
+        return transcription, response, None  # In case of error, return empty audio
+    audio_output = convert_text_to_speech(response)
+    return transcription, response, audio_output
+# Custom CSS
+custom_css = """
+.gradio-container {
+    background-color: #f5f5f5;
+}
+.gr-button-primary {
+    background-color: #f55036 !important;
+    border-color: #f55036 !important;
+}
+.gr-button-secondary {
+    color: #f55036 !important;
+    border-color: #f55036 !important;
+}
+#groq-badge {
+    position: fixed;
+    bottom: 20px;
+    right: 20px;
+    z-index: 1000;
+}
+"""
+# Gradio Interface
+with gr.Blocks(theme=gr.themes.Default()) as demo:
+    gr.Markdown("# 🎙️ Groq x Gradio Voice-Powered AI Assistant")
+    api_key_input = gr.Textbox(type="password", label="Enter your Groq API Key")
+    with gr.Row():
+        audio_input = gr.Audio(label="Speak!", type="numpy")
+    with gr.Row():
+        transcription_output = gr.Textbox(label="Transcription")
+        response_output = gr.Textbox(label="AI Assistant Response")
+        audio_output = gr.Audio(label="Voice Response", type="file")
+    submit_button = gr.Button("Process", variant="primary")
+    gr.HTML("""
+    <div id="groq-badge">
+        <div style="color: #f55036; font-weight: bold;">POWERED BY GROQ</div>
+    </div>
+    """)
+    submit_button.click(
+        process_audio,
+        inputs=[audio_input, api_key_input],
+        outputs=[transcription_output, response_output, audio_output]
+    )
+    gr.Markdown("""
+    ## How to use this app:
+    1. Enter your [Groq API Key](https://console.groq.com/keys) in the provided field.
+    2. Click on the microphone icon and speak your message (or upload an audio file).
+    3. Click the "Process" button to transcribe your speech and generate a response from our AI assistant.
+    4. The transcription, AI assistant response, and voice response will appear.
+    """)
+demo.launch()