VocalChat / app.py
Falln87's picture
Update app.py
0feaeea verified
import streamlit as st
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from huggingface_hub import inference_api
import speech_recognition as sr
import pyttsx3
import diffusers
# Set up speech recognition and synthesis
r = sr.Recognizer()
engine = pyttsx3.init()
# Set up the Hugging Face Hub model and tokenizer
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Set up the Serverless Inference API
inference_api_token = HF_TOKEN
inference_api = inference_api.InferenceApi(token=inference_api_token)
# Set up the Diffusers library
diffusers_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
diffusers_model = diffusers.DDPMPipeline.from_pretrained("ByteDance/SDXL-Lightning")
def recognize_speech():
with sr.Microphone() as source:
print("Say something!")
audio = r.listen(source)
try:
text = r.recognize_google(audio, language="en-US")
return text
except sr.UnknownValueError:
print("Sorry, I didn't catch that. Try again!")
return None
def respond_to_text(text):
inputs = tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=512,
return_attention_mask=True,
return_tensors='pt'
)
outputs = inference_api.predict(model_name, inputs)
logits = outputs.logits
_, predicted = torch.max(logits, dim=1)
response = tokenizer.decode(predicted[0], skip_special_tokens=True)
return response
def generate_image(prompt):
image = diffusers_model(prompt, num_inference_steps=50, device=diffusers_device)
return image
def speak_text(text):
engine.say(text)
engine.runAndWait()
st.title("Chat with LLM and Generate Images")
chat_input = st.text_input("Type or speak something:")
if chat_input:
response = respond_to_text(chat_input)
st.write("LLM Response:", response)
speak_text(response)
generate_image_button = st.button("Generate Image")
if generate_image_button:
prompt = st.text_input("Enter a prompt for the image:")
image = generate_image(prompt)
st.image(image, use_column_width=True)
mic_button = st.button("Speak")
if mic_button:
text = recognize_speech()
if text:
response = respond_to_text(text)
st.write("LLM Response:", response)
speak_text(response)