Spaces:
Sleeping
Sleeping
import streamlit as st | |
import torch | |
from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
from huggingface_hub import inference_api | |
import speech_recognition as sr | |
import pyttsx3 | |
import diffusers | |
# Set up speech recognition and synthesis | |
r = sr.Recognizer() | |
engine = pyttsx3.init() | |
# Set up the Hugging Face Hub model and tokenizer | |
model_name = "distilbert-base-uncased-finetuned-sst-2-english" | |
model = AutoModelForSequenceClassification.from_pretrained(model_name) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
# Set up the Serverless Inference API | |
inference_api_token = HF_TOKEN | |
inference_api = inference_api.InferenceApi(token=inference_api_token) | |
# Set up the Diffusers library | |
diffusers_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
diffusers_model = diffusers.DDPMPipeline.from_pretrained("stabilityai/stable-diffusion-2") | |
def recognize_speech(): | |
with sr.Microphone() as source: | |
print("Say something!") | |
audio = r.listen(source) | |
try: | |
text = r.recognize_google(audio, language="en-US") | |
return text | |
except sr.UnknownValueError: | |
print("Sorry, I didn't catch that. Try again!") | |
return None | |
def respond_to_text(text): | |
inputs = tokenizer.encode_plus( | |
text, | |
add_special_tokens=True, | |
max_length=512, | |
return_attention_mask=True, | |
return_tensors='pt' | |
) | |
outputs = inference_api.predict(model_name, inputs) | |
logits = outputs.logits | |
_, predicted = torch.max(logits, dim=1) | |
response = tokenizer.decode(predicted[0], skip_special_tokens=True) | |
return response | |
def generate_image(prompt): | |
image = diffusers_model(prompt, num_inference_steps=50, device=diffusers_device) | |
return image | |
def speak_text(text): | |
engine.say(text) | |
engine.runAndWait() | |
st.title("Chat with LLM and Generate Images") | |
chat_input = st.text_input("Type or speak something:") | |
if chat_input: | |
response = respond_to_text(chat_input) | |
st.write("LLM Response:", response) | |
speak_text(response) | |
generate_image_button = st.button("Generate Image") | |
if generate_image_button: | |
prompt = st.text_input("Enter a prompt for the image:") | |
image = generate_image(prompt) | |
st.image(image, use_column_width=True) | |
mic_button = st.button("Speak") | |
if mic_button: | |
text = recognize_speech() | |
if text: | |
response = respond_to_text(text) | |
st.write("LLM Response:", response) | |
speak_text(response) |