import streamlit as st import torch from transformers import AutoModelForSequenceClassification, AutoTokenizer from huggingface_hub import inference_api import speech_recognition as sr import pyttsx3 import diffusers # Set up speech recognition and synthesis r = sr.Recognizer() engine = pyttsx3.init() # Set up the Hugging Face Hub model and tokenizer model_name = "distilbert-base-uncased-finetuned-sst-2-english" model = AutoModelForSequenceClassification.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) # Set up the Serverless Inference API inference_api_token = HF_TOKEN inference_api = inference_api.InferenceApi(token=inference_api_token) # Set up the Diffusers library diffusers_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") diffusers_model = diffusers.DDPMPipeline.from_pretrained("ByteDance/SDXL-Lightning") def recognize_speech(): with sr.Microphone() as source: print("Say something!") audio = r.listen(source) try: text = r.recognize_google(audio, language="en-US") return text except sr.UnknownValueError: print("Sorry, I didn't catch that. Try again!") return None def respond_to_text(text): inputs = tokenizer.encode_plus( text, add_special_tokens=True, max_length=512, return_attention_mask=True, return_tensors='pt' ) outputs = inference_api.predict(model_name, inputs) logits = outputs.logits _, predicted = torch.max(logits, dim=1) response = tokenizer.decode(predicted[0], skip_special_tokens=True) return response def generate_image(prompt): image = diffusers_model(prompt, num_inference_steps=50, device=diffusers_device) return image def speak_text(text): engine.say(text) engine.runAndWait() st.title("Chat with LLM and Generate Images") chat_input = st.text_input("Type or speak something:") if chat_input: response = respond_to_text(chat_input) st.write("LLM Response:", response) speak_text(response) generate_image_button = st.button("Generate Image") if generate_image_button: prompt = st.text_input("Enter a prompt for the image:") image = generate_image(prompt) st.image(image, use_column_width=True) mic_button = st.button("Speak") if mic_button: text = recognize_speech() if text: response = respond_to_text(text) st.write("LLM Response:", response) speak_text(response)