File size: 2,498 Bytes
892b053
 
 
 
c05130c
892b053
 
 
 
 
 
 
 
 
 
 
 
 
7cb287d
892b053
 
 
 
0feaeea
892b053
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import streamlit as st
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from huggingface_hub import inference_api
import speech_recognition as sr
import pyttsx3
import diffusers

# Set up speech recognition and synthesis
r = sr.Recognizer()
engine = pyttsx3.init()

# Set up the Hugging Face Hub model and tokenizer
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set up the Serverless Inference API
inference_api_token = HF_TOKEN
inference_api = inference_api.InferenceApi(token=inference_api_token)

# Set up the Diffusers library
diffusers_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
diffusers_model = diffusers.DDPMPipeline.from_pretrained("ByteDance/SDXL-Lightning")

def recognize_speech():
    with sr.Microphone() as source:
        print("Say something!")
        audio = r.listen(source)
        try:
            text = r.recognize_google(audio, language="en-US")
            return text
        except sr.UnknownValueError:
            print("Sorry, I didn't catch that. Try again!")
            return None

def respond_to_text(text):
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        return_attention_mask=True,
        return_tensors='pt'
    )
    outputs = inference_api.predict(model_name, inputs)
    logits = outputs.logits
    _, predicted = torch.max(logits, dim=1)
    response = tokenizer.decode(predicted[0], skip_special_tokens=True)
    return response

def generate_image(prompt):
    image = diffusers_model(prompt, num_inference_steps=50, device=diffusers_device)
    return image

def speak_text(text):
    engine.say(text)
    engine.runAndWait()

st.title("Chat with LLM and Generate Images")

chat_input = st.text_input("Type or speak something:")
if chat_input:
    response = respond_to_text(chat_input)
    st.write("LLM Response:", response)
    speak_text(response)

generate_image_button = st.button("Generate Image")
if generate_image_button:
    prompt = st.text_input("Enter a prompt for the image:")
    image = generate_image(prompt)
    st.image(image, use_column_width=True)

mic_button = st.button("Speak")
if mic_button:
    text = recognize_speech()
    if text:
        response = respond_to_text(text)
        st.write("LLM Response:", response)
        speak_text(response)