import streamlit as st from transformers import pipeline from huggingface_hub import InferenceClient from PIL import Image import os def initialize(): if 'initialized' not in st.session_state: # Initialize only once print("Initializing...") st.session_state['initialized'] = True st.session_state['api_key'] = os.getenv("HUGGINGFACE_TOKEN") st.session_state['client'] = InferenceClient(api_key=st.session_state['api_key']) def main(): initialize() st.header("Character Captions") st.write("Have a character caption any image you upload!") character = st.selectbox("Choose a character", ["rapper", "shrek", "unintelligible", "cookie monster"]) uploaded_img = st.file_uploader("Upload an image") if uploaded_img is not None: # Open Image image = Image.open(uploaded_img) st.image(image) # Get caption from image image_captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large") response = image_captioner(image) caption = response[0]['generated_text'] # Pass the caption to a character prompt character_prompts = { "rapper": f"Describe this caption like you're a rapper: {caption}.", "shrek": f"Describe this caption like you're Shrek: {caption}.", "unintelligible": f"Describe this caption in a way that makes no sense: {caption}.", "cookie monster": f"Describe this caption like you're cookie monster: {caption}." } prompt = character_prompts[character] messages = [ { "role": "user", "content": prompt } ] # Pass to Llama for character output regarding image caption stream = st.session_state['client'].chat.completions.create( model="meta-llama/Llama-3.2-3B-Instruct", messages=messages, max_tokens=500, stream=True ) response = '' for chunk in stream: response += chunk.choices[0].delta.content st.write(response) if __name__ == '__main__': main()