import streamlit as st from transformers import pipeline from huggingface_hub import InferenceClient from PIL import Image import os api_key = os.getenv("HUGGINGFACE_TOKEN") client = InferenceClient(api_key=api_key) st.header("Character Captions (IN PROGRESS!)") st.write("Have a character caption any image you upload!") character = st.selectbox("Choose a character", ["rapper", "shrek", "unintelligible"]) uploaded_img = st.file_uploader("Upload an image") if uploaded_img is not None: image = Image.open(uploaded_img) st.image(image) image_captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large") response = image_captioner(image) caption = response[0]['generated_text'] character_prompts = { "rapper": f"Describe this scene like you're a rapper: {caption}.", "shrek": f"Describe this scene like you're Shrek: {caption}.", "unintelligible": f"Describe this scene in a way that makes no sense: {caption}." } prompt = character_prompts[character] messages = [ { "role": "user", "content": prompt } ] stream = client.chat.completions.create( model="meta-llama/Llama-3.2-3B-Instruct", messages=messages, max_tokens=500, stream=True ) response = '' for chunk in stream: response += chunk.choices[0].delta.content st.write(response)