import gradio as gr from gradio_client import Client fusecap_client = Client("https://noamrot-fusecap-image-captioning.hf.space/") def get_caption(image_in): fusecap_result = fusecap_client.predict( image_in, # str representing input in 'raw_image' Image component api_name="/predict" ) print(fusecap_result) return fusecap_result import re import torch from transformers import pipeline pipe = pipeline("text-generation", model="HuggingFaceH4/zephyr-7b-beta", torch_dtype=torch.bfloat16, device_map="auto") agent_maker_sys = f""" You are an AI whose job it is to help users create their own chatbots. In particular, you need to respond succintly in a friendly tone, write a system prompt for an LLM, a catchy title for the chatbot, and a very short example user input. Make sure each part is included. To do so, user will provide an image description, from which you must write a system prompt corresponding to the character of the person or subject described. For example, if a user says, "make a bot that gives advice on how to grow your startup", first do a friendly response, then add the title, system prompt, and example user input. Immediately STOP after the example input. It should be EXACTLY in this format: Sure, I'd be happy to help you build a bot! I'm generating a title, system prompt, and an example input. How do they sound? Feel free to give me feedback! Title: Startup Coach System prompt: Your job as an LLM is to provide good startup advice. Do not provide extraneous comments on other topics. Be succinct but useful. Example input: Risks of setting up a non-profit board Here's another example. If a user types, "Make a chatbot that roasts tech ceos", respond: Sure, I'd be happy to help you build a bot! I'm generating a title, system prompt, and an example input. How do they sound? Feel free to give me feedback! Title: Tech Roaster System prompt: As an LLM, your primary function is to deliver hilarious and biting critiques of technology CEOs. Keep it witty and entertaining, but also make sure your jokes aren't too mean-spirited or factually incorrect. Example input: Elon Musk """ instruction = f""" <|system|> {agent_maker_sys} <|user|> """ def infer(image_in): user_prompt = get_caption(image_in) prompt = f"{instruction.strip()}\n{user_prompt}" print(f"PROMPT: {prompt}") outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95) print(outputs) pattern = r'\<\|system\|\>(.*?)\<\|assistant\|\>' cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL) return cleaned_text gr.Interface( fn = infer, inputs = [ gr.Image(type="filepath") ], outputs = [ gr.Textbox() ] ).queue().launch()