import gradio as gr from deepface import DeepFace from transformers import pipeline import io import base64 import pandas as pd import numpy as ny from huggingface_hub import InferenceClient get_blip = pipeline("image-to-text",model="Salesforce/blip-image-captioning-large") # using deepface to detect age, gender, emotion def analyze_face(image): #convert PIL image to numpy array image_array = ny.array(image) face_result = DeepFace.analyze(image_array, actions=['age','gender','emotion'], enforce_detection=False) #convert the resulting dictionary to a dataframe df = pd.DataFrame(face_result) return df['dominant_gender'][0],df['age'][0],df['dominant_emotion'][0] #The [0] at the end is for accessing the value at the first row in a DataFrame column. #using blip to generate caption #image_to_base64_str function to convert image to base64 format def image_to_base64_str(pil_image): byte_arr = io.BytesIO() pil_image.save(byte_arr, format='PNG') byte_arr = byte_arr.getvalue() return str(base64.b64encode(byte_arr).decode('utf-8')) #captioner function to take an image def captioner(image): base64_image = image_to_base64_str(image) caption = get_blip(base64_image) return caption[0]['generated_text'] #The [0] at the beginning is for accessing the first element in a container (like a list or dictionary). def get_image_info(image): #call captioner() function image_caption = captioner(image) #call analyze_face() function gender, age, emotion = analyze_face(image) #return image_caption,face_attributes return image_caption, gender, age, emotion client = InferenceClient( "mistralai/Mistral-7B-Instruct-v0.1" ) def generate(image, temperature=0.9, max_new_tokens=1500, top_p=0.95, repetition_penalty=1.0): image_caption, gender, age, emotion = get_image_info(image) #prompt = f"[INS] Generate a story based on person’s emotion: {emotion}, age: {age}, gender: {gender} of the image, and image’s caption: {image_caption}.[/INS]" prompt = ( f"[INS] Please generate a detailed and engaging story based on the person's emotion: {emotion}, " f"age: {age}, and gender: {gender} shown in the image. Begin with the scene described in the image's caption: '{image_caption}'. " f"The generated story should include a beginning, middle, and end.[/INS]" ) print("prompt:",prompt) temperature = float(temperature) if temperature < 1e-2: temperature = 1e-2 top_p = float(top_p) generate_kwargs = dict( temperature=temperature, max_new_tokens=max_new_tokens, top_p=top_p, repetition_penalty=repetition_penalty, do_sample=True, seed=42, ) stream = client.text_generation(prompt, **generate_kwargs, stream=True, details=True, return_full_text=False) output = "" for response in stream: output += response.token.text # yield "".join(output) yield output return output demo = gr.Interface(fn=generate, inputs=[ #gr.Video(sources=["webcam"], label="video") gr.Image(sources=["upload", "webcam"], label="Upload Image", type="pil"), gr.Slider( label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs", ), gr.Slider( label="Max new tokens", value=1500, minimum=0, maximum=3000, step=1.0, interactive=True, info="The maximum numbers of new tokens"), gr.Slider( label="Top-p (nucleus sampling)", value=0.90, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens", ), gr.Slider( label="Repetition penalty", value=1.2, minimum=1.0, maximum=2.0, step=0.05, interactive=True, info="Penalize repeated tokens", ) ], outputs=[gr.Textbox(label="Generated Story")], title="story generation", description="generate a story for you", allow_flagging="never" ) demo.launch(debug=(True))