Spaces:

yiyii
/

generate-story

Sleeping

App Files Files Community

generate-story / app.py

yiyii

Update app.py

3eabba9 verified 6 months ago

raw

history blame contribute delete

4.62 kB

	import gradio as gr
	from deepface import DeepFace
	from transformers import pipeline
	import io
	import base64
	import pandas as pd
	import numpy as ny
	from huggingface_hub import InferenceClient

	get_blip = pipeline("image-to-text",model="Salesforce/blip-image-captioning-large")

	# using deepface to detect age, gender, emotion
	def analyze_face(image):
	#convert PIL image to numpy array
	image_array = ny.array(image)
	face_result = DeepFace.analyze(image_array, actions=['age','gender','emotion'], enforce_detection=False)
	#convert the resulting dictionary to a dataframe
	df = pd.DataFrame(face_result)
	return df['dominant_gender'][0],df['age'][0],df['dominant_emotion'][0]
	#The [0] at the end is for accessing the value at the first row in a DataFrame column.

	#using blip to generate caption
	#image_to_base64_str function to convert image to base64 format
	def image_to_base64_str(pil_image):
	byte_arr = io.BytesIO()
	pil_image.save(byte_arr, format='PNG')
	byte_arr = byte_arr.getvalue()
	return str(base64.b64encode(byte_arr).decode('utf-8'))
	#captioner function to take an image
	def captioner(image):
	base64_image = image_to_base64_str(image)
	caption = get_blip(base64_image)
	return caption[0]['generated_text']
	#The [0] at the beginning is for accessing the first element in a container (like a list or dictionary).


	def get_image_info(image):
	#call captioner() function
	image_caption = captioner(image)

	#call analyze_face() function
	gender, age, emotion = analyze_face(image)

	#return image_caption,face_attributes
	return image_caption, gender, age, emotion


	client = InferenceClient(
	"mistralai/Mistral-7B-Instruct-v0.1"
	)

	def generate(image, temperature=0.9, max_new_tokens=1500, top_p=0.95, repetition_penalty=1.0):
	image_caption, gender, age, emotion = get_image_info(image)

	#prompt = f"[INS] Generate a story based on person’s emotion: {emotion}, age: {age}, gender: {gender} of the image, and image’s caption: {image_caption}.[/INS]"
	prompt = (
	f"[INS] Please generate a detailed and engaging story based on the person's emotion: {emotion}, "
	f"age: {age}, and gender: {gender} shown in the image. Begin with the scene described in the image's caption: '{image_caption}'. "
	f"The generated story should include a beginning, middle, and end.[/INS]"
	)

	print("prompt:",prompt)

	temperature = float(temperature)
	if temperature < 1e-2:
	temperature = 1e-2
	top_p = float(top_p)

	generate_kwargs = dict(
	temperature=temperature,
	max_new_tokens=max_new_tokens,
	top_p=top_p,
	repetition_penalty=repetition_penalty,
	do_sample=True,
	seed=42,
	)
	stream = client.text_generation(prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
	output = ""
	for response in stream:
	output += response.token.text
	# yield "".join(output)
	yield output
	return output


	demo = gr.Interface(fn=generate,
	inputs=[
	#gr.Video(sources=["webcam"], label="video")

	gr.Image(sources=["upload", "webcam"], label="Upload Image", type="pil"),

	gr.Slider(
	label="Temperature",
	value=0.9,
	minimum=0.0,
	maximum=1.0,
	step=0.05,
	interactive=True,
	info="Higher values produce more diverse outputs",
	),

	gr.Slider(
	label="Max new tokens",
	value=1500,
	minimum=0,
	maximum=3000,
	step=1.0,
	interactive=True,
	info="The maximum numbers of new tokens"),

	gr.Slider(
	label="Top-p (nucleus sampling)",
	value=0.90,
	minimum=0.0,
	maximum=1,
	step=0.05,
	interactive=True,
	info="Higher values sample more low-probability tokens",
	),
	gr.Slider(
	label="Repetition penalty",
	value=1.2,
	minimum=1.0,
	maximum=2.0,
	step=0.05,
	interactive=True,
	info="Penalize repeated tokens",
	)
	],
	outputs=[gr.Textbox(label="Generated Story")],
	title="story generation",
	description="generate a story for you",
	allow_flagging="never"

	)
	demo.launch(debug=(True))