joy-caption-ko

Sleeping

App Files Files Community

joy-caption-ko / app.py

kijeoung

Update app.py

bb5b825 verified 10 months ago

raw

history blame contribute delete

8.76 kB

	import spaces
	import gradio as gr
	from huggingface_hub import InferenceClient
	from torch import nn
	from transformers import AutoModel, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM
	from pathlib import Path
	import torch
	from PIL import Image
	import os
	import re

	# 경로 및 설정
	CLIP_PATH = "google/siglip-so400m-patch14-384"
	VLM_PROMPT = "A descriptive caption for this image:\n"
	MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
	CHECKPOINT_PATH = Path("wpkklhc6")
	TITLE = "<h1><center>JoyCaption Pre-Alpha (2024-07-30a)</center></h1>"

	HF_TOKEN = os.environ.get("HF_TOKEN", None)

	# 이미지 어댑터 정의
	class ImageAdapter(nn.Module):
	def __init__(self, input_features: int, output_features: int):
	super().__init__()
	self.linear1 = nn.Linear(input_features, output_features)
	self.activation = nn.GELU()
	self.linear2 = nn.Linear(output_features, output_features)

	def forward(self, vision_outputs: torch.Tensor):
	x = self.linear1(vision_outputs)
	x = self.activation(x)
	x = self.linear2(x)
	return x

	# CLIP 모델 로드
	print("Loading CLIP")
	clip_processor = AutoProcessor.from_pretrained(CLIP_PATH)
	clip_model = AutoModel.from_pretrained(CLIP_PATH)
	clip_model = clip_model.vision_model
	clip_model.eval()
	clip_model.requires_grad_(False)
	clip_model.to("cuda")

	# 토크나이저 로드
	print("Loading tokenizer")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
	assert isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)), f"Tokenizer is of type {type(tokenizer)}"

	# 언어 모델(LLM) 로드
	print("Loading LLM")
	text_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto", torch_dtype=torch.bfloat16)
	text_model.eval()

	# 이미지 어댑터 로드
	print("Loading image adapter")
	image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size)
	image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu"))
	image_adapter.eval()
	image_adapter.to("cuda")

	# 이미지 설명 생성 함수
	@spaces.GPU()
	@torch.no_grad()
	def stream_chat(input_image: Image.Image):
	torch.cuda.empty_cache()

	# 이미지 전처리
	image = clip_processor(images=input_image, return_tensors='pt').pixel_values.to('cuda')

	# 프롬프트 토크나이즈
	prompt = tokenizer.encode(VLM_PROMPT, return_tensors='pt', add_special_tokens=False).to('cuda')

	# 이미지 임베딩
	with torch.amp.autocast_mode.autocast('cuda', enabled=True):
	vision_outputs = clip_model(pixel_values=image, output_hidden_states=True)
	image_features = vision_outputs.hidden_states[-2]
	embedded_images = image_adapter(image_features).to('cuda')

	# 프롬프트 임베딩
	prompt_embeds = text_model.model.embed_tokens(prompt)
	embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device='cuda', dtype=torch.int64))

	# 프롬프트 구성
	inputs_embeds = torch.cat([
	embedded_bos.expand(embedded_images.shape[0], -1, -1),
	embedded_images,
	prompt_embeds
	], dim=1)

	# CPU에 있는 텐서를 GPU로 이동
	input_ids = torch.cat([
	torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long).to('cuda'),
	torch.zeros((1, embedded_images.shape[1]), dtype=torch.long).to('cuda'),
	prompt.to('cuda')
	], dim=1)

	attention_mask = torch.ones_like(input_ids)

	# 텍스트 생성
	generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, top_k=10, temperature=0.5)
	generate_ids = generate_ids[:, input_ids.shape[1]:]
	if generate_ids[0][-1] == tokenizer.eos_token_id:
	generate_ids = generate_ids[:, :-1]

	caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
	return caption.strip()

	# 한글 옵션을 영어로 변환하는 사전
	translation_dict = {
	"한국 남자": "Korean man",
	"한국 여자": "Korean woman",
	"삭발": "shaved",
	"숏컷": "short cut",
	"미디엄": "medium",
	"롱헤어": "long hair",
	"레이어드": "layered",
	"밥": "bob",
	"펌": "perm",
	"업스타일": "upstyle",
	"포니테일": "ponytail",
	"브레이드": "braid",
	"컬": "curl",
	"웨이브": "wave",
	"블랙": "black",
	"브라운": "brown",
	"블론드": "blonde",
	"레드": "red",
	"애쉬": "ash",
	"퍼플": "purple",
	"핑크": "pink",
	"블루": "blue",
	"그린": "green",
	"오렌지": "orange",
	"화이트": "white",
	"헤어밴드": "headband",
	"머리핀": "hairpin",
	"리본": "ribbon",
	"스크런치": "scrunchie",
	"헤어클립": "hairclip",
	"티아라": "tiara",
	"꽃장식": "flower decoration",
	}

	# 성별에 따른 단어 치환
	def translate(option):
	return translation_dict.get(option, option)

	def replace_gender_specific_words(caption, gender_prefix):
	if gender_prefix == "Korean man":
	caption = re.sub(r'\bwoman\b', "man", caption, flags=re.IGNORECASE)
	caption = re.sub(r'\bgirl\b', "boy", caption, flags=re.IGNORECASE)
	caption = re.sub(r'\blady\b', "gentleman", caption, flags=re.IGNORECASE)
	caption = re.sub(r'\bshe\b', "he", caption, flags=re.IGNORECASE)
	caption = re.sub(r'\bher\b', "his", caption, flags=re.IGNORECASE)
	caption = re.sub(r'\bherself\b', "himself", caption, flags=re.IGNORECASE)
	elif gender_prefix == "Korean woman":
	caption = re.sub(r'\bman\b', "woman", caption, flags=re.IGNORECASE)
	caption = re.sub(r'\bboy\b', "girl", caption, flags=re.IGNORECASE)
	caption = re.sub(r'\bgentleman\b', "lady", caption, flags=re.IGNORECASE)
	caption = re.sub(r'\bhe\b', "she", caption, flags=re.IGNORECASE)
	caption = re.sub(r'\bhis\b', "her", caption, flags=re.IGNORECASE)
	caption = re.sub(r'\bhimself\b', "herself", caption, flags=re.IGNORECASE)
	return caption

	def replace_gender_words(caption, gender, age, hair_length, hair_style, hair_color, hair_accessory):
	gender_prefix = translate(gender)
	hair_length_en = translate(hair_length)
	hair_style_en = translate(hair_style)
	hair_color_en = translate(hair_color)
	hair_accessory_en = translate(hair_accessory)

	hair_description = f"{hair_length_en} hair with {hair_style_en}, {hair_color_en} color"
	if hair_accessory_en:
	hair_description += f", wearing a {hair_accessory_en}"

	caption = replace_gender_specific_words(caption, gender_prefix)
	return f"{gender_prefix}, age {age}, {hair_description}: {caption}"

	# Recaption 함수
	def recaption(input_image: Image.Image, prefix: str, age: int, hair_length: str, hair_style: str, hair_color: str, hair_accessory: str):
	original_caption = stream_chat(input_image)
	updated_caption = replace_gender_words(original_caption, prefix, age, hair_length, hair_style, hair_color, hair_accessory)
	return updated_caption

	# Gradio 인터페이스
	with gr.Blocks() as demo:
	gr.HTML(TITLE)
	with gr.Row():
	with gr.Column():
	input_image = gr.Image(type="pil", label="Input Image")
	run_button = gr.Button("Caption")
	recaption_button = gr.Button("Recaption")
	gender_selection = gr.Radio(choices=["한국 남자", "한국 여자"], label="성별 선택")
	age_slider = gr.Slider(minimum=1, maximum=100, step=1, label="Age", value=25)
	hair_length = gr.Radio(choices=["삭발", "숏컷", "미디엄", "롱헤어", "레이어드", "밥"], label="헤어 길이")
	hair_style = gr.Radio(choices=["펌", "업스타일", "포니테일", "브레이드", "컬", "웨이브"], label="헤어 스타일")
	hair_color = gr.Radio(choices=["블랙", "브라운", "블론드", "레드", "애쉬", "퍼플", "핑크", "블루", "그린", "오렌지", "화이트"], label="헤어 색상")
	hair_accessory = gr.Radio(choices=["헤어밴드", "머리핀", "리본", "스크런치", "헤어클립", "티아라", "꽃장식"], label="헤어 액세서리")

	with gr.Column():
	output_caption = gr.Textbox(label="Caption")
	new_caption_output = gr.Textbox(label="Recaptioned Caption", placeholder="New caption will appear here")

	run_button.click(fn=stream_chat, inputs=[input_image], outputs=[output_caption])
	recaption_button.click(fn=recaption, inputs=[input_image, gender_selection, age_slider, hair_length, hair_style, hair_color, hair_accessory], outputs=[new_caption_output])

	if __name__ == "__main__":
	demo.launch()