Test-Caption-Captain

Sleeping

App Files Files Community

Severian commited on Sep 26, 2024

Commit

250653b

verified ·

1 Parent(s): f45fc1b

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -86

app.py CHANGED Viewed

@@ -267,92 +267,97 @@ image_adapter.to("cuda")
 def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int,
                 lens_type: str = "standard", film_stock: str = "digital",
                 composition: str = "rule of thirds", lighting: str = "natural") -> str:
-	torch.cuda.empty_cache()
-	# 'any' means no length specified
-	length = None if caption_length == "any" else caption_length
-	if isinstance(length, str):
-		try:
-			length = int(length)
-		except ValueError:
-			pass
-	# 'rng-tags' and 'training_prompt' don't have formal/informal tones
-	if caption_type == "rng-tags" or caption_type == "training_prompt":
-		caption_tone = "formal"
-	# Build prompt
-	prompt_key = (caption_type, caption_tone, isinstance(length, str), isinstance(length, int))
-	if prompt_key not in CAPTION_TYPE_MAP:
-		raise ValueError(f"Invalid caption type: {prompt_key}")
-	prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(length=length, word_count=length)
-	# Add style prompt details if applicable
-	if caption_type == "style_prompt":
-		prompt_str += (f" The prompt should specifically include details about using a {lens_type} lens, "
-		               f"{film_stock} film stock, {composition} composition, and {lighting} lighting. "
-		               f"Format the output as a comma-separated list of descriptors and modifiers, "
-		               f"suitable for direct input into a Stable Diffusion interface.")
-	print(f"Prompt: {prompt_str}")
-	# Preprocess image
-	#image = clip_processor(images=input_image, return_tensors='pt').pixel_values
-	image = input_image.resize((384, 384), Image.LANCZOS)
-	pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
-	pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
-	pixel_values = pixel_values.to('cuda')
-	# Tokenize the prompt
-	prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
-	# Embed image
-	with torch.amp.autocast_mode.autocast('cuda', enabled=True):
-		vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
-		image_features = vision_outputs.hidden_states
-		embedded_images = image_adapter(image_features)
-		embedded_images = embedded_images.to('cuda')
-	# Embed prompt
-	prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
-	assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}"
-	embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=text_model.device, dtype=torch.int64))
-	eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
-	# Construct prompts
-	inputs_embeds = torch.cat([
-		embedded_bos.expand(embedded_images.shape[0], -1, -1),
-		embedded_images.to(dtype=embedded_bos.dtype),
-		prompt_embeds.expand(embedded_images.shape[0], -1, -1),
-		eot_embed.expand(embedded_images.shape[0], -1, -1),
-	], dim=1)
-	input_ids = torch.cat([
-		torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long),
-		torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
-		prompt,
-		torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
-	], dim=1).to('cuda')
-	attention_mask = torch.ones_like(input_ids)
-	#generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=False, suppress_tokens=None)
-	#generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, top_k=10, temperature=0.5, suppress_tokens=None)
-	generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, suppress_tokens=None)   # Uses the default which is temp=0.6, top_p=0.9
-	# Trim off the prompt
-	generate_ids = generate_ids[:, input_ids.shape[1]:]
-	if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
-		generate_ids = generate_ids[:, :-1]
-	caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
-	# For style_prompt, format the output for easy copying into image generation platforms
-	if caption_type == "style_prompt":
-		caption = "Stable Diffusion Prompt: " + caption.replace("\n", ", ")
-	return caption.strip()
 css = """
 h1, h2, h3, h4, h5, h6, p, li, ul, ol, a, .centered-image {

 def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int,
                 lens_type: str = "standard", film_stock: str = "digital",
                 composition: str = "rule of thirds", lighting: str = "natural") -> str:
+    torch.cuda.empty_cache()
+    # 'any' means no length specified
+    length = None if caption_length == "any" else caption_length
+    if isinstance(length, str):
+        try:
+            length = int(length)
+        except ValueError:
+            pass
+    # 'rng-tags' and 'training_prompt' don't have formal/informal tones
+    if caption_type == "rng-tags" or caption_type == "training_prompt":
+        caption_tone = "formal"
+    # Build prompt
+    prompt_key = (caption_type, caption_tone, isinstance(length, str), isinstance(length, int))
+    if prompt_key not in CAPTION_TYPE_MAP:
+        raise ValueError(f"Invalid caption type: {prompt_key}")
+    prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(length=length, word_count=length)
+    # Add style prompt details if applicable
+    if caption_type == "style_prompt":
+        prompt_str += (f" The prompt should specifically include details about using a {lens_type} lens, "
+                       f"{film_stock} film stock, {composition} composition, and {lighting} lighting. "
+                       f"Format the output as a comma-separated list of descriptors and modifiers, "
+                       f"suitable for direct input into a Stable Diffusion interface.")
+    print(f"Prompt: {prompt_str}")
+    # Preprocess image
+    image = input_image.resize((384, 384), Image.LANCZOS)
+    pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
+    pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
+    pixel_values = pixel_values.to('cuda')
+    # Tokenize the prompt
+    prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
+    # Embed image
+    with torch.amp.autocast_mode.autocast('cuda', enabled=True):
+        vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
+        image_features = vision_outputs.hidden_states
+        embedded_images = image_adapter(image_features)
+        embedded_images = embedded_images.to('cuda')
+    # Embed prompt
+    prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
+    assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}"
+    # Check if bos_token_id exists
+    if tokenizer.bos_token_id is None:
+        print("Warning: bos_token_id is None. Using default value of 1.")
+        bos_token_id = 1
+    else:
+        bos_token_id = tokenizer.bos_token_id
+    embedded_bos = text_model.model.embed_tokens(torch.tensor([[bos_token_id]], device=text_model.device, dtype=torch.int64))
+    eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
+    # Construct prompts
+    inputs_embeds = torch.cat([
+        embedded_bos.expand(embedded_images.shape[0], -1, -1),
+        embedded_images.to(dtype=embedded_bos.dtype),
+        prompt_embeds.expand(embedded_images.shape[0], -1, -1),
+        eot_embed.expand(embedded_images.shape[0], -1, -1),
+    ], dim=1)
+    input_ids = torch.cat([
+        torch.tensor([[bos_token_id]], dtype=torch.long),
+        torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
+        prompt,
+        torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
+    ], dim=1).to('cuda')
+    attention_mask = torch.ones_like(input_ids)
+    generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, suppress_tokens=None)
+    # Trim off the prompt
+    generate_ids = generate_ids[:, input_ids.shape[1]:]
+    if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
+        generate_ids = generate_ids[:, :-1]
+    caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
+    # For style_prompt, format the output for easy copying into image generation platforms
+    if caption_type == "style_prompt":
+        caption = "Stable Diffusion Prompt: " + caption.replace("\n", ", ")
+    return caption.strip()
 css = """
 h1, h2, h3, h4, h5, h6, p, li, ul, ol, a, .centered-image {