import gradio as gr import spaces import json import re from gradio_client import Client def get_caption_from_kosmos(image_in): kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/") kosmos2_result = kosmos2_client.predict( image_in, # str (filepath or URL to image) in 'Test Image' Image component "Detailed", # str in 'Description Type' Radio component fn_index=4 ) print(f"KOSMOS2 RETURNS: {kosmos2_result}") with open(kosmos2_result[1], 'r') as f: data = json.load(f) reconstructed_sentence = [] for sublist in data: reconstructed_sentence.append(sublist[0]) full_sentence = ' '.join(reconstructed_sentence) #print(full_sentence) # Find the pattern matching the expected format ("Describe this image in detail:" followed by optional space and then the rest)... pattern = r'^Describe this image in detail:\s*(.*)$' # Apply the regex pattern to extract the description text. match = re.search(pattern, full_sentence) if match: description = match.group(1) print(description) else: print("Unable to locate valid description.") # Find the last occurrence of "." #last_period_index = full_sentence.rfind('.') # Truncate the string up to the last period #truncated_caption = full_sentence[:last_period_index + 1] # print(truncated_caption) #print(f"\n—\nIMAGE CAPTION: {truncated_caption}") return description def get_caption_from_MD(image_in): client = Client("https://vikhyatk-moondream1.hf.space/") result = client.predict( image_in, # filepath in 'image' Image component "Describe character like if it was fictional", # str in 'Question' Textbox component api_name="/answer_question" ) print(result) return result import re import torch from transformers import pipeline pipe = pipeline("text-generation", model="HuggingFaceH4/zephyr-7b-beta", torch_dtype=torch.bfloat16, device_map="auto") @spaces.GPU(enable_queue=True) def get_card_idea(user_prompt): agent_maker_sys = f''' Your job is to generate new magic card from an image description given by user. You will only provide one card idea. Example 1: "The image represents the famous painting "The Mona Lisa" by the Italian artist Leonardo da Vinci. The painting is a portrait of a woman with a distinctive smile, and it is known for its realistic style and the use of the sfumato technique, which creates a soft, smoky effect around the edges of the painting. The painting is displayed in a museum, and it is considered one of the most iconic and recognizable works of art in the world." Bot Response: "Mona Lisa's Enigma [2][W][U][B] Enchantment At the beginning of your upkeep, add one mana of any color to your mana pool for each color among permanents you control. [W][U][B]: Target player puts the top X cards of their library into their graveyard, where X is the number of colors among permanents you control. Her alluring smile masks unfathomable depths. Mythic" Example 2: "The image features a fluffy, white and gray cat sitting on a couch. The cat has a surprised expression on its face, as if it has just heard or seen something unexpected. The cat's position on the couch and its attentive gaze towards the camera give the impression that it is a well-known or famous cat, perhaps a popular pet or a subject in a movie or TV show. However, without more context or information, it is not possible to definitively identify the cat as something famous. » Bot Response: "Feline Dominator [2][G][W] Creature - Cat Whenever Feline Dominator attacks, it gets +1/+1 until end of turn for each other attacking Cat you control. Whenever Feline Dominator deals combat damage, you gain that much life. The true ruler of the house, demanding tribute from all who enter its domain. 2/4 Uncommon" Only provide one card example according to image description. ''' instruction = f""" <|system|> {agent_maker_sys} <|user|> """ prompt = f"{instruction.strip()}\n{user_prompt}" #print(f"PROMPT: {prompt}") outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95) return outputs def infer(image_in, cap_type): gr.Info("Getting image description...") if cap_type == "Fictional" : user_prompt = get_caption_from_MD(image_in) elif cap_type == "Literal" : user_prompt = get_caption_from_kosmos(image_in) gr.Info("Building a new card according to the image caption ...") outputs = get_card_idea(user_prompt) pattern = r'\<\|system\|\>(.*?)\<\|assistant\|\>' cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL) print(f"SUGGESTED CARD: {cleaned_text}") return cleaned_text.lstrip("\n") title = f"Magic Card Generator", description = f"" css = """ #col-container{ margin: 0 auto; max-width: 780px; text-align: left; } /* fix examples gallery width on mobile */ div#component-14 > .gallery > .gallery-item > .container > img { width: auto!important; } """ with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): gr.HTML(f"""

Magic Card Generator

{description}

""") with gr.Row(): with gr.Column(): image_in = gr.Image( label = "Image reference", type = "filepath", elem_id = "image-in" ) cap_type = gr.Radio( label = "Caption type", choices = [ "Literal", "Fictional" ], value = "Fictional" ) submit_btn = gr.Button("Make a card from my pic !") gr.Examples( examples = [ ["examples/monalisa.png"], ["examples/violonist.png"], ["examples/frog.jpeg"], ["examples/samourai.png"] ], fn = infer, inputs = [image_in, cap_type] ) with gr.Column(): result = gr.Textbox( label = "Suggested Card", lines = 6, max_lines = 30, elem_id = "suggested-card" ) submit_btn.click( fn = infer, inputs = [ image_in, cap_type ], outputs =[ result ] ) demo.queue().launch(show_api=False, show_error=True)