Spaces:

HuggingFaceM4
/

ai_raven

Runtime error

App Files Files Community

VictorSanh commited on Mar 10

Commit

f20057b

•

1 Parent(s): 157a0b7

hello

Browse files

Files changed (2) hide show

app.py +261 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,261 @@

+import torch
+import gradio as gr
+import random
+import numpy as np
+from PIL import Image
+import imagehash
+import cv2
+import os
+from transformers import AutoProcessor, AutoModelForCausalLM
+from transformers.image_utils import to_numpy_array, PILImageResampling, ChannelDimension
+from transformers.image_transforms import resize, to_channel_dimension_format
+from typing import List
+from PIL import Image
+from collections import Counter
+from datasets import load_dataset, concatenate_datasets
+DEVICE = torch.device("cuda")
+PROCESSOR = AutoProcessor.from_pretrained(
+    "HuggingFaceM4/idefics2_raven_finetuned",
+    token=os.environ["HF_AUTH_TOKEN"],
+)
+MODEL = AutoModelForCausalLM.from_pretrained(
+    "HuggingFaceM4/idefics2_raven_finetuned",
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+    token=os.environ["HF_AUTH_TOKEN"],
+).to(DEVICE)
+if MODEL.config.use_resampler:
+    image_seq_len = MODEL.config.perceiver_config.resampler_n_latents
+else:
+    image_seq_len = (
+        MODEL.config.vision_config.image_size // MODEL.config.vision_config.patch_size
+    ) ** 2
+BOS_TOKEN = PROCESSOR.tokenizer.bos_token
+BAD_WORDS_IDS = PROCESSOR.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+DATASET = load_dataset("HuggingFaceM4/RAVEN_rendered", split="validation")
+## Utils
+def convert_to_rgb(image):
+    # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
+    # for transparent images. The call to `alpha_composite` handles this case
+    if image.mode == "RGB":
+        return image
+    image_rgba = image.convert("RGBA")
+    background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
+    alpha_composite = Image.alpha_composite(background, image_rgba)
+    alpha_composite = alpha_composite.convert("RGB")
+    return alpha_composite
+# The processor is the same as the Idefics processor except for the BICUBIC interpolation inside siglip,
+# so this is a hack in order to redefine ONLY the transform method
+def custom_transform(x):
+    x = convert_to_rgb(x)
+    x = to_numpy_array(x)
+    x = resize(x, (960, 960), resample=PILImageResampling.BILINEAR)
+    x = PROCESSOR.image_processor.rescale(x, scale=1 / 255)
+    x = PROCESSOR.image_processor.normalize(
+        x,
+        mean=PROCESSOR.image_processor.image_mean,
+        std=PROCESSOR.image_processor.image_std
+    )
+    x = to_channel_dimension_format(x, ChannelDimension.FIRST)
+    x = torch.tensor(x)
+    return x
+def pixel_difference(image1, image2):
+    def color(im):
+        arr = np.array(im).flatten()
+        arr_list = arr.tolist()
+        counts = Counter(arr_list)
+        most_common = counts.most_common(2)
+        if most_common[0][0] == 255:
+            return most_common[1][0]
+        else:
+            return most_common[0][0]
+    def canny_edges(im):
+        im = cv2.Canny(np.array(im), 50, 100)
+        im[im!=0] = 255
+        return Image.fromarray(im)
+    def phash(im):
+        return imagehash.phash(canny_edges(im), hash_size=32)
+    def surface(im):
+        return (np.array(im) != 255).sum()
+    color_diff = np.abs(color(image1) - color(image2))
+    hash_diff = phash(image1) - phash(image2)
+    surface_diff = np.abs(surface(image1) - surface(image2))
+    if int(hash_diff/7) < 10:
+        return color_diff < 10 or int(surface_diff / (160 * 160) * 100) < 10
+    elif color_diff < 10:
+        return int(surface_diff / (160 * 160) * 100) < 10 or int(hash_diff/7) < 10
+    elif int(surface_diff / (160 * 160) * 100) < 10:
+        return int(hash_diff/7) < 10 or color_diff < 10
+    else:
+        return False
+# End of Utils
+def load_sample():
+    n = len(DATASET)
+    found_sample = False
+    while not found_sample:
+        idx = random.randint(0, n)
+        sample = DATASET[idx]
+        found_sample = True
+    return sample["image"], sample["label"], "", "", ""
+# @spaces.GPU(duration=180)
+def model_inference(
+    image,
+):
+    if image is None:
+        raise ValueError("`image` is None. It should be a PIL image.")
+    # return "A"
+    inputs = PROCESSOR.tokenizer(
+        f"{BOS_TOKEN}User:<fake_token_around_image>{'<image>' * image_seq_len}<fake_token_around_image>Which figure should complete the logical sequence?<end_of_utterance>\nAssistant:",
+        return_tensors="pt",
+        add_special_tokens=False,
+    )
+    inputs["pixel_values"] = PROCESSOR.image_processor(
+        [image],
+        transform=custom_transform
+    )
+    inputs = {
+        k: v.to(DEVICE)
+        for k, v in inputs.items()
+    }
+    generation_kwargs = dict(
+        inputs,
+        bad_words_ids=BAD_WORDS_IDS,
+        max_length=4,
+    )
+    # Regular generation version
+    generated_ids = MODEL.generate(**generation_kwargs)
+    generated_text = PROCESSOR.batch_decode(
+        generated_ids,
+        skip_special_tokens=True
+    )[0]
+    return generated_text[-1]
+model_prediction = gr.TextArea(
+    label="AI's guess",
+    visible=True,
+    lines=1,
+    max_lines=1,
+    interactive=False,
+)
+user_prediction = gr.TextArea(
+    label="Your guess",
+    visible=True,
+    lines=1,
+    max_lines=1,
+    interactive=False,
+)
+result = gr.TextArea(
+    label="Win or lose?",
+    visible=True,
+    lines=1,
+    max_lines=1,
+    interactive=False,
+)
+css = """
+.gradio-container{max-width: 1000px!important}
+h1{display: flex;align-items: center;justify-content: center;gap: .25em}
+*{transition: width 0.5s ease, flex-grow 0.5s ease}
+"""
+with gr.Blocks(title="Beat the AI", theme=gr.themes.Base(), css=css) as demo:
+    gr.Markdown(
+        "Are you smarter than the AI?"
+    )
+    load_new_sample = gr.Button(value="Load new sample")
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=4, min_width=250) as upload_area:
+            imagebox = gr.Image(
+                image_mode="L",
+                type="pil",
+                visible=True,
+                sources=None,
+            )
+        with gr.Column(scale=4):
+            with gr.Row():
+                a = gr.Button(value="A", min_width=1)
+                b = gr.Button(value="B", min_width=1)
+                c = gr.Button(value="C", min_width=1)
+                d = gr.Button(value="D", min_width=1)
+            with gr.Row():
+                e = gr.Button(value="E", min_width=1)
+                f = gr.Button(value="F", min_width=1)
+                g = gr.Button(value="G", min_width=1)
+                h = gr.Button(value="H", min_width=1)
+            with gr.Row():
+                model_prediction.render()
+                user_prediction.render()
+            solution  = gr.TextArea(
+                label="Solution",
+                visible=False,
+                lines=1,
+                max_lines=1,
+                interactive=False,
+            )
+            with gr.Row():
+                result.render()
+    load_new_sample.click(
+        fn=load_sample,
+        inputs=[],
+        outputs=[imagebox, solution, model_prediction, user_prediction, result]
+    )
+    gr.on(
+        triggers=[
+            a.click,
+            b.click,
+            c.click,
+            d.click,
+            e.click,
+            f.click,
+            g.click,
+            h.click,
+        ],
+        fn=model_inference,
+        inputs=[imagebox],
+        outputs=[model_prediction],
+    ).then(
+        fn=lambda x, y, z: "🥇" if x==y else f"💩 The solution is {chr(ord('A') + int(z))}",
+        inputs=[model_prediction, user_prediction, solution],
+        outputs=[result],
+    )
+    a.click(fn=lambda: "A", inputs=[], outputs=[user_prediction])
+    b.click(fn=lambda: "B", inputs=[], outputs=[user_prediction])
+    c.click(fn=lambda: "C", inputs=[], outputs=[user_prediction])
+    d.click(fn=lambda: "D", inputs=[], outputs=[user_prediction])
+    e.click(fn=lambda: "E", inputs=[], outputs=[user_prediction])
+    f.click(fn=lambda: "F", inputs=[], outputs=[user_prediction])
+    g.click(fn=lambda: "G", inputs=[], outputs=[user_prediction])
+    h.click(fn=lambda: "H", inputs=[], outputs=[user_prediction])
+    demo.load()
+demo.queue(max_size=40, api_open=False)
+demo.launch(max_threads=400)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+cv2
+torch
+imagehash
+transformers
+datasets
+pillow
+numpy