diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..81e2bc22e060839d515dd267f600afee9fba252d 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +bee.jpg filter=lfs diff=lfs merge=lfs -text +bird.jpg filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md index 4b1a77e94b5f088af6311b51993b4f74dc14f5bb..ee5be44ae2b10c3d0e00d6a4ada009c839880ab5 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,11 @@ ---- -license: apache-2.0 -title: Paligemma Hf ---- +--- +title: Paligemma HF +emoji: 🤗 +colorFrom: yellow +colorTo: green +sdk: gradio +sdk_version: 4.20.1 +app_file: app.py +pinned: false +license: apache-2.0 +--- diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..76f0e8044971e4849ba8f4adb7364cb265085946 --- /dev/null +++ b/app.py @@ -0,0 +1,327 @@ +import gradio as gr +import PIL.Image +import transformers +from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor +import torch +import os +import string +import functools +import re +import flax.linen as nn +import jax +import jax.numpy as jnp +import numpy as np + + +hf_token = os.getenv("HF_TOKEN") +model_id = "google/paligemma-3b-mix-448" +COLORS = ['#4285f4', '#db4437', '#f4b400', '#0f9d58', '#e48ef1'] +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, use_auth_token=hf_token).eval().to(device) +processor = PaliGemmaProcessor.from_pretrained(model_id) + +###### Transformers Inference + +def infer( + image: PIL.Image.Image, + text: str, + max_new_tokens: int +) -> str: + inputs = processor(text=text, images=image, return_tensors="pt").to(device) + with torch.inference_mode(): + generated_ids = model.generate( + **inputs, + max_new_tokens=max_new_tokens, + do_sample=False + ) + result = processor.batch_decode(generated_ids, skip_special_tokens=True) + return result[0][len(text):] + +##### Parse segmentation output tokens into masks +##### Also returns bounding boxes with their labels + +def parse_segmentation(input_image, input_text): + out = infer(input_image, input_text, max_new_tokens=100) + objs = extract_objs(out.lstrip("\n"), input_image.size[0], input_image.size[1], unique_labels=True) + labels = set(obj.get('name') for obj in objs if obj.get('name')) + color_map = {l: COLORS[i % len(COLORS)] for i, l in enumerate(labels)} + highlighted_text = [(obj['content'], obj.get('name')) for obj in objs] + annotated_img = ( + input_image, + [ + ( + obj['mask'] if obj.get('mask') is not None else obj['xyxy'], + obj['name'] or '', + ) + for obj in objs + if 'mask' in obj or 'xyxy' in obj + ], +) + has_annotations = bool(annotated_img[1]) + return annotated_img + + + +######## Demo + +INTRO_TEXT = """## PaliGemma demo\n\n +| [Github](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md) +| [Blogpost](https://huggingface.co/blog/paligemma) +|\n\n +PaliGemma is an open vision-language model by Google, inspired by [PaLI-3](https://arxiv.org/abs/2310.09199) and +built with open components such as the [SigLIP](https://arxiv.org/abs/2303.15343) +vision model and the [Gemma](https://arxiv.org/abs/2403.08295) language model. PaliGemma is designed as a versatile +model for transfer to a wide range of vision-language tasks such as image and short video caption, visual question +answering, text reading, object detection and object segmentation. +\n\n +This space includes models fine-tuned on a mix of downstream tasks, **inferred via 🤗 transformers**. +See the [Blogpost](https://huggingface.co/blog/paligemma) and +[README]((https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md)) +for detailed information how to use and fine-tune PaliGemma models. +\n\n +**This is an experimental research model.** Make sure to add appropriate guardrails when using the model for applications. +""" + + +with gr.Blocks(css="style.css") as demo: + gr.Markdown(INTRO_TEXT) + with gr.Tab("Conversation"): + with gr.Column(): + image = gr.Image(type="pil") + text_input = gr.Text(label="Input Text") + + text_output = gr.Text(label="Text Output") + chat_btn = gr.Button() + tokens = gr.Slider( + label="Max New Tokens", + info="Set to larger for longer generation.", + minimum=10, + maximum=100, + value=20, + step=10, + ) + + chat_inputs = [ + image, + text_input, + tokens + ] + chat_outputs = [ + text_output + ] + chat_btn.click( + fn=infer, + inputs=chat_inputs, + outputs=chat_outputs, + ) + + examples = [["./bee.jpg", "What is on the flower?"], + ["./examples/billard1.jpg", "How many red balls are there?"], + ["./examples/bowie.jpg", "Who is this?"], + ["./examples/emu.jpg", "What animal is this?"], + ["./howto.jpg", "What does this image show?"], + ["./examples/password.jpg", "What is the password?"], + ["./examples/ulges.jpg", "Who is the author of this book?"]] + gr.Markdown("Example images are licensed CC0 by [akolesnikoff@](https://github.com/akolesnikoff), [mbosnjak@](https://github.com/mbosnjak), [maximneumann@](https://github.com/maximneumann) and [merve](https://huggingface.co/merve).") + + gr.Examples( + examples=examples, + inputs=chat_inputs, + ) + with gr.Tab("Segment/Detect"): + image = gr.Image(type="pil") + seg_input = gr.Text(label="Entities to Segment/Detect") + seg_btn = gr.Button("Submit") + annotated_image = gr.AnnotatedImage(label="Output") + + examples = [["./cats.png", "segment cats"], + ["./bee.jpg", "detect bee"], + ["./examples/barsik.jpg", "segment cat"], + ["./bird.jpg", "segment bird ; bird ; plant"]] + gr.Markdown("Example images are licensed CC0 by [akolesnikoff@](https://github.com/akolesnikoff), [mbosnjak@](https://github.com/mbosnjak), [maximneumann@](https://github.com/maximneumann) and [merve](https://huggingface.co/merve).") + gr.Examples( + examples=examples, + inputs=[image, seg_input], + ) + + seg_inputs = [ + image, + seg_input + ] + seg_outputs = [ + annotated_image + ] + seg_btn.click( + fn=parse_segmentation, + inputs=seg_inputs, + outputs=seg_outputs, + ) + + + + + +### Postprocessing Utils for Segmentation Tokens +### Segmentation tokens are passed to another VAE which decodes them to a mask + +_MODEL_PATH = 'vae-oid.npz' + +_SEGMENT_DETECT_RE = re.compile( + r'(.*?)' + + r'' * 4 + r'\s*' + + '(?:%s)?' % (r'' * 16) + + r'\s*([^;<>]+)? ?(?:; )?', +) + + +def _get_params(checkpoint): + """Converts PyTorch checkpoint to Flax params.""" + + def transp(kernel): + return np.transpose(kernel, (2, 3, 1, 0)) + + def conv(name): + return { + 'bias': checkpoint[name + '.bias'], + 'kernel': transp(checkpoint[name + '.weight']), + } + + def resblock(name): + return { + 'Conv_0': conv(name + '.0'), + 'Conv_1': conv(name + '.2'), + 'Conv_2': conv(name + '.4'), + } + + return { + '_embeddings': checkpoint['_vq_vae._embedding'], + 'Conv_0': conv('decoder.0'), + 'ResBlock_0': resblock('decoder.2.net'), + 'ResBlock_1': resblock('decoder.3.net'), + 'ConvTranspose_0': conv('decoder.4'), + 'ConvTranspose_1': conv('decoder.6'), + 'ConvTranspose_2': conv('decoder.8'), + 'ConvTranspose_3': conv('decoder.10'), + 'Conv_1': conv('decoder.12'), + } + + +def _quantized_values_from_codebook_indices(codebook_indices, embeddings): + batch_size, num_tokens = codebook_indices.shape + assert num_tokens == 16, codebook_indices.shape + unused_num_embeddings, embedding_dim = embeddings.shape + + encodings = jnp.take(embeddings, codebook_indices.reshape((-1)), axis=0) + encodings = encodings.reshape((batch_size, 4, 4, embedding_dim)) + return encodings + + +@functools.cache +def _get_reconstruct_masks(): + """Reconstructs masks from codebook indices. + Returns: + A function that expects indices shaped `[B, 16]` of dtype int32, each + ranging from 0 to 127 (inclusive), and that returns a decoded masks sized + `[B, 64, 64, 1]`, of dtype float32, in range [-1, 1]. + """ + + class ResBlock(nn.Module): + features: int + + @nn.compact + def __call__(self, x): + original_x = x + x = nn.Conv(features=self.features, kernel_size=(3, 3), padding=1)(x) + x = nn.relu(x) + x = nn.Conv(features=self.features, kernel_size=(3, 3), padding=1)(x) + x = nn.relu(x) + x = nn.Conv(features=self.features, kernel_size=(1, 1), padding=0)(x) + return x + original_x + + class Decoder(nn.Module): + """Upscales quantized vectors to mask.""" + + @nn.compact + def __call__(self, x): + num_res_blocks = 2 + dim = 128 + num_upsample_layers = 4 + + x = nn.Conv(features=dim, kernel_size=(1, 1), padding=0)(x) + x = nn.relu(x) + + for _ in range(num_res_blocks): + x = ResBlock(features=dim)(x) + + for _ in range(num_upsample_layers): + x = nn.ConvTranspose( + features=dim, + kernel_size=(4, 4), + strides=(2, 2), + padding=2, + transpose_kernel=True, + )(x) + x = nn.relu(x) + dim //= 2 + + x = nn.Conv(features=1, kernel_size=(1, 1), padding=0)(x) + + return x + + def reconstruct_masks(codebook_indices): + quantized = _quantized_values_from_codebook_indices( + codebook_indices, params['_embeddings'] + ) + return Decoder().apply({'params': params}, quantized) + + with open(_MODEL_PATH, 'rb') as f: + params = _get_params(dict(np.load(f))) + + return jax.jit(reconstruct_masks, backend='cpu') +def extract_objs(text, width, height, unique_labels=False): + """Returns objs for a string with "" and "" tokens.""" + objs = [] + seen = set() + while text: + m = _SEGMENT_DETECT_RE.match(text) + if not m: + break + print("m", m) + gs = list(m.groups()) + before = gs.pop(0) + name = gs.pop() + y1, x1, y2, x2 = [int(x) / 1024 for x in gs[:4]] + + y1, x1, y2, x2 = map(round, (y1*height, x1*width, y2*height, x2*width)) + seg_indices = gs[4:20] + if seg_indices[0] is None: + mask = None + else: + seg_indices = np.array([int(x) for x in seg_indices], dtype=np.int32) + m64, = _get_reconstruct_masks()(seg_indices[None])[..., 0] + m64 = np.clip(np.array(m64) * 0.5 + 0.5, 0, 1) + m64 = PIL.Image.fromarray((m64 * 255).astype('uint8')) + mask = np.zeros([height, width]) + if y2 > y1 and x2 > x1: + mask[y1:y2, x1:x2] = np.array(m64.resize([x2 - x1, y2 - y1])) / 255.0 + + content = m.group() + if before: + objs.append(dict(content=before)) + content = content[len(before):] + while unique_labels and name in seen: + name = (name or '') + "'" + seen.add(name) + objs.append(dict( + content=content, xyxy=(x1, y1, x2, y2), mask=mask, name=name)) + text = text[len(before) + len(content):] + + if text: + objs.append(dict(content=text)) + + return objs + +######### + +if __name__ == "__main__": + demo.queue(max_size=10).launch(debug=True) \ No newline at end of file diff --git a/bee.jpg b/bee.jpg new file mode 100644 index 0000000000000000000000000000000000000000..14721a88bb6ac2b717708297518fff64a28e7fc9 --- /dev/null +++ b/bee.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b21ba78250f852ca5990063866b1ace6432521d0251bde7f8de783b22c99a6d +size 5373297 diff --git a/bird.jpg b/bird.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7b791b686cd4bd2b6e92db234b97798b4593a323 --- /dev/null +++ b/bird.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f1fb355ccd4b46ec882324a8d786abb0f99fc9695b09b3ab86a0a7009573626 +size 1063853 diff --git a/cats.png b/cats.png new file mode 100644 index 0000000000000000000000000000000000000000..f9b73cdf324b663d7730d881b72c60695f6bd11b Binary files /dev/null and b/cats.png differ diff --git a/examples/barsik.jpg b/examples/barsik.jpg new file mode 100644 index 0000000000000000000000000000000000000000..55f855f13e882e57272a4eed142c919e907b84b6 Binary files /dev/null and b/examples/barsik.jpg differ diff --git a/examples/barsik.json b/examples/barsik.json new file mode 100644 index 0000000000000000000000000000000000000000..6d6f13e76e15985824ab27135a8b62d8b278d0dc --- /dev/null +++ b/examples/barsik.json @@ -0,0 +1,7 @@ +{ + "name": "barsik", + "comment": "", + "model": "paligemma-3b-mix-224", + "prompt": "segment cat", + "license": "CC0 by [maximneumann@](https://github.com/maximneumann)" +} \ No newline at end of file diff --git a/examples/biennale.jpg b/examples/biennale.jpg new file mode 100644 index 0000000000000000000000000000000000000000..05ba1292a74c2842df4b4341ecaf1a1c1ecbcce0 Binary files /dev/null and b/examples/biennale.jpg differ diff --git a/examples/biennale.json b/examples/biennale.json new file mode 100644 index 0000000000000000000000000000000000000000..532ff527f32ad4e5fa1ebd71ebacc14d537370e5 --- /dev/null +++ b/examples/biennale.json @@ -0,0 +1,7 @@ +{ + "name": "biennale", + "comment": "", + "model": "paligemma-3b-mix-224", + "prompt": "In which city is this?", + "license": "CC0 by [andsteing@](https://huggingface.co/andsteing)" +} \ No newline at end of file diff --git a/examples/billard1.jpg b/examples/billard1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2fbf3c5a9e96df8099640c4d9700fccac7063648 Binary files /dev/null and b/examples/billard1.jpg differ diff --git a/examples/billard1.json b/examples/billard1.json new file mode 100644 index 0000000000000000000000000000000000000000..2667d173894c20049779f493091cb00be8205d07 --- /dev/null +++ b/examples/billard1.json @@ -0,0 +1,7 @@ +{ + "name": "billard1", + "comment": "", + "model": "paligemma-3b-mix-224", + "prompt": "How many red balls are there?", + "license": "CC0 by [mbosnjak@](https://github.com/mbosnjak)" +} \ No newline at end of file diff --git a/examples/billard2.jpg b/examples/billard2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a2a65c4b4c837082190bda4c1ec0d95aae757387 Binary files /dev/null and b/examples/billard2.jpg differ diff --git a/examples/billard2.json b/examples/billard2.json new file mode 100644 index 0000000000000000000000000000000000000000..1e66dd97b575f666c962436482fc18ee8682493e --- /dev/null +++ b/examples/billard2.json @@ -0,0 +1,7 @@ +{ + "name": "billard2", + "comment": "", + "model": "paligemma-3b-mix-224", + "prompt": "How many balls are there?", + "license": "CC0 by [mbosnjak@](https://github.com/mbosnjak)" +} \ No newline at end of file diff --git a/examples/bowie.jpg b/examples/bowie.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a470c3fbcd2af4e81af9de46f6ba26f17db81631 Binary files /dev/null and b/examples/bowie.jpg differ diff --git a/examples/bowie.json b/examples/bowie.json new file mode 100644 index 0000000000000000000000000000000000000000..deb4dfd631631946765c9e90fa4555822a453e03 --- /dev/null +++ b/examples/bowie.json @@ -0,0 +1,7 @@ +{ + "name": "bowie", + "comment": "", + "model": "paligemma-3b-mix-224", + "prompt": "Who is this?", + "license": "CC0 by [akolesnikoff@](https://github.com/akolesnikoff)" +} \ No newline at end of file diff --git a/examples/branch.jpg b/examples/branch.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d95595728b845c0ec2b7ee508473541a48d84290 Binary files /dev/null and b/examples/branch.jpg differ diff --git a/examples/branch.json b/examples/branch.json new file mode 100644 index 0000000000000000000000000000000000000000..a86c14f5d3fe2f2d0512ce49fc2ab3b9b6012c61 --- /dev/null +++ b/examples/branch.json @@ -0,0 +1,7 @@ +{ + "name": "branch", + "comment": "", + "model": "paligemma-3b-mix-224", + "prompt": "What caused this?", + "license": "CC0 by [andsteing@](https://huggingface.co/andsteing)" +} \ No newline at end of file diff --git a/examples/cc_fox.jpg b/examples/cc_fox.jpg new file mode 100644 index 0000000000000000000000000000000000000000..47c95d0a91241833574ccb19b8a355417a87bc7a Binary files /dev/null and b/examples/cc_fox.jpg differ diff --git a/examples/cc_fox.json b/examples/cc_fox.json new file mode 100644 index 0000000000000000000000000000000000000000..69ee0678e50e701e0167097f4c41ed360f449aed --- /dev/null +++ b/examples/cc_fox.json @@ -0,0 +1,7 @@ +{ + "name": "cc_fox", + "comment": "", + "model": "paligemma-3b-mix-448", + "prompt": "Which breed is this fox?", + "license": "CC0 by [XiaohuaZhai@](https://sites.google.com/view/xzhai)" +} diff --git a/examples/cc_landscape.jpg b/examples/cc_landscape.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0a4b610c4234ffa67305b7952584430f4d953dde Binary files /dev/null and b/examples/cc_landscape.jpg differ diff --git a/examples/cc_landscape.json b/examples/cc_landscape.json new file mode 100644 index 0000000000000000000000000000000000000000..c1a66ec2901cd5108c71a690f23cf6ef51a9fbee --- /dev/null +++ b/examples/cc_landscape.json @@ -0,0 +1,7 @@ +{ + "name": "cc_landscape", + "comment": "", + "model": "paligemma-3b-mix-448", + "prompt": "What does the image show?", + "license": "CC0 by [XiaohuaZhai@](https://sites.google.com/view/xzhai)" +} diff --git a/examples/cc_puffin.jpg b/examples/cc_puffin.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ae6bb3dc676dc40b2854c58d8238ff7770f3072d Binary files /dev/null and b/examples/cc_puffin.jpg differ diff --git a/examples/cc_puffin.json b/examples/cc_puffin.json new file mode 100644 index 0000000000000000000000000000000000000000..5ada0c694b286e2b27ace19084b32950ee56adf7 --- /dev/null +++ b/examples/cc_puffin.json @@ -0,0 +1,7 @@ +{ + "name": "cc_puffin", + "comment": "", + "model": "paligemma-3b-mix-448", + "prompt": "detect puffin in the back; puffin in front", + "license": "CC0 by [XiaohuaZhai@](https://sites.google.com/view/xzhai)" +} diff --git a/examples/couch.jpg b/examples/couch.jpg new file mode 100644 index 0000000000000000000000000000000000000000..81800961f0498e46fc06ef525311f0a5d88eb4cb Binary files /dev/null and b/examples/couch.jpg differ diff --git a/examples/couch.json b/examples/couch.json new file mode 100644 index 0000000000000000000000000000000000000000..32f4cba01ded6e629661c4f81ec9125f9af8409e --- /dev/null +++ b/examples/couch.json @@ -0,0 +1,7 @@ +{ + "name": "couch", + "comment": "", + "model": "paligemma-3b-mix-224", + "prompt": "How many yellow cushions are on the couch?", + "license": "CC0" +} \ No newline at end of file diff --git a/examples/couch_.json b/examples/couch_.json new file mode 100644 index 0000000000000000000000000000000000000000..22a288af099703296a1208279484354f88ed5c20 --- /dev/null +++ b/examples/couch_.json @@ -0,0 +1,7 @@ +{ + "name": "couch", + "comment": "", + "model": "paligemma-3b-mix-224", + "prompt": "How many painting do you see in the image?", + "license": "CC0" +} \ No newline at end of file diff --git a/examples/cups.jpg b/examples/cups.jpg new file mode 100644 index 0000000000000000000000000000000000000000..29fb745612887e7a0d4137a503831fe4dd0841d1 Binary files /dev/null and b/examples/cups.jpg differ diff --git a/examples/cups.json b/examples/cups.json new file mode 100644 index 0000000000000000000000000000000000000000..078e3df2986f38350c30eaf2e1e1522a842b7664 --- /dev/null +++ b/examples/cups.json @@ -0,0 +1,7 @@ +{ + "name": "cups", + "comment": "", + "model": "paligemma-3b-mix-224", + "prompt": "how many cups?", + "license": "CC0 by [mbosnjak@](https://github.com/mbosnjak)" +} \ No newline at end of file diff --git a/examples/dice.jpg b/examples/dice.jpg new file mode 100644 index 0000000000000000000000000000000000000000..76d0fbabee3a9aa31d3335a850f25b0c40952d70 Binary files /dev/null and b/examples/dice.jpg differ diff --git a/examples/dice.json b/examples/dice.json new file mode 100644 index 0000000000000000000000000000000000000000..a3fb3f9703dd6ea055569fba49b4a96b76df8235 --- /dev/null +++ b/examples/dice.json @@ -0,0 +1,7 @@ +{ + "name": "dice", + "comment": "", + "model": "paligemma-3b-mix-224", + "prompt": "segment dice ; dice", + "license": "CC0 by [andresusanopinto@](https://github.com/andresusanopinto)" +} \ No newline at end of file diff --git a/examples/emu.jpg b/examples/emu.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d298e271a108a89cb89473af26bd202902f8b901 Binary files /dev/null and b/examples/emu.jpg differ diff --git a/examples/emu.json b/examples/emu.json new file mode 100644 index 0000000000000000000000000000000000000000..23532eac207641e3d138ceb67f9a051d6d231539 --- /dev/null +++ b/examples/emu.json @@ -0,0 +1,7 @@ +{ + "name": "emu", + "comment": "", + "model": "paligemma-3b-mix-224", + "prompt": "What animal is this?", + "license": "CC0 by [akolesnikoff@](https://github.com/akolesnikoff)" +} \ No newline at end of file diff --git a/examples/fridge.jpg b/examples/fridge.jpg new file mode 100644 index 0000000000000000000000000000000000000000..dd6af3f32f8b3bad650b2162f1b4628d8e5a26db Binary files /dev/null and b/examples/fridge.jpg differ diff --git a/examples/fridge.json b/examples/fridge.json new file mode 100644 index 0000000000000000000000000000000000000000..c6628d78020b331530c4c6a3726c38c454c4da2f --- /dev/null +++ b/examples/fridge.json @@ -0,0 +1,7 @@ +{ + "name": "fridge", + "comment": "", + "model": "paligemma-3b-mix-224", + "prompt": "Describe the image.", + "license": "CC0 by [andresusanopinto@](https://github.com/andresusanopinto)" +} \ No newline at end of file diff --git a/examples/givt.jpg b/examples/givt.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b269c132464ecbd8c7ce8f5464cb6ebf142cc8a1 Binary files /dev/null and b/examples/givt.jpg differ diff --git a/examples/givt.json b/examples/givt.json new file mode 100644 index 0000000000000000000000000000000000000000..4e244d55bd0423fd8041accf1ba3d9bb43d494af --- /dev/null +++ b/examples/givt.json @@ -0,0 +1,7 @@ +{ + "name": "givt", + "comment": "", + "model": "paligemma-3b-mix-224", + "prompt": "What does the image show?", + "license": "CC-BY [GIVT paper](https://arxiv.org/abs/2312.02116)" +} \ No newline at end of file diff --git a/examples/greenlake.jpg b/examples/greenlake.jpg new file mode 100644 index 0000000000000000000000000000000000000000..65401579082eebb41a70869d8785b2c84a437476 Binary files /dev/null and b/examples/greenlake.jpg differ diff --git a/examples/greenlake.json b/examples/greenlake.json new file mode 100644 index 0000000000000000000000000000000000000000..5de5282b9608ada567cd696e8e4846e0906088da --- /dev/null +++ b/examples/greenlake.json @@ -0,0 +1,7 @@ +{ + "name": "greenlake", + "comment": "", + "model": "paligemma-3b-mix-224", + "prompt": "Describe the image.", + "license": "CC0 by [akolesnikoff@](https://github.com/akolesnikoff)" +} \ No newline at end of file diff --git a/examples/howto.jpg b/examples/howto.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5f079c6751730ab16008835765e6296c8fcc2d8c Binary files /dev/null and b/examples/howto.jpg differ diff --git a/examples/howto.json b/examples/howto.json new file mode 100644 index 0000000000000000000000000000000000000000..2b44aae0878af6ff9abf1628ccedbb932179d19d --- /dev/null +++ b/examples/howto.json @@ -0,0 +1,7 @@ +{ + "name": "howto", + "comment": "", + "model": "paligemma-3b-mix-224", + "prompt": "What does this image show?", + "license": "CC-BY [How to train your ViT?](https://arxiv.org/abs/2106.10270)" +} \ No newline at end of file diff --git a/examples/markers.jpg b/examples/markers.jpg new file mode 100644 index 0000000000000000000000000000000000000000..756537b93cf074ebfd32a45a7438b46914d335c3 Binary files /dev/null and b/examples/markers.jpg differ diff --git a/examples/markers.json b/examples/markers.json new file mode 100644 index 0000000000000000000000000000000000000000..9093a2c9a468dd7995039cae415394f182c35e89 --- /dev/null +++ b/examples/markers.json @@ -0,0 +1,7 @@ +{ + "name": "markers", + "comment": "answer en How many cups are there?", + "model": "paligemma-3b-mix-224", + "prompt": "How many cups are there?", + "license": "CC0" +} \ No newline at end of file diff --git a/examples/mcair.jpg b/examples/mcair.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e965dde07d103114690bfc086d24ab3fdb054e65 Binary files /dev/null and b/examples/mcair.jpg differ diff --git a/examples/mcair.json b/examples/mcair.json new file mode 100644 index 0000000000000000000000000000000000000000..0f50b7f96253821cb70eb5aac40760d140252ffa --- /dev/null +++ b/examples/mcair.json @@ -0,0 +1,7 @@ +{ + "name": "mcair", + "comment": "", + "model": "paligemma-3b-mix-224", + "prompt": "Can you board this airplane?", + "license": "CC0 by [akolesnikoff@](https://github.com/akolesnikoff)" +} \ No newline at end of file diff --git a/examples/mcair_.json b/examples/mcair_.json new file mode 100644 index 0000000000000000000000000000000000000000..7ae3353a0ed5109a9cc9f26dc179ec7a86357b8c --- /dev/null +++ b/examples/mcair_.json @@ -0,0 +1,7 @@ +{ + "name": "mcair", + "comment": "", + "model": "paligemma-3b-mix-224", + "prompt": "Is this a restaurant?", + "license": "CC0 by [akolesnikoff@](https://github.com/akolesnikoff)" +} \ No newline at end of file diff --git a/examples/minergie.jpg b/examples/minergie.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8372f189fa3549042e752931b2577fc7384be0fc Binary files /dev/null and b/examples/minergie.jpg differ diff --git a/examples/minergie.json b/examples/minergie.json new file mode 100644 index 0000000000000000000000000000000000000000..cb292ed5e6eafc30ccecb9d7b2569ee370ab3b06 --- /dev/null +++ b/examples/minergie.json @@ -0,0 +1,7 @@ +{ + "name": "minergie", + "comment": "", + "model": "paligemma-3b-mix-224", + "prompt": "ocr", + "license": "CC0 by [andsteing@](https://huggingface.co/andsteing)" +} \ No newline at end of file diff --git a/examples/morel.jpg b/examples/morel.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e1498f0c98d6a2a7187ec74b809ca9f8fdc776c2 Binary files /dev/null and b/examples/morel.jpg differ diff --git a/examples/morel.json b/examples/morel.json new file mode 100644 index 0000000000000000000000000000000000000000..c4fb09a89a268cae5cbdff0810feea34177da7c8 --- /dev/null +++ b/examples/morel.json @@ -0,0 +1,7 @@ +{ + "name": "morel", + "comment": "", + "model": "paligemma-3b-mix-224", + "prompt": "detect morel", + "license": "CC0 by [andsteing@](https://huggingface.co/andsteing)" +} \ No newline at end of file diff --git a/examples/motorcyclists.jpg b/examples/motorcyclists.jpg new file mode 100644 index 0000000000000000000000000000000000000000..91fdffa020ea0e2ba5ef1a9be7dd68bdb7a081ce Binary files /dev/null and b/examples/motorcyclists.jpg differ diff --git a/examples/motorcyclists.json b/examples/motorcyclists.json new file mode 100644 index 0000000000000000000000000000000000000000..f4a0d1e8b7207ac55ed90d09cbfc68ea065c901c --- /dev/null +++ b/examples/motorcyclists.json @@ -0,0 +1,7 @@ +{ + "name": "motorcyclists", + "comment": "", + "model": "paligemma-3b-mix-224", + "prompt": "What does the image show?", + "license": "CC0 by [akolesnikoff@](https://github.com/akolesnikoff)" +} \ No newline at end of file diff --git a/examples/parking.jpg b/examples/parking.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3b3c6d3ebb5057b228f04a85a83184c5a1c8aaba Binary files /dev/null and b/examples/parking.jpg differ diff --git a/examples/parking.json b/examples/parking.json new file mode 100644 index 0000000000000000000000000000000000000000..9964ba3acfadec3e0165377bb182ec416672b49a --- /dev/null +++ b/examples/parking.json @@ -0,0 +1,7 @@ +{ + "name": "parking", + "comment": "", + "model": "paligemma-3b-mix-224", + "prompt": "Describe the image.", + "license": "CC0 by [xiaohuazhai@](https://huggingface.co/xiaohuazhai)" +} \ No newline at end of file diff --git a/examples/password.jpg b/examples/password.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c7804dfa42aa3cad23089087bffca604a8c3507e Binary files /dev/null and b/examples/password.jpg differ diff --git a/examples/password.json b/examples/password.json new file mode 100644 index 0000000000000000000000000000000000000000..070f3f8c992a948b177f2cc647a7f9260c0d6c38 --- /dev/null +++ b/examples/password.json @@ -0,0 +1,7 @@ +{ + "name": "password", + "comment": "", + "model": "paligemma-3b-mix-224", + "prompt": "What is the password?", + "license": "CC0 by [akolesnikoff@](https://github.com/akolesnikoff)" +} \ No newline at end of file diff --git a/examples/preservationhall.jpg b/examples/preservationhall.jpg new file mode 100644 index 0000000000000000000000000000000000000000..adab242566923b4fa3ac97b179c358b07697f221 Binary files /dev/null and b/examples/preservationhall.jpg differ diff --git a/examples/preservationhall.json b/examples/preservationhall.json new file mode 100644 index 0000000000000000000000000000000000000000..6f9be7e1169ea9269ee0e9ebe58d32b424b4c21f --- /dev/null +++ b/examples/preservationhall.json @@ -0,0 +1,7 @@ +{ + "name": "preservationhall", + "comment": "", + "model": "paligemma-3b-mix-224", + "prompt": "Describe the image.", + "license": "CC0 by [mitscha@](https://github.com/mitscha)" +} \ No newline at end of file diff --git a/examples/preservationhall_.json b/examples/preservationhall_.json new file mode 100644 index 0000000000000000000000000000000000000000..5571c5272f91d36f2d67a55aac2d61715e3e5f26 --- /dev/null +++ b/examples/preservationhall_.json @@ -0,0 +1,7 @@ +{ + "name": "preservationhall", + "comment": "", + "model": "paligemma-3b-mix-224", + "prompt": "What's the name of the place?", + "license": "CC0 by [mitscha@](https://github.com/mitscha)" +} \ No newline at end of file diff --git a/examples/ulges.jpg b/examples/ulges.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e91d86083e3dbfecc2688735d380a441c0dee227 Binary files /dev/null and b/examples/ulges.jpg differ diff --git a/examples/ulges.json b/examples/ulges.json new file mode 100644 index 0000000000000000000000000000000000000000..d22ee5806c716238cd83fcbad2597dcf31dc6e04 --- /dev/null +++ b/examples/ulges.json @@ -0,0 +1,7 @@ +{ + "name": "ulges", + "comment": "", + "model": "paligemma-3b-mix-224", + "prompt": "Who is the author of this book?", + "license": "CC0" +} \ No newline at end of file diff --git a/howto.jpg b/howto.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5f079c6751730ab16008835765e6296c8fcc2d8c Binary files /dev/null and b/howto.jpg differ diff --git a/password.jpg b/password.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c7804dfa42aa3cad23089087bffca604a8c3507e Binary files /dev/null and b/password.jpg differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6b184898a1cf71fbc709bffaf3188bca0040aa5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +git+https://github.com/huggingface/transformers.git@add_palma +torch +jax +flax +spaces \ No newline at end of file diff --git a/vae-oid.npz b/vae-oid.npz new file mode 100644 index 0000000000000000000000000000000000000000..e30bd245fc0b67df063c5bd49d83c7130bba2637 --- /dev/null +++ b/vae-oid.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5586010257b8536dddefab65e7755077f21d5672d5674dacf911f73ae95a4447 +size 8479556