Spaces:

Antoine245
/

classification

Runtime error

App Files Files Community

Antoine245 commited on Jun 8, 2023

Commit

fa423dd

•

1 Parent(s): c631412

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -15

app.py CHANGED Viewed

@@ -1,24 +1,53 @@
-import gradio as gr
-import requests
 import torch
-from PIL import Image
 from transformers import AlignProcessor, AlignModel
 processor = AlignProcessor.from_pretrained("kakaobrain/align-base")
-model = AlignModel.from_pretrained("kakaobrain/align-base")
-pipe = pipeline(model="kakaobrain/align-base")
-def image_classifier(image):
-    outputs = pipe(image)
-    results = {}
-    for result in outputs:
-        results[result['label']] = result['score']
-    return results
-title = "Is it a dog"
 description = """
-This app is not finished
 """
-demo = gr.Interface(fn=image_classifier, inputs=gr.Image(type="pil"), outputs="label", title=title, description=description)
-demo.launch(show_api=False)

 import torch
+import gradio as gr
 from transformers import AlignProcessor, AlignModel
+device = "cuda" if torch.cuda.is_available() else "cpu"
 processor = AlignProcessor.from_pretrained("kakaobrain/align-base")
+model = AlignModel.from_pretrained("kakaobrain/align-base").to(device)
+model.eval()
+def predict(image, labels):
+    labels = labels.split(', ')
+    inputs = processor(images=image, text=labels, return_tensors="pt").to(device)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    logits_per_image = outputs.logits_per_image
+    probs = logits_per_image.softmax(dim=1).cpu().numpy()
+    return {k: float(v) for k, v in zip(labels, probs[0])}
 description = """
+<div class="container" style="display:flex;">
+  <div class="image">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/132_vit_align/align.png" alt="ALIGN performance" />
+  </div>
+  <div class="text">
+  <p>Gradio demo for <a href="https://huggingface.co/docs/transformers/main/en/model_doc/align">ALIGN</a>,
+    as introduced in <a href="https://arxiv.org/abs/2102.05918"></a><i>"Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision"</i>. ALIGN features a dual-encoder architecture with EfficientNet and BERT as its text and vision encoders, and learns to align visual and text representations with contrastive learning.
+    Unlike previous work, ALIGN leverages a massive noisy dataset and shows that the scale of the corpus can be used to achieve SOTA representations with a simple recipe.
+    \n\nALIGN is not open-sourced and the `kakaobrain/align-base` model used for this demo is based on the Kakao Brain implementation that follows the original paper. The model is trained on the open source [COYO](https://github.com/kakaobrain/coyo-dataset) dataset by the Kakao Brain team. To perform zero-shot image classification with ALIGN, upload an image and enter your candidate labels as free-form text separated by a comma followed by a space.</p>
+  </div>
+</div>
 """
+gr.Interface(
+    fn=predict,
+    inputs=[
+        gr.inputs.Image(label="Image to classify", type="pil"),
+        gr.inputs.Textbox(lines=1, label="Comma separated candidate labels", placeholder="Enter labels separated by ', '",)
+    ],
+    theme="grass",
+    outputs="label",
+    examples=[
+        ["assets/cartoon.jpeg", "dinosaur, drawing, forest",],
+        ["assets/painting.jpeg", "watercolor painting, oil painting, boats",],
+    ],
+    title="Zero-Shot Image Classification with ALIGN",
+    description=description
+).launch()