adirik commited on
Commit
855a8a4
1 Parent(s): f83bc77
Files changed (5) hide show
  1. app.py +56 -0
  2. assets/.DS_Store +0 -0
  3. assets/cartoon.jpeg +0 -0
  4. assets/painting.jpeg +0 -0
  5. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+ from transformers import AlignProcessor, AlignModel
4
+
5
+
6
+ device = "cuda" if torch.cuda.is_available() else "cpu"
7
+
8
+ processor = AlignProcessor.from_pretrained("kakaobrain/align-base")
9
+ model = AlignModel.from_pretrained("kakaobrain/align-base").to(device)
10
+ model.eval()
11
+
12
+
13
+ def predict(image, labels):
14
+ labels = labels.split(', ')
15
+ inputs = processor(images=image, text=labels, return_tensors="pt").to(device)
16
+
17
+ with torch.no_grad():
18
+ outputs = model(**inputs)
19
+
20
+ logits_per_image = outputs.logits_per_image
21
+ probs = logits_per_image.softmax(dim=1).cpu().numpy()
22
+ return {k: float(v) for k, v in zip(labels, probs[0])}
23
+
24
+
25
+ description = """
26
+ <div class="container" style="display:flex;">
27
+ <div class="image">
28
+ <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/132_vit_align/align.png" alt="ALIGN performance" />
29
+ </div>
30
+ <div class="text">
31
+ <p>Gradio demo for <a href="https://huggingface.co/docs/transformers/main/en/model_doc/align">ALIGN</a>,
32
+ as introduced in <a href="https://arxiv.org/abs/2102.05918"></a><i>"Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision
33
+ "</i>. ALIGN features a dual-encoder architecture with EfficientNet and BERT as its text and vision encoders, and learns to align visual and text representations with contrastive learning.
34
+ Unlike previous work, ALIGN leverages a massive noisy dataset and shows that the scale of the corpus can be used to achieve SOTA representations with a simple recipe.
35
+ \n\nALIGN is not open-sourced and the `kakaobrain/align-base` model used for this demo is based on the Kakao Brain implementation that follows the original paper.
36
+ The model is trained on the open source [COYO](https://github.com/kakaobrain/coyo-dataset) dataset by the Kakao Brain team.
37
+ To perform zero-shot image classification with ALIGN, upload an image and enter your candidate labels as free-form text separated by a comma followed by a space.</p>
38
+ </div>
39
+ </div>
40
+ """
41
+
42
+ gr.Interface(
43
+ fn=predict,
44
+ inputs=[
45
+ gr.inputs.Image(label="Image to classify", type="pil"),
46
+ gr.inputs.Textbox(lines=1, label="Comma separated candidate labels", placeholder="Enter labels separated by ', '",)
47
+ ],
48
+ theme="grass",
49
+ outputs="label",
50
+ examples=[
51
+ ["assets/cartoon.jpeg", "dinosaur, drawing, forest",],
52
+ ["assets/painting.jpeg", "watercolor painting, oil painting, boats",],
53
+ ],
54
+ title="Zero-Shot Image Classification with ALIGN",
55
+ description=description
56
+ ).launch()
assets/.DS_Store ADDED
Binary file (6.15 kB). View file
assets/cartoon.jpeg ADDED
assets/painting.jpeg ADDED
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
1
+ torch
2
+ opencv-python
3
+ git+https://github.com/huggingface/transformers
4
+
5
+