Antoine245 commited on
Commit
fa423dd
1 Parent(s): c631412

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -15
app.py CHANGED
@@ -1,24 +1,53 @@
1
- import gradio as gr
2
- import requests
3
  import torch
4
- from PIL import Image
5
  from transformers import AlignProcessor, AlignModel
6
 
 
 
 
7
  processor = AlignProcessor.from_pretrained("kakaobrain/align-base")
8
- model = AlignModel.from_pretrained("kakaobrain/align-base")
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- pipe = pipeline(model="kakaobrain/align-base")
11
 
12
- def image_classifier(image):
13
- outputs = pipe(image)
14
- results = {}
15
- for result in outputs:
16
- results[result['label']] = result['score']
17
- return results
18
- title = "Is it a dog"
19
  description = """
20
- This app is not finished
 
 
 
 
 
 
 
 
 
 
21
  """
22
 
23
- demo = gr.Interface(fn=image_classifier, inputs=gr.Image(type="pil"), outputs="label", title=title, description=description)
24
- demo.launch(show_api=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
+ import gradio as gr
3
  from transformers import AlignProcessor, AlignModel
4
 
5
+
6
+ device = "cuda" if torch.cuda.is_available() else "cpu"
7
+
8
  processor = AlignProcessor.from_pretrained("kakaobrain/align-base")
9
+ model = AlignModel.from_pretrained("kakaobrain/align-base").to(device)
10
+ model.eval()
11
+
12
+
13
+ def predict(image, labels):
14
+ labels = labels.split(', ')
15
+ inputs = processor(images=image, text=labels, return_tensors="pt").to(device)
16
+
17
+ with torch.no_grad():
18
+ outputs = model(**inputs)
19
+
20
+ logits_per_image = outputs.logits_per_image
21
+ probs = logits_per_image.softmax(dim=1).cpu().numpy()
22
+ return {k: float(v) for k, v in zip(labels, probs[0])}
23
 
 
24
 
 
 
 
 
 
 
 
25
  description = """
26
+ <div class="container" style="display:flex;">
27
+ <div class="image">
28
+ <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/132_vit_align/align.png" alt="ALIGN performance" />
29
+ </div>
30
+ <div class="text">
31
+ <p>Gradio demo for <a href="https://huggingface.co/docs/transformers/main/en/model_doc/align">ALIGN</a>,
32
+ as introduced in <a href="https://arxiv.org/abs/2102.05918"></a><i>"Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision"</i>. ALIGN features a dual-encoder architecture with EfficientNet and BERT as its text and vision encoders, and learns to align visual and text representations with contrastive learning.
33
+ Unlike previous work, ALIGN leverages a massive noisy dataset and shows that the scale of the corpus can be used to achieve SOTA representations with a simple recipe.
34
+ \n\nALIGN is not open-sourced and the `kakaobrain/align-base` model used for this demo is based on the Kakao Brain implementation that follows the original paper. The model is trained on the open source [COYO](https://github.com/kakaobrain/coyo-dataset) dataset by the Kakao Brain team. To perform zero-shot image classification with ALIGN, upload an image and enter your candidate labels as free-form text separated by a comma followed by a space.</p>
35
+ </div>
36
+ </div>
37
  """
38
 
39
+ gr.Interface(
40
+ fn=predict,
41
+ inputs=[
42
+ gr.inputs.Image(label="Image to classify", type="pil"),
43
+ gr.inputs.Textbox(lines=1, label="Comma separated candidate labels", placeholder="Enter labels separated by ', '",)
44
+ ],
45
+ theme="grass",
46
+ outputs="label",
47
+ examples=[
48
+ ["assets/cartoon.jpeg", "dinosaur, drawing, forest",],
49
+ ["assets/painting.jpeg", "watercolor painting, oil painting, boats",],
50
+ ],
51
+ title="Zero-Shot Image Classification with ALIGN",
52
+ description=description
53
+ ).launch()