akdeniz27 commited on
Commit
bb4261b
1 Parent(s): 5718ae8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -0
app.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from PIL import Image
3
+ from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
4
+
5
+ model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-docvqa-large")
6
+ processor = Pix2StructProcessor.from_pretrained("google/pix2struct-docvqa-large")
7
+
8
+ def process_document(image, question):
9
+ image = Image.open(image)
10
+ inputs = processor(images=image, text=question, return_tensors="pt").to("cuda")
11
+ predictions = model.generate(**inputs)
12
+ return processor.decode(predictions[0], skip_special_tokens=True)
13
+
14
+ description = "Demo for pix2struct fine-tuned on DocVQA (document visual question answering). To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below."
15
+ article = "<p style='text-align: center'><a href='https://arxiv.org/pdf/2210.03347.pdf' target='_blank'>PIX2STRUCT: SCREENSHOT PARSING AS PRETRAINING FOR VISUAL LANGUAGE UNDERSTANDING</a>
16
+
17
+ demo = gr.Interface(
18
+ fn=process_document,
19
+ inputs=["image", "text"],
20
+ outputs="json",
21
+ title="Demo: pix2struct for DocVQA",
22
+ description=description,
23
+ article=article,
24
+ enable_queue=True,
25
+ examples=[["example_1.png", "When is the coffee break?"], ["example_2.jpeg", "What's the population of Stoddard?"]],
26
+ cache_examples=False)
27
+
28
+ demo.launch()