Andrew.Thang commited on
Commit
35d3d26
·
1 Parent(s): bc1c0d6
Files changed (2) hide show
  1. app.py +40 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoProcessor, AutoModelForVision2Seq
3
+ from PIL import Image
4
+ import torch
5
+
6
+ device = "cuda" if torch.cuda.is_available() else "cpu"
7
+
8
+ # Load model + processor
9
+ model_id = "prithivMLmods/Doc-VLMs-v2-Localization"
10
+ processor = AutoProcessor.from_pretrained(model_id)
11
+ model = AutoModelForVision2Seq.from_pretrained(model_id).to(device)
12
+
13
+ # Giao diện
14
+ def predict(image, text_input, system_prompt="Trích thông tin, không cần diễn giải"):
15
+ image = image.convert("RGB")
16
+ inputs = processor(images=image, text=text_input, return_tensors="pt").to(device)
17
+ generated_ids = model.generate(
18
+ **inputs,
19
+ max_new_tokens=512,
20
+ do_sample=False,
21
+ eos_token_id=processor.tokenizer.eos_token_id,
22
+ pad_token_id=processor.tokenizer.pad_token_id
23
+ )
24
+ result = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
25
+ return result
26
+
27
+ # Gradio app
28
+ demo = gr.Interface(
29
+ fn=predict,
30
+ inputs=[
31
+ gr.Image(type="pil", label="Upload ảnh tài liệu"),
32
+ gr.Textbox(label="Câu hỏi muốn hỏi mô hình"),
33
+ gr.Textbox(label="System prompt (tuỳ chọn)", value="Trích thông tin, không cần diễn giải")
34
+ ],
35
+ outputs="text",
36
+ title="Doc-VLMs v2 - Vision Document QA"
37
+ )
38
+
39
+ if __name__ == "__main__":
40
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ transformers>=4.39
2
+ torch
3
+ gradio
4
+ Pillow