Spaces:
Runtime error
Runtime error
dhairyashah
commited on
Commit
•
2c8b539
1
Parent(s):
a0e2927
update
Browse files- app.py +101 -4
- requirements.txt +13 -0
app.py
CHANGED
@@ -1,7 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
5 |
|
6 |
-
|
7 |
-
demo.launch()
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
import tqdm
|
5 |
+
from PIL import Image
|
6 |
+
import torch
|
7 |
+
import fitz
|
8 |
import gradio as gr
|
9 |
+
import spaces
|
10 |
+
import os
|
11 |
+
from transformers import AutoModel
|
12 |
+
from transformers import AutoTokenizer
|
13 |
+
import numpy as np
|
14 |
+
|
15 |
+
cache_dir = 'pdf_cache'
|
16 |
+
os.makedirs(cache_dir, exist_ok=True)
|
17 |
+
|
18 |
+
device = 'cuda'
|
19 |
+
|
20 |
+
print("Embedding model loading...")
|
21 |
+
model_path = 'RhapsodyAI/minicpm-visual-embedding-v0'
|
22 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
23 |
+
model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
|
24 |
+
model.eval()
|
25 |
+
model.to(device)
|
26 |
+
print("Embedding model loaded successfully!")
|
27 |
+
|
28 |
+
print("Generation model loading...")
|
29 |
+
gen_model_path = 'openbmb/MiniCPM-V-2_6'
|
30 |
+
gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_path, trust_remote_code=True)
|
31 |
+
gen_model = AutoModel.from_pretrained(gen_model_path, trust_remote_code=True, attn_implementation='sdpa', torch_dtype=torch.bfloat16)
|
32 |
+
gen_model.eval()
|
33 |
+
gen_model.to(device)
|
34 |
+
print("Generation model loaded successfully!")
|
35 |
+
|
36 |
+
@spaces.GPU(duration=100)
|
37 |
+
def process_pdf(pdf_file, max_pages, progress=gr.Progress()):
|
38 |
+
doc = fitz.open("pdf", pdf_file)
|
39 |
+
num_pages = min(max_pages, len(doc))
|
40 |
+
|
41 |
+
images = []
|
42 |
+
for page_num in progress.tqdm(range(num_pages)):
|
43 |
+
page = doc[page_num]
|
44 |
+
pix = page.get_pixmap(dpi=200)
|
45 |
+
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
46 |
+
images.append(image)
|
47 |
+
|
48 |
+
return images
|
49 |
+
|
50 |
+
@spaces.GPU(duration=50)
|
51 |
+
def answer_question(images, question):
|
52 |
+
global gen_model, gen_tokenizer
|
53 |
+
images_ = [img.convert('RGB') for img in images]
|
54 |
+
msgs = [{'role': 'user', 'content': [question, *images_]}]
|
55 |
+
answer = gen_model.chat(
|
56 |
+
image=None,
|
57 |
+
msgs=msgs,
|
58 |
+
tokenizer=gen_tokenizer
|
59 |
+
)
|
60 |
+
print(answer)
|
61 |
+
return answer
|
62 |
+
|
63 |
+
with gr.Blocks() as app:
|
64 |
+
gr.Markdown("# PDF Question Answering with Vision Language Model")
|
65 |
+
|
66 |
+
gr.Markdown("""
|
67 |
+
This application uses a Vision Language Model to answer questions about PDF documents.
|
68 |
+
|
69 |
+
1. Upload a PDF file
|
70 |
+
2. Set the maximum number of pages to process
|
71 |
+
3. Click "Process PDF" to extract the pages
|
72 |
+
4. Enter your question about the PDF content
|
73 |
+
5. Click "Answer Question" to get the model's response
|
74 |
+
""")
|
75 |
+
|
76 |
+
with gr.Row():
|
77 |
+
file_input = gr.File(type="binary", label="Upload PDF")
|
78 |
+
max_pages = gr.Number(value=10, minimum=1, maximum=50, step=1, label="Maximum number of pages to process")
|
79 |
+
process_button = gr.Button("Process PDF")
|
80 |
+
|
81 |
+
with gr.Row():
|
82 |
+
query_input = gr.Text(label="Your Question")
|
83 |
+
answer_button = gr.Button("Answer Question")
|
84 |
+
|
85 |
+
images_output = gr.Gallery(label="Processed PDF Pages", visible=False)
|
86 |
+
gen_model_response = gr.Textbox(label="Model's Answer")
|
87 |
+
|
88 |
+
def process_and_show(pdf_file, max_pages):
|
89 |
+
images = process_pdf(pdf_file, max_pages)
|
90 |
+
return gr.Gallery.update(value=images, visible=True)
|
91 |
+
|
92 |
+
process_button.click(
|
93 |
+
process_and_show,
|
94 |
+
inputs=[file_input, max_pages],
|
95 |
+
outputs=images_output
|
96 |
+
)
|
97 |
|
98 |
+
answer_button.click(
|
99 |
+
answer_question,
|
100 |
+
inputs=[images_output, query_input],
|
101 |
+
outputs=gen_model_response
|
102 |
+
)
|
103 |
|
104 |
+
app.launch()
|
|
requirements.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PyMuPDF
|
2 |
+
tqdm
|
3 |
+
gradio
|
4 |
+
Pillow==10.1.0
|
5 |
+
sentencepiece==0.1.99
|
6 |
+
numpy==1.26.0
|
7 |
+
transformers==4.40.2
|
8 |
+
timm
|
9 |
+
|
10 |
+
torch==2.1.2
|
11 |
+
torchvision==0.16.2
|
12 |
+
|
13 |
+
https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.2/flash_attn-2.6.2+cu123torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
|