dhairyashah commited on
Commit
2c8b539
1 Parent(s): a0e2927
Files changed (2) hide show
  1. app.py +101 -4
  2. requirements.txt +13 -0
app.py CHANGED
@@ -1,7 +1,104 @@
 
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import tqdm
5
+ from PIL import Image
6
+ import torch
7
+ import fitz
8
  import gradio as gr
9
+ import spaces
10
+ import os
11
+ from transformers import AutoModel
12
+ from transformers import AutoTokenizer
13
+ import numpy as np
14
+
15
+ cache_dir = 'pdf_cache'
16
+ os.makedirs(cache_dir, exist_ok=True)
17
+
18
+ device = 'cuda'
19
+
20
+ print("Embedding model loading...")
21
+ model_path = 'RhapsodyAI/minicpm-visual-embedding-v0'
22
+ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
23
+ model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
24
+ model.eval()
25
+ model.to(device)
26
+ print("Embedding model loaded successfully!")
27
+
28
+ print("Generation model loading...")
29
+ gen_model_path = 'openbmb/MiniCPM-V-2_6'
30
+ gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_path, trust_remote_code=True)
31
+ gen_model = AutoModel.from_pretrained(gen_model_path, trust_remote_code=True, attn_implementation='sdpa', torch_dtype=torch.bfloat16)
32
+ gen_model.eval()
33
+ gen_model.to(device)
34
+ print("Generation model loaded successfully!")
35
+
36
+ @spaces.GPU(duration=100)
37
+ def process_pdf(pdf_file, max_pages, progress=gr.Progress()):
38
+ doc = fitz.open("pdf", pdf_file)
39
+ num_pages = min(max_pages, len(doc))
40
+
41
+ images = []
42
+ for page_num in progress.tqdm(range(num_pages)):
43
+ page = doc[page_num]
44
+ pix = page.get_pixmap(dpi=200)
45
+ image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
46
+ images.append(image)
47
+
48
+ return images
49
+
50
+ @spaces.GPU(duration=50)
51
+ def answer_question(images, question):
52
+ global gen_model, gen_tokenizer
53
+ images_ = [img.convert('RGB') for img in images]
54
+ msgs = [{'role': 'user', 'content': [question, *images_]}]
55
+ answer = gen_model.chat(
56
+ image=None,
57
+ msgs=msgs,
58
+ tokenizer=gen_tokenizer
59
+ )
60
+ print(answer)
61
+ return answer
62
+
63
+ with gr.Blocks() as app:
64
+ gr.Markdown("# PDF Question Answering with Vision Language Model")
65
+
66
+ gr.Markdown("""
67
+ This application uses a Vision Language Model to answer questions about PDF documents.
68
+
69
+ 1. Upload a PDF file
70
+ 2. Set the maximum number of pages to process
71
+ 3. Click "Process PDF" to extract the pages
72
+ 4. Enter your question about the PDF content
73
+ 5. Click "Answer Question" to get the model's response
74
+ """)
75
+
76
+ with gr.Row():
77
+ file_input = gr.File(type="binary", label="Upload PDF")
78
+ max_pages = gr.Number(value=10, minimum=1, maximum=50, step=1, label="Maximum number of pages to process")
79
+ process_button = gr.Button("Process PDF")
80
+
81
+ with gr.Row():
82
+ query_input = gr.Text(label="Your Question")
83
+ answer_button = gr.Button("Answer Question")
84
+
85
+ images_output = gr.Gallery(label="Processed PDF Pages", visible=False)
86
+ gen_model_response = gr.Textbox(label="Model's Answer")
87
+
88
+ def process_and_show(pdf_file, max_pages):
89
+ images = process_pdf(pdf_file, max_pages)
90
+ return gr.Gallery.update(value=images, visible=True)
91
+
92
+ process_button.click(
93
+ process_and_show,
94
+ inputs=[file_input, max_pages],
95
+ outputs=images_output
96
+ )
97
 
98
+ answer_button.click(
99
+ answer_question,
100
+ inputs=[images_output, query_input],
101
+ outputs=gen_model_response
102
+ )
103
 
104
+ app.launch()
 
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PyMuPDF
2
+ tqdm
3
+ gradio
4
+ Pillow==10.1.0
5
+ sentencepiece==0.1.99
6
+ numpy==1.26.0
7
+ transformers==4.40.2
8
+ timm
9
+
10
+ torch==2.1.2
11
+ torchvision==0.16.2
12
+
13
+ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.2/flash_attn-2.6.2+cu123torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl