os1187 commited on
Commit
8953fd3
1 Parent(s): c9a369e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -0
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import os
3
+ import gradio as gr
4
+ from transformers import ViTFeatureExtractor, ViTModel
5
+ from PIL import Image
6
+ from transformers import AutoTokenizer, AutoModel
7
+ import torch
8
+
9
+ # Function to get image embeddings using ViT
10
+ def get_image_embeddings(image_path, model_name='google/vit-base-patch16-224'):
11
+ feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)
12
+ model = ViTModel.from_pretrained(model_name)
13
+
14
+ image = Image.open(image_path)
15
+ inputs = feature_extractor(images=image, return_tensors="pt")
16
+ outputs = model(**inputs)
17
+ embeddings = outputs.last_hidden_state.mean(dim=1) # Mean pooling
18
+ return embeddings
19
+
20
+ # Function to convert PDF to images
21
+ def pdf_to_images(pdf_file, img_dir):
22
+ # Open the provided PDF file
23
+ doc = fitz.open(pdf_file)
24
+
25
+ # Create the directory if it doesn't exist
26
+ os.makedirs(img_dir, exist_ok=True)
27
+
28
+ for page_num in range(len(doc)):
29
+ # Get the page
30
+ page = doc.load_page(page_num)
31
+
32
+ # Render the page to an image
33
+ pix = page.get_pixmap()
34
+
35
+ # Define the output image path
36
+ output_file = f"{img_dir}/page_{page_num + 1}.png"
37
+
38
+ # Save the image
39
+ pix.save(output_file)
40
+
41
+ print(f"Converted {len(doc)} pages to images and saved in {img_dir}")
42
+
43
+ # Function to get text embeddings using a transformer model
44
+ def get_text_embeddings(text, model_name='bert-base-uncased'):
45
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
46
+ model = AutoModel.from_pretrained(model_name)
47
+
48
+ inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
49
+ outputs = model(**inputs)
50
+ embeddings = outputs.last_hidden_state.mean(dim=1) # Mean pooling
51
+ return embeddings
52
+
53
+ # Function to process PDF and generate a response
54
+ def process_pdf_and_generate_response(pdf_file):
55
+ # Convert PDF to images
56
+ img_dir = "pdf_images"
57
+ pdf_to_images(pdf_file, img_dir)
58
+
59
+ # Generate embeddings for each image
60
+ image_embeddings = []
61
+ for filename in os.listdir(img_dir):
62
+ if filename.endswith(".png"):
63
+ image_path = os.path.join(img_dir, filename)
64
+ image_embeddings.append(get_image_embeddings(image_path))
65
+
66
+ # Perform some text analysis on the PDF content (replace with your logic)
67
+ pdf_text = "PDF content analysis placeholder"
68
+ text_embeddings = get_text_embeddings(pdf_text)
69
+
70
+ # Combine image and text embeddings and generate a response (replace with your logic)
71
+ combined_embeddings = torch.cat([*image_embeddings, text_embeddings], dim=0)
72
+ response = "Response based on the processed PDF"
73
+ return response
74
+
75
+ # Gradio interface
76
+ iface = gr.Interface(
77
+ fn=process_pdf_and_generate_response,
78
+ inputs=gr.inputs.File(label="Upload PDF", type="file"),
79
+ outputs=gr.outputs.Textbox(),
80
+ title="Talk2Deck - Interact with your PDFs",
81
+ description="Upload a PDF and receive insights based on its content."
82
+ )
83
+
84
+ if __name__ == "__main__":
85
+ iface.launch()