raghuv-aditya commited on
Commit
e46379d
1 Parent(s): 79457ed

Upload 18 files

Browse files
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  Datasets/mini_wiki_collection.json filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  Datasets/mini_wiki_collection.json filter=lfs diff=lfs merge=lfs -text
37
+ Retrieval/savedModels/document-vision-embeddings.json filter=lfs diff=lfs merge=lfs -text
Retrieval/.DS_Store ADDED
Binary file (8.2 kB). View file
 
Retrieval/__pycache__/bm25.cpython-311.pyc ADDED
Binary file (1.24 kB). View file
 
Retrieval/__pycache__/tf_idf.cpython-311.pyc ADDED
Binary file (4.45 kB). View file
 
Retrieval/__pycache__/vision.cpython-311.pyc ADDED
Binary file (9.7 kB). View file
 
Retrieval/bm25.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import joblib
3
+ from gensim.utils import simple_preprocess
4
+ from rank_bm25 import BM25Okapi
5
+
6
+ def bm25_pipeline(query, bm25_path="Retrieval/savedModels/bm25-1_0.pkl", ids_path="Retrieval/savedModels/ids.pkl", k=100):
7
+ bm25 = joblib.load(bm25_path)
8
+ ids = joblib.load(ids_path)
9
+ ranking = bm25.get_scores(simple_preprocess(query))
10
+ ranking = np.argsort(np.array(ranking))[::-1]
11
+ ranking = ranking[:k]
12
+ for j in range(len(ranking)):
13
+ ranking[j] = ids[ranking[j]]
14
+ return ranking
Retrieval/openSource.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tqdm import tqdm
2
+ import joblib
3
+ import numpy as np
4
+ from sentence_transformers import SentenceTransformer, util
5
+
6
+ # Load the model
7
+ model = SentenceTransformer('all-MiniLM-L6-v2')
8
+
9
+ def get_documents_from_scores(scores):
10
+ rankings = []
11
+ for score in scores:
12
+ rankings.append(score[0])
13
+ return rankings
14
+
15
+ def cosine_similarity(v1, v2):
16
+ v1 = np.array(v1)
17
+ v2 = np.array(v2)
18
+ if(np.linalg.norm(v1) != 0 and np.linalg.norm(v2) != 0):
19
+ sim = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
20
+ else:
21
+ sim = 0
22
+ return sim
23
+
24
+ def get_open_source_embeddings(documents):
25
+ documents_embeddings = []
26
+ for document in tqdm(documents):
27
+ documents_embeddings.append(model.encode(document))
28
+ return documents_embeddings
29
+
30
+ def open_source_rankings(query, document_embeddings, k):
31
+ query_embedding = model.encode(query)
32
+ scores = []
33
+ for idx, embedding in enumerate(document_embeddings):
34
+ scores.append((idx, cosine_similarity(query_embedding, embedding)))
35
+ scores = sorted(scores, key=lambda x: x[1], reverse=True)
36
+ scores = scores[:k]
37
+ rankings = get_documents_from_scores(scores)
38
+ return rankings, scores
39
+
40
+
41
+ def open_source_pipeline(query, documents_embeddings_path="Retrieval/savedModels/open_source_embeddings.pkl", ids_path="Retrieval/savedModels/ids.pkl", k=100):
42
+ document_embeddings = joblib.load(documents_embeddings_path)
43
+ ids = joblib.load(ids_path)
44
+ rankings, scores = open_source_rankings(query, document_embeddings, k)
45
+ rankings2 = []
46
+ for ranking in tqdm(rankings):
47
+ rankings2.append(ids[ranking])
48
+ return rankings2
Retrieval/savedModels/.DS_Store ADDED
Binary file (6.15 kB). View file
 
Retrieval/savedModels/bm25-1_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ece3c19027cd35ca6dde2d4aac8412f726715b9ac135ab28ab84bdd480451c09
3
+ size 9361012
Retrieval/savedModels/document-vision-embeddings.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c73ac57ca7de5276aef16fc2c1ccbd47ac2aea133784264239152ef4d4820274
3
+ size 16544464
Retrieval/savedModels/document_matrix.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bd045763d2222b592255289eb9f269d1cba3a45ec6f73507dca3bd70a7da7ec
3
+ size 625240225
Retrieval/savedModels/document_matrix.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d377da907541907f1da87e18f02bf84f621f8337a2e63004c120ba049c1bc1a4
3
+ size 5911195
Retrieval/savedModels/idf.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f76f99e75d4b35f2e9aa06825f92f961d1a867061e242db347cfb45563c2e4f
3
+ size 1533535
Retrieval/savedModels/ids.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b724a3d8820d865881b964a130948e1d780f8d6bdcb0e027f9e84bd4bba8480
3
+ size 10071
Retrieval/savedModels/open_source_embeddings.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3588adcbde10e19ffd96ae65ea2c0d799f9a86889bdf642c1607613951c3257
3
+ size 1584194
Retrieval/savedModels/tf_idf_dict.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:765eed596ae38d7a54c78ecf7f60ab1e25c0da09bbf4e4e5ccbad10aa1438c6c
3
+ size 13293122
Retrieval/savedModels/vocab.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0cf1aa0710b6b11ecded1a4fe90e55c5502f223109713d02a4c580ea16583e6
3
+ size 986100
Retrieval/tf_idf.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from collections import defaultdict
3
+ from gensim.utils import simple_preprocess
4
+ from tqdm import tqdm
5
+ import joblib
6
+
7
+
8
+ def get_tf_query(query):
9
+ k = len(query)
10
+ tf_query = defaultdict(lambda: 0)
11
+ for i in range(k):
12
+ tf_query[query[i]] += 1
13
+ for token in tf_query.keys():
14
+ tf_query[token] /= k
15
+ return tf_query
16
+
17
+ def get_tf_idf_query(query, idf_dict):
18
+ query = simple_preprocess(query)
19
+ tf_idf_query = defaultdict(lambda: 0)
20
+ tf_query = get_tf_query(query)
21
+ for token in tf_query.keys():
22
+ tf_idf_query[token] = tf_query[token] * idf_dict[token]
23
+ return tf_idf_query
24
+
25
+ def get_tf_idf_vector(tf_idf_instance, vocab):
26
+ temp = []
27
+ for key in vocab.keys():
28
+ temp.append(tf_idf_instance[key])
29
+ return temp
30
+
31
+
32
+ def tf_idf_rankings(query, idf_dict, tf_idf_dict, vocab, document_matrix, k):
33
+ query_vector = np.reshape(np.array(get_tf_idf_vector(get_tf_idf_query(query, idf_dict), vocab)), (1, -1))
34
+ scores = []
35
+ dot_products = document_matrix @ query_vector.T
36
+
37
+ query_norm = np.linalg.norm(query_vector)
38
+ doc_norms = np.linalg.norm(document_matrix, axis=1, keepdims=True)
39
+ cosine_similarities = dot_products / (doc_norms * query_norm)
40
+ cosine_similarities = cosine_similarities.flatten()
41
+ rankings = np.argsort(cosine_similarities)[::-1]
42
+ rankings = rankings[:k]
43
+ scores = []
44
+ for rank in rankings:
45
+ scores.append(cosine_similarities[rank])
46
+ # scores = sorted(cosine_similarities, key=lambda x: x[1], reverse=True)
47
+ # scores = scores[:k]
48
+ # rankings = get_documents_from_scores(scores)
49
+ return rankings, scores
50
+
51
+ def tf_idf_pipeline(query, idf_dict_path="Retrieval/savedModels/idf.pkl", tf_idf_dict_path="Retrieval/savedModels/tf_idf_dict.pkl", vocab_path="Retrieval/savedModels/vocab.pkl", document_matrix_path="Retrieval/savedModels/document_matrix.pkl", ids_path="Retrieval/savedModels/ids.pkl", k=100):
52
+ idf_dict = joblib.load(idf_dict_path)
53
+ print("idf loaded...")
54
+ tf_idf_dict = joblib.load(tf_idf_dict_path)
55
+ print("tf-idf loaded...")
56
+ vocab = joblib.load(vocab_path)
57
+ print("vocab loaded...")
58
+ document_matrix = joblib.load(document_matrix_path)
59
+ print("document_matrix loaded...")
60
+ ids = joblib.load(ids_path)
61
+ print("ids loaded")
62
+ rankings, scores = tf_idf_rankings(query, idf_dict, tf_idf_dict, vocab, document_matrix, k)
63
+ rankings2 = []
64
+ for ranking in tqdm(rankings):
65
+ rankings2.append(ids[ranking])
66
+ return rankings2
Retrieval/vision.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from tqdm import tqdm
3
+ import numpy as np
4
+ from transformers import ViTModel, ViTFeatureExtractor, ViTImageProcessor
5
+ from PIL import Image
6
+ import re
7
+ from fpdf import FPDF
8
+ from datetime import datetime
9
+ import fitz
10
+ import joblib
11
+ import json
12
+
13
+ model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
14
+ processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
15
+
16
+ def create_pdf(input_text):
17
+ # Create instance of FPDF class
18
+ pdf = FPDF()
19
+
20
+ # Add a page
21
+ pdf.add_page()
22
+
23
+ # Set font
24
+ pdf.set_font("Arial", size=10)
25
+
26
+ # Split the input text into multiple lines if necessary
27
+ # This ensures that the text fits the page and multiple pages are handled
28
+ pdf.multi_cell(0, 5, txt=input_text)
29
+
30
+ # Create a unique file name with the current time
31
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
32
+ file_name = f"temp/PDFs/{timestamp}.pdf"
33
+
34
+ # Create output directory if it doesn't exist
35
+ os.makedirs(os.path.dirname(file_name), exist_ok=True)
36
+
37
+ # Save the PDF
38
+ pdf.output(file_name)
39
+
40
+ # Return the file path
41
+ return file_name
42
+
43
+ def pdf_to_image(pdf_path, zoom=2.0):
44
+ # Open the PDF file
45
+ pdf_document = fitz.open(pdf_path)
46
+
47
+ # Create a list to store image paths
48
+ image_paths = []
49
+
50
+ # Create an 'Images' directory if it doesn't exist
51
+ os.makedirs("temp/Images", exist_ok=True)
52
+
53
+ # Iterate over PDF pages and convert each to an image
54
+ for page_num in range(len(pdf_document)):
55
+ page = pdf_document.load_page(page_num) # Load the page
56
+
57
+ # Set zoom level to improve quality
58
+ mat = fitz.Matrix(zoom, zoom) # Create a transformation matrix with the zoom level
59
+ pix = page.get_pixmap(matrix=mat) # Render the page to an image with the specified zoom
60
+
61
+ image_file = f'temp/Images/{os.path.basename(pdf_path)}_page_{page_num}.png'
62
+ pix.save(image_file) # Save the image as PNG
63
+ image_paths.append(image_file)
64
+
65
+ # Return the list containing paths of all images
66
+ return image_paths
67
+
68
+ def sanitize_text(text):
69
+ """
70
+ Cleans and standardizes text by keeping only alphanumeric characters and spaces.
71
+ Args:
72
+ text (str): Text to sanitize.
73
+ Returns:
74
+ str: Sanitized text.
75
+ """
76
+ if isinstance(text, str):
77
+ # Use regex to keep only alphanumeric characters and spaces
78
+ text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
79
+ # Optionally, collapse multiple spaces into a single space
80
+ text = re.sub(r'\s+', ' ', text).strip()
81
+ return text
82
+
83
+ def text_to_images(text):
84
+ text = sanitize_text(text)
85
+ pdf_path = create_pdf(text)
86
+ image_paths = pdf_to_image(pdf_path)
87
+ return image_paths
88
+
89
+ def documents_to_images(path):
90
+ document_set = []
91
+ for filename in os.listdir(path):
92
+ file_path = os.path.join(path, filename)
93
+ if os.path.isfile(file_path):
94
+ with open(file_path, "r") as f:
95
+ content = f.read()
96
+ document_set.append(content)
97
+ document_image_paths = []
98
+ for document in document_set:
99
+ image_paths = text_to_images(document)
100
+ document_image_paths.append(image_paths)
101
+ return document_image_paths
102
+
103
+ def single_unit_embedding(text):
104
+ image_paths = text_to_images(text)
105
+ temp = []
106
+ for image_path in image_paths:
107
+ image = Image.open(image_path)
108
+ inputs = processor(images=image, return_tensors="pt")
109
+ outputs = model(**inputs)
110
+ vector = outputs.last_hidden_state.mean(dim=1).detach().numpy()
111
+ temp.append(vector)
112
+ return np.mean(np.array(temp), axis=0)
113
+
114
+ def single_image_embedding(image):
115
+ inputs = processor(images=image, return_tensors="pt")
116
+ outputs = model(**inputs)
117
+ vector = outputs.last_hidden_state.mean(dim=1).detach().numpy()
118
+ return vector
119
+
120
+ def documents_to_vision_embeddings(documents):
121
+ document_vision_embeddings = []
122
+ for document in tqdm(documents):
123
+ vector = single_unit_embedding(document)
124
+ document_vision_embeddings.append(vector)
125
+ return document_vision_embeddings
126
+
127
+ def queries_to_vision_embeddings(queries):
128
+ query_vision_embeddings = []
129
+ for query in tqdm(queries):
130
+ vector = single_unit_embedding(query)
131
+ query_vision_embeddings.append(vector)
132
+ return query_vision_embeddings
133
+
134
+ def get_documents_from_scores(scores):
135
+ rankings = []
136
+ for score in scores:
137
+ rankings.append(score[0])
138
+ return rankings
139
+
140
+ def cosine_similarity(v1, v2):
141
+ v1 = np.array(v1)
142
+ v2 = np.array(v2)
143
+ if(np.linalg.norm(v1) != 0 and np.linalg.norm(v2) != 0):
144
+ sim = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
145
+ else:
146
+ sim = 0
147
+ return sim
148
+
149
+ def vision_rankings(query_embedding, document_embeddings, k):
150
+ # query_embedding = single_unit_embedding(query)
151
+ scores = []
152
+ for idx, embedding in enumerate(document_embeddings):
153
+ scores.append((idx, cosine_similarity(query_embedding[0], embedding[0])))
154
+ scores = sorted(scores, key=lambda x: x[1], reverse=True)
155
+ scores = scores[:k]
156
+ rankings = get_documents_from_scores(scores)
157
+ return rankings, scores
158
+
159
+
160
+ def vision_pipeline(query, document_embeddings_path="Retrieval/savedModels/document-vision-embeddings.json", ids_path="Retrieval/savedModels/ids.pkl", k=100):
161
+ # document_embeddings = joblib.load(document_embeddings_path)
162
+ ids = joblib.load(ids_path)
163
+ with open(document_embeddings_path, "r") as f:
164
+ document_vision_embeddings2 = json.load(f)
165
+ document_vision_embeddings = []
166
+ for embedding in tqdm(document_vision_embeddings2):
167
+ document_vision_embeddings.append(np.array(embedding))
168
+ print("loaded embeddings")
169
+ query_embedding = single_unit_embedding(query)
170
+ rankings, scores = vision_rankings(query_embedding, document_vision_embeddings, k)
171
+ rankings2 = []
172
+ for ranking in rankings:
173
+ rankings2.append(ids[ranking])
174
+ return rankings2