That1BrainCell
commited on
Upload 3 files
Browse files- embedding.py +101 -15
- preprocess.py +9 -11
- search.py +1 -6
embedding.py
CHANGED
@@ -11,6 +11,10 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
11 |
import google.generativeai as genai
|
12 |
from langchain_core.messages import HumanMessage
|
13 |
from io import BytesIO
|
|
|
|
|
|
|
|
|
14 |
|
15 |
from search import search_images
|
16 |
|
@@ -19,6 +23,16 @@ gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIza
|
|
19 |
gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
|
20 |
gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
genai.configure(api_key="AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI")
|
23 |
|
24 |
def pdf_extractor(link):
|
@@ -42,7 +56,7 @@ def pdf_extractor(link):
|
|
42 |
except Exception as e:
|
43 |
print(f'An error occurred: {e}')
|
44 |
|
45 |
-
return
|
46 |
|
47 |
def web_extractor(link):
|
48 |
text = ''
|
@@ -56,8 +70,22 @@ def web_extractor(link):
|
|
56 |
except:
|
57 |
pass
|
58 |
|
59 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
|
|
|
|
|
|
|
|
61 |
|
62 |
def feature_extraction(tag, history , context):
|
63 |
|
@@ -81,18 +109,23 @@ def feature_extraction(tag, history , context):
|
|
81 |
|
82 |
return result.content
|
83 |
|
84 |
-
def feature_extraction_image(url
|
85 |
|
86 |
vision = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
|
87 |
# result = gemini.invoke('''Hello''')
|
88 |
# Markdown(result.content)
|
89 |
# print(result)
|
90 |
|
|
|
91 |
message = HumanMessage(content=[
|
92 |
{"type": "text", "text": "Please, Describe this image in detail"},
|
93 |
{"type": "image_url", "image_url": url}
|
94 |
])
|
95 |
-
|
|
|
|
|
|
|
|
|
96 |
return text.content
|
97 |
|
98 |
def detailed_feature_extraction(find, context):
|
@@ -196,7 +229,7 @@ def detailed_history(history):
|
|
196 |
|
197 |
def get_embeddings(link,tag_option):
|
198 |
|
199 |
-
print(f"\
|
200 |
|
201 |
if tag_option=='Complete Document Similarity':
|
202 |
history = { "Details": "" }
|
@@ -261,18 +294,65 @@ def get_embeddings(link,tag_option):
|
|
261 |
|
262 |
return history,genai_embeddings
|
263 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
def get_image_embeddings(Product):
|
265 |
image_embeddings = []
|
266 |
|
267 |
-
links = search_images(Product)
|
268 |
-
|
269 |
-
|
270 |
-
result = genai.embed_content(
|
271 |
-
model="models/embedding-001",
|
272 |
-
content=description,
|
273 |
-
task_type="retrieval_document")
|
274 |
|
275 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
276 |
|
277 |
|
278 |
|
@@ -287,6 +367,12 @@ text_splitter = RecursiveCharacterTextSplitter(
|
|
287 |
separators = ["",''," "]
|
288 |
)
|
289 |
|
|
|
|
|
|
|
|
|
|
|
|
|
290 |
if __name__ == '__main__':
|
291 |
-
|
292 |
-
print(get_image_embeddings(Product='Samsung Galaxy S24'))
|
|
|
11 |
import google.generativeai as genai
|
12 |
from langchain_core.messages import HumanMessage
|
13 |
from io import BytesIO
|
14 |
+
import numpy as np
|
15 |
+
import re
|
16 |
+
import torch
|
17 |
+
from transformers import AutoTokenizer, AutoModel
|
18 |
|
19 |
from search import search_images
|
20 |
|
|
|
23 |
gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
|
24 |
gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
|
25 |
|
26 |
+
vision = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
|
27 |
+
vision1 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
|
28 |
+
vision2 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
|
29 |
+
vision3 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
|
30 |
+
|
31 |
+
tokenizer = AutoTokenizer.from_pretrained('Alibaba-NLP/gte-base-en-v1.5',trust_remote_code = True)
|
32 |
+
model = AutoModel.from_pretrained('Alibaba-NLP/gte-base-en-v1.5',trust_remote_code = True)
|
33 |
+
model.to('cpu') # Ensure the model is on the CPU
|
34 |
+
|
35 |
+
|
36 |
genai.configure(api_key="AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI")
|
37 |
|
38 |
def pdf_extractor(link):
|
|
|
56 |
except Exception as e:
|
57 |
print(f'An error occurred: {e}')
|
58 |
|
59 |
+
return text
|
60 |
|
61 |
def web_extractor(link):
|
62 |
text = ''
|
|
|
70 |
except:
|
71 |
pass
|
72 |
|
73 |
+
return text
|
74 |
+
|
75 |
+
def imporve_text(text):
|
76 |
+
|
77 |
+
prompt = f'''
|
78 |
+
Please rewrite the following text to make it short, concise, and of high quality.
|
79 |
+
Ensure that all essential information and key points are retained.
|
80 |
+
Focus on improving clarity, coherence, and word choice without altering the original meaning.
|
81 |
+
|
82 |
+
text = {text}
|
83 |
+
'''
|
84 |
|
85 |
+
model = random.choice([gemini,gemini1,gemini2,gemini3])
|
86 |
+
result = model.invoke(prompt)
|
87 |
+
|
88 |
+
return result.content
|
89 |
|
90 |
def feature_extraction(tag, history , context):
|
91 |
|
|
|
109 |
|
110 |
return result.content
|
111 |
|
112 |
+
def feature_extraction_image(url):
|
113 |
|
114 |
vision = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
|
115 |
# result = gemini.invoke('''Hello''')
|
116 |
# Markdown(result.content)
|
117 |
# print(result)
|
118 |
|
119 |
+
text = 'None'
|
120 |
message = HumanMessage(content=[
|
121 |
{"type": "text", "text": "Please, Describe this image in detail"},
|
122 |
{"type": "image_url", "image_url": url}
|
123 |
])
|
124 |
+
try:
|
125 |
+
model = random.choice([vision,vision1,vision2,vision3])
|
126 |
+
text = model.invoke([message])
|
127 |
+
except:
|
128 |
+
return text
|
129 |
return text.content
|
130 |
|
131 |
def detailed_feature_extraction(find, context):
|
|
|
229 |
|
230 |
def get_embeddings(link,tag_option):
|
231 |
|
232 |
+
print(f"\n--> Creating Embeddings - {link}")
|
233 |
|
234 |
if tag_option=='Complete Document Similarity':
|
235 |
history = { "Details": "" }
|
|
|
294 |
|
295 |
return history,genai_embeddings
|
296 |
|
297 |
+
def get_embed_chroma(link):
|
298 |
+
|
299 |
+
print(f"\n--> Creating Embeddings - {link}")
|
300 |
+
|
301 |
+
# Extract Text -----------------------------
|
302 |
+
if link[-3:] == '.md' or link[8:11] == 'en.':
|
303 |
+
text = web_extractor(link)
|
304 |
+
else:
|
305 |
+
text = pdf_extractor(link)
|
306 |
+
print("\u2713 Extracting Text")
|
307 |
+
|
308 |
+
# Create Chunks ----------------------------
|
309 |
+
|
310 |
+
text = re.sub(r'\.{2,}', '.', text)
|
311 |
+
text = re.sub(r'\s{2,}', ' ', text)
|
312 |
+
text = [re.sub(r'\n{2,}', '\n', text)]
|
313 |
+
|
314 |
+
chunks = text_splitter_small.create_documents(text)
|
315 |
+
print("\u2713 Writing Tag Data")
|
316 |
+
|
317 |
+
# Creating Vector
|
318 |
+
embedding_vectors=[]
|
319 |
+
textual_data = []
|
320 |
+
print("\u2713 Creating Vectors")
|
321 |
+
|
322 |
+
|
323 |
+
for text in chunks:
|
324 |
+
|
325 |
+
inputs = tokenizer(text.page_content, return_tensors="pt", padding=True, truncation=True)
|
326 |
+
inputs = {k: v.to('cpu') for k, v in inputs.items()}
|
327 |
+
|
328 |
+
# Get the model's outputs
|
329 |
+
with torch.no_grad():
|
330 |
+
outputs = model(**inputs)
|
331 |
+
|
332 |
+
embeddings = outputs.last_hidden_state.mean(dim=1)
|
333 |
+
embedding_vectors.append(embeddings.squeeze().cpu().numpy().tolist())
|
334 |
+
textual_data.append(text.page_content)
|
335 |
+
|
336 |
+
return textual_data , embedding_vectors
|
337 |
+
|
338 |
+
|
339 |
+
|
340 |
def get_image_embeddings(Product):
|
341 |
image_embeddings = []
|
342 |
|
343 |
+
links = search_images(Product)
|
344 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
345 |
+
descriptions = list(executor.map(feature_extraction_image, links))
|
|
|
|
|
|
|
|
|
346 |
|
347 |
+
for description in descriptions:
|
348 |
+
result = genai.embed_content(
|
349 |
+
model="models/embedding-001",
|
350 |
+
content=description,
|
351 |
+
task_type="retrieval_document")
|
352 |
+
|
353 |
+
image_embeddings.append(result['embedding'])
|
354 |
+
# print(image_embeddings)
|
355 |
+
return image_embeddings
|
356 |
|
357 |
|
358 |
|
|
|
367 |
separators = ["",''," "]
|
368 |
)
|
369 |
|
370 |
+
text_splitter_small = RecursiveCharacterTextSplitter(
|
371 |
+
chunk_size = 2000,
|
372 |
+
chunk_overlap = 100,
|
373 |
+
separators = ["",''," "]
|
374 |
+
)
|
375 |
+
|
376 |
if __name__ == '__main__':
|
377 |
+
print(get_embed_chroma('https://www.galaxys24manual.com/wp-content/uploads/pdf/galaxy-s24-manual-SAM-S921-S926-S928-OS14-011824-FINAL-US-English.pdf'))
|
378 |
+
# print(get_image_embeddings(Product='Samsung Galaxy S24'))
|
preprocess.py
CHANGED
@@ -15,11 +15,11 @@ from pymongo import MongoClient
|
|
15 |
|
16 |
|
17 |
# Mongo Connections
|
18 |
-
srv_connection_uri = "mongodb+srv://adityasm1410:uOh6i11AYFeKp4wd@patseer.5xilhld.mongodb.net/?retryWrites=true&w=majority&appName=Patseer"
|
19 |
|
20 |
-
client = MongoClient(srv_connection_uri)
|
21 |
-
db = client['embeddings']
|
22 |
-
collection = db['data']
|
23 |
|
24 |
|
25 |
# API Urls -----
|
@@ -42,12 +42,11 @@ headers = {"Authorization": "Bearer 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"}
|
|
42 |
logging.basicConfig(level=logging.INFO)
|
43 |
|
44 |
|
45 |
-
|
46 |
# Global Var --------
|
47 |
|
48 |
data = False
|
49 |
seen = set()
|
50 |
-
existing_products_urls = set(
|
51 |
|
52 |
|
53 |
|
@@ -122,8 +121,7 @@ def extract_text_from_pdf(pdf_file, pages):
|
|
122 |
page = reader.pages[page_num]
|
123 |
extracted_text += page.extract_text() + "\n"
|
124 |
else:
|
125 |
-
|
126 |
-
|
127 |
return extracted_text
|
128 |
|
129 |
except:
|
@@ -155,11 +153,11 @@ def process_link(link, main_product, similar_product):
|
|
155 |
|
156 |
if language_preprocess(text):
|
157 |
if relevant(main_product, similar_product, text):
|
158 |
-
print("Accepted",link)
|
159 |
return link
|
160 |
except:
|
161 |
pass
|
162 |
-
print("
|
163 |
return None
|
164 |
|
165 |
def filtering(urls, main_product, similar_product, link_count):
|
@@ -178,7 +176,7 @@ def filtering(urls, main_product, similar_product, link_count):
|
|
178 |
|
179 |
count = 0
|
180 |
|
181 |
-
print(f"Filtering Links of
|
182 |
|
183 |
for link in urls:
|
184 |
|
|
|
15 |
|
16 |
|
17 |
# Mongo Connections
|
18 |
+
# srv_connection_uri = "mongodb+srv://adityasm1410:uOh6i11AYFeKp4wd@patseer.5xilhld.mongodb.net/?retryWrites=true&w=majority&appName=Patseer"
|
19 |
|
20 |
+
# client = MongoClient(srv_connection_uri)
|
21 |
+
# db = client['embeddings']
|
22 |
+
# collection = db['data']
|
23 |
|
24 |
|
25 |
# API Urls -----
|
|
|
42 |
logging.basicConfig(level=logging.INFO)
|
43 |
|
44 |
|
|
|
45 |
# Global Var --------
|
46 |
|
47 |
data = False
|
48 |
seen = set()
|
49 |
+
existing_products_urls = set()
|
50 |
|
51 |
|
52 |
|
|
|
121 |
page = reader.pages[page_num]
|
122 |
extracted_text += page.extract_text() + "\n"
|
123 |
else:
|
124 |
+
pass
|
|
|
125 |
return extracted_text
|
126 |
|
127 |
except:
|
|
|
153 |
|
154 |
if language_preprocess(text):
|
155 |
if relevant(main_product, similar_product, text):
|
156 |
+
print("Accepted -",link)
|
157 |
return link
|
158 |
except:
|
159 |
pass
|
160 |
+
print("Rejected -",link)
|
161 |
return None
|
162 |
|
163 |
def filtering(urls, main_product, similar_product, link_count):
|
|
|
176 |
|
177 |
count = 0
|
178 |
|
179 |
+
print(f"--> Filtering Links of - {similar_product}")
|
180 |
|
181 |
for link in urls:
|
182 |
|
search.py
CHANGED
@@ -12,7 +12,6 @@ import re
|
|
12 |
|
13 |
# Function to search DuckDuckGo
|
14 |
def search_duckduckgo(query):
|
15 |
-
print("Fetching Duckduckgo Links -----")
|
16 |
try:
|
17 |
results = DDGS().text(f"{query} manual filetype:pdf", max_results=5)
|
18 |
return [res['href'] for res in results]
|
@@ -21,7 +20,6 @@ def search_duckduckgo(query):
|
|
21 |
|
22 |
# Function to search Google
|
23 |
def search_google(query):
|
24 |
-
print("Fetching Google Links -----")
|
25 |
|
26 |
links = []
|
27 |
try:
|
@@ -55,7 +53,6 @@ def search_google(query):
|
|
55 |
|
56 |
# Function to search Internet Archive
|
57 |
def search_archive(query):
|
58 |
-
print("Fetching Archive Links -----")
|
59 |
|
60 |
try:
|
61 |
url = "https://archive.org/advancedsearch.php"
|
@@ -132,7 +129,6 @@ def search_archive(query):
|
|
132 |
return []
|
133 |
|
134 |
def search_github(query):
|
135 |
-
print("Fetching Github Links -----")
|
136 |
|
137 |
try:
|
138 |
# GitHub Search API endpoint
|
@@ -153,7 +149,6 @@ def search_github(query):
|
|
153 |
return []
|
154 |
|
155 |
def search_wikipedia(product):
|
156 |
-
print("Fetching Wikipedia Links -----")
|
157 |
|
158 |
api_url = "https://en.wikipedia.org/w/api.php"
|
159 |
params = {
|
@@ -223,7 +218,7 @@ def search_images(product):
|
|
223 |
# Similarity Check -------------------------------------->
|
224 |
|
225 |
def extract_similar_products(query):
|
226 |
-
print(f"\
|
227 |
results = DDGS().chat(f'{query} Similar Products')
|
228 |
|
229 |
pattern = r'^\d+\.\s(.+)$'
|
|
|
12 |
|
13 |
# Function to search DuckDuckGo
|
14 |
def search_duckduckgo(query):
|
|
|
15 |
try:
|
16 |
results = DDGS().text(f"{query} manual filetype:pdf", max_results=5)
|
17 |
return [res['href'] for res in results]
|
|
|
20 |
|
21 |
# Function to search Google
|
22 |
def search_google(query):
|
|
|
23 |
|
24 |
links = []
|
25 |
try:
|
|
|
53 |
|
54 |
# Function to search Internet Archive
|
55 |
def search_archive(query):
|
|
|
56 |
|
57 |
try:
|
58 |
url = "https://archive.org/advancedsearch.php"
|
|
|
129 |
return []
|
130 |
|
131 |
def search_github(query):
|
|
|
132 |
|
133 |
try:
|
134 |
# GitHub Search API endpoint
|
|
|
149 |
return []
|
150 |
|
151 |
def search_wikipedia(product):
|
|
|
152 |
|
153 |
api_url = "https://en.wikipedia.org/w/api.php"
|
154 |
params = {
|
|
|
218 |
# Similarity Check -------------------------------------->
|
219 |
|
220 |
def extract_similar_products(query):
|
221 |
+
print(f"\n--> Fetching similar items of - {query}")
|
222 |
results = DDGS().chat(f'{query} Similar Products')
|
223 |
|
224 |
pattern = r'^\d+\.\s(.+)$'
|