That1BrainCell commited on
Commit
05fdf5e
·
verified ·
1 Parent(s): 026c1cf

Upload 3 files

Browse files
Files changed (3) hide show
  1. embedding.py +101 -15
  2. preprocess.py +9 -11
  3. search.py +1 -6
embedding.py CHANGED
@@ -11,6 +11,10 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
  import google.generativeai as genai
12
  from langchain_core.messages import HumanMessage
13
  from io import BytesIO
 
 
 
 
14
 
15
  from search import search_images
16
 
@@ -19,6 +23,16 @@ gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIza
19
  gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
20
  gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
21
 
 
 
 
 
 
 
 
 
 
 
22
  genai.configure(api_key="AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI")
23
 
24
  def pdf_extractor(link):
@@ -42,7 +56,7 @@ def pdf_extractor(link):
42
  except Exception as e:
43
  print(f'An error occurred: {e}')
44
 
45
- return [text]
46
 
47
  def web_extractor(link):
48
  text = ''
@@ -56,8 +70,22 @@ def web_extractor(link):
56
  except:
57
  pass
58
 
59
- return [text]
 
 
 
 
 
 
 
 
 
 
60
 
 
 
 
 
61
 
62
  def feature_extraction(tag, history , context):
63
 
@@ -81,18 +109,23 @@ def feature_extraction(tag, history , context):
81
 
82
  return result.content
83
 
84
- def feature_extraction_image(url,):
85
 
86
  vision = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
87
  # result = gemini.invoke('''Hello''')
88
  # Markdown(result.content)
89
  # print(result)
90
 
 
91
  message = HumanMessage(content=[
92
  {"type": "text", "text": "Please, Describe this image in detail"},
93
  {"type": "image_url", "image_url": url}
94
  ])
95
- text = vision.invoke([message])
 
 
 
 
96
  return text.content
97
 
98
  def detailed_feature_extraction(find, context):
@@ -196,7 +229,7 @@ def detailed_history(history):
196
 
197
  def get_embeddings(link,tag_option):
198
 
199
- print(f"\nCreating Embeddings ----- {link}")
200
 
201
  if tag_option=='Complete Document Similarity':
202
  history = { "Details": "" }
@@ -261,18 +294,65 @@ def get_embeddings(link,tag_option):
261
 
262
  return history,genai_embeddings
263
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  def get_image_embeddings(Product):
265
  image_embeddings = []
266
 
267
- links = search_images(Product)[0]
268
- description = feature_extraction_image(links)
269
-
270
- result = genai.embed_content(
271
- model="models/embedding-001",
272
- content=description,
273
- task_type="retrieval_document")
274
 
275
- return result
 
 
 
 
 
 
 
 
276
 
277
 
278
 
@@ -287,6 +367,12 @@ text_splitter = RecursiveCharacterTextSplitter(
287
  separators = ["",''," "]
288
  )
289
 
 
 
 
 
 
 
290
  if __name__ == '__main__':
291
- # print(get_embeddings('https://www.galaxys24manual.com/wp-content/uploads/pdf/galaxy-s24-manual-SAM-S921-S926-S928-OS14-011824-FINAL-US-English.pdf',"Complete Document Similarity"))
292
- print(get_image_embeddings(Product='Samsung Galaxy S24'))
 
11
  import google.generativeai as genai
12
  from langchain_core.messages import HumanMessage
13
  from io import BytesIO
14
+ import numpy as np
15
+ import re
16
+ import torch
17
+ from transformers import AutoTokenizer, AutoModel
18
 
19
  from search import search_images
20
 
 
23
  gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
24
  gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
25
 
26
+ vision = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
27
+ vision1 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
28
+ vision2 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
29
+ vision3 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
30
+
31
+ tokenizer = AutoTokenizer.from_pretrained('Alibaba-NLP/gte-base-en-v1.5',trust_remote_code = True)
32
+ model = AutoModel.from_pretrained('Alibaba-NLP/gte-base-en-v1.5',trust_remote_code = True)
33
+ model.to('cpu') # Ensure the model is on the CPU
34
+
35
+
36
  genai.configure(api_key="AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI")
37
 
38
  def pdf_extractor(link):
 
56
  except Exception as e:
57
  print(f'An error occurred: {e}')
58
 
59
+ return text
60
 
61
  def web_extractor(link):
62
  text = ''
 
70
  except:
71
  pass
72
 
73
+ return text
74
+
75
+ def imporve_text(text):
76
+
77
+ prompt = f'''
78
+ Please rewrite the following text to make it short, concise, and of high quality.
79
+ Ensure that all essential information and key points are retained.
80
+ Focus on improving clarity, coherence, and word choice without altering the original meaning.
81
+
82
+ text = {text}
83
+ '''
84
 
85
+ model = random.choice([gemini,gemini1,gemini2,gemini3])
86
+ result = model.invoke(prompt)
87
+
88
+ return result.content
89
 
90
  def feature_extraction(tag, history , context):
91
 
 
109
 
110
  return result.content
111
 
112
+ def feature_extraction_image(url):
113
 
114
  vision = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
115
  # result = gemini.invoke('''Hello''')
116
  # Markdown(result.content)
117
  # print(result)
118
 
119
+ text = 'None'
120
  message = HumanMessage(content=[
121
  {"type": "text", "text": "Please, Describe this image in detail"},
122
  {"type": "image_url", "image_url": url}
123
  ])
124
+ try:
125
+ model = random.choice([vision,vision1,vision2,vision3])
126
+ text = model.invoke([message])
127
+ except:
128
+ return text
129
  return text.content
130
 
131
  def detailed_feature_extraction(find, context):
 
229
 
230
  def get_embeddings(link,tag_option):
231
 
232
+ print(f"\n--> Creating Embeddings - {link}")
233
 
234
  if tag_option=='Complete Document Similarity':
235
  history = { "Details": "" }
 
294
 
295
  return history,genai_embeddings
296
 
297
+ def get_embed_chroma(link):
298
+
299
+ print(f"\n--> Creating Embeddings - {link}")
300
+
301
+ # Extract Text -----------------------------
302
+ if link[-3:] == '.md' or link[8:11] == 'en.':
303
+ text = web_extractor(link)
304
+ else:
305
+ text = pdf_extractor(link)
306
+ print("\u2713 Extracting Text")
307
+
308
+ # Create Chunks ----------------------------
309
+
310
+ text = re.sub(r'\.{2,}', '.', text)
311
+ text = re.sub(r'\s{2,}', ' ', text)
312
+ text = [re.sub(r'\n{2,}', '\n', text)]
313
+
314
+ chunks = text_splitter_small.create_documents(text)
315
+ print("\u2713 Writing Tag Data")
316
+
317
+ # Creating Vector
318
+ embedding_vectors=[]
319
+ textual_data = []
320
+ print("\u2713 Creating Vectors")
321
+
322
+
323
+ for text in chunks:
324
+
325
+ inputs = tokenizer(text.page_content, return_tensors="pt", padding=True, truncation=True)
326
+ inputs = {k: v.to('cpu') for k, v in inputs.items()}
327
+
328
+ # Get the model's outputs
329
+ with torch.no_grad():
330
+ outputs = model(**inputs)
331
+
332
+ embeddings = outputs.last_hidden_state.mean(dim=1)
333
+ embedding_vectors.append(embeddings.squeeze().cpu().numpy().tolist())
334
+ textual_data.append(text.page_content)
335
+
336
+ return textual_data , embedding_vectors
337
+
338
+
339
+
340
  def get_image_embeddings(Product):
341
  image_embeddings = []
342
 
343
+ links = search_images(Product)
344
+ with concurrent.futures.ThreadPoolExecutor() as executor:
345
+ descriptions = list(executor.map(feature_extraction_image, links))
 
 
 
 
346
 
347
+ for description in descriptions:
348
+ result = genai.embed_content(
349
+ model="models/embedding-001",
350
+ content=description,
351
+ task_type="retrieval_document")
352
+
353
+ image_embeddings.append(result['embedding'])
354
+ # print(image_embeddings)
355
+ return image_embeddings
356
 
357
 
358
 
 
367
  separators = ["",''," "]
368
  )
369
 
370
+ text_splitter_small = RecursiveCharacterTextSplitter(
371
+ chunk_size = 2000,
372
+ chunk_overlap = 100,
373
+ separators = ["",''," "]
374
+ )
375
+
376
  if __name__ == '__main__':
377
+ print(get_embed_chroma('https://www.galaxys24manual.com/wp-content/uploads/pdf/galaxy-s24-manual-SAM-S921-S926-S928-OS14-011824-FINAL-US-English.pdf'))
378
+ # print(get_image_embeddings(Product='Samsung Galaxy S24'))
preprocess.py CHANGED
@@ -15,11 +15,11 @@ from pymongo import MongoClient
15
 
16
 
17
  # Mongo Connections
18
- srv_connection_uri = "mongodb+srv://adityasm1410:uOh6i11AYFeKp4wd@patseer.5xilhld.mongodb.net/?retryWrites=true&w=majority&appName=Patseer"
19
 
20
- client = MongoClient(srv_connection_uri)
21
- db = client['embeddings']
22
- collection = db['data']
23
 
24
 
25
  # API Urls -----
@@ -42,12 +42,11 @@ headers = {"Authorization": "Bearer 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"}
42
  logging.basicConfig(level=logging.INFO)
43
 
44
 
45
-
46
  # Global Var --------
47
 
48
  data = False
49
  seen = set()
50
- existing_products_urls = set(collection.distinct('url'))
51
 
52
 
53
 
@@ -122,8 +121,7 @@ def extract_text_from_pdf(pdf_file, pages):
122
  page = reader.pages[page_num]
123
  extracted_text += page.extract_text() + "\n"
124
  else:
125
- print(f"Page {page_num} does not exist in the document.")
126
-
127
  return extracted_text
128
 
129
  except:
@@ -155,11 +153,11 @@ def process_link(link, main_product, similar_product):
155
 
156
  if language_preprocess(text):
157
  if relevant(main_product, similar_product, text):
158
- print("Accepted",link)
159
  return link
160
  except:
161
  pass
162
- print("NOT Accepted",link)
163
  return None
164
 
165
  def filtering(urls, main_product, similar_product, link_count):
@@ -178,7 +176,7 @@ def filtering(urls, main_product, similar_product, link_count):
178
 
179
  count = 0
180
 
181
- print(f"Filtering Links of ---- {similar_product}")
182
 
183
  for link in urls:
184
 
 
15
 
16
 
17
  # Mongo Connections
18
+ # srv_connection_uri = "mongodb+srv://adityasm1410:uOh6i11AYFeKp4wd@patseer.5xilhld.mongodb.net/?retryWrites=true&w=majority&appName=Patseer"
19
 
20
+ # client = MongoClient(srv_connection_uri)
21
+ # db = client['embeddings']
22
+ # collection = db['data']
23
 
24
 
25
  # API Urls -----
 
42
  logging.basicConfig(level=logging.INFO)
43
 
44
 
 
45
  # Global Var --------
46
 
47
  data = False
48
  seen = set()
49
+ existing_products_urls = set()
50
 
51
 
52
 
 
121
  page = reader.pages[page_num]
122
  extracted_text += page.extract_text() + "\n"
123
  else:
124
+ pass
 
125
  return extracted_text
126
 
127
  except:
 
153
 
154
  if language_preprocess(text):
155
  if relevant(main_product, similar_product, text):
156
+ print("Accepted -",link)
157
  return link
158
  except:
159
  pass
160
+ print("Rejected -",link)
161
  return None
162
 
163
  def filtering(urls, main_product, similar_product, link_count):
 
176
 
177
  count = 0
178
 
179
+ print(f"--> Filtering Links of - {similar_product}")
180
 
181
  for link in urls:
182
 
search.py CHANGED
@@ -12,7 +12,6 @@ import re
12
 
13
  # Function to search DuckDuckGo
14
  def search_duckduckgo(query):
15
- print("Fetching Duckduckgo Links -----")
16
  try:
17
  results = DDGS().text(f"{query} manual filetype:pdf", max_results=5)
18
  return [res['href'] for res in results]
@@ -21,7 +20,6 @@ def search_duckduckgo(query):
21
 
22
  # Function to search Google
23
  def search_google(query):
24
- print("Fetching Google Links -----")
25
 
26
  links = []
27
  try:
@@ -55,7 +53,6 @@ def search_google(query):
55
 
56
  # Function to search Internet Archive
57
  def search_archive(query):
58
- print("Fetching Archive Links -----")
59
 
60
  try:
61
  url = "https://archive.org/advancedsearch.php"
@@ -132,7 +129,6 @@ def search_archive(query):
132
  return []
133
 
134
  def search_github(query):
135
- print("Fetching Github Links -----")
136
 
137
  try:
138
  # GitHub Search API endpoint
@@ -153,7 +149,6 @@ def search_github(query):
153
  return []
154
 
155
  def search_wikipedia(product):
156
- print("Fetching Wikipedia Links -----")
157
 
158
  api_url = "https://en.wikipedia.org/w/api.php"
159
  params = {
@@ -223,7 +218,7 @@ def search_images(product):
223
  # Similarity Check -------------------------------------->
224
 
225
  def extract_similar_products(query):
226
- print(f"\nFetching similar items of -----> {query}")
227
  results = DDGS().chat(f'{query} Similar Products')
228
 
229
  pattern = r'^\d+\.\s(.+)$'
 
12
 
13
  # Function to search DuckDuckGo
14
  def search_duckduckgo(query):
 
15
  try:
16
  results = DDGS().text(f"{query} manual filetype:pdf", max_results=5)
17
  return [res['href'] for res in results]
 
20
 
21
  # Function to search Google
22
  def search_google(query):
 
23
 
24
  links = []
25
  try:
 
53
 
54
  # Function to search Internet Archive
55
  def search_archive(query):
 
56
 
57
  try:
58
  url = "https://archive.org/advancedsearch.php"
 
129
  return []
130
 
131
  def search_github(query):
 
132
 
133
  try:
134
  # GitHub Search API endpoint
 
149
  return []
150
 
151
  def search_wikipedia(product):
 
152
 
153
  api_url = "https://en.wikipedia.org/w/api.php"
154
  params = {
 
218
  # Similarity Check -------------------------------------->
219
 
220
  def extract_similar_products(query):
221
+ print(f"\n--> Fetching similar items of - {query}")
222
  results = DDGS().chat(f'{query} Similar Products')
223
 
224
  pattern = r'^\d+\.\s(.+)$'