That1BrainCell commited on
Commit
140a96c
·
verified ·
1 Parent(s): f7cf778

New API + MongoDB

Browse files
Files changed (3) hide show
  1. embedding.py +40 -11
  2. preprocess.py +32 -10
  3. search.py +234 -227
embedding.py CHANGED
@@ -9,17 +9,17 @@ from langchain_community.document_loaders import WebBaseLoader
9
  from langchain_community.document_loaders import PyPDFLoader
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
11
  import google.generativeai as genai
 
12
  from io import BytesIO
13
 
 
14
 
 
 
 
 
15
 
16
- gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA',temperature = 0.1)
17
- gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyABsaDjPujPCBlz4LLxcXDX_bDA9uEL7Xc',temperature = 0.1)
18
- gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBCIQgt1uK7-sJH5Afg5vUZ99EWkx5gSU0',temperature = 0.1)
19
- gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBot9W5Q-BKQ66NAYRUmVeloXWEbXOXTmM',temperature = 0.1)
20
-
21
- genai.configure(api_key="AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA")
22
-
23
 
24
  def pdf_extractor(link):
25
  text = ''
@@ -76,11 +76,25 @@ def feature_extraction(tag, history , context):
76
  Respond with the updated Tag_History.
77
  '''
78
 
79
- # model = random.choice([gemini,gemini1,gemini2,gemini3])
80
- result = gemini1.invoke(prompt)
81
 
82
  return result.content
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  def detailed_feature_extraction(find, context):
85
 
86
  prompt = f'''
@@ -246,6 +260,21 @@ def get_embeddings(link,tag_option):
246
 
247
 
248
  return history,genai_embeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
  global text_splitter
251
  global data
@@ -259,5 +288,5 @@ text_splitter = RecursiveCharacterTextSplitter(
259
  )
260
 
261
  if __name__ == '__main__':
262
- # print(get_embeddings('https://www.galaxys24manual.com/wp-content/uploads/pdf/galaxy-s24-manual-SAM-S921-S926-S928-OS14-011824-FINAL-US-English.pdf',"Single"))
263
- pass
 
9
  from langchain_community.document_loaders import PyPDFLoader
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
11
  import google.generativeai as genai
12
+ from langchain_core.messages import HumanMessage
13
  from io import BytesIO
14
 
15
+ from search import search_images
16
 
17
+ gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
18
+ gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
19
+ gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
20
+ gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
21
 
22
+ genai.configure(api_key="AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI")
 
 
 
 
 
 
23
 
24
  def pdf_extractor(link):
25
  text = ''
 
76
  Respond with the updated Tag_History.
77
  '''
78
 
79
+ model = random.choice([gemini,gemini1,gemini2,gemini3])
80
+ result = model.invoke(prompt)
81
 
82
  return result.content
83
 
84
+ def feature_extraction_image(url,):
85
+
86
+ vision = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
87
+ # result = gemini.invoke('''Hello''')
88
+ # Markdown(result.content)
89
+ # print(result)
90
+
91
+ message = HumanMessage(content=[
92
+ {"type": "text", "text": "Please, Describe this image in detail"},
93
+ {"type": "image_url", "image_url": url}
94
+ ])
95
+ text = vision.invoke([message])
96
+ return text.content
97
+
98
  def detailed_feature_extraction(find, context):
99
 
100
  prompt = f'''
 
260
 
261
 
262
  return history,genai_embeddings
263
+
264
+ def get_image_embeddings(Product):
265
+ image_embeddings = []
266
+
267
+ links = search_images(Product)[0]
268
+ description = feature_extraction_image(links)
269
+
270
+ result = genai.embed_content(
271
+ model="models/embedding-001",
272
+ content=description,
273
+ task_type="retrieval_document")
274
+
275
+ return result
276
+
277
+
278
 
279
  global text_splitter
280
  global data
 
288
  )
289
 
290
  if __name__ == '__main__':
291
+ # print(get_embeddings('https://www.galaxys24manual.com/wp-content/uploads/pdf/galaxy-s24-manual-SAM-S921-S926-S928-OS14-011824-FINAL-US-English.pdf',"Complete Document Similarity"))
292
+ print(get_image_embeddings(Product='Samsung Galaxy S24'))
preprocess.py CHANGED
@@ -11,9 +11,16 @@ from io import BytesIO
11
  from langchain_community.document_loaders import WebBaseLoader
12
  from langchain_google_genai import ChatGoogleGenerativeAI
13
  import logging
 
 
 
 
 
 
 
 
 
14
 
15
- data = False
16
- seen = set()
17
 
18
  # API Urls -----
19
 
@@ -22,10 +29,10 @@ main_url = "http://127.0.0.1:8000/search/all"
22
  # main_product = "Samsung Galaxy s23 ultra"
23
 
24
  # Revelevance Checking Models -----
25
- gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA',temperature = 0.1)
26
- gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyABsaDjPujPCBlz4LLxcXDX_bDA9uEL7Xc',temperature = 0.1)
27
- gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBCIQgt1uK7-sJH5Afg5vUZ99EWkx5gSU0',temperature = 0.1)
28
- gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBot9W5Q-BKQ66NAYRUmVeloXWEbXOXTmM',temperature = 0.1)
29
 
30
 
31
  API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-xxl"
@@ -35,6 +42,15 @@ headers = {"Authorization": "Bearer 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"}
35
  logging.basicConfig(level=logging.INFO)
36
 
37
 
 
 
 
 
 
 
 
 
 
38
  def get_links(main_product,api_key):
39
  params = {
40
  "API_KEY": f"{api_key}",
@@ -165,11 +181,17 @@ def filtering(urls, main_product, similar_product, link_count):
165
  print(f"Filtering Links of ---- {similar_product}")
166
 
167
  for link in urls:
168
- result = process_link(link, main_product, similar_product)
 
 
 
 
 
 
169
 
170
- if result is not None:
171
- res.append(result)
172
- count += 1
173
 
174
  if count == link_count:
175
  break
 
11
  from langchain_community.document_loaders import WebBaseLoader
12
  from langchain_google_genai import ChatGoogleGenerativeAI
13
  import logging
14
+ from pymongo import MongoClient
15
+
16
+
17
+ # Mongo Connections
18
+ srv_connection_uri = "mongodb+srv://adityasm1410:uOh6i11AYFeKp4wd@patseer.5xilhld.mongodb.net/?retryWrites=true&w=majority&appName=Patseer"
19
+
20
+ client = MongoClient(srv_connection_uri)
21
+ db = client['embeddings']
22
+ collection = db['data']
23
 
 
 
24
 
25
  # API Urls -----
26
 
 
29
  # main_product = "Samsung Galaxy s23 ultra"
30
 
31
  # Revelevance Checking Models -----
32
+ gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
33
+ gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
34
+ gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
35
+ gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
36
 
37
 
38
  API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-xxl"
 
42
  logging.basicConfig(level=logging.INFO)
43
 
44
 
45
+
46
+ # Global Var --------
47
+
48
+ data = False
49
+ seen = set()
50
+ existing_products_urls = set(collection.distinct('url'))
51
+
52
+
53
+
54
  def get_links(main_product,api_key):
55
  params = {
56
  "API_KEY": f"{api_key}",
 
181
  print(f"Filtering Links of ---- {similar_product}")
182
 
183
  for link in urls:
184
+
185
+ if link in existing_products_urls:
186
+ res.append((link,1))
187
+ count+=1
188
+
189
+ else:
190
+ result = process_link(link, main_product, similar_product)
191
 
192
+ if result is not None:
193
+ res.append((result,0))
194
+ count += 1
195
 
196
  if count == link_count:
197
  break
search.py CHANGED
@@ -1,227 +1,234 @@
1
- # Library Imports
2
- import requests
3
- from bs4 import BeautifulSoup
4
- from googlesearch import search
5
- from duckduckgo_search import DDGS
6
- import concurrent.futures
7
- import re
8
-
9
-
10
-
11
- # Search Functions -------------------------------------------------------------->
12
-
13
- # Function to search DuckDuckGo
14
- def search_duckduckgo(query):
15
- print("Fetching Duckduckgo Links -----")
16
- try:
17
- results = DDGS().text(f"{query} manual filetype:pdf", max_results=5)
18
- return [res['href'] for res in results]
19
- except:
20
- return []
21
-
22
- # Function to search Google
23
- def search_google(query):
24
- print("Fetching Google Links -----")
25
-
26
- links = []
27
- try:
28
- api_key = 'AIzaSyDV_uJwrgNtawqtl6GDfeUj6NqO-H1tA4c'
29
- search_engine_id = 'c4ca951b9fc6949cb'
30
-
31
- url = f"https://www.googleapis.com/customsearch/v1"
32
- params = {
33
- "key": api_key,
34
- "cx": search_engine_id,
35
- "q": query + " manual filetype:pdf"
36
- }
37
-
38
- response = requests.get(url, params=params)
39
- results = response.json()
40
-
41
- for item in results.get('items', []):
42
- links.append(item['link'])
43
- except:
44
- pass
45
-
46
- try:
47
- extension = "ext:pdf"
48
- for result in search(query + " manual " + extension, num_results=5):
49
- if result.endswith('.pdf'):
50
- links.append(result)
51
- except:
52
- pass
53
-
54
- return links
55
-
56
- # Function to search Internet Archive
57
- def search_archive(query):
58
- print("Fetching Archive Links -----")
59
-
60
- try:
61
- url = "https://archive.org/advancedsearch.php"
62
- params = {
63
- 'q': f'{query} manual',
64
- 'fl[]': ['identifier', 'title', 'format'],
65
- 'rows': 50,
66
- 'page': 1,
67
- 'output': 'json'
68
- }
69
-
70
- # Make the request
71
- response = requests.get(url, params=params)
72
- data = response.json()
73
-
74
- # Function to extract hyperlinks from a webpage
75
- def extract_hyperlinks(url):
76
- # Send a GET request to the URL
77
- response = requests.get(url)
78
-
79
- # Check if the request was successful
80
- if response.status_code == 200:
81
- # Parse the HTML content of the page
82
- soup = BeautifulSoup(response.text, 'html.parser')
83
-
84
- # Find all <a> tags (hyperlinks)
85
- for link in soup.find_all('a', href=True):
86
- href = link['href']
87
- if href.endswith('.pdf'):
88
- pdf_files.append(url+'/'+href)
89
- if href.endswith('.iso'):
90
- # If the link ends with .iso, follow the link and extract .pdf hyperlinks
91
- extract_pdf_from_iso(url+'/'+href+'/')
92
-
93
- # Function to extract .pdf hyperlinks from an .iso file
94
- def extract_pdf_from_iso(iso_url):
95
- # Send a GET request to the ISO URL
96
- iso_response = requests.get(iso_url)
97
-
98
- # Check if the request was successful
99
- if iso_response.status_code == 200:
100
- # Parse the HTML content of the ISO page
101
- iso_soup = BeautifulSoup(iso_response.text, 'html.parser')
102
-
103
- # Find all <a> tags (hyperlinks) in the ISO page
104
- for link in iso_soup.find_all('a', href=True):
105
- href = link['href']
106
- if href.endswith('.pdf'):
107
- pdf_files.append('https:'+href)
108
-
109
- pdf_files = []
110
-
111
- def process_doc(doc):
112
- identifier = doc.get('identifier', 'N/A')
113
- # title = doc.get('title', 'N/A')
114
- # format = doc.get('format', 'N/A')
115
- pdf_link = f"https://archive.org/download/{identifier}"
116
- extract_hyperlinks(pdf_link)
117
-
118
- with concurrent.futures.ThreadPoolExecutor() as executor:
119
- futures = [executor.submit(process_doc, doc) for doc in data['response']['docs']]
120
-
121
- # Optionally, wait for all futures to complete and handle any exceptions
122
- for future in concurrent.futures.as_completed(futures):
123
- try:
124
- future.result() # This will raise an exception if the function call raised
125
- except Exception as exc:
126
- print(f'Generated an exception: {exc}')
127
-
128
-
129
- return pdf_files
130
-
131
- except:
132
- return []
133
-
134
- def search_github(query):
135
- print("Fetching Github Links -----")
136
-
137
- try:
138
- # GitHub Search API endpoint
139
- url = f"https://api.github.com/search/code?q={query}+extension:md"
140
-
141
- headers = {
142
- 'Authorization': 'Token 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'
143
- }
144
-
145
- # Make the request
146
- response = requests.get(url,headers=headers)
147
- data = response.json()
148
- links = [item['html_url'] for item in data['items']]
149
-
150
- return links
151
-
152
- except:
153
- return []
154
-
155
- def search_wikipedia(product):
156
- print("Fetching Wikipedia Links -----")
157
-
158
- api_url = "https://en.wikipedia.org/w/api.php"
159
- params = {
160
- "action": "opensearch",
161
- "search": product,
162
- "limit": 5,
163
- "namespace": 0,
164
- "format": "json"
165
- }
166
-
167
- try:
168
- response = requests.get(api_url, params=params)
169
- response.raise_for_status() # Raise an HTTPError for bad responses (4xx and 5xx)
170
- data = response.json()
171
-
172
- if data and len(data) > 3 and len(data[3]) > 0:
173
- return data[3] # The URL is in the fourth element of the response array
174
- else:
175
- return []
176
-
177
- except requests.RequestException as e:
178
- print(f"An error occurred: {e}")
179
- return []
180
-
181
- # def search_all(product,num):
182
-
183
- # similar_products = extract_similar_products(product)[num]
184
-
185
- # # results = {
186
- # # product : [{'duckduckgo': duckduckgo_search(product)},{'google': google_search(product)},{'github': github_search(product)},{'archive': archive_search(product)}]
187
- # # }
188
-
189
- # results = {}
190
-
191
- # def search_product(p):
192
- # return {
193
- # 'product': p,
194
- # 'duckduckgo': duckduckgo_search(p),
195
- # 'google': google_search(p),
196
- # 'github': github_search(p),
197
- # 'archive': archive_search(p),
198
- # 'wikipedia': wikipedia_search(p)
199
- # }
200
-
201
- # with concurrent.futures.ThreadPoolExecutor() as executor:
202
- # future_to_product = {executor.submit(search_product, p): p for p in similar_products}
203
-
204
- # for future in concurrent.futures.as_completed(future_to_product):
205
- # result = future.result()
206
- # product = result['product']
207
- # results[product] = [
208
- # {'duckduckgo': result['duckduckgo']},
209
- # {'google': result['google']},
210
- # {'github': result['github']},
211
- # {'archive': result['archive']},
212
- # {'wikipedia': result['wikipedia']}
213
- # ]
214
-
215
- # return results
216
-
217
- # Similarity Check -------------------------------------->
218
-
219
- def extract_similar_products(query):
220
- print(f"\nFetching similar items of -----> {query}")
221
- results = DDGS().chat(f'{query} Similar Products')
222
-
223
- pattern = r'^\d+\.\s(.+)$'
224
- matches = re.findall(pattern, results, re.MULTILINE)
225
- matches = [item.split(': ')[0] for item in matches]
226
- return matches
227
-
 
 
 
 
 
 
 
 
1
+ # Library Imports
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from googlesearch import search
5
+ from duckduckgo_search import DDGS
6
+ import concurrent.futures
7
+ import re
8
+
9
+
10
+
11
+ # Search Functions -------------------------------------------------------------->
12
+
13
+ # Function to search DuckDuckGo
14
+ def search_duckduckgo(query):
15
+ print("Fetching Duckduckgo Links -----")
16
+ try:
17
+ results = DDGS().text(f"{query} manual filetype:pdf", max_results=5)
18
+ return [res['href'] for res in results]
19
+ except:
20
+ return []
21
+
22
+ # Function to search Google
23
+ def search_google(query):
24
+ print("Fetching Google Links -----")
25
+
26
+ links = []
27
+ try:
28
+ api_key = 'AIzaSyDV_uJwrgNtawqtl6GDfeUj6NqO-H1tA4c'
29
+ search_engine_id = 'c4ca951b9fc6949cb'
30
+
31
+ url = f"https://www.googleapis.com/customsearch/v1"
32
+ params = {
33
+ "key": api_key,
34
+ "cx": search_engine_id,
35
+ "q": query + " manual filetype:pdf"
36
+ }
37
+
38
+ response = requests.get(url, params=params)
39
+ results = response.json()
40
+
41
+ for item in results.get('items', []):
42
+ links.append(item['link'])
43
+ except:
44
+ pass
45
+
46
+ try:
47
+ extension = "ext:pdf"
48
+ for result in search(query + " manual " + extension, num_results=5):
49
+ if result.endswith('.pdf'):
50
+ links.append(result)
51
+ except:
52
+ pass
53
+
54
+ return links
55
+
56
+ # Function to search Internet Archive
57
+ def search_archive(query):
58
+ print("Fetching Archive Links -----")
59
+
60
+ try:
61
+ url = "https://archive.org/advancedsearch.php"
62
+ params = {
63
+ 'q': f'{query} manual',
64
+ 'fl[]': ['identifier', 'title', 'format'],
65
+ 'rows': 50,
66
+ 'page': 1,
67
+ 'output': 'json'
68
+ }
69
+
70
+ # Make the request
71
+ response = requests.get(url, params=params)
72
+ data = response.json()
73
+
74
+ # Function to extract hyperlinks from a webpage
75
+ def extract_hyperlinks(url):
76
+ # Send a GET request to the URL
77
+ response = requests.get(url)
78
+
79
+ # Check if the request was successful
80
+ if response.status_code == 200:
81
+ # Parse the HTML content of the page
82
+ soup = BeautifulSoup(response.text, 'html.parser')
83
+
84
+ # Find all <a> tags (hyperlinks)
85
+ for link in soup.find_all('a', href=True):
86
+ href = link['href']
87
+ if href.endswith('.pdf'):
88
+ pdf_files.append(url+'/'+href)
89
+ if href.endswith('.iso'):
90
+ # If the link ends with .iso, follow the link and extract .pdf hyperlinks
91
+ extract_pdf_from_iso(url+'/'+href+'/')
92
+
93
+ # Function to extract .pdf hyperlinks from an .iso file
94
+ def extract_pdf_from_iso(iso_url):
95
+ # Send a GET request to the ISO URL
96
+ iso_response = requests.get(iso_url)
97
+
98
+ # Check if the request was successful
99
+ if iso_response.status_code == 200:
100
+ # Parse the HTML content of the ISO page
101
+ iso_soup = BeautifulSoup(iso_response.text, 'html.parser')
102
+
103
+ # Find all <a> tags (hyperlinks) in the ISO page
104
+ for link in iso_soup.find_all('a', href=True):
105
+ href = link['href']
106
+ if href.endswith('.pdf'):
107
+ pdf_files.append('https:'+href)
108
+
109
+ pdf_files = []
110
+
111
+ def process_doc(doc):
112
+ identifier = doc.get('identifier', 'N/A')
113
+ # title = doc.get('title', 'N/A')
114
+ # format = doc.get('format', 'N/A')
115
+ pdf_link = f"https://archive.org/download/{identifier}"
116
+ extract_hyperlinks(pdf_link)
117
+
118
+ with concurrent.futures.ThreadPoolExecutor() as executor:
119
+ futures = [executor.submit(process_doc, doc) for doc in data['response']['docs']]
120
+
121
+ # Optionally, wait for all futures to complete and handle any exceptions
122
+ for future in concurrent.futures.as_completed(futures):
123
+ try:
124
+ future.result() # This will raise an exception if the function call raised
125
+ except Exception as exc:
126
+ print(f'Generated an exception: {exc}')
127
+
128
+
129
+ return pdf_files
130
+
131
+ except:
132
+ return []
133
+
134
+ def search_github(query):
135
+ print("Fetching Github Links -----")
136
+
137
+ try:
138
+ # GitHub Search API endpoint
139
+ url = f"https://api.github.com/search/code?q={query}+extension:md"
140
+
141
+ headers = {
142
+ 'Authorization': 'Token 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'
143
+ }
144
+
145
+ # Make the request
146
+ response = requests.get(url,headers=headers)
147
+ data = response.json()
148
+ links = [item['html_url'] for item in data['items']]
149
+
150
+ return links
151
+
152
+ except:
153
+ return []
154
+
155
+ def search_wikipedia(product):
156
+ print("Fetching Wikipedia Links -----")
157
+
158
+ api_url = "https://en.wikipedia.org/w/api.php"
159
+ params = {
160
+ "action": "opensearch",
161
+ "search": product,
162
+ "limit": 5,
163
+ "namespace": 0,
164
+ "format": "json"
165
+ }
166
+
167
+ try:
168
+ response = requests.get(api_url, params=params)
169
+ response.raise_for_status() # Raise an HTTPError for bad responses (4xx and 5xx)
170
+ data = response.json()
171
+
172
+ if data and len(data) > 3 and len(data[3]) > 0:
173
+ return data[3] # The URL is in the fourth element of the response array
174
+ else:
175
+ return []
176
+
177
+ except requests.RequestException as e:
178
+ print(f"An error occurred: {e}")
179
+ return []
180
+
181
+ # def search_all(product,num):
182
+
183
+ # similar_products = extract_similar_products(product)[num]
184
+
185
+ # # results = {
186
+ # # product : [{'duckduckgo': duckduckgo_search(product)},{'google': google_search(product)},{'github': github_search(product)},{'archive': archive_search(product)}]
187
+ # # }
188
+
189
+ # results = {}
190
+
191
+ # def search_product(p):
192
+ # return {
193
+ # 'product': p,
194
+ # 'duckduckgo': duckduckgo_search(p),
195
+ # 'google': google_search(p),
196
+ # 'github': github_search(p),
197
+ # 'archive': archive_search(p),
198
+ # 'wikipedia': wikipedia_search(p)
199
+ # }
200
+
201
+ # with concurrent.futures.ThreadPoolExecutor() as executor:
202
+ # future_to_product = {executor.submit(search_product, p): p for p in similar_products}
203
+
204
+ # for future in concurrent.futures.as_completed(future_to_product):
205
+ # result = future.result()
206
+ # product = result['product']
207
+ # results[product] = [
208
+ # {'duckduckgo': result['duckduckgo']},
209
+ # {'google': result['google']},
210
+ # {'github': result['github']},
211
+ # {'archive': result['archive']},
212
+ # {'wikipedia': result['wikipedia']}
213
+ # ]
214
+
215
+ # return results
216
+
217
+ def search_images(product):
218
+ results = DDGS().images(f"{product}", max_results=5)
219
+ # print(results)
220
+ return [r['image'] for r in results]
221
+
222
+
223
+ # Similarity Check -------------------------------------->
224
+
225
+ def extract_similar_products(query):
226
+ print(f"\nFetching similar items of -----> {query}")
227
+ results = DDGS().chat(f'{query} Similar Products')
228
+
229
+ pattern = r'^\d+\.\s(.+)$'
230
+ matches = re.findall(pattern, results, re.MULTILINE)
231
+ matches = [item.split(': ')[0] for item in matches]
232
+ return matches
233
+
234
+