Prathmesh48 commited on
Commit
9ba9756
1 Parent(s): 9de950b

Upload 8 files

Browse files
Files changed (7) hide show
  1. api_fast.py +226 -0
  2. app.py +402 -326
  3. embedding.py +425 -370
  4. github_storage.py +77 -0
  5. preprocess.py +2 -3
  6. requirements.txt +32 -28
  7. tokenizer.json +0 -0
api_fast.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Request, HTTPException
2
+ from fastapi.responses import JSONResponse
3
+ from pydantic import BaseModel
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+ from googlesearch import search
7
+ from duckduckgo_search import DDGS
8
+ import concurrent.futures
9
+ import re
10
+
11
+ app = FastAPI()
12
+
13
+ API_KEY_DEFAULT = '12345'
14
+
15
+ class SearchRequest(BaseModel):
16
+ API_KEY: str
17
+ product: str
18
+
19
+ # Function to search DuckDuckGo
20
+ def duckduckgo_search(query):
21
+ try:
22
+ results = DDGS().text(f"{query} manual filetype:pdf", max_results=5)
23
+ return [res['href'] for res in results]
24
+ except:
25
+ return []
26
+
27
+ # Function to search Google
28
+ def google_search(query):
29
+ links = []
30
+ try:
31
+ api_key = 'AIzaSyDV_uJwrgNtawqtl6GDfeUj6NqO-H1tA4c'
32
+ search_engine_id = 'c4ca951b9fc6949cb'
33
+
34
+ url = f"https://www.googleapis.com/customsearch/v1"
35
+ params = {
36
+ "key": api_key,
37
+ "cx": search_engine_id,
38
+ "q": query + " manual filetype:pdf"
39
+ }
40
+
41
+ response = requests.get(url, params=params)
42
+ results = response.json()
43
+
44
+ for item in results.get('items', []):
45
+ links.append(item['link'])
46
+ except:
47
+ pass
48
+
49
+ try:
50
+ extension = "ext:pdf"
51
+ for result in search(query + " manual " + extension, num_results=5):
52
+ if result.endswith('.pdf'):
53
+ links.append(result)
54
+ except:
55
+ pass
56
+
57
+ return links
58
+
59
+ # Function to search Internet Archive
60
+ def archive_search(query):
61
+ try:
62
+ url = "https://archive.org/advancedsearch.php"
63
+ params = {
64
+ 'q': f'{query} manual',
65
+ 'fl[]': ['identifier', 'title', 'format'],
66
+ 'rows': 50,
67
+ 'page': 1,
68
+ 'output': 'json'
69
+ }
70
+
71
+ response = requests.get(url, params=params)
72
+ data = response.json()
73
+
74
+ def extract_hyperlinks(url):
75
+ response = requests.get(url)
76
+ if response.status_code == 200:
77
+ soup = BeautifulSoup(response.text, 'html.parser')
78
+ for link in soup.find_all('a', href=True):
79
+ href = link['href']
80
+ if href.endswith('.pdf'):
81
+ pdf_files.append(url + '/' + href)
82
+ if href.endswith('.iso'):
83
+ extract_pdf_from_iso(url + '/' + href + '/')
84
+
85
+ def extract_pdf_from_iso(iso_url):
86
+ iso_response = requests.get(iso_url)
87
+ if iso_response.status_code == 200:
88
+ iso_soup = BeautifulSoup(iso_response.text, 'html.parser')
89
+ for link in iso_soup.find_all('a', href=True):
90
+ href = link['href']
91
+ if href.endswith('.pdf'):
92
+ pdf_files.append('https:' + href)
93
+
94
+ pdf_files = []
95
+
96
+ def process_doc(doc):
97
+ identifier = doc.get('identifier', 'N/A')
98
+ pdf_link = f"https://archive.org/download/{identifier}"
99
+ extract_hyperlinks(pdf_link)
100
+
101
+ with concurrent.futures.ThreadPoolExecutor() as executor:
102
+ futures = [executor.submit(process_doc, doc) for doc in data['response']['docs']]
103
+ for future in concurrent.futures.as_completed(futures):
104
+ try:
105
+ future.result()
106
+ except Exception as exc:
107
+ print(f'Generated an exception: {exc}')
108
+
109
+ return pdf_files
110
+
111
+ except:
112
+ return []
113
+
114
+ def github_search(query):
115
+ try:
116
+ url = f"https://api.github.com/search/code?q={query}+extension:md"
117
+ headers = {
118
+ 'Authorization': 'Token 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'
119
+ }
120
+ response = requests.get(url, headers=headers)
121
+ data = response.json()
122
+ links = [item['html_url'].replace('/blob','').replace('//github','//raw.github') for item in data['items']]
123
+ return links
124
+
125
+ except:
126
+ return []
127
+
128
+ def extract_similar_products(query):
129
+ results = DDGS().chat(f'{query} Similar Products')
130
+ pattern = r'^\d+\.\s(.+)$'
131
+ matches = re.findall(pattern, results, re.MULTILINE)
132
+ matches = [item.split(': ')[0] for item in matches]
133
+ return matches[:5] if matches else []
134
+
135
+ @app.get('/')
136
+ def read_root():
137
+ return {"message": "Welcome to the search API"}
138
+
139
+ @app.post('/search/google')
140
+ async def search_google(request: SearchRequest):
141
+ if request.API_KEY == API_KEY_DEFAULT:
142
+ results = {request.product: google_search(request.product)}
143
+ similar_products = extract_similar_products(request.product)
144
+ for p in similar_products:
145
+ results[p] = google_search(p)
146
+ return results
147
+ else:
148
+ raise HTTPException(status_code=401, detail="Invalid API key")
149
+
150
+ @app.post('/search/duckduckgo')
151
+ async def search_duckduckgo(request: SearchRequest):
152
+ if request.API_KEY == API_KEY_DEFAULT:
153
+ results = {request.product: duckduckgo_search(request.product)}
154
+ similar_products = extract_similar_products(request.product)
155
+ for p in similar_products:
156
+ results[p] = duckduckgo_search(p)
157
+ return results
158
+ else:
159
+ raise HTTPException(status_code=401, detail="Invalid API key")
160
+
161
+ @app.post('/search/archive')
162
+ async def search_archive(request: SearchRequest):
163
+ if request.API_KEY == API_KEY_DEFAULT:
164
+ results = {request.product: archive_search(request.product)}
165
+ similar_products = extract_similar_products(request.product)
166
+
167
+ def process_product(product):
168
+ return product, archive_search(product)
169
+
170
+ with concurrent.futures.ThreadPoolExecutor() as executor:
171
+ future_to_product = {executor.submit(process_product, p): p for p in similar_products}
172
+ for future in concurrent.futures.as_completed(future_to_product):
173
+ product, result = future.result()
174
+ results[product] = result
175
+
176
+ return results
177
+ else:
178
+ raise HTTPException(status_code=401, detail="Invalid API key")
179
+
180
+ @app.post('/search/github')
181
+ async def search_github(request: SearchRequest):
182
+ if request.API_KEY == API_KEY_DEFAULT:
183
+ results = {request.product: github_search(request.product)}
184
+ similar_products = extract_similar_products(request.product)
185
+ for p in similar_products:
186
+ results[p] = github_search(p)
187
+ return results
188
+ else:
189
+ raise HTTPException(status_code=401, detail="Invalid API key")
190
+
191
+ @app.post('/search/all')
192
+ async def search_all(request: SearchRequest):
193
+ if request.API_KEY == API_KEY_DEFAULT:
194
+ results = {
195
+ request.product: [
196
+ {'duckduckgo': duckduckgo_search(request.product)},
197
+ {'google': google_search(request.product)},
198
+ {'github': github_search(request.product)},
199
+ {'archive': archive_search(request.product)}
200
+ ]
201
+ }
202
+
203
+ def search_product(p):
204
+ return {
205
+ 'product': p,
206
+ 'duckduckgo': duckduckgo_search(p),
207
+ 'google': google_search(p),
208
+ 'github': github_search(p),
209
+ 'archive': archive_search(p)
210
+ }
211
+
212
+ with concurrent.futures.ThreadPoolExecutor() as executor:
213
+ future_to_product = {executor.submit(search_product, p): p for p in extract_similar_products(request.product)}
214
+ for future in concurrent.futures.as_completed(future_to_product):
215
+ result = future.result()
216
+ product = result['product']
217
+ results[product] = [
218
+ {'duckduckgo': result['duckduckgo']},
219
+ {'google': result['google']},
220
+ {'github': result['github']},
221
+ {'archive': result['archive']}
222
+ ]
223
+
224
+ return results
225
+ else:
226
+ raise HTTPException(status_code=401, detail="Invalid API key")
app.py CHANGED
@@ -1,326 +1,402 @@
1
- import streamlit as st
2
- import concurrent.futures
3
- from concurrent.futures import ThreadPoolExecutor, as_completed
4
- from functools import partial
5
- import numpy as np
6
- from io import StringIO
7
- import sys
8
- import time
9
- import pandas as pd
10
- from pymongo import MongoClient
11
- import plotly.express as px
12
- from pinecone import Pinecone, ServerlessSpec
13
- import chromadb
14
- import requests
15
- from io import BytesIO
16
- from PyPDF2 import PdfReader
17
- import hashlib
18
- import os
19
- import shutil
20
-
21
- # File Imports
22
- from embedding import get_embeddings, get_image_embeddings, get_embed_chroma , imporve_text # Ensure this file/module is available
23
- from preprocess import filtering # Ensure this file/module is available
24
- from search import *
25
-
26
-
27
- # Chroma Connections
28
- client = chromadb.PersistentClient(path="embeddings")
29
- collection = client.get_or_create_collection(name="data", metadata={"hnsw:space": "l2"})
30
-
31
-
32
- def zip_folder(folder_path, zip_name):
33
- # Create a zip file from the folder
34
- shutil.make_archive(zip_name, 'zip', folder_path)
35
- return zip_name + '.zip'
36
-
37
- folder_path = '/home/user/app/embeddings'
38
- zip_name = 'embedding'
39
-
40
- # st.title("Download Embedding Folder")
41
-
42
-
43
- def generate_hash(content):
44
- return hashlib.sha256(content.encode('utf-8')).hexdigest()
45
-
46
-
47
- def get_key(link):
48
- text = ''
49
- try:
50
- # Fetch the PDF file from the URL
51
- response = requests.get(link)
52
- response.raise_for_status() # Raise an error for bad status codes
53
-
54
- # Use BytesIO to handle the PDF content in memory
55
- pdf_file = BytesIO(response.content)
56
-
57
- # Load the PDF file
58
- reader = PdfReader(pdf_file)
59
- num_pages = len(reader.pages)
60
-
61
- first_page_text = reader.pages[0].extract_text()
62
- if first_page_text:
63
- text += first_page_text
64
-
65
- last_page_text = reader.pages[-1].extract_text()
66
- if last_page_text:
67
- text += last_page_text
68
-
69
- except requests.exceptions.HTTPError as e:
70
- print(f'HTTP error occurred: {e}')
71
- except Exception as e:
72
- print(f'An error occurred: {e}')
73
-
74
- unique_key = generate_hash(text)
75
-
76
- return unique_key
77
-
78
-
79
- # Cosine Similarity Function
80
- def cosine_similarity(vec1, vec2):
81
- vec1 = np.array(vec1)
82
- vec2 = np.array(vec2)
83
-
84
- dot_product = np.dot(vec1, vec2.T)
85
- magnitude_vec1 = np.linalg.norm(vec1)
86
- magnitude_vec2 = np.linalg.norm(vec2)
87
-
88
- if magnitude_vec1 == 0 or magnitude_vec2 == 0:
89
- return 0.0
90
-
91
- cosine_sim = dot_product / (magnitude_vec1 * magnitude_vec2)
92
- return cosine_sim
93
-
94
-
95
- def update_chroma(product_name, url, key, text, vector, log_area):
96
- id_list = [key + str(i) for i in range(len(text))]
97
-
98
- metadata_list = [
99
- {'key': key,
100
- 'product_name': product_name,
101
- 'url': url,
102
- 'text': item
103
- }
104
- for item in text
105
- ]
106
-
107
- collection.upsert(
108
- ids=id_list,
109
- embeddings=vector,
110
- metadatas=metadata_list
111
- )
112
-
113
- logger.write(f"\n\u2713 Updated DB - {url}\n\n")
114
- log_area.text(logger.getvalue())
115
-
116
-
117
- # Logger class to capture output
118
- class StreamCapture:
119
- def __init__(self):
120
- self.output = StringIO()
121
- self._stdout = sys.stdout
122
-
123
- def __enter__(self):
124
- sys.stdout = self.output
125
- return self.output
126
-
127
- def __exit__(self, exc_type, exc_val, exc_tb):
128
- sys.stdout = self._stdout
129
-
130
-
131
- # Main Function
132
- def score(main_product, main_url, product_count, link_count, search, logger, log_area):
133
- data = {}
134
- similar_products = extract_similar_products(main_product)[:product_count]
135
-
136
- print("--> Fetching Manual Links")
137
- # Normal Filtering + Embedding -----------------------------------------------
138
- if search == 'All':
139
-
140
- def process_product(product, search_function, main_product):
141
- search_result = search_function(product)
142
- return filtering(search_result, main_product, product, link_count)
143
-
144
- search_functions = {
145
- 'google': search_google,
146
- 'duckduckgo': search_duckduckgo,
147
- 'github': search_github,
148
- 'wikipedia': search_wikipedia
149
- }
150
-
151
- with ThreadPoolExecutor() as executor:
152
- future_to_product_search = {
153
- executor.submit(process_product, product, search_function, main_product): (product, search_name)
154
- for product in similar_products
155
- for search_name, search_function in search_functions.items()
156
- }
157
-
158
- for future in as_completed(future_to_product_search):
159
- product, search_name = future_to_product_search[future]
160
- try:
161
- if product not in data:
162
- data[product] = {}
163
- data[product] = future.result()
164
- except Exception as e:
165
- print(f"Error processing product {product} with {search_name}: {e}")
166
-
167
- else:
168
-
169
- for product in similar_products:
170
-
171
- if search == 'google':
172
- data[product] = filtering(search_google(product), main_product, product, link_count)
173
- elif search == 'duckduckgo':
174
- data[product] = filtering(search_duckduckgo(product), main_product, product, link_count)
175
- elif search == 'archive':
176
- data[product] = filtering(search_archive(product), main_product, product, link_count)
177
- elif search == 'github':
178
- data[product] = filtering(search_github(product), main_product, product, link_count)
179
- elif search == 'wikipedia':
180
- data[product] = filtering(search_wikipedia(product), main_product, product, link_count)
181
-
182
- # Filtered Link -----------------------------------------
183
- logger.write("\n\n\u2713 Filtered Links\n")
184
- log_area.text(logger.getvalue())
185
-
186
- # Main product Embeddings ---------------------------------
187
- logger.write("\n\n--> Creating Main product Embeddings\n")
188
-
189
- main_key = get_key(main_url)
190
- main_text, main_vector = get_embed_chroma(main_url)
191
-
192
- update_chroma(main_product, main_url, main_key, main_text, main_vector, log_area)
193
-
194
- # log_area.text(logger.getvalue())
195
- print("\n\n\u2713 Main Product embeddings Created")
196
-
197
- logger.write("\n\n--> Creating Similar product Embeddings\n")
198
- log_area.text(logger.getvalue())
199
- test_embedding = [0] * 768
200
-
201
- for product in data:
202
- for link in data[product]:
203
-
204
- url, _ = link
205
- similar_key = get_key(url)
206
-
207
- res = collection.query(
208
- query_embeddings=[test_embedding],
209
- n_results=1,
210
- where={"key": similar_key},
211
- )
212
-
213
- if not res['distances'][0]:
214
- similar_text, similar_vector = get_embed_chroma(url)
215
- update_chroma(product, url, similar_key, similar_text, similar_vector, log_area)
216
-
217
- logger.write("\n\n\u2713 Similar Product embeddings Created\n")
218
- log_area.text(logger.getvalue())
219
-
220
- top_similar = []
221
-
222
- for idx, chunk in enumerate(main_vector):
223
- res = collection.query(
224
- query_embeddings=[chunk],
225
- n_results=1,
226
- where={"key": {'$ne': main_key}},
227
- include=['metadatas', 'embeddings', 'distances']
228
- )
229
-
230
- top_similar.append((main_text[idx], chunk, res, res['distances'][0]))
231
-
232
- most_similar_items = sorted(top_similar, key=lambda x: x[3])[:top_similar_count]
233
-
234
- logger.write("--------------- DONE -----------------\n")
235
- log_area.text(logger.getvalue())
236
-
237
- return most_similar_items
238
-
239
-
240
- # Streamlit Interface
241
- st.title("Check Infringement")
242
-
243
- # Inputs
244
- with st.sidebar:
245
- st.header("Product Information")
246
- main_product = st.text_input('Enter Main Product Name', 'Philips led 7w bulb')
247
- main_url = st.text_input('Enter Main Product Manual URL', 'https://www.assets.signify.com/is/content/PhilipsConsumer/PDFDownloads/Colombia/technical-sheets/ODLI20180227_001-UPD-es_CO-Ficha_Tecnica_LED_MR16_Master_7W_Dim_12V_CRI90.pdf')
248
-
249
- st.header("Search Settings")
250
- search_method = st.selectbox('Choose Search Engine', ['All', 'duckduckgo', 'google', 'archive', 'github', 'wikipedia'])
251
-
252
- product_count = st.number_input("Number of Similar Products", min_value=1, step=1, format="%i")
253
- link_count = st.number_input("Number of Links per Product", min_value=1, step=1, format="%i")
254
- need_image = st.selectbox("Process Images", ['True', 'False'])
255
-
256
- top_similar_count = st.number_input("Top Similarities to be Displayed", value=3, min_value=1, step=1, format="%i")
257
-
258
- if st.button("Download"):
259
- zip_file = zip_folder(folder_path, zip_name)
260
- with open(zip_file, "rb") as f:
261
- st.download_button(
262
- label="Download ZIP",
263
- data=f,
264
- file_name=zip_file,
265
- mime="application/zip"
266
- )
267
- if st.button('Check for Infringement'):
268
- global log_output # Placeholder for log output
269
-
270
- tab1, tab2 = st.tabs(["Output", "Console"])
271
-
272
- with tab2:
273
- log_output = st.empty()
274
-
275
- with tab1:
276
- with st.spinner('Processing...'):
277
- with StreamCapture() as logger:
278
- top_similar_values = score(main_product, main_url, product_count, link_count, search_method, logger, log_output)
279
-
280
- st.success('Processing complete!')
281
-
282
- st.subheader("Cosine Similarity Scores")
283
-
284
- for main_text, main_vector, response, _ in top_similar_values:
285
- product_name = response['metadatas'][0][0]['product_name']
286
- link = response['metadatas'][0][0]['url']
287
- similar_text = response['metadatas'][0][0]['text']
288
-
289
- cosine_score = cosine_similarity([main_vector], response['embeddings'][0])[0][0]
290
-
291
- # Display the product information
292
- with st.container():
293
- st.markdown(f"### [Product: {product_name}]({link})")
294
- st.markdown(f"#### Cosine Score: {cosine_score:.4f}")
295
- col1, col2 = st.columns(2)
296
- with col1:
297
- st.markdown(f"**Main Text:** {imporve_text(main_text)}")
298
- with col2:
299
- st.markdown(f"**Similar Text:** {imporve_text(similar_text)}")
300
-
301
- st.markdown("---")
302
-
303
- if need_image == 'True':
304
- with st.spinner('Processing Images...'):
305
- emb_main = get_image_embeddings(main_product)
306
- similar_prod = extract_similar_products(main_product)[0]
307
- emb_similar = get_image_embeddings(similar_prod)
308
-
309
- similarity_matrix = np.zeros((5, 5))
310
- for i in range(5):
311
- for j in range(5):
312
- similarity_matrix[i][j] = cosine_similarity([emb_main[i]], [emb_similar[j]])[0][0]
313
-
314
- st.subheader("Image Similarity")
315
- # Create an interactive heatmap
316
- fig = px.imshow(similarity_matrix,
317
- labels=dict(x=f"{similar_prod} Images", y=f"{main_product} Images", color="Similarity"),
318
- x=[f"Image {i+1}" for i in range(5)],
319
- y=[f"Image {i+1}" for i in range(5)],
320
- color_continuous_scale="Viridis")
321
-
322
- # Add title to the heatmap
323
- fig.update_layout(title="Image Similarity Heatmap")
324
-
325
- # Display the interactive heatmap
326
- st.plotly_chart(fig)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import concurrent.futures
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
+ from functools import partial
5
+ import numpy as np
6
+ from io import StringIO
7
+ import sys
8
+ import time
9
+ import pandas as pd
10
+ from pymongo import MongoClient
11
+ import plotly.express as px
12
+ from pinecone import Pinecone, ServerlessSpec
13
+ import chromadb
14
+ import requests
15
+ from io import BytesIO
16
+ from PyPDF2 import PdfReader
17
+ import hashlib
18
+ import os
19
+ from plotly.subplots import make_subplots
20
+ import plotly.graph_objects as go
21
+ from PIL import Image
22
+ import shutil
23
+
24
+
25
+ # File Imports
26
+ from embedding import get_embeddings, get_image_embeddings, get_embed_chroma,imporve_text # Ensure this file/module is available
27
+ from preprocess import filtering # Ensure this file/module is available
28
+ from github_storage import update_db,download_db
29
+ from search import *
30
+
31
+
32
+ # Chroma Connections
33
+ try:
34
+ client = chromadb.PersistentClient(path="embeddings")
35
+ collection = client.get_or_create_collection(name="data", metadata={"hnsw:space": "l2"})
36
+ except:
37
+ pass
38
+
39
+
40
+
41
+ def generate_hash(content):
42
+ return hashlib.sha256(content.encode('utf-8')).hexdigest()
43
+
44
+
45
+ def get_key(link):
46
+ text = ''
47
+ try:
48
+ # Fetch the PDF file from the URL
49
+ response = requests.get(link)
50
+ response.raise_for_status() # Raise an error for bad status codes
51
+
52
+ # Use BytesIO to handle the PDF content in memory
53
+ pdf_file = BytesIO(response.content)
54
+
55
+ # Load the PDF file
56
+ reader = PdfReader(pdf_file)
57
+ num_pages = len(reader.pages)
58
+
59
+ first_page_text = reader.pages[0].extract_text()
60
+ if first_page_text:
61
+ text += first_page_text
62
+
63
+ last_page_text = reader.pages[-1].extract_text()
64
+ if last_page_text:
65
+ text += last_page_text
66
+
67
+ except requests.exceptions.HTTPError as e:
68
+ print(f'HTTP error occurred: {e}')
69
+ except Exception as e:
70
+ print(f'An error occurred: {e}')
71
+
72
+ unique_key = generate_hash(text)
73
+
74
+ return unique_key
75
+
76
+
77
+ # Cosine Similarity Function
78
+ def cosine_similarity(vec1, vec2):
79
+ vec1 = np.array(vec1)
80
+ vec2 = np.array(vec2)
81
+
82
+ dot_product = np.dot(vec1, vec2.T)
83
+ magnitude_vec1 = np.linalg.norm(vec1)
84
+ magnitude_vec2 = np.linalg.norm(vec2)
85
+
86
+ if magnitude_vec1 == 0 or magnitude_vec2 == 0:
87
+ return 0.0
88
+
89
+ cosine_sim = dot_product / (magnitude_vec1 * magnitude_vec2)
90
+ return cosine_sim
91
+
92
+
93
+ def update_chroma(product_name, url, key, text, vector, log_area):
94
+ if len(text) > 0:
95
+ id_list = [key + str(i) for i in range(len(text))]
96
+
97
+ metadata_list = [
98
+ {'key': key,
99
+ 'product_name': product_name,
100
+ 'url': url,
101
+ 'text': item
102
+ }
103
+ for item in text
104
+ ]
105
+
106
+ collection.upsert(
107
+ ids=id_list,
108
+ embeddings=vector,
109
+ metadatas=metadata_list
110
+ )
111
+
112
+ logger.write(f"\n\u2713 Updated DB - {url}\n\n")
113
+ log_area.text(logger.getvalue())
114
+
115
+ return True
116
+
117
+ return False
118
+
119
+
120
+ # Logger class to capture output
121
+ class StreamCapture:
122
+ def __init__(self):
123
+ self.output = StringIO()
124
+ self._stdout = sys.stdout
125
+
126
+ def __enter__(self):
127
+ sys.stdout = self.output
128
+ return self.output
129
+
130
+ def __exit__(self, exc_type, exc_val, exc_tb):
131
+ sys.stdout = self._stdout
132
+
133
+
134
+ # Main Function
135
+ def score(main_product, main_url, product_count, link_count, search, logger, log_area):
136
+ data = {}
137
+ similar_products = extract_similar_products(main_product)[:product_count]
138
+
139
+ if len(similar_products) < 1:
140
+ st.warning(f'No Simililar Products Found for {main_product}. Please Be More Specific With Product Name')
141
+
142
+
143
+ print("--> Fetching Manual Links")
144
+ # Normal Filtering + Embedding -----------------------------------------------
145
+ if search == 'All':
146
+
147
+ def process_product(product, search_function, main_product):
148
+ search_result = search_function(product)
149
+ return filtering(search_result, main_product, product, link_count)
150
+
151
+ search_functions = {
152
+ 'google': search_google,
153
+ 'duckduckgo': search_duckduckgo,
154
+ 'github': search_github,
155
+ 'wikipedia': search_wikipedia
156
+ }
157
+
158
+ with ThreadPoolExecutor() as executor:
159
+ future_to_product_search = {
160
+ executor.submit(process_product, product, search_function, main_product): (product, search_name)
161
+ for product in similar_products
162
+ for search_name, search_function in search_functions.items()
163
+ }
164
+
165
+ for future in as_completed(future_to_product_search):
166
+ product, search_name = future_to_product_search[future]
167
+ try:
168
+ if product not in data:
169
+ data[product] = {}
170
+ data[product] = future.result()
171
+ except Exception as e:
172
+ print(f"Error processing product {product} with {search_name}: {e}")
173
+
174
+ else:
175
+
176
+ for product in similar_products:
177
+
178
+ if search == 'google':
179
+ data[product] = filtering(search_google(product), main_product, product, link_count)
180
+ elif search == 'duckduckgo':
181
+ data[product] = filtering(search_duckduckgo(product), main_product, product, link_count)
182
+ elif search == 'archive':
183
+ data[product] = filtering(search_archive(product), main_product, product, link_count)
184
+ elif search == 'github':
185
+ data[product] = filtering(search_github(product), main_product, product, link_count)
186
+ elif search == 'wikipedia':
187
+ data[product] = filtering(search_wikipedia(product), main_product, product, link_count)
188
+
189
+ # Filtered Link -----------------------------------------
190
+ logger.write("\n\n\u2713 Filtered Links\n")
191
+ log_area.text(logger.getvalue())
192
+
193
+ # Main product Embeddings ---------------------------------
194
+ logger.write("\n\n--> Creating Main product Embeddings\n")
195
+
196
+ main_key = get_key(main_url)
197
+ main_text, main_vector = get_embed_chroma(main_url)
198
+
199
+ readable = update_chroma(main_product, main_url, main_key, main_text, main_vector, log_area)
200
+
201
+ if readable:
202
+ # log_area.text(logger.getvalue())
203
+ print("\n\n\u2713 Main Product embeddings Created")
204
+
205
+ logger.write("\n\n--> Creating Similar product Embeddings\n")
206
+ log_area.text(logger.getvalue())
207
+ test_embedding = [0] * 768
208
+
209
+ for product in data:
210
+ for link in data[product]:
211
+
212
+ url, _ = link
213
+ similar_key = get_key(url)
214
+
215
+ res = collection.query(
216
+ query_embeddings=[test_embedding],
217
+ n_results=1,
218
+ where={"key": similar_key},
219
+ )
220
+
221
+ if not res['distances'][0]:
222
+ similar_text, similar_vector = get_embed_chroma(url)
223
+ update_chroma(product, url, similar_key, similar_text, similar_vector, log_area)
224
+
225
+ logger.write("\n\n\u2713 Similar Product embeddings Created\n")
226
+ log_area.text(logger.getvalue())
227
+
228
+ top_similar = []
229
+
230
+ for idx, chunk in enumerate(main_vector):
231
+ res = collection.query(
232
+ query_embeddings=[chunk],
233
+ n_results=1,
234
+ where={"key": {'$ne': main_key}},
235
+ include=['metadatas', 'embeddings', 'distances']
236
+ )
237
+
238
+ top_similar.append((main_text[idx], chunk, res, res['distances'][0]))
239
+
240
+ most_similar_items = sorted(top_similar, key=lambda x: x[3])[:top_similar_count]
241
+
242
+ logger.write("--------------- DONE -----------------\n")
243
+ log_area.text(logger.getvalue())
244
+
245
+ return most_similar_items
246
+
247
+ return []
248
+
249
+
250
+ # Streamlit Interface
251
+
252
+ st.title("🔍 Infringement Checker")
253
+
254
+ # Inputs
255
+ with st.sidebar:
256
+ st.header("📋 Product Information")
257
+ main_product = st.text_input('Enter Main Product Name', 'Philips led 7w bulb')
258
+ main_url = st.text_input('Enter Main Product Manual URL', 'https://www.assets.signify.com/is/content/PhilipsConsumer/PDFDownloads/Colombia/technical-sheets/ODLI20180227_001-UPD-es_CO-Ficha_Tecnica_LED_MR16_Master_7W_Dim_12V_CRI90.pdf')
259
+
260
+ st.header("🔎 Search Settings")
261
+ search_method = st.selectbox('Choose Search Engine', ['All', 'duckduckgo', 'google', 'archive', 'github', 'wikipedia'])
262
+
263
+ product_count = st.number_input("Number of Similar Products", min_value=1, step=1, format="%i")
264
+ link_count = st.number_input("Number of Links per Product", min_value=1, step=1, format="%i")
265
+ need_image = st.selectbox("Process Images", ['True', 'False'])
266
+
267
+ top_similar_count = st.number_input("Top Similarities to be Displayed", value=3, min_value=1, step=1, format="%i")
268
+
269
+
270
+ col1_main,col2_main = st.columns([7,3])
271
+
272
+ with col1_main:
273
+ run_streamlit = st.button('Check for Infringement')
274
+
275
+
276
+ if run_streamlit:
277
+ global log_output
278
+
279
+ tab1, tab2 = st.tabs(["📊 Output", "🖥️ Console"])
280
+
281
+ with tab2:
282
+ log_output = st.empty()
283
+
284
+ with tab1:
285
+ with st.spinner('Processing...'):
286
+
287
+ if len(os.listdir('/home/user/app/embeddings'))<2:
288
+ download_db()
289
+ print("\u2713 Downloaded Database\n\n")
290
+
291
+ with StreamCapture() as logger:
292
+ top_similar_values = score(main_product, main_url, product_count, link_count, search_method, logger, log_output)
293
+
294
+ st.success('✅ Processing complete!')
295
+
296
+ st.subheader("📈 Cosine Similarity Scores")
297
+
298
+ if len(top_similar_values) > 0:
299
+
300
+ for main_text, main_vector, response, _ in top_similar_values:
301
+ product_name = response['metadatas'][0][0]['product_name']
302
+ link = response['metadatas'][0][0]['url']
303
+ similar_text = response['metadatas'][0][0]['text']
304
+ # similar_text_refined = imporve_text(similar_text)
305
+ # main_text_refined = imporve_text(main_text)
306
+
307
+ cosine_score = cosine_similarity([main_vector], response['embeddings'][0])[0][0]
308
+
309
+ # Display the product information
310
+ with st.expander(f"### Product: {product_name} - Score: {cosine_score:.4f}"):
311
+ link = link.replace(" ","%20")
312
+ st.markdown(f"[View Product Manual]({link})")
313
+ tab1, tab2 = st.tabs(["Raw Text", "Refined Text"])
314
+ with tab2:
315
+ col1, col2 = st.columns(2)
316
+ with col1:
317
+ st.markdown(f"*Main Text:\n* {imporve_text(main_text)}")
318
+ with col2:
319
+ st.markdown(f"*Similar Text\n:* {imporve_text(similar_text)}")
320
+
321
+ with tab1:
322
+ col1, col2 = st.columns(2)
323
+ with col1:
324
+ st.markdown(f"*Main Text:* {main_text}")
325
+ with col2:
326
+ st.markdown(f"*Similar Text:* {similar_text}")
327
+
328
+ else:
329
+ st.warning("Main Product Document isn't Readable!")
330
+
331
+ if need_image == 'True':
332
+ with st.spinner('Processing Images...'):
333
+ emb_main , main_prod_imgs = get_image_embeddings(main_product)
334
+ similar_prod = extract_similar_products(main_product)[0]
335
+ emb_similar , similar_prod_imgs = get_image_embeddings(similar_prod)
336
+ if similar_prod:
337
+ similarity_matrix = np.zeros((5, 5))
338
+ for i in range(5):
339
+ for j in range(5):
340
+ similarity_matrix[i][j] = cosine_similarity([emb_main[i]], [emb_similar[j]])[0][0]
341
+
342
+ st.subheader("Image Similarity")
343
+ # Create an interactive heatmap
344
+ fig = px.imshow(similarity_matrix,
345
+ labels=dict(x=f"{similar_prod} Images", y=f"{main_product} Images", color="Similarity"),
346
+ x=[f"Image {i+1}" for i in range(5)],
347
+ y=[f"Image {i+1}" for i in range(5)],
348
+ color_continuous_scale="Viridis")
349
+
350
+ # Add title to the heatmap
351
+ fig.update_layout(title="Image Similarity Heatmap")
352
+
353
+ # Display the interactive heatmap
354
+ st.plotly_chart(fig)
355
+
356
+
357
+
358
+ @st.experimental_fragment
359
+ def image_viewer():
360
+ # Form to handle image selection
361
+
362
+ st.subheader("Image Viewer")
363
+
364
+ selected_row = st.selectbox('Select a row (Main Product Image)', [f'Image {i+1}' for i in range(5)])
365
+ selected_col = st.selectbox('Select a column (Similar Product Image)', [f'Image {i+1}' for i in range(5)])
366
+
367
+ # Get the selected indices from session state
368
+ row_idx = int(selected_row.split()[1]) - 1
369
+ col_idx = int(selected_col.split()[1]) - 1
370
+
371
+ col1, col2 = st.columns(2)
372
+
373
+ with col1:
374
+ st.image(main_prod_imgs[row_idx], caption=f'Main Product Image {row_idx+1}', use_column_width=True)
375
+ with col2:
376
+ st.image(similar_prod_imgs[col_idx], caption=f'Similar Product Image {col_idx+1}', use_column_width=True)
377
+
378
+ # Call the fragment
379
+ image_viewer()
380
+
381
+
382
+ @st.experimental_dialog("Confirm Database Backup")
383
+ def update():
384
+ st.write("Do you want to backup the new changes in the database?")
385
+ if st.button("Confirm",type="primary"):
386
+ st.write("Updating Database....")
387
+ st.session_state.update = {"Done": True}
388
+
389
+ update_db()
390
+
391
+ st.success('Backup Complete!', icon="✅")
392
+ time.sleep(2)
393
+ st.rerun()
394
+
395
+ if "update" not in st.session_state:
396
+ with col2_main:
397
+ update_button = st.button("Update Database",type="primary")
398
+ if update_button:
399
+ update()
400
+
401
+
402
+
embedding.py CHANGED
@@ -1,370 +1,425 @@
1
- from PyPDF2 import PdfReader
2
- import requests
3
- import json
4
- import os
5
- import concurrent.futures
6
- import random
7
- from langchain_google_genai import ChatGoogleGenerativeAI
8
- from langchain_community.document_loaders import WebBaseLoader
9
- from langchain_community.document_loaders import PyPDFLoader
10
- from langchain.text_splitter import RecursiveCharacterTextSplitter
11
- import google.generativeai as genai
12
- from langchain_core.messages import HumanMessage
13
- from io import BytesIO
14
- import numpy as np
15
- import re
16
- import torch
17
- from transformers import AutoTokenizer, AutoModel
18
-
19
- from search import search_images
20
-
21
- gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
22
- gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
23
- gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
24
- gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
25
-
26
- vision = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
27
- vision1 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
28
- vision2 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
29
- vision3 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
30
-
31
- tokenizer = AutoTokenizer.from_pretrained('Alibaba-NLP/gte-base-en-v1.5',trust_remote_code = True)
32
- model = AutoModel.from_pretrained('Alibaba-NLP/gte-base-en-v1.5',trust_remote_code = True)
33
- model.to('cpu') # Ensure the model is on the CPU
34
-
35
-
36
- genai.configure(api_key="AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI")
37
-
38
- def pdf_extractor(link):
39
- text = ''
40
-
41
- try:
42
- # Fetch the PDF file from the URL
43
- response = requests.get(link)
44
- response.raise_for_status() # Raise an error for bad status codes
45
-
46
- # Use BytesIO to handle the PDF content in memory
47
- pdf_file = BytesIO(response.content)
48
-
49
- # Load the PDF file
50
- reader = PdfReader(pdf_file)
51
- for page in reader.pages:
52
- text += page.extract_text() # Extract text from each page
53
-
54
- except requests.exceptions.HTTPError as e:
55
- print(f'HTTP error occurred: {e}')
56
- except Exception as e:
57
- print(f'An error occurred: {e}')
58
-
59
- return text
60
-
61
- def web_extractor(link):
62
- text = ''
63
-
64
- try:
65
- loader = WebBaseLoader(link)
66
- pages = loader.load_and_split()
67
-
68
- for page in pages:
69
- text+=page.page_content
70
- except:
71
- pass
72
-
73
- return text
74
-
75
- def imporve_text(text):
76
-
77
- prompt = f'''
78
- Please rewrite the following text to make it short, concise, and of high quality.
79
- Ensure that all essential information and key points are retained.
80
- Focus on improving clarity, coherence, and word choice without altering the original meaning.
81
-
82
- text = {text}
83
- '''
84
-
85
- model = random.choice([gemini,gemini1,gemini2,gemini3])
86
- result = model.invoke(prompt)
87
-
88
- return result.content
89
-
90
- def feature_extraction(tag, history , context):
91
-
92
- prompt = f'''
93
- You are an intelligent assistant tasked with updating product information. You have two data sources:
94
- 1. Tag_History: Previously gathered information about the product.
95
- 2. Tag_Context: New data that might contain additional details.
96
- Your job is to read the Tag_Context and update the relevant field in the Tag_History with any new details found. The field to be updated is the {tag} FIELD.
97
- Guidelines:
98
- - Only add new details that are relevant to the {tag} FIELD.
99
- - Do not add or modify any other fields in the Tag_History.
100
- - Ensure your response is in coherent sentences, integrating the new details seamlessly into the existing information.
101
- Here is the data:
102
- Tag_Context: {str(context)}
103
- Tag_History: {history}
104
- Respond with the updated Tag_History.
105
- '''
106
-
107
- model = random.choice([gemini,gemini1,gemini2,gemini3])
108
- result = model.invoke(prompt)
109
-
110
- return result.content
111
-
112
- def feature_extraction_image(url):
113
- text = ' '
114
- model = genai.GenerativeModel('gemini-1.5-flash-001')
115
- try:
116
- res = model.generate_content(['Describe this image to me',url])
117
- text = res.text
118
-
119
- except:
120
- pass
121
- return text
122
-
123
- def detailed_feature_extraction(find, context):
124
-
125
- prompt = f'''
126
- You are an intelligent assistant tasked with finding product information. You have one data source and one output format:
127
- 1. Context: The gathered information about the product.
128
- 2. Format: Details which need to be filled based on Context.
129
- Your job is to read the Context and update the relevant field in Format using Context.
130
- Guidelines:
131
- - Only add details that are relevant to the individual FIELD.
132
- - Do not add or modify any other fields in the Format.
133
- - If nothing found return None.
134
- Here is the data:
135
- The Context is {str(context)}
136
- The Format is {str(find)}
137
- '''
138
-
139
- model = random.choice([gemini,gemini1,gemini2,gemini3])
140
- result = model.invoke(prompt)
141
-
142
- return result.content
143
-
144
- def detailed_history(history):
145
-
146
- details = {
147
- "Introduction": {
148
- "Product Name": None,
149
- "Overview of the product": None,
150
- "Purpose of the manual": None,
151
- "Audience": None,
152
- "Additional Details": None
153
- },
154
- "Specifications": {
155
- "Technical specifications": None,
156
- "Performance metrics": None,
157
- "Additional Details": None
158
- },
159
- "Product Overview": {
160
- "Product features": None,
161
- "Key components and parts": None,
162
- "Additional Details": None
163
- },
164
- "Safety Information": {
165
- "Safety warnings and precautions": None,
166
- "Compliance and certification information": None,
167
- "Additional Details": None
168
- },
169
- "Installation Instructions": {
170
- "Unboxing and inventory checklist": None,
171
- "Step-by-step installation guide": None,
172
- "Required tools and materials": None,
173
- "Additional Details": None
174
- },
175
- "Setup and Configuration": {
176
- "Initial setup procedures": None,
177
- "Configuration settings": None,
178
- "Troubleshooting setup issues": None,
179
- "Additional Details": None
180
- },
181
- "Operation Instructions": {
182
- "How to use the product": None,
183
- "Detailed instructions for different functionalities": None,
184
- "User interface guide": None,
185
- "Additional Details": None
186
- },
187
- "Maintenance and Care": {
188
- "Cleaning instructions": None,
189
- "Maintenance schedule": None,
190
- "Replacement parts and accessories": None,
191
- "Additional Details": None
192
- },
193
- "Troubleshooting": {
194
- "Common issues and solutions": None,
195
- "Error messages and their meanings": None,
196
- "Support Information": None,
197
- "Additional Details": None
198
- },
199
- "Warranty Information": {
200
- "Terms and Conditions": None,
201
- "Service and repair information": None,
202
- "Additional Details": None
203
- },
204
- "Legal Information": {
205
- "Copyright information": None,
206
- "Trademarks and patents": None,
207
- "Disclaimers": None,
208
- "Additional Details": None
209
-
210
- }
211
- }
212
-
213
- for key,val in history.items():
214
-
215
- find = details[key]
216
-
217
- details[key] = str(detailed_feature_extraction(find,val))
218
-
219
- return details
220
-
221
-
222
- def get_embeddings(link,tag_option):
223
-
224
- print(f"\n--> Creating Embeddings - {link}")
225
-
226
- if tag_option=='Complete Document Similarity':
227
- history = { "Details": "" }
228
-
229
- else:
230
- history = {
231
- "Introduction": "",
232
- "Specifications": "",
233
- "Product Overview": "",
234
- "Safety Information": "",
235
- "Installation Instructions": "",
236
- "Setup and Configuration": "",
237
- "Operation Instructions": "",
238
- "Maintenance and Care": "",
239
- "Troubleshooting": "",
240
- "Warranty Information": "",
241
- "Legal Information": ""
242
- }
243
-
244
- # Extract Text -----------------------------
245
- print("Extracting Text")
246
- if link[-3:] == '.md' or link[8:11] == 'en.':
247
- text = web_extractor(link)
248
- else:
249
- text = pdf_extractor(link)
250
-
251
- # Create Chunks ----------------------------
252
- print("Writing Tag Data")
253
-
254
- if tag_option=="Complete Document Similarity":
255
- history["Details"] = feature_extraction("Details", history["Details"], text[0][:50000])
256
-
257
- else:
258
- chunks = text_splitter.create_documents(text)
259
-
260
- for chunk in chunks:
261
-
262
- with concurrent.futures.ThreadPoolExecutor() as executor:
263
- future_to_key = {
264
- executor.submit(
265
- feature_extraction, f"Product {key}", history[key], chunk.page_content
266
- ): key for key in history
267
- }
268
- for future in concurrent.futures.as_completed(future_to_key):
269
- key = future_to_key[future]
270
- try:
271
- response = future.result()
272
- history[key] = response
273
- except Exception as e:
274
- print(f"Error processing {key}: {e}")
275
-
276
- print("Creating Vectors")
277
- genai_embeddings=[]
278
-
279
- for tag in history:
280
- result = genai.embed_content(
281
- model="models/embedding-001",
282
- content=history[tag],
283
- task_type="retrieval_document")
284
- genai_embeddings.append(result['embedding'])
285
-
286
-
287
- return history,genai_embeddings
288
-
289
- def get_embed_chroma(link):
290
-
291
- print(f"\n--> Creating Embeddings - {link}")
292
-
293
- # Extract Text -----------------------------
294
- if link[-3:] == '.md' or link[8:11] == 'en.':
295
- text = web_extractor(link)
296
- else:
297
- text = pdf_extractor(link)
298
- print("\u2713 Extracting Text")
299
-
300
- # Create Chunks ----------------------------
301
-
302
- text = re.sub(r'\.{2,}', '.', text)
303
- text = re.sub(r'\s{2,}', ' ', text)
304
- text = [re.sub(r'\n{2,}', '\n', text)]
305
-
306
- chunks = text_splitter_small.create_documents(text)
307
- print("\u2713 Writing Tag Data")
308
-
309
- # Creating Vector
310
- embedding_vectors=[]
311
- textual_data = []
312
- print("\u2713 Creating Vectors")
313
-
314
-
315
- for text in chunks:
316
-
317
- inputs = tokenizer(text.page_content, return_tensors="pt", padding=True, truncation=True)
318
- inputs = {k: v.to('cpu') for k, v in inputs.items()}
319
-
320
- # Get the model's outputs
321
- with torch.no_grad():
322
- outputs = model(**inputs)
323
-
324
- embeddings = outputs.last_hidden_state.mean(dim=1)
325
- embedding_vectors.append(embeddings.squeeze().cpu().numpy().tolist())
326
- textual_data.append(text.page_content)
327
-
328
- return textual_data , embedding_vectors
329
-
330
-
331
-
332
- def get_image_embeddings(Product):
333
- image_embeddings = []
334
-
335
- links = search_images(Product)
336
- with concurrent.futures.ThreadPoolExecutor() as executor:
337
- descriptions = list(executor.map(feature_extraction_image, links))
338
-
339
- for description in descriptions:
340
- result = genai.embed_content(
341
- model="models/embedding-001",
342
- content=description,
343
- task_type="retrieval_document")
344
-
345
- image_embeddings.append(result['embedding'])
346
- # print(image_embeddings)
347
- return image_embeddings
348
-
349
-
350
-
351
- global text_splitter
352
- global data
353
- global history
354
-
355
-
356
- text_splitter = RecursiveCharacterTextSplitter(
357
- chunk_size = 10000,
358
- chunk_overlap = 100,
359
- separators = ["",''," "]
360
- )
361
-
362
- text_splitter_small = RecursiveCharacterTextSplitter(
363
- chunk_size = 2000,
364
- chunk_overlap = 100,
365
- separators = ["",''," "]
366
- )
367
-
368
- if __name__ == '__main__':
369
- # print(get_embed_chroma('https://www.galaxys24manual.com/wp-content/uploads/pdf/galaxy-s24-manual-SAM-S921-S926-S928-OS14-011824-FINAL-US-English.pdf'))
370
- print(get_image_embeddings(Product='Samsung Galaxy S24'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PyPDF2 import PdfReader
2
+ import requests
3
+ import json
4
+ import os
5
+ import concurrent.futures
6
+ import random
7
+ from langchain_google_genai import ChatGoogleGenerativeAI
8
+ from langchain_community.document_loaders import WebBaseLoader
9
+ from langchain_community.document_loaders import PyPDFLoader
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ import google.generativeai as genai
12
+ from langchain_core.messages import HumanMessage
13
+ from io import BytesIO
14
+ import numpy as np
15
+ import re
16
+ import torch
17
+ from transformers import AutoTokenizer, AutoModel
18
+ import numpy as np
19
+ import onnxruntime as ort
20
+ # import torch._dynamo
21
+ import time
22
+ # torch._dynamo.config.suppress_errors = True
23
+
24
+ from search import search_images
25
+
26
+ gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
27
+ gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
28
+ gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
29
+ gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
30
+
31
+ vision = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
32
+ vision1 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
33
+ vision2 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
34
+ vision3 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
35
+
36
+ tokenizer = AutoTokenizer.from_pretrained('dwzhu/e5-base-4k',trust_remote_code = True)
37
+ # model = AutoModel.from_pretrained('dwzhu/e5-base-4k',trust_remote_code = True)
38
+ model_path = "model_opt2_QInt8.onnx"
39
+
40
+ session = ort.InferenceSession(model_path)
41
+ # model = torch.compile(model)
42
+ # model.to('cpu') # Ensure the model is on the CPU
43
+
44
+ from transformers import PreTrainedTokenizerFast
45
+
46
+ class TokenBasedTextSplitter:
47
+ def __init__(self, tokenizer_path='tokenizer.json', chunk_size=2000, chunk_overlap=50):
48
+ self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path)
49
+ self.chunk_size = chunk_size
50
+ self.chunk_overlap = chunk_overlap
51
+
52
+ def split_text(self, text):
53
+ tokens = self.tokenizer.tokenize(text)
54
+ chunks = []
55
+
56
+ for i in range(0, len(tokens), self.chunk_size - self.chunk_overlap):
57
+ chunk = tokens[i:i + self.chunk_size]
58
+ chunks.append(self.tokenizer.convert_tokens_to_string(chunk))
59
+
60
+ return chunks
61
+
62
+
63
+
64
+ genai.configure(api_key="AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI")
65
+
66
+ def pdf_extractor(link):
67
+ text = ''
68
+
69
+ try:
70
+ # Fetch the PDF file from the URL
71
+ response = requests.get(link)
72
+ response.raise_for_status() # Raise an error for bad status codes
73
+
74
+ # Use BytesIO to handle the PDF content in memory
75
+ pdf_file = BytesIO(response.content)
76
+
77
+ # Load the PDF file
78
+ reader = PdfReader(pdf_file)
79
+ for page in reader.pages:
80
+ text += page.extract_text() # Extract text from each page
81
+
82
+ except requests.exceptions.HTTPError as e:
83
+ print(f'HTTP error occurred: {e}')
84
+ except Exception as e:
85
+ print(f'An error occurred: {e}')
86
+
87
+ return text
88
+
89
+ def web_extractor(link):
90
+ text = ''
91
+
92
+ try:
93
+ loader = WebBaseLoader(link)
94
+ pages = loader.load_and_split()
95
+
96
+ for page in pages:
97
+ text+=page.page_content
98
+ except:
99
+ pass
100
+
101
+ return text
102
+
103
+ def imporve_text(text):
104
+
105
+ prompt = f'''
106
+ Please rewrite the following text to make it short, descriptive, concise, and of high quality.
107
+ Ensure that all essential information is retained.
108
+ Focus on improving clarity, coherence, and word choice without altering the original meaning.
109
+
110
+ text = {text}
111
+ '''
112
+
113
+ model = random.choice([gemini,gemini1,gemini2,gemini3])
114
+ result = model.invoke(prompt)
115
+
116
+ return result.content
117
+
118
+ def feature_extraction(tag, history , context):
119
+
120
+ prompt = f'''
121
+ You are an intelligent assistant tasked with updating product information. You have two data sources:
122
+ 1. Tag_History: Previously gathered information about the product.
123
+ 2. Tag_Context: New data that might contain additional details.
124
+ Your job is to read the Tag_Context and update the relevant field in the Tag_History with any new details found. The field to be updated is the {tag} FIELD.
125
+ Guidelines:
126
+ - Only add new details that are relevant to the {tag} FIELD.
127
+ - Do not add or modify any other fields in the Tag_History.
128
+ - Ensure your response is in coherent sentences, integrating the new details seamlessly into the existing information.
129
+ Here is the data:
130
+ Tag_Context: {str(context)}
131
+ Tag_History: {history}
132
+ Respond with the updated Tag_History.
133
+ '''
134
+
135
+ model = random.choice([gemini,gemini1,gemini2,gemini3])
136
+ result = model.invoke(prompt)
137
+
138
+ return result.content
139
+
140
+ def feature_extraction_image(url):
141
+ text = ' '
142
+ model = genai.GenerativeModel('gemini-1.5-flash-001')
143
+ try:
144
+ res = model.generate_content(['Describe this image to me',url])
145
+ text = res.text
146
+
147
+ except:
148
+ pass
149
+ return text
150
+
151
+ def detailed_feature_extraction(find, context):
152
+
153
+ prompt = f'''
154
+ You are an intelligent assistant tasked with finding product information. You have one data source and one output format:
155
+ 1. Context: The gathered information about the product.
156
+ 2. Format: Details which need to be filled based on Context.
157
+ Your job is to read the Context and update the relevant field in Format using Context.
158
+ Guidelines:
159
+ - Only add details that are relevant to the individual FIELD.
160
+ - Do not add or modify any other fields in the Format.
161
+ - If nothing found return None.
162
+ Here is the data:
163
+ The Context is {str(context)}
164
+ The Format is {str(find)}
165
+ '''
166
+
167
+ model = random.choice([gemini,gemini1,gemini2,gemini3])
168
+ result = model.invoke(prompt)
169
+
170
+ return result.content
171
+
172
+ def detailed_history(history):
173
+
174
+ details = {
175
+ "Introduction": {
176
+ "Product Name": None,
177
+ "Overview of the product": None,
178
+ "Purpose of the manual": None,
179
+ "Audience": None,
180
+ "Additional Details": None
181
+ },
182
+ "Specifications": {
183
+ "Technical specifications": None,
184
+ "Performance metrics": None,
185
+ "Additional Details": None
186
+ },
187
+ "Product Overview": {
188
+ "Product features": None,
189
+ "Key components and parts": None,
190
+ "Additional Details": None
191
+ },
192
+ "Safety Information": {
193
+ "Safety warnings and precautions": None,
194
+ "Compliance and certification information": None,
195
+ "Additional Details": None
196
+ },
197
+ "Installation Instructions": {
198
+ "Unboxing and inventory checklist": None,
199
+ "Step-by-step installation guide": None,
200
+ "Required tools and materials": None,
201
+ "Additional Details": None
202
+ },
203
+ "Setup and Configuration": {
204
+ "Initial setup procedures": None,
205
+ "Configuration settings": None,
206
+ "Troubleshooting setup issues": None,
207
+ "Additional Details": None
208
+ },
209
+ "Operation Instructions": {
210
+ "How to use the product": None,
211
+ "Detailed instructions for different functionalities": None,
212
+ "User interface guide": None,
213
+ "Additional Details": None
214
+ },
215
+ "Maintenance and Care": {
216
+ "Cleaning instructions": None,
217
+ "Maintenance schedule": None,
218
+ "Replacement parts and accessories": None,
219
+ "Additional Details": None
220
+ },
221
+ "Troubleshooting": {
222
+ "Common issues and solutions": None,
223
+ "Error messages and their meanings": None,
224
+ "Support Information": None,
225
+ "Additional Details": None
226
+ },
227
+ "Warranty Information": {
228
+ "Terms and Conditions": None,
229
+ "Service and repair information": None,
230
+ "Additional Details": None
231
+ },
232
+ "Legal Information": {
233
+ "Copyright information": None,
234
+ "Trademarks and patents": None,
235
+ "Disclaimers": None,
236
+ "Additional Details": None
237
+
238
+ }
239
+ }
240
+
241
+ for key,val in history.items():
242
+
243
+ find = details[key]
244
+
245
+ details[key] = str(detailed_feature_extraction(find,val))
246
+
247
+ return details
248
+
249
+
250
+ def get_embeddings(link,tag_option):
251
+
252
+ print(f"\n--> Creating Embeddings - {link}")
253
+
254
+ if tag_option=='Complete Document Similarity':
255
+ history = { "Details": "" }
256
+
257
+ else:
258
+ history = {
259
+ "Introduction": "",
260
+ "Specifications": "",
261
+ "Product Overview": "",
262
+ "Safety Information": "",
263
+ "Installation Instructions": "",
264
+ "Setup and Configuration": "",
265
+ "Operation Instructions": "",
266
+ "Maintenance and Care": "",
267
+ "Troubleshooting": "",
268
+ "Warranty Information": "",
269
+ "Legal Information": ""
270
+ }
271
+
272
+ # Extract Text -----------------------------
273
+ print("Extracting Text")
274
+ if link[-3:] == '.md' or link[8:11] == 'en.':
275
+ text = web_extractor(link)
276
+ else:
277
+ text = pdf_extractor(link)
278
+
279
+ # Create Chunks ----------------------------
280
+ print("Writing Tag Data")
281
+
282
+
283
+ if tag_option=="Complete Document Similarity":
284
+ history["Details"] = feature_extraction("Details", history["Details"], text[0][:50000])
285
+
286
+ else:
287
+ chunks = text_splitter.create_documents(text)
288
+
289
+ for chunk in chunks:
290
+
291
+ with concurrent.futures.ThreadPoolExecutor() as executor:
292
+ future_to_key = {
293
+ executor.submit(
294
+ feature_extraction, f"Product {key}", history[key], chunk.page_content
295
+ ): key for key in history
296
+ }
297
+ for future in concurrent.futures.as_completed(future_to_key):
298
+ key = future_to_key[future]
299
+ try:
300
+ response = future.result()
301
+ history[key] = response
302
+ except Exception as e:
303
+ print(f"Error processing {key}: {e}")
304
+
305
+ print("Creating Vectors")
306
+ genai_embeddings=[]
307
+
308
+ for tag in history:
309
+ result = genai.embed_content(
310
+ model="models/embedding-001",
311
+ content=history[tag],
312
+ task_type="retrieval_document")
313
+ genai_embeddings.append(result['embedding'])
314
+
315
+
316
+ return history,genai_embeddings
317
+
318
+ def get_embed_chroma(link):
319
+
320
+ print(f"\n--> Creating Embeddings - {link}")
321
+
322
+ # Extract Text -----------------------------
323
+ if link[-3:] == '.md' or link[8:11] == 'en.':
324
+ text = web_extractor(link)
325
+ else:
326
+ text = pdf_extractor(link)
327
+ print("\u2713 Extracting Text")
328
+
329
+ # Create Chunks ----------------------------
330
+
331
+ text = re.sub(r'\.{2,}', '.', text)
332
+ text = re.sub(r'\s{2,}', ' ', text)
333
+ text = re.sub(r'\d{7,}', '', text)
334
+
335
+ text = re.sub(r'\n{2,}', '\n', text)
336
+
337
+
338
+ chunks = text_splitter_small.split_text(text)
339
+ # print(chunks[:2])
340
+ print("\u2713 Writing Tag Data")
341
+
342
+ # Creating Vector
343
+ embedding_vectors=[]
344
+ # textual_data = []
345
+ print("\u2713 Creating Vectors")
346
+
347
+
348
+ # batch_size = 1
349
+ # # Process chunks in batches
350
+ # for i in range(0, len(chunks), batch_size):
351
+ # batch = chunks[i:i + batch_size]
352
+
353
+ # # texts = [text for text in batch]
354
+ # # print(texts)
355
+
356
+ t1 = time.time()
357
+ for chunk in chunks:
358
+ # Tokenize the input text
359
+ inputs = tokenizer(chunk, return_tensors="np", padding=True, truncation=True)
360
+
361
+ # Convert inputs to int64
362
+ input_ids = inputs['input_ids'].astype(np.int64)
363
+ attention_mask = inputs['attention_mask'].astype(np.int64)
364
+ token_type_ids = inputs.get('token_type_ids', np.zeros_like(input_ids)).astype(np.int64) # Some models might not use token_type_ids
365
+
366
+ # Create the input feed dictionary
367
+ input_feed = {
368
+ 'input_ids': input_ids,
369
+ 'attention_mask': attention_mask,
370
+ 'token_type_ids': token_type_ids
371
+ }
372
+
373
+ # Get the model's outputs
374
+ outputs = session.run(None, input_feed)
375
+
376
+ # Convert the outputs to numpy and process as needed
377
+ last_hidden_state = np.array(outputs[0])
378
+ embeddings = last_hidden_state.mean(axis=1).tolist()
379
+ embedding_vectors.append(embeddings)
380
+ # textual_data.a(text)
381
+
382
+ t2 = time.time()
383
+ print(t2-t1)
384
+ return chunks , embedding_vectors
385
+
386
+
387
+ def get_image_embeddings(Product):
388
+ image_embeddings = []
389
+
390
+ links = search_images(Product)
391
+ with concurrent.futures.ThreadPoolExecutor() as executor:
392
+ descriptions = list(executor.map(feature_extraction_image, links))
393
+
394
+ for description in descriptions:
395
+ result = genai.embed_content(
396
+ model="models/embedding-001",
397
+ content=description,
398
+ task_type="retrieval_document")
399
+
400
+ image_embeddings.append(result['embedding'])
401
+ # print(image_embeddings)
402
+ return image_embeddings , links
403
+
404
+ global text_splitter
405
+ global data
406
+ global history
407
+
408
+ text_splitter = RecursiveCharacterTextSplitter(
409
+ chunk_size = 10000,
410
+ chunk_overlap = 100,
411
+ separators = ["",''," "]
412
+ )
413
+
414
+ # text_splitter_small = RecursiveCharacterTextSplitter(
415
+ # chunk_size = 2000,
416
+ # chunk_overlap = 100,
417
+ # separators = ["",''," "]
418
+ # )
419
+
420
+ text_splitter_small = TokenBasedTextSplitter(chunk_size=500, chunk_overlap=50)
421
+ # chunks = splitter.split_text(text)
422
+
423
+ if __name__ == '__main__':
424
+ print(get_embed_chroma('https://www.galaxys24manual.com/wp-content/uploads/pdf/galaxy-s24-manual-SAM-S921-S926-S928-OS14-011824-FINAL-US-English.pdf'))
425
+ # print(get_image_embeddings(Product='Samsung Galaxy S24'))
github_storage.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from github import Github
3
+ import base64
4
+ import shutil
5
+ import zipfile
6
+ from io import BytesIO
7
+
8
+
9
+ # Global Variables
10
+
11
+ # HF ------------
12
+ hf_folder_path = '/home/user/app/embeddings'
13
+ zip_name = 'embeddings'
14
+
15
+ # Github -------
16
+ github_token = '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' # Replace with your GitHub token
17
+ repo_name = 'AdityaMetkar/Patseer-Database' # Replace with your repository, e.g., 'octocat/Hello-World'
18
+ folder_path = 'Manual Database/embeddings.zip' # Replace with the path to the folder in the repository
19
+
20
+ # Authenticate to GitHub
21
+ g = Github(github_token)
22
+ repo = g.get_repo(repo_name)
23
+
24
+
25
+
26
+ # Functions -------------------------------
27
+ def zip_folder():
28
+ shutil.make_archive(zip_name, 'zip', hf_folder_path)
29
+ return zip_name + '.zip'
30
+
31
+
32
+ def update_db():
33
+
34
+ try:
35
+ # Check if the file already exists in the repository
36
+ existing_file = repo.get_contents(folder_path)
37
+
38
+ compressed_zip = zip_folder()
39
+ with open(compressed_zip, 'rb') as file:
40
+ file_content = file.read()
41
+
42
+ # Update the existing file
43
+ repo.update_file(existing_file.path, "New DB Update", file_content, existing_file.sha)
44
+ print(f"Updated {folder_path} in GitHub repository.")
45
+
46
+ except Exception as e:
47
+ print(f"Error: {e}")
48
+
49
+
50
+ def download_db():
51
+ if not os.path.exists(hf_folder_path):
52
+ os.makedirs(hf_folder_path)
53
+
54
+ file_content = repo.get_contents(folder_path)
55
+
56
+ try:
57
+ # Download the zip file content from GitHub
58
+ file_content = repo.get_contents(folder_path)
59
+ zip_data = base64.b64decode(file_content.content)
60
+
61
+ # Extract the downloaded zip file directly to hf_folder_path using shutil
62
+ with zipfile.ZipFile(BytesIO(zip_data)) as zip_ref:
63
+ for file in zip_ref.namelist():
64
+ zip_ref.extract(file, hf_folder_path)
65
+
66
+ print(f"Successfully unzipped files to {hf_folder_path}")
67
+
68
+ except Exception as e:
69
+ print(f"Error: {e}")
70
+
71
+
72
+
73
+
74
+ # Download the folder
75
+ # download_folder()
76
+ # update_db()
77
+
preprocess.py CHANGED
@@ -46,7 +46,7 @@ logging.basicConfig(level=logging.INFO)
46
 
47
  data = False
48
  seen = set()
49
- existing_products_urls = set('123')
50
 
51
 
52
 
@@ -121,8 +121,7 @@ def extract_text_from_pdf(pdf_file, pages):
121
  page = reader.pages[page_num]
122
  extracted_text += page.extract_text() + "\n"
123
  else:
124
- print(f"Page {page_num} does not exist in the document.")
125
-
126
  return extracted_text
127
 
128
  except:
 
46
 
47
  data = False
48
  seen = set()
49
+ existing_products_urls = set()
50
 
51
 
52
 
 
121
  page = reader.pages[page_num]
122
  extracted_text += page.extract_text() + "\n"
123
  else:
124
+ pass
 
125
  return extracted_text
126
 
127
  except:
requirements.txt CHANGED
@@ -1,28 +1,32 @@
1
- beautifulsoup4==4.12.3
2
- chromadb
3
- duckduckgo_search==6.1.0
4
- faiss_cpu==1.8.0
5
- fastapi==0.111.0
6
- fitz==0.0.1.dev2
7
- Flask==3.0.3
8
- googlesearch_python==1.2.4
9
- langchain==0.2.3
10
- langchain_community==0.2.4
11
- langchain_google_genai==1.0.6
12
- langchain_text_splitters==0.2.1
13
- langdetect==1.0.9
14
- numpy==1.26.4
15
- pdfplumber==0.11.1
16
- Pillow==10.3.0
17
- pinecone
18
- plotly
19
- protobuf==4.25.0
20
- pydantic==2.7.4
21
- pymongo
22
- PyPDF2==3.0.1
23
- Requests==2.32.3
24
- spacy==3.7.5
25
- streamlit==1.35.0
26
- transformers
27
- torch
28
- tqdm==4.66.4
 
 
 
 
 
1
+ APScheduler
2
+ beautifulsoup4==4.11.1
3
+ chromadb==0.5.3
4
+ duckduckgo_search==6.1.0
5
+ fastapi==0.111.0
6
+ fitz==0.0.1.dev2
7
+ Flask==2.3.1
8
+ googlesearch_python==1.2.4
9
+ langchain==0.2.6
10
+ langchain_community==0.2.6
11
+ langchain_core==0.2.10
12
+ langchain_google_genai==1.0.7
13
+ langdetect==1.0.9
14
+ numpy
15
+ onnx
16
+ onnxruntime
17
+ pandas==1.5.2
18
+ pdfplumber==0.11.0
19
+ Pillow==10.3.0
20
+ pinecone==4.0.0
21
+ plotly==5.22.0
22
+ protobuf<5
23
+ pydantic==1.10.9
24
+ pymongo
25
+ PyPDF2==3.0.1
26
+ pygithub
27
+ Requests==2.32.3
28
+ streamlit==1.36.0
29
+ torch==2.2.0
30
+ tqdm==4.66.4
31
+ transformers==4.41.2
32
+ zipfile36
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff