Spaces:

Prathmesh48
/

Test_E5

Sleeping

App Files Files Community

Prathmesh48 commited on Jul 12, 2024

Commit

9ba9756

verified ·

1 Parent(s): 9de950b

Upload 8 files

Browse files

Files changed (7) hide show

api_fast.py +226 -0
app.py +402 -326
embedding.py +425 -370
github_storage.py +77 -0
preprocess.py +2 -3
requirements.txt +32 -28
tokenizer.json +0 -0

api_fast.py ADDED Viewed

	@@ -0,0 +1,226 @@

+from fastapi import FastAPI, Request, HTTPException
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+import requests
+from bs4 import BeautifulSoup
+from googlesearch import search
+from duckduckgo_search import DDGS
+import concurrent.futures
+import re
+app = FastAPI()
+API_KEY_DEFAULT = '12345'
+class SearchRequest(BaseModel):
+    API_KEY: str
+    product: str
+# Function to search DuckDuckGo
+def duckduckgo_search(query):
+    try:
+        results = DDGS().text(f"{query} manual filetype:pdf", max_results=5)
+        return [res['href'] for res in results]
+    except:
+        return []
+# Function to search Google
+def google_search(query):
+    links = []
+    try:
+        api_key = 'AIzaSyDV_uJwrgNtawqtl6GDfeUj6NqO-H1tA4c'
+        search_engine_id = 'c4ca951b9fc6949cb'
+        url = f"https://www.googleapis.com/customsearch/v1"
+        params = {
+            "key": api_key,
+            "cx": search_engine_id,
+            "q": query + " manual filetype:pdf"
+        }
+        response = requests.get(url, params=params)
+        results = response.json()
+        for item in results.get('items', []):
+            links.append(item['link'])
+    except:
+        pass
+    try:
+        extension = "ext:pdf"
+        for result in search(query + " manual " + extension, num_results=5):
+            if result.endswith('.pdf'):
+                links.append(result)
+    except:
+        pass
+    return links
+# Function to search Internet Archive
+def archive_search(query):
+    try:
+        url = "https://archive.org/advancedsearch.php"
+        params = {
+            'q': f'{query} manual',
+            'fl[]': ['identifier', 'title', 'format'],
+            'rows': 50,
+            'page': 1,
+            'output': 'json'
+        }
+        response = requests.get(url, params=params)
+        data = response.json()
+        def extract_hyperlinks(url):
+            response = requests.get(url)
+            if response.status_code == 200:
+                soup = BeautifulSoup(response.text, 'html.parser')
+                for link in soup.find_all('a', href=True):
+                    href = link['href']
+                    if href.endswith('.pdf'):
+                        pdf_files.append(url + '/' + href)
+                    if href.endswith('.iso'):
+                        extract_pdf_from_iso(url + '/' + href + '/')
+        def extract_pdf_from_iso(iso_url):
+            iso_response = requests.get(iso_url)
+            if iso_response.status_code == 200:
+                iso_soup = BeautifulSoup(iso_response.text, 'html.parser')
+                for link in iso_soup.find_all('a', href=True):
+                    href = link['href']
+                    if href.endswith('.pdf'):
+                        pdf_files.append('https:' + href)
+        pdf_files = []
+        def process_doc(doc):
+            identifier = doc.get('identifier', 'N/A')
+            pdf_link = f"https://archive.org/download/{identifier}"
+            extract_hyperlinks(pdf_link)
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = [executor.submit(process_doc, doc) for doc in data['response']['docs']]
+            for future in concurrent.futures.as_completed(futures):
+                try:
+                    future.result()
+                except Exception as exc:
+                    print(f'Generated an exception: {exc}')
+        return pdf_files
+    except:
+        return []
+def github_search(query):
+    try:
+        url = f"https://api.github.com/search/code?q={query}+extension:md"
+        headers = {
+            'Authorization': 'Token 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'
+        }
+        response = requests.get(url, headers=headers)
+        data = response.json()
+        links = [item['html_url'].replace('/blob','').replace('//github','//raw.github') for item in data['items']]
+        return links
+    except:
+        return []
+def extract_similar_products(query):
+    results = DDGS().chat(f'{query} Similar Products')
+    pattern = r'^\d+\.\s(.+)$'
+    matches = re.findall(pattern, results, re.MULTILINE)
+    matches = [item.split(': ')[0] for item in matches]
+    return matches[:5] if matches else []
+@app.get('/')
+def read_root():
+    return {"message": "Welcome to the search API"}
+@app.post('/search/google')
+async def search_google(request: SearchRequest):
+    if request.API_KEY == API_KEY_DEFAULT:
+        results = {request.product: google_search(request.product)}
+        similar_products = extract_similar_products(request.product)
+        for p in similar_products:
+            results[p] = google_search(p)
+        return results
+    else:
+        raise HTTPException(status_code=401, detail="Invalid API key")
+@app.post('/search/duckduckgo')
+async def search_duckduckgo(request: SearchRequest):
+    if request.API_KEY == API_KEY_DEFAULT:
+        results = {request.product: duckduckgo_search(request.product)}
+        similar_products = extract_similar_products(request.product)
+        for p in similar_products:
+            results[p] = duckduckgo_search(p)
+        return results
+    else:
+        raise HTTPException(status_code=401, detail="Invalid API key")
+@app.post('/search/archive')
+async def search_archive(request: SearchRequest):
+    if request.API_KEY == API_KEY_DEFAULT:
+        results = {request.product: archive_search(request.product)}
+        similar_products = extract_similar_products(request.product)
+        def process_product(product):
+            return product, archive_search(product)
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            future_to_product = {executor.submit(process_product, p): p for p in similar_products}
+            for future in concurrent.futures.as_completed(future_to_product):
+                product, result = future.result()
+                results[product] = result
+        return results
+    else:
+        raise HTTPException(status_code=401, detail="Invalid API key")
+@app.post('/search/github')
+async def search_github(request: SearchRequest):
+    if request.API_KEY == API_KEY_DEFAULT:
+        results = {request.product: github_search(request.product)}
+        similar_products = extract_similar_products(request.product)
+        for p in similar_products:
+            results[p] = github_search(p)
+        return results
+    else:
+        raise HTTPException(status_code=401, detail="Invalid API key")
+@app.post('/search/all')
+async def search_all(request: SearchRequest):
+    if request.API_KEY == API_KEY_DEFAULT:
+        results = {
+            request.product: [
+                {'duckduckgo': duckduckgo_search(request.product)},
+                {'google': google_search(request.product)},
+                {'github': github_search(request.product)},
+                {'archive': archive_search(request.product)}
+            ]
+        }
+        def search_product(p):
+            return {
+                'product': p,
+                'duckduckgo': duckduckgo_search(p),
+                'google': google_search(p),
+                'github': github_search(p),
+                'archive': archive_search(p)
+            }
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            future_to_product = {executor.submit(search_product, p): p for p in extract_similar_products(request.product)}
+            for future in concurrent.futures.as_completed(future_to_product):
+                result = future.result()
+                product = result['product']
+                results[product] = [
+                    {'duckduckgo': result['duckduckgo']},
+                    {'google': result['google']},
+                    {'github': result['github']},
+                    {'archive': result['archive']}
+                ]
+        return results
+    else:
+        raise HTTPException(status_code=401, detail="Invalid API key")

app.py CHANGED Viewed

@@ -1,326 +1,402 @@
-import streamlit as st
-import concurrent.futures
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from functools import partial
-import numpy as np
-from io import StringIO
-import sys
-import time
-import pandas as pd
-from pymongo import MongoClient
-import plotly.express as px
-from pinecone import Pinecone, ServerlessSpec
-import chromadb
-import requests
-from io import BytesIO
-from PyPDF2 import PdfReader
-import hashlib
-import os
-import shutil
-# File Imports
-from embedding import get_embeddings, get_image_embeddings, get_embed_chroma , imporve_text # Ensure this file/module is available
-from preprocess import filtering  # Ensure this file/module is available
-from search import *
-# Chroma Connections
-client = chromadb.PersistentClient(path="embeddings")
-collection = client.get_or_create_collection(name="data", metadata={"hnsw:space": "l2"})
-def zip_folder(folder_path, zip_name):
-    # Create a zip file from the folder
-    shutil.make_archive(zip_name, 'zip', folder_path)
-    return zip_name + '.zip'
-folder_path = '/home/user/app/embeddings'
-zip_name = 'embedding'
-# st.title("Download Embedding Folder")
-def generate_hash(content):
-    return hashlib.sha256(content.encode('utf-8')).hexdigest()
-def get_key(link):
-    text = ''
-    try:
-        # Fetch the PDF file from the URL
-        response = requests.get(link)
-        response.raise_for_status()  # Raise an error for bad status codes
-        # Use BytesIO to handle the PDF content in memory
-        pdf_file = BytesIO(response.content)
-        # Load the PDF file
-        reader = PdfReader(pdf_file)
-        num_pages = len(reader.pages)
-        first_page_text = reader.pages[0].extract_text()
-        if first_page_text:
-            text += first_page_text
-        last_page_text = reader.pages[-1].extract_text()
-        if last_page_text:
-            text += last_page_text
-    except requests.exceptions.HTTPError as e:
-        print(f'HTTP error occurred: {e}')
-    except Exception as e:
-        print(f'An error occurred: {e}')
-    unique_key = generate_hash(text)
-    return unique_key
-# Cosine Similarity Function
-def cosine_similarity(vec1, vec2):
-    vec1 = np.array(vec1)
-    vec2 = np.array(vec2)
-    dot_product = np.dot(vec1, vec2.T)
-    magnitude_vec1 = np.linalg.norm(vec1)
-    magnitude_vec2 = np.linalg.norm(vec2)
-    if magnitude_vec1 == 0 or magnitude_vec2 == 0:
-        return 0.0
-    cosine_sim = dot_product / (magnitude_vec1 * magnitude_vec2)
-    return cosine_sim
-def update_chroma(product_name, url, key, text, vector, log_area):
-    id_list = [key + str(i) for i in range(len(text))]
-    metadata_list = [
-        {'key': key,
-         'product_name': product_name,
-         'url': url,
-         'text': item
-         }
-        for item in text
-    ]
-    collection.upsert(
-        ids=id_list,
-        embeddings=vector,
-        metadatas=metadata_list
-    )
-    logger.write(f"\n\u2713 Updated DB - {url}\n\n")
-    log_area.text(logger.getvalue())
-# Logger class to capture output
-class StreamCapture:
-    def __init__(self):
-        self.output = StringIO()
-        self._stdout = sys.stdout
-    def __enter__(self):
-        sys.stdout = self.output
-        return self.output
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        sys.stdout = self._stdout
-# Main Function
-def score(main_product, main_url, product_count, link_count, search, logger, log_area):
-    data = {}
-    similar_products = extract_similar_products(main_product)[:product_count]
-    print("--> Fetching Manual Links")
-    # Normal Filtering + Embedding  -----------------------------------------------
-    if search == 'All':
-        def process_product(product, search_function, main_product):
-            search_result = search_function(product)
-            return filtering(search_result, main_product, product, link_count)
-        search_functions = {
-            'google': search_google,
-            'duckduckgo': search_duckduckgo,
-            'github': search_github,
-            'wikipedia': search_wikipedia
-        }
-        with ThreadPoolExecutor() as executor:
-            future_to_product_search = {
-                executor.submit(process_product, product, search_function, main_product): (product, search_name)
-                for product in similar_products
-                for search_name, search_function in search_functions.items()
-            }
-            for future in as_completed(future_to_product_search):
-                product, search_name = future_to_product_search[future]
-                try:
-                    if product not in data:
-                        data[product] = {}
-                    data[product] = future.result()
-                except Exception as e:
-                    print(f"Error processing product {product} with {search_name}: {e}")
-    else:
-        for product in similar_products:
-            if search == 'google':
-                data[product] = filtering(search_google(product), main_product, product, link_count)
-            elif search == 'duckduckgo':
-                data[product] = filtering(search_duckduckgo(product), main_product, product, link_count)
-            elif search == 'archive':
-                data[product] = filtering(search_archive(product), main_product, product, link_count)
-            elif search == 'github':
-                data[product] = filtering(search_github(product), main_product, product, link_count)
-            elif search == 'wikipedia':
-                data[product] = filtering(search_wikipedia(product), main_product, product, link_count)
-    # Filtered Link -----------------------------------------
-    logger.write("\n\n\u2713 Filtered Links\n")
-    log_area.text(logger.getvalue())
-    # Main product Embeddings ---------------------------------
-    logger.write("\n\n--> Creating Main product Embeddings\n")
-    main_key = get_key(main_url)
-    main_text, main_vector = get_embed_chroma(main_url)
-    update_chroma(main_product, main_url, main_key, main_text, main_vector, log_area)
-    # log_area.text(logger.getvalue())
-    print("\n\n\u2713 Main Product embeddings Created")
-    logger.write("\n\n--> Creating Similar product Embeddings\n")
-    log_area.text(logger.getvalue())
-    test_embedding = [0] * 768
-    for product in data:
-        for link in data[product]:
-            url, _ = link
-            similar_key = get_key(url)
-            res = collection.query(
-                query_embeddings=[test_embedding],
-                n_results=1,
-                where={"key": similar_key},
-            )
-            if not res['distances'][0]:
-                similar_text, similar_vector = get_embed_chroma(url)
-                update_chroma(product, url, similar_key, similar_text, similar_vector, log_area)
-    logger.write("\n\n\u2713 Similar Product embeddings Created\n")
-    log_area.text(logger.getvalue())
-    top_similar = []
-    for idx, chunk in enumerate(main_vector):
-        res = collection.query(
-            query_embeddings=[chunk],
-            n_results=1,
-            where={"key": {'$ne': main_key}},
-            include=['metadatas', 'embeddings', 'distances']
-        )
-        top_similar.append((main_text[idx], chunk, res, res['distances'][0]))
-    most_similar_items = sorted(top_similar, key=lambda x: x[3])[:top_similar_count]
-    logger.write("--------------- DONE -----------------\n")
-    log_area.text(logger.getvalue())
-    return most_similar_items
-# Streamlit Interface
-st.title("Check Infringement")
-# Inputs
-with st.sidebar:
-    st.header("Product Information")
-    main_product = st.text_input('Enter Main Product Name', 'Philips led 7w bulb')
-    main_url = st.text_input('Enter Main Product Manual URL', 'https://www.assets.signify.com/is/content/PhilipsConsumer/PDFDownloads/Colombia/technical-sheets/ODLI20180227_001-UPD-es_CO-Ficha_Tecnica_LED_MR16_Master_7W_Dim_12V_CRI90.pdf')
-    st.header("Search Settings")
-    search_method = st.selectbox('Choose Search Engine', ['All', 'duckduckgo', 'google', 'archive', 'github', 'wikipedia'])
-    product_count = st.number_input("Number of Similar Products", min_value=1, step=1, format="%i")
-    link_count = st.number_input("Number of Links per Product", min_value=1, step=1, format="%i")
-    need_image = st.selectbox("Process Images", ['True', 'False'])
-    top_similar_count = st.number_input("Top Similarities to be Displayed", value=3, min_value=1, step=1, format="%i")
-if st.button("Download"):
-    zip_file = zip_folder(folder_path, zip_name)
-    with open(zip_file, "rb") as f:
-        st.download_button(
-            label="Download ZIP",
-            data=f,
-            file_name=zip_file,
-            mime="application/zip"
-        )
-if st.button('Check for Infringement'):
-    global log_output  # Placeholder for log output
-    tab1, tab2 = st.tabs(["Output", "Console"])
-    with tab2:
-        log_output = st.empty()
-    with tab1:
-        with st.spinner('Processing...'):
-            with StreamCapture() as logger:
-                top_similar_values = score(main_product, main_url, product_count, link_count, search_method, logger, log_output)
-        st.success('Processing complete!')
-        st.subheader("Cosine Similarity Scores")
-        for main_text, main_vector, response, _ in top_similar_values:
-            product_name = response['metadatas'][0][0]['product_name']
-            link = response['metadatas'][0][0]['url']
-            similar_text = response['metadatas'][0][0]['text']
-            cosine_score = cosine_similarity([main_vector], response['embeddings'][0])[0][0]
-            # Display the product information
-            with st.container():
-                st.markdown(f"### [Product: {product_name}]({link})")
-                st.markdown(f"#### Cosine Score: {cosine_score:.4f}")
-                col1, col2 = st.columns(2)
-                with col1:
-                    st.markdown(f"**Main Text:** {imporve_text(main_text)}")
-                with col2:
-                    st.markdown(f"**Similar Text:** {imporve_text(similar_text)}")
-                st.markdown("---")
-    if need_image == 'True':
-        with st.spinner('Processing Images...'):
-            emb_main = get_image_embeddings(main_product)
-            similar_prod = extract_similar_products(main_product)[0]
-            emb_similar = get_image_embeddings(similar_prod)
-            similarity_matrix = np.zeros((5, 5))
-            for i in range(5):
-                for j in range(5):
-                    similarity_matrix[i][j] = cosine_similarity([emb_main[i]], [emb_similar[j]])[0][0]
-            st.subheader("Image Similarity")
-            # Create an interactive heatmap
-            fig = px.imshow(similarity_matrix,
-                            labels=dict(x=f"{similar_prod} Images", y=f"{main_product} Images", color="Similarity"),
-                            x=[f"Image {i+1}" for i in range(5)],
-                            y=[f"Image {i+1}" for i in range(5)],
-                            color_continuous_scale="Viridis")
-            # Add title to the heatmap
-            fig.update_layout(title="Image Similarity Heatmap")
-            # Display the interactive heatmap
-            st.plotly_chart(fig)

+import streamlit as st
+import concurrent.futures
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from functools import partial
+import numpy as np
+from io import StringIO
+import sys
+import time
+import pandas as pd
+from pymongo import MongoClient
+import plotly.express as px
+from pinecone import Pinecone, ServerlessSpec
+import chromadb
+import requests
+from io import BytesIO
+from PyPDF2 import PdfReader
+import hashlib
+import os
+from plotly.subplots import make_subplots
+import plotly.graph_objects as go
+from PIL import Image
+import shutil
+# File Imports
+from embedding import get_embeddings, get_image_embeddings, get_embed_chroma,imporve_text  # Ensure this file/module is available
+from preprocess import filtering  # Ensure this file/module is available
+from github_storage import update_db,download_db
+from search import *
+# Chroma Connections
+try:
+    client = chromadb.PersistentClient(path="embeddings")
+    collection = client.get_or_create_collection(name="data", metadata={"hnsw:space": "l2"})
+except:
+    pass
+def generate_hash(content):
+    return hashlib.sha256(content.encode('utf-8')).hexdigest()
+def get_key(link):
+    text = ''
+    try:
+        # Fetch the PDF file from the URL
+        response = requests.get(link)
+        response.raise_for_status()  # Raise an error for bad status codes
+        # Use BytesIO to handle the PDF content in memory
+        pdf_file = BytesIO(response.content)
+        # Load the PDF file
+        reader = PdfReader(pdf_file)
+        num_pages = len(reader.pages)
+        first_page_text = reader.pages[0].extract_text()
+        if first_page_text:
+            text += first_page_text
+        last_page_text = reader.pages[-1].extract_text()
+        if last_page_text:
+            text += last_page_text
+    except requests.exceptions.HTTPError as e:
+        print(f'HTTP error occurred: {e}')
+    except Exception as e:
+        print(f'An error occurred: {e}')
+    unique_key = generate_hash(text)
+    return unique_key
+# Cosine Similarity Function
+def cosine_similarity(vec1, vec2):
+    vec1 = np.array(vec1)
+    vec2 = np.array(vec2)
+    dot_product = np.dot(vec1, vec2.T)
+    magnitude_vec1 = np.linalg.norm(vec1)
+    magnitude_vec2 = np.linalg.norm(vec2)
+    if magnitude_vec1 == 0 or magnitude_vec2 == 0:
+        return 0.0
+    cosine_sim = dot_product / (magnitude_vec1 * magnitude_vec2)
+    return cosine_sim
+def update_chroma(product_name, url, key, text, vector, log_area):
+    if len(text) > 0:
+        id_list = [key + str(i) for i in range(len(text))]
+        metadata_list = [
+            {'key': key,
+            'product_name': product_name,
+            'url': url,
+            'text': item
+            }
+            for item in text
+        ]
+        collection.upsert(
+            ids=id_list,
+            embeddings=vector,
+            metadatas=metadata_list
+        )
+        logger.write(f"\n\u2713 Updated DB - {url}\n\n")
+        log_area.text(logger.getvalue())
+        return True
+    return False
+# Logger class to capture output
+class StreamCapture:
+    def __init__(self):
+        self.output = StringIO()
+        self._stdout = sys.stdout
+    def __enter__(self):
+        sys.stdout = self.output
+        return self.output
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        sys.stdout = self._stdout
+# Main Function
+def score(main_product, main_url, product_count, link_count, search, logger, log_area):
+    data = {}
+    similar_products = extract_similar_products(main_product)[:product_count]
+    if len(similar_products) < 1:
+        st.warning(f'No Simililar Products Found for {main_product}. Please Be More Specific With Product Name')
+    print("--> Fetching Manual Links")
+    # Normal Filtering + Embedding  -----------------------------------------------
+    if search == 'All':
+        def process_product(product, search_function, main_product):
+            search_result = search_function(product)
+            return filtering(search_result, main_product, product, link_count)
+        search_functions = {
+            'google': search_google,
+            'duckduckgo': search_duckduckgo,
+            'github': search_github,
+            'wikipedia': search_wikipedia
+        }
+        with ThreadPoolExecutor() as executor:
+            future_to_product_search = {
+                executor.submit(process_product, product, search_function, main_product): (product, search_name)
+                for product in similar_products
+                for search_name, search_function in search_functions.items()
+            }
+            for future in as_completed(future_to_product_search):
+                product, search_name = future_to_product_search[future]
+                try:
+                    if product not in data:
+                        data[product] = {}
+                    data[product] = future.result()
+                except Exception as e:
+                    print(f"Error processing product {product} with {search_name}: {e}")
+    else:
+        for product in similar_products:
+            if search == 'google':
+                data[product] = filtering(search_google(product), main_product, product, link_count)
+            elif search == 'duckduckgo':
+                data[product] = filtering(search_duckduckgo(product), main_product, product, link_count)
+            elif search == 'archive':
+                data[product] = filtering(search_archive(product), main_product, product, link_count)
+            elif search == 'github':
+                data[product] = filtering(search_github(product), main_product, product, link_count)
+            elif search == 'wikipedia':
+                data[product] = filtering(search_wikipedia(product), main_product, product, link_count)
+    # Filtered Link -----------------------------------------
+    logger.write("\n\n\u2713 Filtered Links\n")
+    log_area.text(logger.getvalue())
+    # Main product Embeddings ---------------------------------
+    logger.write("\n\n--> Creating Main product Embeddings\n")
+    main_key = get_key(main_url)
+    main_text, main_vector = get_embed_chroma(main_url)
+    readable = update_chroma(main_product, main_url, main_key, main_text, main_vector, log_area)
+    if readable:
+        # log_area.text(logger.getvalue())
+        print("\n\n\u2713 Main Product embeddings Created")
+        logger.write("\n\n--> Creating Similar product Embeddings\n")
+        log_area.text(logger.getvalue())
+        test_embedding = [0] * 768
+        for product in data:
+            for link in data[product]:
+                url, _ = link
+                similar_key = get_key(url)
+                res = collection.query(
+                    query_embeddings=[test_embedding],
+                    n_results=1,
+                    where={"key": similar_key},
+                )
+                if not res['distances'][0]:
+                    similar_text, similar_vector = get_embed_chroma(url)
+                    update_chroma(product, url, similar_key, similar_text, similar_vector, log_area)
+        logger.write("\n\n\u2713 Similar Product embeddings Created\n")
+        log_area.text(logger.getvalue())
+        top_similar = []
+        for idx, chunk in enumerate(main_vector):
+            res = collection.query(
+                query_embeddings=[chunk],
+                n_results=1,
+                where={"key": {'$ne': main_key}},
+                include=['metadatas', 'embeddings', 'distances']
+            )
+            top_similar.append((main_text[idx], chunk, res, res['distances'][0]))
+        most_similar_items = sorted(top_similar, key=lambda x: x[3])[:top_similar_count]
+        logger.write("--------------- DONE -----------------\n")
+        log_area.text(logger.getvalue())
+        return most_similar_items
+    return []
+# Streamlit Interface
+st.title("🔍 Infringement Checker")
+# Inputs
+with st.sidebar:
+    st.header("📋 Product Information")
+    main_product = st.text_input('Enter Main Product Name', 'Philips led 7w bulb')
+    main_url = st.text_input('Enter Main Product Manual URL', 'https://www.assets.signify.com/is/content/PhilipsConsumer/PDFDownloads/Colombia/technical-sheets/ODLI20180227_001-UPD-es_CO-Ficha_Tecnica_LED_MR16_Master_7W_Dim_12V_CRI90.pdf')
+    st.header("🔎 Search Settings")
+    search_method = st.selectbox('Choose Search Engine', ['All', 'duckduckgo', 'google', 'archive', 'github', 'wikipedia'])
+    product_count = st.number_input("Number of Similar Products", min_value=1, step=1, format="%i")
+    link_count = st.number_input("Number of Links per Product", min_value=1, step=1, format="%i")
+    need_image = st.selectbox("Process Images", ['True', 'False'])
+    top_similar_count = st.number_input("Top Similarities to be Displayed", value=3, min_value=1, step=1, format="%i")
+col1_main,col2_main = st.columns([7,3])
+with col1_main:
+    run_streamlit = st.button('Check for Infringement')
+if run_streamlit:
+        global log_output
+        tab1, tab2 = st.tabs(["📊 Output", "🖥️ Console"])
+        with tab2:
+            log_output = st.empty()
+        with tab1:
+            with st.spinner('Processing...'):
+                if len(os.listdir('/home/user/app/embeddings'))<2:
+                    download_db()
+                    print("\u2713 Downloaded Database\n\n")
+                with StreamCapture() as logger:
+                    top_similar_values = score(main_product, main_url, product_count, link_count, search_method, logger, log_output)
+            st.success('✅ Processing complete!')
+            st.subheader("📈 Cosine Similarity Scores")
+            if len(top_similar_values) > 0:
+                for main_text, main_vector, response, _ in top_similar_values:
+                    product_name = response['metadatas'][0][0]['product_name']
+                    link = response['metadatas'][0][0]['url']
+                    similar_text = response['metadatas'][0][0]['text']
+                    # similar_text_refined = imporve_text(similar_text)
+                    # main_text_refined = imporve_text(main_text)
+                    cosine_score = cosine_similarity([main_vector], response['embeddings'][0])[0][0]
+                    # Display the product information
+                    with st.expander(f"### Product: {product_name} - Score: {cosine_score:.4f}"):
+                        link = link.replace(" ","%20")
+                        st.markdown(f"[View Product Manual]({link})")
+                        tab1, tab2 = st.tabs(["Raw Text", "Refined Text"])
+                        with tab2:
+                            col1, col2 = st.columns(2)
+                            with col1:
+                                st.markdown(f"*Main Text:\n* {imporve_text(main_text)}")
+                            with col2:
+                                st.markdown(f"*Similar Text\n:* {imporve_text(similar_text)}")
+                        with tab1:
+                            col1, col2 = st.columns(2)
+                            with col1:
+                                st.markdown(f"*Main Text:* {main_text}")
+                            with col2:
+                                st.markdown(f"*Similar Text:* {similar_text}")
+            else:
+                st.warning("Main Product Document isn't Readable!")
+            if need_image == 'True':
+                with st.spinner('Processing Images...'):
+                    emb_main , main_prod_imgs = get_image_embeddings(main_product)
+                    similar_prod = extract_similar_products(main_product)[0]
+                    emb_similar , similar_prod_imgs = get_image_embeddings(similar_prod)
+                    if similar_prod:
+                        similarity_matrix = np.zeros((5, 5))
+                        for i in range(5):
+                            for j in range(5):
+                                similarity_matrix[i][j] = cosine_similarity([emb_main[i]], [emb_similar[j]])[0][0]
+                        st.subheader("Image Similarity")
+                        # Create an interactive heatmap
+                        fig = px.imshow(similarity_matrix,
+                                        labels=dict(x=f"{similar_prod} Images", y=f"{main_product} Images", color="Similarity"),
+                                        x=[f"Image {i+1}" for i in range(5)],
+                                        y=[f"Image {i+1}" for i in range(5)],
+                                        color_continuous_scale="Viridis")
+                        # Add title to the heatmap
+                        fig.update_layout(title="Image Similarity Heatmap")
+                        # Display the interactive heatmap
+                        st.plotly_chart(fig)
+                        @st.experimental_fragment
+                        def image_viewer():
+                            # Form to handle image selection
+                            st.subheader("Image Viewer")
+                            selected_row = st.selectbox('Select a row (Main Product Image)', [f'Image {i+1}' for i in range(5)])
+                            selected_col = st.selectbox('Select a column (Similar Product Image)', [f'Image {i+1}' for i in range(5)])
+                            # Get the selected indices from session state
+                            row_idx = int(selected_row.split()[1]) - 1
+                            col_idx = int(selected_col.split()[1]) - 1
+                            col1, col2 = st.columns(2)
+                            with col1:
+                                st.image(main_prod_imgs[row_idx], caption=f'Main Product Image {row_idx+1}', use_column_width=True)
+                            with col2:
+                                st.image(similar_prod_imgs[col_idx], caption=f'Similar Product Image {col_idx+1}', use_column_width=True)
+                        # Call the fragment
+                        image_viewer()
+@st.experimental_dialog("Confirm Database Backup")
+def update():
+    st.write("Do you want to backup the new changes in the database?")
+    if st.button("Confirm",type="primary"):
+        st.write("Updating Database....")
+        st.session_state.update = {"Done": True}
+        update_db()
+        st.success('Backup Complete!', icon="✅")
+        time.sleep(2)
+        st.rerun()
+if "update" not in st.session_state:
+    with col2_main:
+        update_button = st.button("Update Database",type="primary")
+        if update_button:
+            update()

embedding.py CHANGED Viewed

@@ -1,370 +1,425 @@
-from PyPDF2 import PdfReader
-import requests
-import json
-import os
-import concurrent.futures
-import random
-from langchain_google_genai import ChatGoogleGenerativeAI
-from langchain_community.document_loaders import WebBaseLoader
-from langchain_community.document_loaders import PyPDFLoader
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-import google.generativeai as genai
-from langchain_core.messages import HumanMessage
-from io import BytesIO
-import numpy as np
-import re
-import torch
-from transformers import AutoTokenizer, AutoModel
-from search import search_images
-gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
-gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
-gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
-gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
-vision = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
-vision1 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
-vision2 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
-vision3 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
-tokenizer = AutoTokenizer.from_pretrained('Alibaba-NLP/gte-base-en-v1.5',trust_remote_code = True)
-model = AutoModel.from_pretrained('Alibaba-NLP/gte-base-en-v1.5',trust_remote_code = True)
-model.to('cpu')  # Ensure the model is on the CPU
-genai.configure(api_key="AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI")
-def pdf_extractor(link):
-    text = ''
-    try:
-        # Fetch the PDF file from the URL
-        response = requests.get(link)
-        response.raise_for_status()  # Raise an error for bad status codes
-        # Use BytesIO to handle the PDF content in memory
-        pdf_file = BytesIO(response.content)
-        # Load the PDF file
-        reader = PdfReader(pdf_file)
-        for page in reader.pages:
-            text += page.extract_text()  # Extract text from each page
-    except requests.exceptions.HTTPError as e:
-        print(f'HTTP error occurred: {e}')
-    except Exception as e:
-        print(f'An error occurred: {e}')
-    return text
-def web_extractor(link):
-    text = ''
-    try:
-        loader = WebBaseLoader(link)
-        pages = loader.load_and_split()
-        for page in pages:
-            text+=page.page_content
-    except:
-        pass
-    return text
-def imporve_text(text):
-    prompt = f'''
-    Please rewrite the following text to make it short, concise, and of high quality.
-    Ensure that all essential information and key points are retained.
-    Focus on improving clarity, coherence, and word choice without altering the original meaning.
-    text = {text}
-    '''
-    model = random.choice([gemini,gemini1,gemini2,gemini3])
-    result = model.invoke(prompt)
-    return result.content
-def feature_extraction(tag, history , context):
-    prompt = f'''
-    You are an intelligent assistant tasked with updating product information. You have two data sources:
-    1. Tag_History: Previously gathered information about the product.
-    2. Tag_Context: New data that might contain additional details.
-    Your job is to read the Tag_Context and update the relevant field in the Tag_History with any new details found. The field to be updated is the {tag} FIELD.
-    Guidelines:
-    - Only add new details that are relevant to the {tag} FIELD.
-    - Do not add or modify any other fields in the Tag_History.
-    - Ensure your response is in coherent sentences, integrating the new details seamlessly into the existing information.
-    Here is the data:
-    Tag_Context: {str(context)}
-    Tag_History: {history}
-    Respond with the updated Tag_History.
-    '''
-    model = random.choice([gemini,gemini1,gemini2,gemini3])
-    result = model.invoke(prompt)
-    return result.content
-def feature_extraction_image(url):
-    text = ' '
-    model = genai.GenerativeModel('gemini-1.5-flash-001')
-    try:
-        res = model.generate_content(['Describe this image to me',url])
-        text = res.text
-    except:
-        pass
-    return text
-def detailed_feature_extraction(find, context):
-    prompt = f'''
-    You are an intelligent assistant tasked with finding product information. You have one data source and one output format:
-    1. Context: The gathered information about the product.
-    2. Format: Details which need to be filled based on Context.
-    Your job is to read the Context and update the relevant field in Format using Context.
-    Guidelines:
-    - Only add details that are relevant to the individual FIELD.
-    - Do not add or modify any other fields in the Format.
-    - If nothing found return None.
-    Here is the data:
-    The Context is {str(context)}
-    The Format is {str(find)}
-    '''
-    model = random.choice([gemini,gemini1,gemini2,gemini3])
-    result = model.invoke(prompt)
-    return result.content
-def detailed_history(history):
-    details = {
-    "Introduction": {
-        "Product Name": None,
-        "Overview of the product": None,
-        "Purpose of the manual": None,
-        "Audience": None,
-        "Additional Details": None
-    },
-    "Specifications": {
-        "Technical specifications": None,
-        "Performance metrics": None,
-        "Additional Details": None
-    },
-    "Product Overview": {
-        "Product features": None,
-        "Key components and parts": None,
-        "Additional Details": None
-    },
-    "Safety Information": {
-        "Safety warnings and precautions": None,
-        "Compliance and certification information": None,
-        "Additional Details": None
-    },
-    "Installation Instructions": {
-        "Unboxing and inventory checklist": None,
-        "Step-by-step installation guide": None,
-        "Required tools and materials": None,
-        "Additional Details": None
-    },
-    "Setup and Configuration": {
-        "Initial setup procedures": None,
-        "Configuration settings": None,
-        "Troubleshooting setup issues": None,
-        "Additional Details": None
-    },
-    "Operation Instructions": {
-        "How to use the product": None,
-        "Detailed instructions for different functionalities": None,
-        "User interface guide": None,
-        "Additional Details": None
-    },
-    "Maintenance and Care": {
-        "Cleaning instructions": None,
-        "Maintenance schedule": None,
-        "Replacement parts and accessories": None,
-        "Additional Details": None
-    },
-    "Troubleshooting": {
-        "Common issues and solutions": None,
-        "Error messages and their meanings": None,
-        "Support Information": None,
-        "Additional Details": None
-    },
-    "Warranty Information": {
-        "Terms and Conditions": None,
-        "Service and repair information": None,
-        "Additional Details": None
-    },
-    "Legal Information": {
-        "Copyright information": None,
-        "Trademarks and patents": None,
-        "Disclaimers": None,
-        "Additional Details": None
-    }
-}
-    for key,val in history.items():
-        find = details[key]
-        details[key] = str(detailed_feature_extraction(find,val))
-    return details
-def get_embeddings(link,tag_option):
-        print(f"\n--> Creating Embeddings - {link}")
-        if tag_option=='Complete Document Similarity':
-            history = { "Details": "" }
-        else:
-            history = {
-                    "Introduction": "",
-                    "Specifications": "",
-                    "Product Overview": "",
-                    "Safety Information": "",
-                    "Installation Instructions": "",
-                    "Setup and Configuration": "",
-                    "Operation Instructions": "",
-                    "Maintenance and Care": "",
-                    "Troubleshooting": "",
-                    "Warranty Information": "",
-                    "Legal Information": ""
-                }
-        # Extract Text -----------------------------
-        print("Extracting Text")
-        if link[-3:] == '.md' or link[8:11] == 'en.':
-            text = web_extractor(link)
-        else:
-            text = pdf_extractor(link)
-        # Create Chunks ----------------------------
-        print("Writing Tag Data")
-        if tag_option=="Complete Document Similarity":
-            history["Details"] = feature_extraction("Details", history["Details"], text[0][:50000])
-        else:
-            chunks = text_splitter.create_documents(text)
-            for chunk in chunks:
-                with concurrent.futures.ThreadPoolExecutor() as executor:
-                        future_to_key = {
-                            executor.submit(
-                                feature_extraction, f"Product {key}", history[key], chunk.page_content
-                            ): key for key in history
-                        }
-                        for future in concurrent.futures.as_completed(future_to_key):
-                            key = future_to_key[future]
-                            try:
-                                response = future.result()
-                                history[key] = response
-                            except Exception as e:
-                                print(f"Error processing {key}: {e}")
-        print("Creating Vectors")
-        genai_embeddings=[]
-        for tag in history:
-            result = genai.embed_content(
-                    model="models/embedding-001",
-                    content=history[tag],
-                    task_type="retrieval_document")
-            genai_embeddings.append(result['embedding'])
-        return history,genai_embeddings
-def get_embed_chroma(link):
-    print(f"\n--> Creating Embeddings - {link}")
-    # Extract Text -----------------------------
-    if link[-3:] == '.md' or link[8:11] == 'en.':
-        text = web_extractor(link)
-    else:
-        text = pdf_extractor(link)
-    print("\u2713 Extracting Text")
-    # Create Chunks ----------------------------
-    text = re.sub(r'\.{2,}', '.', text)
-    text = re.sub(r'\s{2,}', ' ', text)
-    text = [re.sub(r'\n{2,}', '\n', text)]
-    chunks = text_splitter_small.create_documents(text)
-    print("\u2713 Writing Tag Data")
-    # Creating Vector
-    embedding_vectors=[]
-    textual_data = []
-    print("\u2713 Creating Vectors")
-    for text in chunks:
-        inputs = tokenizer(text.page_content, return_tensors="pt", padding=True, truncation=True)
-        inputs = {k: v.to('cpu') for k, v in inputs.items()}
-        # Get the model's outputs
-        with torch.no_grad():
-            outputs = model(**inputs)
-        embeddings = outputs.last_hidden_state.mean(dim=1)
-        embedding_vectors.append(embeddings.squeeze().cpu().numpy().tolist())
-        textual_data.append(text.page_content)
-    return textual_data , embedding_vectors
-def get_image_embeddings(Product):
-    image_embeddings = []
-    links = search_images(Product)
-    with concurrent.futures.ThreadPoolExecutor() as executor:
-        descriptions = list(executor.map(feature_extraction_image, links))
-    for description in descriptions:
-        result = genai.embed_content(
-                model="models/embedding-001",
-                content=description,
-                task_type="retrieval_document")
-        image_embeddings.append(result['embedding'])
-    # print(image_embeddings)
-    return image_embeddings
-global text_splitter
-global data
-global history
-text_splitter = RecursiveCharacterTextSplitter(
-    chunk_size = 10000,
-    chunk_overlap  = 100,
-    separators = ["",''," "]
-)
-text_splitter_small = RecursiveCharacterTextSplitter(
-    chunk_size = 2000,
-    chunk_overlap  = 100,
-    separators = ["",''," "]
-)
-if __name__ == '__main__':
-    # print(get_embed_chroma('https://www.galaxys24manual.com/wp-content/uploads/pdf/galaxy-s24-manual-SAM-S921-S926-S928-OS14-011824-FINAL-US-English.pdf'))
-    print(get_image_embeddings(Product='Samsung Galaxy S24'))

+from PyPDF2 import PdfReader
+import requests
+import json
+import os
+import concurrent.futures
+import random
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_community.document_loaders import WebBaseLoader
+from langchain_community.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+import google.generativeai as genai
+from langchain_core.messages import HumanMessage
+from io import BytesIO
+import numpy as np
+import re
+import torch
+from transformers import AutoTokenizer, AutoModel
+import numpy as np
+import onnxruntime as ort
+# import torch._dynamo
+import time
+# torch._dynamo.config.suppress_errors = True
+from search import search_images
+gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
+gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
+gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
+gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
+vision = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
+vision1 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
+vision2 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
+vision3 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
+tokenizer = AutoTokenizer.from_pretrained('dwzhu/e5-base-4k',trust_remote_code = True)
+# model = AutoModel.from_pretrained('dwzhu/e5-base-4k',trust_remote_code = True)
+model_path = "model_opt2_QInt8.onnx"
+session = ort.InferenceSession(model_path)
+# model = torch.compile(model)
+# model.to('cpu')  # Ensure the model is on the CPU
+from transformers import PreTrainedTokenizerFast
+class TokenBasedTextSplitter:
+    def __init__(self, tokenizer_path='tokenizer.json', chunk_size=2000, chunk_overlap=50):
+        self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path)
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+    def split_text(self, text):
+        tokens = self.tokenizer.tokenize(text)
+        chunks = []
+        for i in range(0, len(tokens), self.chunk_size - self.chunk_overlap):
+            chunk = tokens[i:i + self.chunk_size]
+            chunks.append(self.tokenizer.convert_tokens_to_string(chunk))
+        return chunks
+genai.configure(api_key="AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI")
+def pdf_extractor(link):
+    text = ''
+    try:
+        # Fetch the PDF file from the URL
+        response = requests.get(link)
+        response.raise_for_status()  # Raise an error for bad status codes
+        # Use BytesIO to handle the PDF content in memory
+        pdf_file = BytesIO(response.content)
+        # Load the PDF file
+        reader = PdfReader(pdf_file)
+        for page in reader.pages:
+            text += page.extract_text()  # Extract text from each page
+    except requests.exceptions.HTTPError as e:
+        print(f'HTTP error occurred: {e}')
+    except Exception as e:
+        print(f'An error occurred: {e}')
+    return text
+def web_extractor(link):
+    text = ''
+    try:
+        loader = WebBaseLoader(link)
+        pages = loader.load_and_split()
+        for page in pages:
+            text+=page.page_content
+    except:
+        pass
+    return text
+def imporve_text(text):
+    prompt = f'''
+    Please rewrite the following text to make it short, descriptive, concise, and of high quality.
+    Ensure that all essential information is retained.
+    Focus on improving clarity, coherence, and word choice without altering the original meaning.
+    text = {text}
+    '''
+    model = random.choice([gemini,gemini1,gemini2,gemini3])
+    result = model.invoke(prompt)
+    return result.content
+def feature_extraction(tag, history , context):
+    prompt = f'''
+    You are an intelligent assistant tasked with updating product information. You have two data sources:
+    1. Tag_History: Previously gathered information about the product.
+    2. Tag_Context: New data that might contain additional details.
+    Your job is to read the Tag_Context and update the relevant field in the Tag_History with any new details found. The field to be updated is the {tag} FIELD.
+    Guidelines:
+    - Only add new details that are relevant to the {tag} FIELD.
+    - Do not add or modify any other fields in the Tag_History.
+    - Ensure your response is in coherent sentences, integrating the new details seamlessly into the existing information.
+    Here is the data:
+    Tag_Context: {str(context)}
+    Tag_History: {history}
+    Respond with the updated Tag_History.
+    '''
+    model = random.choice([gemini,gemini1,gemini2,gemini3])
+    result = model.invoke(prompt)
+    return result.content
+def feature_extraction_image(url):
+    text = ' '
+    model = genai.GenerativeModel('gemini-1.5-flash-001')
+    try:
+        res = model.generate_content(['Describe this image to me',url])
+        text = res.text
+    except:
+        pass
+    return text
+def detailed_feature_extraction(find, context):
+    prompt = f'''
+    You are an intelligent assistant tasked with finding product information. You have one data source and one output format:
+    1. Context: The gathered information about the product.
+    2. Format: Details which need to be filled based on Context.
+    Your job is to read the Context and update the relevant field in Format using Context.
+    Guidelines:
+    - Only add details that are relevant to the individual FIELD.
+    - Do not add or modify any other fields in the Format.
+    - If nothing found return None.
+    Here is the data:
+    The Context is {str(context)}
+    The Format is {str(find)}
+    '''
+    model = random.choice([gemini,gemini1,gemini2,gemini3])
+    result = model.invoke(prompt)
+    return result.content
+def detailed_history(history):
+    details = {
+    "Introduction": {
+        "Product Name": None,
+        "Overview of the product": None,
+        "Purpose of the manual": None,
+        "Audience": None,
+        "Additional Details": None
+    },
+    "Specifications": {
+        "Technical specifications": None,
+        "Performance metrics": None,
+        "Additional Details": None
+    },
+    "Product Overview": {
+        "Product features": None,
+        "Key components and parts": None,
+        "Additional Details": None
+    },
+    "Safety Information": {
+        "Safety warnings and precautions": None,
+        "Compliance and certification information": None,
+        "Additional Details": None
+    },
+    "Installation Instructions": {
+        "Unboxing and inventory checklist": None,
+        "Step-by-step installation guide": None,
+        "Required tools and materials": None,
+        "Additional Details": None
+    },
+    "Setup and Configuration": {
+        "Initial setup procedures": None,
+        "Configuration settings": None,
+        "Troubleshooting setup issues": None,
+        "Additional Details": None
+    },
+    "Operation Instructions": {
+        "How to use the product": None,
+        "Detailed instructions for different functionalities": None,
+        "User interface guide": None,
+        "Additional Details": None
+    },
+    "Maintenance and Care": {
+        "Cleaning instructions": None,
+        "Maintenance schedule": None,
+        "Replacement parts and accessories": None,
+        "Additional Details": None
+    },
+    "Troubleshooting": {
+        "Common issues and solutions": None,
+        "Error messages and their meanings": None,
+        "Support Information": None,
+        "Additional Details": None
+    },
+    "Warranty Information": {
+        "Terms and Conditions": None,
+        "Service and repair information": None,
+        "Additional Details": None
+    },
+    "Legal Information": {
+        "Copyright information": None,
+        "Trademarks and patents": None,
+        "Disclaimers": None,
+        "Additional Details": None
+    }
+}
+    for key,val in history.items():
+        find = details[key]
+        details[key] = str(detailed_feature_extraction(find,val))
+    return details
+def get_embeddings(link,tag_option):
+        print(f"\n--> Creating Embeddings - {link}")
+        if tag_option=='Complete Document Similarity':
+            history = { "Details": "" }
+        else:
+            history = {
+                    "Introduction": "",
+                    "Specifications": "",
+                    "Product Overview": "",
+                    "Safety Information": "",
+                    "Installation Instructions": "",
+                    "Setup and Configuration": "",
+                    "Operation Instructions": "",
+                    "Maintenance and Care": "",
+                    "Troubleshooting": "",
+                    "Warranty Information": "",
+                    "Legal Information": ""
+                }
+        # Extract Text -----------------------------
+        print("Extracting Text")
+        if link[-3:] == '.md' or link[8:11] == 'en.':
+            text = web_extractor(link)
+        else:
+            text = pdf_extractor(link)
+        # Create Chunks ----------------------------
+        print("Writing Tag Data")
+        if tag_option=="Complete Document Similarity":
+            history["Details"] = feature_extraction("Details", history["Details"], text[0][:50000])
+        else:
+            chunks = text_splitter.create_documents(text)
+            for chunk in chunks:
+                with concurrent.futures.ThreadPoolExecutor() as executor:
+                        future_to_key = {
+                            executor.submit(
+                                feature_extraction, f"Product {key}", history[key], chunk.page_content
+                            ): key for key in history
+                        }
+                        for future in concurrent.futures.as_completed(future_to_key):
+                            key = future_to_key[future]
+                            try:
+                                response = future.result()
+                                history[key] = response
+                            except Exception as e:
+                                print(f"Error processing {key}: {e}")
+        print("Creating Vectors")
+        genai_embeddings=[]
+        for tag in history:
+            result = genai.embed_content(
+                    model="models/embedding-001",
+                    content=history[tag],
+                    task_type="retrieval_document")
+            genai_embeddings.append(result['embedding'])
+        return history,genai_embeddings
+def get_embed_chroma(link):
+    print(f"\n--> Creating Embeddings - {link}")
+    # Extract Text -----------------------------
+    if link[-3:] == '.md' or link[8:11] == 'en.':
+        text = web_extractor(link)
+    else:
+        text = pdf_extractor(link)
+    print("\u2713 Extracting Text")
+    # Create Chunks ----------------------------
+    text = re.sub(r'\.{2,}', '.', text)
+    text = re.sub(r'\s{2,}', ' ', text)
+    text = re.sub(r'\d{7,}', '', text)
+    text = re.sub(r'\n{2,}', '\n', text)
+    chunks = text_splitter_small.split_text(text)
+    # print(chunks[:2])
+    print("\u2713 Writing Tag Data")
+    # Creating Vector
+    embedding_vectors=[]
+    # textual_data = []
+    print("\u2713 Creating Vectors")
+    # batch_size = 1
+    # # Process chunks in batches
+    # for i in range(0, len(chunks), batch_size):
+    #     batch = chunks[i:i + batch_size]
+    #     # texts = [text for text in batch]
+    #     # print(texts)
+    t1 = time.time()
+    for chunk in chunks:
+        # Tokenize the input text
+        inputs = tokenizer(chunk, return_tensors="np", padding=True, truncation=True)
+        # Convert inputs to int64
+        input_ids = inputs['input_ids'].astype(np.int64)
+        attention_mask = inputs['attention_mask'].astype(np.int64)
+        token_type_ids = inputs.get('token_type_ids', np.zeros_like(input_ids)).astype(np.int64)  # Some models might not use token_type_ids
+        # Create the input feed dictionary
+        input_feed = {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'token_type_ids': token_type_ids
+        }
+        # Get the model's outputs
+        outputs = session.run(None, input_feed)
+        # Convert the outputs to numpy and process as needed
+        last_hidden_state = np.array(outputs[0])
+        embeddings = last_hidden_state.mean(axis=1).tolist()
+        embedding_vectors.append(embeddings)
+        # textual_data.a(text)
+    t2 = time.time()
+    print(t2-t1)
+    return chunks , embedding_vectors
+def get_image_embeddings(Product):
+    image_embeddings = []
+    links = search_images(Product)
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        descriptions = list(executor.map(feature_extraction_image, links))
+    for description in descriptions:
+        result = genai.embed_content(
+                model="models/embedding-001",
+                content=description,
+                task_type="retrieval_document")
+        image_embeddings.append(result['embedding'])
+    # print(image_embeddings)
+    return image_embeddings , links
+global text_splitter
+global data
+global history
+text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size = 10000,
+    chunk_overlap  = 100,
+    separators = ["",''," "]
+)
+# text_splitter_small = RecursiveCharacterTextSplitter(
+#     chunk_size = 2000,
+#     chunk_overlap  = 100,
+#     separators = ["",''," "]
+# )
+text_splitter_small = TokenBasedTextSplitter(chunk_size=500, chunk_overlap=50)
+# chunks = splitter.split_text(text)
+if __name__ == '__main__':
+    print(get_embed_chroma('https://www.galaxys24manual.com/wp-content/uploads/pdf/galaxy-s24-manual-SAM-S921-S926-S928-OS14-011824-FINAL-US-English.pdf'))
+    # print(get_image_embeddings(Product='Samsung Galaxy S24'))

github_storage.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import os
+from github import Github
+import base64
+import shutil
+import zipfile
+from io import BytesIO
+# Global Variables
+# HF ------------
+hf_folder_path = '/home/user/app/embeddings'
+zip_name = 'embeddings'
+# Github -------
+github_token = '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'  # Replace with your GitHub token
+repo_name = 'AdityaMetkar/Patseer-Database'  # Replace with your repository, e.g., 'octocat/Hello-World'
+folder_path = 'Manual Database/embeddings.zip'  # Replace with the path to the folder in the repository
+# Authenticate to GitHub
+g = Github(github_token)
+repo = g.get_repo(repo_name)
+# Functions -------------------------------
+def zip_folder():
+    shutil.make_archive(zip_name, 'zip', hf_folder_path)
+    return zip_name + '.zip'
+def update_db():
+    try:
+        # Check if the file already exists in the repository
+        existing_file = repo.get_contents(folder_path)
+        compressed_zip = zip_folder()
+        with open(compressed_zip, 'rb') as file:
+            file_content = file.read()
+        # Update the existing file
+        repo.update_file(existing_file.path, "New DB Update", file_content, existing_file.sha)
+        print(f"Updated {folder_path} in GitHub repository.")
+    except Exception as e:
+        print(f"Error: {e}")
+def download_db():
+    if not os.path.exists(hf_folder_path):
+        os.makedirs(hf_folder_path)
+    file_content = repo.get_contents(folder_path)
+    try:
+        # Download the zip file content from GitHub
+        file_content = repo.get_contents(folder_path)
+        zip_data = base64.b64decode(file_content.content)
+        # Extract the downloaded zip file directly to hf_folder_path using shutil
+        with zipfile.ZipFile(BytesIO(zip_data)) as zip_ref:
+            for file in zip_ref.namelist():
+                zip_ref.extract(file, hf_folder_path)
+        print(f"Successfully unzipped files to {hf_folder_path}")
+    except Exception as e:
+        print(f"Error: {e}")
+# Download the folder
+# download_folder()
+# update_db()

preprocess.py CHANGED Viewed

@@ -46,7 +46,7 @@ logging.basicConfig(level=logging.INFO)
 data = False
 seen = set()
-existing_products_urls = set('123')
@@ -121,8 +121,7 @@ def extract_text_from_pdf(pdf_file, pages):
                 page = reader.pages[page_num]
                 extracted_text += page.extract_text() + "\n"
             else:
-                print(f"Page {page_num} does not exist in the document.")
         return extracted_text
     except:

 data = False
 seen = set()
+existing_products_urls = set()
                 page = reader.pages[page_num]
                 extracted_text += page.extract_text() + "\n"
             else:
+                pass
         return extracted_text
     except:

requirements.txt CHANGED Viewed

@@ -1,28 +1,32 @@
-beautifulsoup4==4.12.3
-chromadb
-duckduckgo_search==6.1.0
-faiss_cpu==1.8.0
-fastapi==0.111.0
-fitz==0.0.1.dev2
-Flask==3.0.3
-googlesearch_python==1.2.4
-langchain==0.2.3
-langchain_community==0.2.4
-langchain_google_genai==1.0.6
-langchain_text_splitters==0.2.1
-langdetect==1.0.9
-numpy==1.26.4
-pdfplumber==0.11.1
-Pillow==10.3.0
-pinecone
-plotly
-protobuf==4.25.0
-pydantic==2.7.4
-pymongo
-PyPDF2==3.0.1
-Requests==2.32.3
-spacy==3.7.5
-streamlit==1.35.0
-transformers
-torch
-tqdm==4.66.4

+APScheduler
+beautifulsoup4==4.11.1
+chromadb==0.5.3
+duckduckgo_search==6.1.0
+fastapi==0.111.0
+fitz==0.0.1.dev2
+Flask==2.3.1
+googlesearch_python==1.2.4
+langchain==0.2.6
+langchain_community==0.2.6
+langchain_core==0.2.10
+langchain_google_genai==1.0.7
+langdetect==1.0.9
+numpy
+onnx
+onnxruntime
+pandas==1.5.2
+pdfplumber==0.11.0
+Pillow==10.3.0
+pinecone==4.0.0
+plotly==5.22.0
+protobuf<5
+pydantic==1.10.9
+pymongo
+PyPDF2==3.0.1
+pygithub
+Requests==2.32.3
+streamlit==1.36.0
+torch==2.2.0
+tqdm==4.66.4
+transformers==4.41.2
+zipfile36

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff