Spaces:

That1BrainCell
/

Infringement-Checker

Sleeping

App Files Files Community

That1BrainCell commited on Jun 19, 2024

Commit

140a96c

verified ·

1 Parent(s): f7cf778

New API + MongoDB

Browse files

Files changed (3) hide show

embedding.py +40 -11
preprocess.py +32 -10
search.py +234 -227

embedding.py CHANGED Viewed

@@ -9,17 +9,17 @@ from langchain_community.document_loaders import WebBaseLoader
 from langchain_community.document_loaders import PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import google.generativeai as genai
 from io import BytesIO
-gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA',temperature = 0.1)
-gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyABsaDjPujPCBlz4LLxcXDX_bDA9uEL7Xc',temperature = 0.1)
-gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBCIQgt1uK7-sJH5Afg5vUZ99EWkx5gSU0',temperature = 0.1)
-gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBot9W5Q-BKQ66NAYRUmVeloXWEbXOXTmM',temperature = 0.1)
-genai.configure(api_key="AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA")
 def pdf_extractor(link):
     text = ''
@@ -76,11 +76,25 @@ def feature_extraction(tag, history , context):
     Respond with the updated Tag_History.
     '''
-    # model = random.choice([gemini,gemini1,gemini2,gemini3])
-    result = gemini1.invoke(prompt)
     return result.content
 def detailed_feature_extraction(find, context):
     prompt = f'''
@@ -246,6 +260,21 @@ def get_embeddings(link,tag_option):
         return history,genai_embeddings
 global text_splitter
 global data
@@ -259,5 +288,5 @@ text_splitter = RecursiveCharacterTextSplitter(
 )
 if __name__ == '__main__':
-    # print(get_embeddings('https://www.galaxys24manual.com/wp-content/uploads/pdf/galaxy-s24-manual-SAM-S921-S926-S928-OS14-011824-FINAL-US-English.pdf',"Single"))
-    pass

 from langchain_community.document_loaders import PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import google.generativeai as genai
+from langchain_core.messages import HumanMessage
 from io import BytesIO
+from search import search_images
+gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
+gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
+gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
+gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
+genai.configure(api_key="AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI")
 def pdf_extractor(link):
     text = ''
     Respond with the updated Tag_History.
     '''
+    model = random.choice([gemini,gemini1,gemini2,gemini3])
+    result = model.invoke(prompt)
     return result.content
+def feature_extraction_image(url,):
+    vision = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
+    # result = gemini.invoke('''Hello''')
+    # Markdown(result.content)
+    # print(result)
+    message = HumanMessage(content=[
+                    {"type": "text", "text": "Please, Describe this image in detail"},
+                    {"type": "image_url", "image_url": url}
+                ])
+    text = vision.invoke([message])
+    return text.content
 def detailed_feature_extraction(find, context):
     prompt = f'''
         return history,genai_embeddings
+def get_image_embeddings(Product):
+    image_embeddings = []
+    links = search_images(Product)[0]
+    description = feature_extraction_image(links)
+    result = genai.embed_content(
+            model="models/embedding-001",
+            content=description,
+            task_type="retrieval_document")
+    return result
 global text_splitter
 global data
 )
 if __name__ == '__main__':
+    # print(get_embeddings('https://www.galaxys24manual.com/wp-content/uploads/pdf/galaxy-s24-manual-SAM-S921-S926-S928-OS14-011824-FINAL-US-English.pdf',"Complete Document Similarity"))
+    print(get_image_embeddings(Product='Samsung Galaxy S24'))

preprocess.py CHANGED Viewed

@@ -11,9 +11,16 @@ from io import BytesIO
 from langchain_community.document_loaders import WebBaseLoader
 from langchain_google_genai import ChatGoogleGenerativeAI
 import logging
-data = False
-seen = set()
 # API Urls -----
@@ -22,10 +29,10 @@ main_url = "http://127.0.0.1:8000/search/all"
 # main_product = "Samsung Galaxy s23 ultra"
 # Revelevance Checking Models -----
-gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA',temperature = 0.1)
-gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyABsaDjPujPCBlz4LLxcXDX_bDA9uEL7Xc',temperature = 0.1)
-gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBCIQgt1uK7-sJH5Afg5vUZ99EWkx5gSU0',temperature = 0.1)
-gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBot9W5Q-BKQ66NAYRUmVeloXWEbXOXTmM',temperature = 0.1)
 API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-xxl"
@@ -35,6 +42,15 @@ headers = {"Authorization": "Bearer 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"}
 logging.basicConfig(level=logging.INFO)
 def get_links(main_product,api_key):
     params = {
         "API_KEY": f"{api_key}",
@@ -165,11 +181,17 @@ def filtering(urls, main_product, similar_product, link_count):
     print(f"Filtering Links of ---- {similar_product}")
     for link in urls:
-        result = process_link(link, main_product, similar_product)
-        if result is not None:
-            res.append(result)
-            count += 1
         if count == link_count:
             break

 from langchain_community.document_loaders import WebBaseLoader
 from langchain_google_genai import ChatGoogleGenerativeAI
 import logging
+from pymongo import MongoClient
+# Mongo Connections
+srv_connection_uri = "mongodb+srv://adityasm1410:uOh6i11AYFeKp4wd@patseer.5xilhld.mongodb.net/?retryWrites=true&w=majority&appName=Patseer"
+client = MongoClient(srv_connection_uri)
+db = client['embeddings']
+collection = db['data']
 # API Urls -----
 # main_product = "Samsung Galaxy s23 ultra"
 # Revelevance Checking Models -----
+gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
+gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
+gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
+gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
 API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-xxl"
 logging.basicConfig(level=logging.INFO)
+# Global Var --------
+data = False
+seen = set()
+existing_products_urls = set(collection.distinct('url'))
 def get_links(main_product,api_key):
     params = {
         "API_KEY": f"{api_key}",
     print(f"Filtering Links of ---- {similar_product}")
     for link in urls:
+        if link in existing_products_urls:
+            res.append((link,1))
+            count+=1
+        else:
+            result = process_link(link, main_product, similar_product)
+            if result is not None:
+                res.append((result,0))
+                count += 1
         if count == link_count:
             break

search.py CHANGED Viewed

@@ -1,227 +1,234 @@
-# Library Imports
-import requests
-from bs4 import BeautifulSoup
-from googlesearch import search
-from duckduckgo_search import DDGS
-import concurrent.futures
-import re
-# Search Functions -------------------------------------------------------------->
-# Function to search DuckDuckGo
-def search_duckduckgo(query):
-    print("Fetching Duckduckgo Links -----")
-    try:
-        results = DDGS().text(f"{query} manual filetype:pdf", max_results=5)
-        return [res['href'] for res in results]
-    except:
-        return []
-# Function to search Google
-def search_google(query):
-    print("Fetching Google Links -----")
-    links = []
-    try:
-        api_key = 'AIzaSyDV_uJwrgNtawqtl6GDfeUj6NqO-H1tA4c'
-        search_engine_id = 'c4ca951b9fc6949cb'
-        url = f"https://www.googleapis.com/customsearch/v1"
-        params = {
-            "key": api_key,
-            "cx": search_engine_id,
-            "q": query + " manual filetype:pdf"
-        }
-        response = requests.get(url, params=params)
-        results = response.json()
-        for item in results.get('items', []):
-            links.append(item['link'])
-    except:
-        pass
-    try:
-        extension = "ext:pdf"
-        for result in search(query + " manual " + extension, num_results=5):
-            if result.endswith('.pdf'):
-                links.append(result)
-    except:
-        pass
-    return links
-# Function to search Internet Archive
-def search_archive(query):
-    print("Fetching Archive Links -----")
-    try:
-        url = "https://archive.org/advancedsearch.php"
-        params = {
-            'q': f'{query} manual',
-            'fl[]': ['identifier', 'title', 'format'],
-            'rows': 50,
-            'page': 1,
-            'output': 'json'
-        }
-        # Make the request
-        response = requests.get(url, params=params)
-        data = response.json()
-        # Function to extract hyperlinks from a webpage
-        def extract_hyperlinks(url):
-            # Send a GET request to the URL
-            response = requests.get(url)
-            # Check if the request was successful
-            if response.status_code == 200:
-                # Parse the HTML content of the page
-                soup = BeautifulSoup(response.text, 'html.parser')
-                # Find all <a> tags (hyperlinks)
-                for link in soup.find_all('a', href=True):
-                    href = link['href']
-                    if href.endswith('.pdf'):
-                        pdf_files.append(url+'/'+href)
-                    if href.endswith('.iso'):
-                        # If the link ends with .iso, follow the link and extract .pdf hyperlinks
-                        extract_pdf_from_iso(url+'/'+href+'/')
-        # Function to extract .pdf hyperlinks from an .iso file
-        def extract_pdf_from_iso(iso_url):
-            # Send a GET request to the ISO URL
-            iso_response = requests.get(iso_url)
-            # Check if the request was successful
-            if iso_response.status_code == 200:
-                # Parse the HTML content of the ISO page
-                iso_soup = BeautifulSoup(iso_response.text, 'html.parser')
-                # Find all <a> tags (hyperlinks) in the ISO page
-                for link in iso_soup.find_all('a', href=True):
-                    href = link['href']
-                    if href.endswith('.pdf'):
-                        pdf_files.append('https:'+href)
-        pdf_files = []
-        def process_doc(doc):
-            identifier = doc.get('identifier', 'N/A')
-            # title = doc.get('title', 'N/A')
-            # format = doc.get('format', 'N/A')
-            pdf_link = f"https://archive.org/download/{identifier}"
-            extract_hyperlinks(pdf_link)
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            futures = [executor.submit(process_doc, doc) for doc in data['response']['docs']]
-            # Optionally, wait for all futures to complete and handle any exceptions
-            for future in concurrent.futures.as_completed(futures):
-                try:
-                    future.result()  # This will raise an exception if the function call raised
-                except Exception as exc:
-                    print(f'Generated an exception: {exc}')
-        return pdf_files
-    except:
-        return []
-def search_github(query):
-    print("Fetching Github Links -----")
-    try:
-        # GitHub Search API endpoint
-        url = f"https://api.github.com/search/code?q={query}+extension:md"
-        headers = {
-        'Authorization': 'Token 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'
-        }
-        # Make the request
-        response = requests.get(url,headers=headers)
-        data = response.json()
-        links = [item['html_url'] for item in data['items']]
-        return links
-    except:
-        return []
-def search_wikipedia(product):
-    print("Fetching Wikipedia Links -----")
-    api_url = "https://en.wikipedia.org/w/api.php"
-    params = {
-        "action": "opensearch",
-        "search": product,
-        "limit": 5,
-        "namespace": 0,
-        "format": "json"
-    }
-    try:
-        response = requests.get(api_url, params=params)
-        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
-        data = response.json()
-        if data and len(data) > 3 and len(data[3]) > 0:
-            return data[3]  # The URL is in the fourth element of the response array
-        else:
-            return []
-    except requests.RequestException as e:
-        print(f"An error occurred: {e}")
-        return []
-# def search_all(product,num):
-#     similar_products = extract_similar_products(product)[num]
-#     # results = {
-#     #         product : [{'duckduckgo': duckduckgo_search(product)},{'google': google_search(product)},{'github': github_search(product)},{'archive': archive_search(product)}]
-#     #     }
-#     results = {}
-#     def search_product(p):
-#             return {
-#                 'product': p,
-#                 'duckduckgo': duckduckgo_search(p),
-#                 'google': google_search(p),
-#                 'github': github_search(p),
-#                 'archive': archive_search(p),
-#                 'wikipedia': wikipedia_search(p)
-#             }
-#     with concurrent.futures.ThreadPoolExecutor() as executor:
-#             future_to_product = {executor.submit(search_product, p): p for p in similar_products}
-#             for future in concurrent.futures.as_completed(future_to_product):
-#                 result = future.result()
-#                 product = result['product']
-#                 results[product] = [
-#                     {'duckduckgo': result['duckduckgo']},
-#                     {'google': result['google']},
-#                     {'github': result['github']},
-#                     {'archive': result['archive']},
-#                     {'wikipedia': result['wikipedia']}
-#                 ]
-#     return results
-# Similarity Check  -------------------------------------->
-def extract_similar_products(query):
-    print(f"\nFetching similar items of  -----> {query}")
-    results = DDGS().chat(f'{query} Similar Products')
-    pattern = r'^\d+\.\s(.+)$'
-    matches = re.findall(pattern, results, re.MULTILINE)
-    matches = [item.split(': ')[0] for item in matches]
-    return matches

+# Library Imports
+import requests
+from bs4 import BeautifulSoup
+from googlesearch import search
+from duckduckgo_search import DDGS
+import concurrent.futures
+import re
+# Search Functions -------------------------------------------------------------->
+# Function to search DuckDuckGo
+def search_duckduckgo(query):
+    print("Fetching Duckduckgo Links -----")
+    try:
+        results = DDGS().text(f"{query} manual filetype:pdf", max_results=5)
+        return [res['href'] for res in results]
+    except:
+        return []
+# Function to search Google
+def search_google(query):
+    print("Fetching Google Links -----")
+    links = []
+    try:
+        api_key = 'AIzaSyDV_uJwrgNtawqtl6GDfeUj6NqO-H1tA4c'
+        search_engine_id = 'c4ca951b9fc6949cb'
+        url = f"https://www.googleapis.com/customsearch/v1"
+        params = {
+            "key": api_key,
+            "cx": search_engine_id,
+            "q": query + " manual filetype:pdf"
+        }
+        response = requests.get(url, params=params)
+        results = response.json()
+        for item in results.get('items', []):
+            links.append(item['link'])
+    except:
+        pass
+    try:
+        extension = "ext:pdf"
+        for result in search(query + " manual " + extension, num_results=5):
+            if result.endswith('.pdf'):
+                links.append(result)
+    except:
+        pass
+    return links
+# Function to search Internet Archive
+def search_archive(query):
+    print("Fetching Archive Links -----")
+    try:
+        url = "https://archive.org/advancedsearch.php"
+        params = {
+            'q': f'{query} manual',
+            'fl[]': ['identifier', 'title', 'format'],
+            'rows': 50,
+            'page': 1,
+            'output': 'json'
+        }
+        # Make the request
+        response = requests.get(url, params=params)
+        data = response.json()
+        # Function to extract hyperlinks from a webpage
+        def extract_hyperlinks(url):
+            # Send a GET request to the URL
+            response = requests.get(url)
+            # Check if the request was successful
+            if response.status_code == 200:
+                # Parse the HTML content of the page
+                soup = BeautifulSoup(response.text, 'html.parser')
+                # Find all <a> tags (hyperlinks)
+                for link in soup.find_all('a', href=True):
+                    href = link['href']
+                    if href.endswith('.pdf'):
+                        pdf_files.append(url+'/'+href)
+                    if href.endswith('.iso'):
+                        # If the link ends with .iso, follow the link and extract .pdf hyperlinks
+                        extract_pdf_from_iso(url+'/'+href+'/')
+        # Function to extract .pdf hyperlinks from an .iso file
+        def extract_pdf_from_iso(iso_url):
+            # Send a GET request to the ISO URL
+            iso_response = requests.get(iso_url)
+            # Check if the request was successful
+            if iso_response.status_code == 200:
+                # Parse the HTML content of the ISO page
+                iso_soup = BeautifulSoup(iso_response.text, 'html.parser')
+                # Find all <a> tags (hyperlinks) in the ISO page
+                for link in iso_soup.find_all('a', href=True):
+                    href = link['href']
+                    if href.endswith('.pdf'):
+                        pdf_files.append('https:'+href)
+        pdf_files = []
+        def process_doc(doc):
+            identifier = doc.get('identifier', 'N/A')
+            # title = doc.get('title', 'N/A')
+            # format = doc.get('format', 'N/A')
+            pdf_link = f"https://archive.org/download/{identifier}"
+            extract_hyperlinks(pdf_link)
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = [executor.submit(process_doc, doc) for doc in data['response']['docs']]
+            # Optionally, wait for all futures to complete and handle any exceptions
+            for future in concurrent.futures.as_completed(futures):
+                try:
+                    future.result()  # This will raise an exception if the function call raised
+                except Exception as exc:
+                    print(f'Generated an exception: {exc}')
+        return pdf_files
+    except:
+        return []
+def search_github(query):
+    print("Fetching Github Links -----")
+    try:
+        # GitHub Search API endpoint
+        url = f"https://api.github.com/search/code?q={query}+extension:md"
+        headers = {
+        'Authorization': 'Token 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'
+        }
+        # Make the request
+        response = requests.get(url,headers=headers)
+        data = response.json()
+        links = [item['html_url'] for item in data['items']]
+        return links
+    except:
+        return []
+def search_wikipedia(product):
+    print("Fetching Wikipedia Links -----")
+    api_url = "https://en.wikipedia.org/w/api.php"
+    params = {
+        "action": "opensearch",
+        "search": product,
+        "limit": 5,
+        "namespace": 0,
+        "format": "json"
+    }
+    try:
+        response = requests.get(api_url, params=params)
+        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
+        data = response.json()
+        if data and len(data) > 3 and len(data[3]) > 0:
+            return data[3]  # The URL is in the fourth element of the response array
+        else:
+            return []
+    except requests.RequestException as e:
+        print(f"An error occurred: {e}")
+        return []
+# def search_all(product,num):
+#     similar_products = extract_similar_products(product)[num]
+#     # results = {
+#     #         product : [{'duckduckgo': duckduckgo_search(product)},{'google': google_search(product)},{'github': github_search(product)},{'archive': archive_search(product)}]
+#     #     }
+#     results = {}
+#     def search_product(p):
+#             return {
+#                 'product': p,
+#                 'duckduckgo': duckduckgo_search(p),
+#                 'google': google_search(p),
+#                 'github': github_search(p),
+#                 'archive': archive_search(p),
+#                 'wikipedia': wikipedia_search(p)
+#             }
+#     with concurrent.futures.ThreadPoolExecutor() as executor:
+#             future_to_product = {executor.submit(search_product, p): p for p in similar_products}
+#             for future in concurrent.futures.as_completed(future_to_product):
+#                 result = future.result()
+#                 product = result['product']
+#                 results[product] = [
+#                     {'duckduckgo': result['duckduckgo']},
+#                     {'google': result['google']},
+#                     {'github': result['github']},
+#                     {'archive': result['archive']},
+#                     {'wikipedia': result['wikipedia']}
+#                 ]
+#     return results
+def search_images(product):
+    results = DDGS().images(f"{product}", max_results=5)
+    # print(results)
+    return [r['image'] for r in results]
+# Similarity Check  -------------------------------------->
+def extract_similar_products(query):
+    print(f"\nFetching similar items of  -----> {query}")
+    results = DDGS().chat(f'{query} Similar Products')
+    pattern = r'^\d+\.\s(.+)$'
+    matches = re.findall(pattern, results, re.MULTILINE)
+    matches = [item.split(': ')[0] for item in matches]
+    return matches