Spaces:

That1BrainCell
/

Infringement-Checker

Sleeping

App Files Files Community

That1BrainCell commited on Jun 27, 2024

Commit

05fdf5e

verified ·

1 Parent(s): 026c1cf

Upload 3 files

Browse files

Files changed (3) hide show

embedding.py +101 -15
preprocess.py +9 -11
search.py +1 -6

embedding.py CHANGED Viewed

@@ -11,6 +11,10 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
 import google.generativeai as genai
 from langchain_core.messages import HumanMessage
 from io import BytesIO
 from search import search_images
@@ -19,6 +23,16 @@ gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIza
 gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
 gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
 genai.configure(api_key="AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI")
 def pdf_extractor(link):
@@ -42,7 +56,7 @@ def pdf_extractor(link):
     except Exception as e:
         print(f'An error occurred: {e}')
-    return [text]
 def web_extractor(link):
     text = ''
@@ -56,8 +70,22 @@ def web_extractor(link):
     except:
         pass
-    return [text]
 def feature_extraction(tag, history , context):
@@ -81,18 +109,23 @@ def feature_extraction(tag, history , context):
     return result.content
-def feature_extraction_image(url,):
     vision = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
     # result = gemini.invoke('''Hello''')
     # Markdown(result.content)
     # print(result)
     message = HumanMessage(content=[
                     {"type": "text", "text": "Please, Describe this image in detail"},
                     {"type": "image_url", "image_url": url}
                 ])
-    text = vision.invoke([message])
     return text.content
 def detailed_feature_extraction(find, context):
@@ -196,7 +229,7 @@ def detailed_history(history):
 def get_embeddings(link,tag_option):
-        print(f"\nCreating Embeddings ----- {link}")
         if tag_option=='Complete Document Similarity':
             history = { "Details": "" }
@@ -261,18 +294,65 @@ def get_embeddings(link,tag_option):
         return history,genai_embeddings
 def get_image_embeddings(Product):
     image_embeddings = []
-    links = search_images(Product)[0]
-    description = feature_extraction_image(links)
-    result = genai.embed_content(
-            model="models/embedding-001",
-            content=description,
-            task_type="retrieval_document")
-    return result
@@ -287,6 +367,12 @@ text_splitter = RecursiveCharacterTextSplitter(
     separators = ["",''," "]
 )
 if __name__ == '__main__':
-    # print(get_embeddings('https://www.galaxys24manual.com/wp-content/uploads/pdf/galaxy-s24-manual-SAM-S921-S926-S928-OS14-011824-FINAL-US-English.pdf',"Complete Document Similarity"))
-    print(get_image_embeddings(Product='Samsung Galaxy S24'))

 import google.generativeai as genai
 from langchain_core.messages import HumanMessage
 from io import BytesIO
+import numpy as np
+import re
+import torch
+from transformers import AutoTokenizer, AutoModel
 from search import search_images
 gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
 gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
+vision = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyCo-TeDp0Ou--UwhlTgMwCoTEZxg6-v7wA',temperature = 0.1)
+vision1 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI',temperature = 0.1)
+vision2 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
+vision3 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBNN4VDMAOB2gSZha6HjsTuH71PVV69FLM',temperature = 0.1)
+tokenizer = AutoTokenizer.from_pretrained('Alibaba-NLP/gte-base-en-v1.5',trust_remote_code = True)
+model = AutoModel.from_pretrained('Alibaba-NLP/gte-base-en-v1.5',trust_remote_code = True)
+model.to('cpu')  # Ensure the model is on the CPU
 genai.configure(api_key="AIzaSyAtnUk8QKSUoJd3uOBpmeBNN-t8WXBt0zI")
 def pdf_extractor(link):
     except Exception as e:
         print(f'An error occurred: {e}')
+    return text
 def web_extractor(link):
     text = ''
     except:
         pass
+    return text
+def imporve_text(text):
+    prompt = f'''
+    Please rewrite the following text to make it short, concise, and of high quality.
+    Ensure that all essential information and key points are retained.
+    Focus on improving clarity, coherence, and word choice without altering the original meaning.
+    text = {text}
+    '''
+    model = random.choice([gemini,gemini1,gemini2,gemini3])
+    result = model.invoke(prompt)
+    return result.content
 def feature_extraction(tag, history , context):
     return result.content
+def feature_extraction_image(url):
     vision = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key='AIzaSyBzbZQBffHFK3N-gWnhDDNbQ9yZnZtaS2E',temperature = 0.1)
     # result = gemini.invoke('''Hello''')
     # Markdown(result.content)
     # print(result)
+    text = 'None'
     message = HumanMessage(content=[
                     {"type": "text", "text": "Please, Describe this image in detail"},
                     {"type": "image_url", "image_url": url}
                 ])
+    try:
+        model = random.choice([vision,vision1,vision2,vision3])
+        text = model.invoke([message])
+    except:
+        return text
     return text.content
 def detailed_feature_extraction(find, context):
 def get_embeddings(link,tag_option):
+        print(f"\n--> Creating Embeddings - {link}")
         if tag_option=='Complete Document Similarity':
             history = { "Details": "" }
         return history,genai_embeddings
+def get_embed_chroma(link):
+    print(f"\n--> Creating Embeddings - {link}")
+    # Extract Text -----------------------------
+    if link[-3:] == '.md' or link[8:11] == 'en.':
+        text = web_extractor(link)
+    else:
+        text = pdf_extractor(link)
+    print("\u2713 Extracting Text")
+    # Create Chunks ----------------------------
+    text = re.sub(r'\.{2,}', '.', text)
+    text = re.sub(r'\s{2,}', ' ', text)
+    text = [re.sub(r'\n{2,}', '\n', text)]
+    chunks = text_splitter_small.create_documents(text)
+    print("\u2713 Writing Tag Data")
+    # Creating Vector
+    embedding_vectors=[]
+    textual_data = []
+    print("\u2713 Creating Vectors")
+    for text in chunks:
+        inputs = tokenizer(text.page_content, return_tensors="pt", padding=True, truncation=True)
+        inputs = {k: v.to('cpu') for k, v in inputs.items()}
+        # Get the model's outputs
+        with torch.no_grad():
+            outputs = model(**inputs)
+        embeddings = outputs.last_hidden_state.mean(dim=1)
+        embedding_vectors.append(embeddings.squeeze().cpu().numpy().tolist())
+        textual_data.append(text.page_content)
+    return textual_data , embedding_vectors
 def get_image_embeddings(Product):
     image_embeddings = []
+    links = search_images(Product)
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        descriptions = list(executor.map(feature_extraction_image, links))
+    for description in descriptions:
+        result = genai.embed_content(
+                model="models/embedding-001",
+                content=description,
+                task_type="retrieval_document")
+        image_embeddings.append(result['embedding'])
+    # print(image_embeddings)
+    return image_embeddings
     separators = ["",''," "]
 )
+text_splitter_small = RecursiveCharacterTextSplitter(
+    chunk_size = 2000,
+    chunk_overlap  = 100,
+    separators = ["",''," "]
+)
 if __name__ == '__main__':
+    print(get_embed_chroma('https://www.galaxys24manual.com/wp-content/uploads/pdf/galaxy-s24-manual-SAM-S921-S926-S928-OS14-011824-FINAL-US-English.pdf'))
+    # print(get_image_embeddings(Product='Samsung Galaxy S24'))

preprocess.py CHANGED Viewed

@@ -15,11 +15,11 @@ from pymongo import MongoClient
 # Mongo Connections
-srv_connection_uri = "mongodb+srv://adityasm1410:uOh6i11AYFeKp4wd@patseer.5xilhld.mongodb.net/?retryWrites=true&w=majority&appName=Patseer"
-client = MongoClient(srv_connection_uri)
-db = client['embeddings']
-collection = db['data']
 # API Urls -----
@@ -42,12 +42,11 @@ headers = {"Authorization": "Bearer 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"}
 logging.basicConfig(level=logging.INFO)
 # Global Var --------
 data = False
 seen = set()
-existing_products_urls = set(collection.distinct('url'))
@@ -122,8 +121,7 @@ def extract_text_from_pdf(pdf_file, pages):
                 page = reader.pages[page_num]
                 extracted_text += page.extract_text() + "\n"
             else:
-                print(f"Page {page_num} does not exist in the document.")
         return extracted_text
     except:
@@ -155,11 +153,11 @@ def process_link(link, main_product, similar_product):
         if language_preprocess(text):
             if relevant(main_product, similar_product, text):
-                print("Accepted",link)
                 return link
     except:
         pass
-    print("NOT Accepted",link)
     return None
 def filtering(urls, main_product, similar_product, link_count):
@@ -178,7 +176,7 @@ def filtering(urls, main_product, similar_product, link_count):
     count = 0
-    print(f"Filtering Links of ---- {similar_product}")
     for link in urls:

 # Mongo Connections
+# srv_connection_uri = "mongodb+srv://adityasm1410:uOh6i11AYFeKp4wd@patseer.5xilhld.mongodb.net/?retryWrites=true&w=majority&appName=Patseer"
+# client = MongoClient(srv_connection_uri)
+# db = client['embeddings']
+# collection = db['data']
 # API Urls -----
 logging.basicConfig(level=logging.INFO)
 # Global Var --------
 data = False
 seen = set()
+existing_products_urls = set()
                 page = reader.pages[page_num]
                 extracted_text += page.extract_text() + "\n"
             else:
+                pass
         return extracted_text
     except:
         if language_preprocess(text):
             if relevant(main_product, similar_product, text):
+                print("Accepted -",link)
                 return link
     except:
         pass
+    print("Rejected -",link)
     return None
 def filtering(urls, main_product, similar_product, link_count):
     count = 0
+    print(f"--> Filtering Links of - {similar_product}")
     for link in urls:

search.py CHANGED Viewed

@@ -12,7 +12,6 @@ import re
 # Function to search DuckDuckGo
 def search_duckduckgo(query):
-    print("Fetching Duckduckgo Links -----")
     try:
         results = DDGS().text(f"{query} manual filetype:pdf", max_results=5)
         return [res['href'] for res in results]
@@ -21,7 +20,6 @@ def search_duckduckgo(query):
 # Function to search Google
 def search_google(query):
-    print("Fetching Google Links -----")
     links = []
     try:
@@ -55,7 +53,6 @@ def search_google(query):
 # Function to search Internet Archive
 def search_archive(query):
-    print("Fetching Archive Links -----")
     try:
         url = "https://archive.org/advancedsearch.php"
@@ -132,7 +129,6 @@ def search_archive(query):
         return []
 def search_github(query):
-    print("Fetching Github Links -----")
     try:
         # GitHub Search API endpoint
@@ -153,7 +149,6 @@ def search_github(query):
         return []
 def search_wikipedia(product):
-    print("Fetching Wikipedia Links -----")
     api_url = "https://en.wikipedia.org/w/api.php"
     params = {
@@ -223,7 +218,7 @@ def search_images(product):
 # Similarity Check  -------------------------------------->
 def extract_similar_products(query):
-    print(f"\nFetching similar items of  -----> {query}")
     results = DDGS().chat(f'{query} Similar Products')
     pattern = r'^\d+\.\s(.+)$'

 # Function to search DuckDuckGo
 def search_duckduckgo(query):
     try:
         results = DDGS().text(f"{query} manual filetype:pdf", max_results=5)
         return [res['href'] for res in results]
 # Function to search Google
 def search_google(query):
     links = []
     try:
 # Function to search Internet Archive
 def search_archive(query):
     try:
         url = "https://archive.org/advancedsearch.php"
         return []
 def search_github(query):
     try:
         # GitHub Search API endpoint
         return []
 def search_wikipedia(product):
     api_url = "https://en.wikipedia.org/w/api.php"
     params = {
 # Similarity Check  -------------------------------------->
 def extract_similar_products(query):
+    print(f"\n--> Fetching similar items of - {query}")
     results = DDGS().chat(f'{query} Similar Products')
     pattern = r'^\d+\.\s(.+)$'