Spaces:

Vitomir
/

search_engine

Running

App Files Files Community

Vitomir Jovanović commited on Oct 10, 2024

Commit

591de4e

1 Parent(s): 348df3a

Glancing + new data

Browse files

Files changed (10) hide show

Procfile.yaml +0 -1
README.md +1 -1
app.py +4 -5
{models → data}/prompts_data.jsonl +0 -0
fast_api.py +1 -2
models/__pycache__/data_reader.cpython-312.pyc +0 -0
models/__pycache__/prompt_search_engine.cpython-312.pyc +0 -0
models/data_reader.py +17 -9
models/prompt_search_engine.py +2 -6
models/vectorizer.py +0 -33

Procfile.yaml DELETED Viewed

	@@ -1 +0,0 @@
1	- web: gunicorn -w 1 -k uvicorn.workers.UvicornWorker main:app --bind 0.0.0.0:8084 & streamlit run app.py --server.port 7860

README.md CHANGED Viewed

@@ -24,7 +24,7 @@ Script creates swagger app with endpoints on [localhost:8084](http://127.0.0.1:8
 data_reader.py
 ```
 creates data of various prompts for encoding into vector database, from prompt-picture dataset.
-Local database encoded only 6000 prompts.
 Faiss index that is used is small and not optimized, used for experimental datasets. Search is brute force, not optimised.
 ### Streamlit

 data_reader.py
 ```
 creates data of various prompts for encoding into vector database, from prompt-picture dataset.
+Local database encoded only 11000 prompts.
 Faiss index that is used is small and not optimized, used for experimental datasets. Search is brute force, not optimised.
 ### Streamlit

app.py CHANGED Viewed

@@ -1,15 +1,11 @@
 import streamlit as st
-from models.vectorizer import Vectorizer
 from models.prompt_search_engine import PromptSearchEngine
 from models.data_reader import load_prompts_from_jsonl
-from models.Query import Query, SimilarPrompt, SearchResponse, PromptVector, VectorResponse
-from sentence_transformers import SentenceTransformer
-import os
 # Cache the prompts data to avoid reloading every time
 @st.cache_data
 def load_prompts():
-    prompt_path = "models/prompts_data.jsonl"
     return load_prompts_from_jsonl(prompt_path)
 # Cache the search engine initialization
@@ -36,12 +32,15 @@ k = st.number_input("Number of similar prompts to retrieve:", min_value=1, max_v
 # Button to trigger search
 if st.button("Search Prompts"):
     if query_input:
         similar_prompts, distances = search_engine.most_similar(query_input, top_k=k)
         # Format and display search results
         st.write(f"Search Results: ")
         for i, (prompt, distance) in enumerate(zip(similar_prompts, distances)):
             st.write(f"{i+1}. Prompt: {prompt}, Distance: {distance}")
     else:
         st.error("Please enter a prompt.")

 import streamlit as st
 from models.prompt_search_engine import PromptSearchEngine
 from models.data_reader import load_prompts_from_jsonl
 # Cache the prompts data to avoid reloading every time
 @st.cache_data
 def load_prompts():
+    prompt_path = "data/prompts_data.jsonl"
     return load_prompts_from_jsonl(prompt_path)
 # Cache the search engine initialization
 # Button to trigger search
 if st.button("Search Prompts"):
     if query_input:
+        print(f'Search engine is searching the most similar prompts for query {query_input}')
         similar_prompts, distances = search_engine.most_similar(query_input, top_k=k)
+        print(f'Those are: {similar_prompts}, {distances}')
         # Format and display search results
         st.write(f"Search Results: ")
         for i, (prompt, distance) in enumerate(zip(similar_prompts, distances)):
             st.write(f"{i+1}. Prompt: {prompt}, Distance: {distance}")
+            print(f'Those are: {prompt}, {distance}')
     else:
         st.error("Please enter a prompt.")

{models → data}/prompts_data.jsonl RENAMED Viewed

The diff for this file is too large to render. See raw diff

fast_api.py CHANGED Viewed

@@ -5,7 +5,6 @@ import uvicorn
 import socket
 import logging
 import datetime
-from models.vectorizer import Vectorizer
 from models.prompt_search_engine import PromptSearchEngine
 from models.data_reader import load_prompts_from_jsonl
 from models.Query import Query, Query_Multiple, SearchResponse, SimilarPrompt, PromptVector, VectorResponse
@@ -15,7 +14,7 @@ from sentence_transformers import SentenceTransformer
-prompt_path = r"C:\Users\jov2bg\Desktop\PromptSearch\search_engine\models\prompts_data.jsonl"
 app = FastAPI(title="Search Prompt Engine", description="API for prompt search", version="1.0")

 import socket
 import logging
 import datetime
 from models.prompt_search_engine import PromptSearchEngine
 from models.data_reader import load_prompts_from_jsonl
 from models.Query import Query, Query_Multiple, SearchResponse, SimilarPrompt, PromptVector, VectorResponse
+prompt_path = r"C:\Users\jov2bg\Desktop\PromptSearch\search_engine\data\prompts_data.jsonl"
 app = FastAPI(title="Search Prompt Engine", description="API for prompt search", version="1.0")

models/__pycache__/data_reader.cpython-312.pyc CHANGED Viewed

Binary files a/models/__pycache__/data_reader.cpython-312.pyc and b/models/__pycache__/data_reader.cpython-312.pyc differ

models/__pycache__/prompt_search_engine.cpython-312.pyc CHANGED Viewed

Binary files a/models/__pycache__/prompt_search_engine.cpython-312.pyc and b/models/__pycache__/prompt_search_engine.cpython-312.pyc differ

models/data_reader.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from datasets import load_dataset
 import json
 # Load the dataset
@@ -10,21 +11,28 @@ num_shards = 46  # Number of webdataset tar files
 def download_data(base_url, num_shards):
     # Download the data
     urls = [base_url.format(i=i) for i in range(num_shards)]
     dataset = load_dataset("webdataset", data_files={"train": urls}, split="train", streaming=True)
     return dataset
-def extract_prompts(dataset, json_file_path):
     # Write data to the jsonl file
     prompts = {}
     with open(jsonl_file_path, 'w') as f:
-        for index, row in enumerate(dataset):
-            prompts[index] = row['json']['prompt']
-            f.write(json.dumps(prompts[index]) + '\n')
 def read_data(jsonl_file_path):
     # Read data from the jsonl file
     with open(jsonl_file_path, 'r') as f:
         for line in f:
@@ -36,15 +44,15 @@ def load_prompts_from_jsonl(file_path):
     prompts = []
     with open(file_path, 'r') as f:
         for line in f:
-            data = json.loads(line)  # Each line is a JSON object
-            prompts.append(data)  # Extract the 'prompt' field
     print("Data loaded successfully.")
     return prompts
 if __name__ == "__main__":
-    jsonl_file_path = r"C:\Users\jov2bg\Desktop\PromptSearch\search_engine\models\prompts_data.jsonl"
     num_shards = 1
-    dataset = download_data(num_shards, base_url)
     extract_prompts(dataset, jsonl_file_path)
     read_data(jsonl_file_path)

 from datasets import load_dataset
 import json
+from tqdm import tqdm
 # Load the dataset
 def download_data(base_url, num_shards):
     # Download the data
+    print("Downloading data...")
     urls = [base_url.format(i=i) for i in range(num_shards)]
     dataset = load_dataset("webdataset", data_files={"train": urls}, split="train", streaming=True)
     return dataset
+def extract_prompts(dataset, jsonl_file_path):
     # Write data to the jsonl file
     prompts = {}
+    print('Extracting data to:', jsonl_file_path)
     with open(jsonl_file_path, 'w') as f:
+        with tqdm(desc="Processing prompts", unit=" prompt") as pbar:
+            for index, row in enumerate(dataset):
+                prompts[index] = row['json']['prompt']
+                f.write(json.dumps(prompts[index]) + '\n')
+                pbar.update(1)
 def read_data(jsonl_file_path):
     # Read data from the jsonl file
     with open(jsonl_file_path, 'r') as f:
         for line in f:
     prompts = []
     with open(file_path, 'r') as f:
         for line in f:
+            data = json.loads(line)
+            prompts.append(data)
     print("Data loaded successfully.")
     return prompts
 if __name__ == "__main__":
+    jsonl_file_path = r"C:\Users\jov2bg\Desktop\PromptSearch\search_engine\data\prompts_data_new.jsonl"
     num_shards = 1
+    dataset = download_data(base_url, num_shards)
     extract_prompts(dataset, jsonl_file_path)
     read_data(jsonl_file_path)

models/prompt_search_engine.py CHANGED Viewed

@@ -1,10 +1,10 @@
 from typing import Sequence, List, Tuple
-from models.vectorizer import Vectorizer
 import numpy as np
 from sentence_transformers import SentenceTransformer
 import faiss
 class PromptSearchEngine:
     def __init__(self, model_name='bert-base-nli-mean-tokens'):
         print("Search engine started!")
         self.model = SentenceTransformer(model_name)
@@ -27,7 +27,7 @@ class PromptSearchEngine:
         print('Finding the most similar vectors')
         query_embedding = self.model.encode([query]).astype('float32')
-        # Optimizovana pretraga ali moramo promeniti vrstu indeksa
         distances, indices = self.index.search(query_embedding, top_k)
         # Retrieve the corresponding prompts for the found indices
@@ -47,12 +47,8 @@ class PromptSearchEngine:
         # Get all vectors from FAISS
         index_vectors = index.reconstruct_n(0, index.ntotal)  # Reconstruct all vectors in the index
         index_norms = np.linalg.norm(index_vectors, axis=1, keepdims=True)
         normalized_index_vectors = index_vectors / index_norms
         cosine_similarities = np.dot(normalized_index_vectors, query_norm.T)
         return cosine_similarities

 from typing import Sequence, List, Tuple
 import numpy as np
 from sentence_transformers import SentenceTransformer
 import faiss
 class PromptSearchEngine:
+    '''Instanciate the language model and index for searching the most similar prompts. Performs the semantic search.'''
     def __init__(self, model_name='bert-base-nli-mean-tokens'):
         print("Search engine started!")
         self.model = SentenceTransformer(model_name)
         print('Finding the most similar vectors')
         query_embedding = self.model.encode([query]).astype('float32')
+        # Optimizovana pretraga ali moramo promeniti vrstu indeksa za pretragu kod stvarne upotrebe
         distances, indices = self.index.search(query_embedding, top_k)
         # Retrieve the corresponding prompts for the found indices
         # Get all vectors from FAISS
         index_vectors = index.reconstruct_n(0, index.ntotal)  # Reconstruct all vectors in the index
         index_norms = np.linalg.norm(index_vectors, axis=1, keepdims=True)
         normalized_index_vectors = index_vectors / index_norms
         cosine_similarities = np.dot(normalized_index_vectors, query_norm.T)
         return cosine_similarities

models/vectorizer.py DELETED Viewed

@@ -1,33 +0,0 @@
-from sentence_transformers import SentenceTransformer
-import numpy as np
-from typing import Sequence
-import faiss
-class Vectorizer:
-    def __init__(self, model) -> None:
-        """Initialize the vectorizer with a pre-trained embedding model.
-        Args: model: The pre-trained embedding model to use for transforming prompts.
-        """
-        self.model = model
-        self.index_size = 50000
-        self.index = faiss.IndexFlatIP(self.index_size)
-        self.cached_index_idx_to_retrieval_db_idx = []
-    def transform_and_add_to_index(self, prompts: Sequence[str]) -> np.ndarray:
-        """Transform texts into numerical vectors using the specified model.
-        Args: prompts: The sequence of raw corpus prompts. Returns: Vectorized prompts
-        """
-        embeddings = self.model.encode(prompts)
-        embedding_dimension = embeddings.shape[1]
-        print('Embedding dimension:', embedding_dimension)
-        self.index.add(np.array(embeddings))