Vitomir Jovanović commited on
Commit
e620120
1 Parent(s): 43e9781

Add all vector similarity feature

Browse files
Procfile.yaml CHANGED
@@ -1 +1 @@
1
- web: gunicorn -w 1 -k uvicorn.workers.UvicornWorker main.py:app --bind 0.0.0.0:8000 & streamlit run app.py --server.port 7860
 
1
+ web: gunicorn -w 1 -k uvicorn.workers.UvicornWorker main:app --bind 0.0.0.0:8000 & streamlit run app.py --server.port 7860
main.py CHANGED
@@ -8,14 +8,14 @@ import datetime
8
  from models.vectorizer import Vectorizer
9
  from models.prompt_search_engine import PromptSearchEngine
10
  from models.data_reader import load_prompts_from_jsonl
11
- from models.Query import Query, Query_Multiple, SearchResponse, SimilarPrompt
12
  from decouple import config
13
  from fastapi import FastAPI, HTTPException, Depends, Body
14
  from sentence_transformers import SentenceTransformer
15
 
16
 
17
 
18
- prompt_path = r"C:\Users\jov2bg\Desktop\PromptSearch\models\prompts_data.jsonl"
19
 
20
 
21
  app = FastAPI(title="Search Prompt Engine", description="API for prompt search", version="1.0")
@@ -46,16 +46,17 @@ async def search_prompts(query: Query, k: int = 3):
46
  @app.post("/all_vectors_similarities/")
47
  async def all_vectors(query: Query):
48
 
49
- all_similarities = search_engine.cosine_similarity(query.prompt, search_engine.index)
 
50
  response = [
51
- SimilarPrompt(prompt=prompt, distance=float(distance))
52
- for prompt, distance in all_similarities.items()
53
  ]
54
- return SearchResponse(results=response)
55
 
56
  if __name__ == "__main__":
57
  # Server Config
58
- Search_SERVER_HOST_IP = socket.gethostbyname(socket.gethostname())
59
- # Search_SERVER_HOST_IP = socket.gethostbyname("localhost") # for local deployment
60
- Search_SERVER_PORT = int(8084)
61
- uvicorn.run(app, host=Search_SERVER_HOST_IP, port=Search_SERVER_PORT)
 
8
  from models.vectorizer import Vectorizer
9
  from models.prompt_search_engine import PromptSearchEngine
10
  from models.data_reader import load_prompts_from_jsonl
11
+ from models.Query import Query, Query_Multiple, SearchResponse, SimilarPrompt, PromptVector, VectorResponse
12
  from decouple import config
13
  from fastapi import FastAPI, HTTPException, Depends, Body
14
  from sentence_transformers import SentenceTransformer
15
 
16
 
17
 
18
+ prompt_path = r"C:\Users\jov2bg\Desktop\PromptSearch\search_engine\models\prompts_data.jsonl"
19
 
20
 
21
  app = FastAPI(title="Search Prompt Engine", description="API for prompt search", version="1.0")
 
46
  @app.post("/all_vectors_similarities/")
47
  async def all_vectors(query: Query):
48
 
49
+ query_embedding = search_engine.model.encode([query.prompt]) # Encode the prompt to a vector
50
+ all_similarities = search_engine.cosine_similarity(query_embedding, search_engine.index)
51
  response = [
52
+ PromptVector(vector=index, distance=float(distance))
53
+ for index, distance in enumerate(all_similarities)
54
  ]
55
+ return VectorResponse(results=response)
56
 
57
  if __name__ == "__main__":
58
  # Server Config
59
+ # Search_SERVER_HOST_IP = socket.gethostbyname(socket.gethostname())
60
+ SERVER_HOST_IP = socket.gethostbyname("localhost") # for local deployment
61
+ SERVER_PORT = int(8084)
62
+ uvicorn.run(app, host=SERVER_HOST_IP, port=SERVER_PORT)
models/Query.py CHANGED
@@ -17,4 +17,11 @@ class SimilarPrompt(BaseModel):
17
  distance: float
18
 
19
  class SearchResponse(BaseModel):
20
- results: List[SimilarPrompt]
 
 
 
 
 
 
 
 
17
  distance: float
18
 
19
  class SearchResponse(BaseModel):
20
+ results: List[SimilarPrompt]
21
+
22
+ class PromptVector(BaseModel):
23
+ vector: int
24
+ distance: float
25
+
26
+ class VectorResponse(BaseModel):
27
+ results: List[PromptVector]
models/__pycache__/Query.cpython-312.pyc CHANGED
Binary files a/models/__pycache__/Query.cpython-312.pyc and b/models/__pycache__/Query.cpython-312.pyc differ
 
models/__pycache__/data_reader.cpython-312.pyc CHANGED
Binary files a/models/__pycache__/data_reader.cpython-312.pyc and b/models/__pycache__/data_reader.cpython-312.pyc differ
 
models/__pycache__/prompt_search_engine.cpython-312.pyc CHANGED
Binary files a/models/__pycache__/prompt_search_engine.cpython-312.pyc and b/models/__pycache__/prompt_search_engine.cpython-312.pyc differ
 
models/data_reader.py CHANGED
@@ -41,7 +41,7 @@ def load_prompts_from_jsonl(file_path):
41
 
42
 
43
  if __name__ == "__main__":
44
- jsonl_file_path = r"C:\Users\jov2bg\Desktop\PromptSearch\models\prompts_data.jsonl"
45
  num_shards = 1
46
  dataset = download_data(num_shards, base_url)
47
  extract_prompts(dataset, jsonl_file_path)
 
41
 
42
 
43
  if __name__ == "__main__":
44
+ jsonl_file_path = r"C:\Users\jov2bg\Desktop\PromptSearch\search_engine\models\prompts_data.jsonl"
45
  num_shards = 1
46
  dataset = download_data(num_shards, base_url)
47
  extract_prompts(dataset, jsonl_file_path)
models/prompt_search_engine.py CHANGED
@@ -32,17 +32,27 @@ class PromptSearchEngine:
32
  return similar_prompts, distances[0] # Return both the similar prompts and their distances
33
 
34
 
35
- def cosine_similarity(query_vector: np.ndarray, corpus_vectors: np.ndarray) -> np.ndarray:
36
  """Compute the cosine similarity between a query vector and a set of corpus vectors.
37
- Args: query_vector: The query vector to compare against the corpus vectors. corpus_vectors: The set of corpus vectors to compare against the query vector.
38
- Returns: The cosine similarity between the query vector and the corpus vectors.
39
- """
40
- similarities = {}
41
- for index, vector in enumerate(corpus_vectors):
42
- if np.linalg.norm(vector) == 0:
43
- raise ValueError("One of the corpus vectors has zero norm.")
44
- cos_similarity = np.dot(vector, query_vector) / (np.linalg.norm(vector) * np.linalg.norm(query_vector))
45
- similarities[index] = cos_similarity
46
- return similarities
 
 
 
 
 
 
 
 
 
 
47
 
48
 
 
32
  return similar_prompts, distances[0] # Return both the similar prompts and their distances
33
 
34
 
35
+ def cosine_similarity(self, query_vector, index):
36
  """Compute the cosine similarity between a query vector and a set of corpus vectors.
37
+ Args: query_vector: The query vector to compare against the corpus vectors. corpus_vectors: The set of corpus vectors to compare against the query vector.
38
+ Returns: The cosine similarity between the query vector and the corpus vectors.
39
+ """
40
+
41
+ query_vector = np.array(query_vector).astype('float32')
42
+ query_norm = query_vector / np.linalg.norm(query_vector)
43
+
44
+ # Get all vectors from FAISS
45
+ index_vectors = index.reconstruct_n(0, index.ntotal) # Reconstruct all vectors in the index
46
+
47
+
48
+ index_norms = np.linalg.norm(index_vectors, axis=1, keepdims=True)
49
+ normalized_index_vectors = index_vectors / index_norms
50
+
51
+
52
+ cosine_similarities = np.dot(normalized_index_vectors, query_norm.T)
53
+
54
+ return cosine_similarities
55
+
56
+
57
 
58