QDrantRAG9

Sleeping

App Files Files Community

dinhquangson commited on Jun 21, 2024

Commit

3f50d8b

verified ·

1 Parent(s): 4ed5309

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -191

app.py CHANGED Viewed

@@ -1,25 +1,13 @@
 from fastapi import FastAPI, UploadFile, File
 from fastapi.responses import FileResponse
 from fastapi.middleware.cors import CORSMiddleware
 # Loading
 import os
 import shutil
 from os import makedirs,getcwd
 from os.path import join,exists,dirname
-from datasets import load_dataset
 import torch
-from tqdm import tqdm
-from sentence_transformers import SentenceTransformer
-import uuid
-from qdrant_client import models, QdrantClient
-from itertools import islice
-from tqdm import tqdm
-# The file where NeuralSearcher is stored
-from neural_searcher import NeuralSearcher
-# The file where HybridSearcher is stored
-from hybrid_searcher import HybridSearcher
 app = FastAPI()
@@ -31,7 +19,6 @@ app.add_middleware(
     allow_headers=["*"],
 )
-FILEPATH_PATTERN = "structured_data_doc.parquet"
 NUM_PROC = os.cpu_count()
 parent_path  = dirname(getcwd())
@@ -42,171 +29,71 @@ if not exists(temp_path ):
 # Determine device based on GPU availability
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
-# Load the desired model
-model = SentenceTransformer(
-          'sentence-transformers/all-MiniLM-L6-v2',
-          device=device
-)
-# Create function to upsert embeddings in batches
-def batched(iterable, n):
-    iterator = iter(iterable)
-    while batch := list(islice(iterator, n)):
-        yield batch
-batch_size = 100
-# Create an in-memory Qdrant instance
-client2 = QdrantClient(path="database")
-# Create a Qdrant collection for the embeddings
-client2.create_collection(
-    collection_name="law",
-    vectors_config=models.VectorParams(
-        size=model.get_sentence_embedding_dimension(),
-        distance=models.Distance.COSINE,
-    ),
-)
-# Create function to generate embeddings (in batches) for a given dataset split
-def generate_embeddings(dataset, text_field, batch_size=32):
-    embeddings = []
-    with tqdm(total=len(dataset), desc=f"Generating embeddings for dataset") as pbar:
-        for i in range(0, len(dataset), batch_size):
-            print(dataset)
-            batch_sentences = dataset[text_field][i:i+batch_size]
-            batch_embeddings = model.encode(batch_sentences)
-            embeddings.extend(batch_embeddings)
-            pbar.update(len(batch_sentences))
-    return embeddings
 @app.post("/uploadfile/")
 async def create_upload_file(text_field: str, file: UploadFile = File(...)):
     import time
     start_time = time.time()
     file_savePath =  join(temp_path,file.filename)
     with open(file_savePath,'wb') as f:
         shutil.copyfileobj(file.file, f)
-    # Here you can save the file and do other operations as needed
-    if '.json' in file_savePath:
-        full_dataset = load_dataset('json',
-                    data_files=file_savePath,
-                    split="train",
-                    cache_dir=temp_path,
-                    keep_in_memory=True,
-                    num_proc=NUM_PROC*2)
-    elif '.parquet' in file_savePath:
-        full_dataset = load_dataset("parquet",
-                    data_files=file_savePath,
-                    split="train",
-                    cache_dir=temp_path,
-                    keep_in_memory=True,
-                    num_proc=NUM_PROC*2)
-    else:
-        raise NotImplementedError("This feature is not supported yet")
-    # Generate and append embeddings to the train split
-    law_embeddings = generate_embeddings(full_dataset, text_field)
-    full_dataset= full_dataset.add_column("embeddings", law_embeddings)
-    if not 'uuid' in full_dataset.column_names:
-      full_dataset = full_dataset.add_column('uuid', [str(uuid.uuid4()) for _ in range(len(full_dataset))])
-    # Upsert the embeddings in batches
-    for batch in batched(full_dataset, batch_size):
-        ids = [point.pop("uuid") for point in batch]
-        vectors = [point.pop("embeddings") for point in batch]
-        client2.upsert(
-            collection_name=collection_name,
-            points=models.Batch(
-                ids=ids,
-                vectors=vectors,
-                payloads=batch,
-            ),
-        )
-    end_time = time.time()
-    elapsed_time = end_time - start_time
-    return {"filename": file.filename, "message": "Done", "execution_time": elapsed_time}
-@app.post("/uploadfile4hypersearch/")
-async def upload_file_4_hyper_search(collection_name: str, text_field: str, file: UploadFile = File(...)):
-    import time
-    start_time = time.time()
-    file_savePath =  join(temp_path,file.filename)
-    client2.set_model("sentence-transformers/all-MiniLM-L6-v2")
-    # comment this line to use dense vectors only
-    client2.set_sparse_model("prithivida/Splade_PP_en_v1")
-    with open(file_savePath,'wb') as f:
-        shutil.copyfileobj(file.file, f)
-    print(f"Uploaded complete!")
-    client2.recreate_collection(
-        collection_name=collection_name,
-        vectors_config=client2.get_fastembed_vector_params(),
-        # comment this line to use dense vectors only
-        sparse_vectors_config=client2.get_fastembed_sparse_vector_params(),
-    )
-    print(f"The collection is created complete!")
     # Here you can save the file and do other operations as needed
     if '.json' in file_savePath:
-        import json
-        import uuid
-        # Define your batch size
-        batch_size = 100
-        metadata = []
-        documents = []
         with open(file_savePath) as fd:
             for line in fd:
                 obj = json.loads(line)
-                documents.append(obj.pop(text_field))
-                metadata.append(obj)
-        # Generate UUIDs for each document
-        document_ids = [str(uuid.uuid4()) for _ in range(len(documents))]
-        # Split documents and metadata into batches
-        for i in range(0, len(documents), batch_size):
-            batch_documents = documents[i:i + batch_size]
-            batch_metadata = metadata[i:i + batch_size]
-            batch_ids = document_ids[i:i + batch_size]
-            # Upsert the embeddings in batches
-            client2.add(
-                collection_name=collection_name,
-                documents=batch_documents,
-                metadata=batch_metadata,
-                ids=batch_ids,
-            )
-            print(f"The documents and metadata are parsed and upserted in batches with unique UUIDs: {batch_ids}!")
-        print(f"The documents and metadata are parsed and upserted in batches of {batch_size} with unique UUIDs!")
-        print(f"The documents and metadata is upserted complete!")
     else:
         raise NotImplementedError("This feature is not supported yet")
     end_time = time.time()
     elapsed_time = end_time - start_time
     return {"filename": file.filename, "message": "Done", "execution_time": elapsed_time}
 @app.get("/search")
 def search(prompt: str):
@@ -214,15 +101,25 @@ def search(prompt: str):
     start_time = time.time()
-    # Let's see what senators are saying about immigration policy
-    hits = client2.search(
-        collection_name="law",
-        query_vector=model.encode(prompt).tolist(),
-        limit=5
-    )
-    for hit in hits:
-      print(hit.payload, "score:", hit.score)
     end_time = time.time()
@@ -230,7 +127,7 @@ def search(prompt: str):
     print(f"Execution time: {elapsed_time:.6f} seconds")
-    return hits
 @app.get("/download-database/")
 async def download_database():
@@ -254,36 +151,6 @@ async def download_database():
     # Return the zip file as a response for download
     return FileResponse(zip_path, media_type='application/zip', filename='database.zip')
-@app.get("/neural_search")
-def neural_search(q: str, city: str, collection_name: str):
-    import time
-    start_time = time.time()
-    # Create a neural searcher instance
-    neural_searcher = NeuralSearcher(collection_name=collection_name)
-    end_time = time.time()
-    elapsed_time = end_time - start_time
-    return {"result": neural_searcher.search(text=q, city=city), "execution_time": elapsed_time}
-@app.get("/hybrid_search")
-def hybrid_search(q: str, city: str, collection_name: str):
-    import time
-    start_time = time.time()
-    # Create a hybrid searcher instance
-    hybrid_searcher = HybridSearcher(collection_name=collection_name)
-    end_time = time.time()
-    elapsed_time = end_time - start_time
-    return {"result": hybrid_searcher.search(text=q, city=city), "execution_time": elapsed_time}
 @app.get("/")
 def api_home():

 from fastapi import FastAPI, UploadFile, File
 from fastapi.responses import FileResponse
+from datasets import load_dataset
 from fastapi.middleware.cors import CORSMiddleware
 # Loading
 import os
 import shutil
 from os import makedirs,getcwd
 from os.path import join,exists,dirname
 import torch
 app = FastAPI()
     allow_headers=["*"],
 )
 NUM_PROC = os.cpu_count()
 parent_path  = dirname(getcwd())
 # Determine device based on GPU availability
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
+import logging
+logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
+logging.getLogger("haystack").setLevel(logging.INFO)
 @app.post("/uploadfile/")
 async def create_upload_file(text_field: str, file: UploadFile = File(...)):
+    # Imports
     import time
+    from haystack import Document, Pipeline
+    from haystack.components.writers import DocumentWriter
+    from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever
+    from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
+    from haystack.document_stores.types import DuplicatePolicy
+    from haystack_integrations.components.embedders.fastembed import (
+     FastembedTextEmbedder,
+     FastembedDocumentEmbedder,
+     FastembedSparseTextEmbedder,
+     FastembedSparseDocumentEmbedder
+    )
     start_time = time.time()
     file_savePath =  join(temp_path,file.filename)
     with open(file_savePath,'wb') as f:
         shutil.copyfileobj(file.file, f)
+    documents=[]
     # Here you can save the file and do other operations as needed
     if '.json' in file_savePath:
         with open(file_savePath) as fd:
             for line in fd:
                 obj = json.loads(line)
+                document = Document(content=obj[text_field], meta=obj)
+                documents.append(document)
     else:
         raise NotImplementedError("This feature is not supported yet")
+    # Indexing
+    document_store = QdrantDocumentStore(
+        path="database",
+        recreate_index=True,
+        use_sparse_embeddings=True,
+        embedding_dim = 384
+    )
+    indexing = Pipeline()
+    indexing.add_component("sparse_doc_embedder", FastembedSparseDocumentEmbedder(model="prithvida/Splade_PP_en_v1"))
+    indexing.add_component("dense_doc_embedder", FastembedDocumentEmbedder(model="BAAI/bge-small-en-v1.5"))
+    indexing.add_component("writer", DocumentWriter(document_store=document_store, policy=DuplicatePolicy.OVERWRITE))
+    indexing.connect("sparse_doc_embedder", "dense_doc_embedder")
+    indexing.connect("dense_doc_embedder", "writer")
+    indexing.run({"sparse_doc_embedder": {"documents": documents}})
     end_time = time.time()
     elapsed_time = end_time - start_time
     return {"filename": file.filename, "message": "Done", "execution_time": elapsed_time}
 @app.get("/search")
 def search(prompt: str):
     start_time = time.time()
+    # Querying
+    querying = Pipeline()
+    querying.add_component("sparse_text_embedder", FastembedSparseTextEmbedder(model="prithvida/Splade_PP_en_v1"))
+    querying.add_component("dense_text_embedder", FastembedTextEmbedder(
+     model="BAAI/bge-small-en-v1.5", prefix="Represent this sentence for searching relevant passages: ")
+     )
+    querying.add_component("retriever", QdrantHybridRetriever(document_store=document_store))
+    querying.connect("sparse_text_embedder.sparse_embedding", "retriever.query_sparse_embedding")
+    querying.connect("dense_text_embedder.embedding", "retriever.query_embedding")
+    question = "Cosa sono i marker tumorali?"
+    results = querying.run(
+        {"dense_text_embedder": {"text": question},
+         "sparse_text_embedder": {"text": question}}
+    )
     end_time = time.time()
     print(f"Execution time: {elapsed_time:.6f} seconds")
+    return results["retriever"]["documents"]
 @app.get("/download-database/")
 async def download_database():
     # Return the zip file as a response for download
     return FileResponse(zip_path, media_type='application/zip', filename='database.zip')
 @app.get("/")
 def api_home():