Spaces:

nazib61
/

qdarnt

Running

App Files Files Community

nazib61 commited on Oct 6

Commit

2d181ba

verified ·

1 Parent(s): 0ae57e5

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -39

app.py CHANGED Viewed

@@ -2,54 +2,53 @@ import gradio as gr
 from datasets import load_dataset
 from qdrant_client import QdrantClient, models
 from sentence_transformers import SentenceTransformer
 # --- Configuration ---
-QDRANT_HOST = "localhost"  # Or your Hugging Face Space Qdrant URL
-QDRANT_PORT = 6333
 COLLECTION_NAME = "my_text_collection"
 MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
-# --- Load Dataset and Model ---
-# Using a simple dataset from Hugging Face
-dataset = load_dataset("ag_news", split="test")
-data = [item['text'] for item in dataset]
-# Limiting the dataset for a quicker demo
-data = data[:1000]
-# Load a pre-trained sentence transformer model
-model = SentenceTransformer(MODEL_NAME)
 # --- Qdrant Client and Collection Setup ---
-# Initialize Qdrant client
-# In a Hugging Face Space, you might use a local in-memory instance or connect to a running Qdrant container.
-qdrant_client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
-# Create a Qdrant collection if it doesn't exist
 try:
-    qdrant_client.get_collection(collection_name=COLLECTION_NAME)
     print("Collection already exists.")
 except Exception as e:
-    print("Creating collection...")
-    qdrant_client.recreate_collection(
         collection_name=COLLECTION_NAME,
         vectors_config=models.VectorParams(size=model.get_sentence_embedding_dimension(), distance=models.Distance.COSINE),
     )
     # --- Generate and Index Embeddings ---
     print("Generating and indexing embeddings...")
-    batch_size = 128
-    for i in range(0, len(data), batch_size):
-        batch_texts = data[i:i+batch_size]
-        embeddings = model.encode(batch_texts, convert_to_tensor=True)
-        qdrant_client.upsert(
-            collection_name=COLLECTION_NAME,
-            points=models.Batch(
-                ids=list(range(i, i + len(batch_texts))),
-                vectors=[embedding.tolist() for embedding in embeddings],
-                payloads=[{"text": text} for text in batch_texts]
-            )
-        )
     print("Embeddings indexed successfully.")
@@ -61,18 +60,22 @@ def search_in_qdrant(query):
     if not query:
         return "Please enter a search query."
-    query_embedding = model.encode(query).tolist()
-    search_result = qdrant_client.search(
         collection_name=COLLECTION_NAME,
-        query_vector=query_embedding,
-        limit=5  # Return the top 5 most similar results
     )
     results_text = ""
-    for hit in search_result:
         results_text += f"**Score:** {hit.score:.4f}\n"
-        results_text += f"**Text:** {hit.payload['text']}\n\n"
     return results_text
@@ -82,7 +85,7 @@ with gr.Blocks() as demo:
     gr.Markdown("Enter a query to search for similar news articles from the AG News dataset.")
     with gr.Row():
-        search_input = gr.Textbox(label="Search Query")
     search_button = gr.Button("Search")
     search_output = gr.Markdown()

 from datasets import load_dataset
 from qdrant_client import QdrantClient, models
 from sentence_transformers import SentenceTransformer
+import torch # Ensure torch is imported
 # --- Configuration ---
+# Use ":memory:" for a temporary, in-memory database.
+# Or use a path like "./qdrant_db" to save the data to disk.
+# Using a path is better for Spaces as data will be rebuilt only when the code changes.
+QDRANT_PATH = "./qdrant_db"
 COLLECTION_NAME = "my_text_collection"
 MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
+# --- Load Model ---
+# Specify that the model should run on the CPU, which is standard for HF Spaces
+device = "cpu"
+model = SentenceTransformer(MODEL_NAME, device=device)
 # --- Qdrant Client and Collection Setup ---
+# Initialize Qdrant client to use a local, on-disk storage
+# This avoids the need to run a separate Qdrant server
+qdrant_client = QdrantClient(path=QDRANT_PATH)
+# Check if the collection already exists
 try:
+    collection_info = qdrant_client.get_collection(collection_name=COLLECTION_NAME)
     print("Collection already exists.")
 except Exception as e:
+    print("Collection not found, creating a new one...")
+    # --- Load Dataset ---
+    # We only load the dataset and create embeddings if the collection doesn't exist
+    dataset = load_dataset("ag_news", split="test")
+    # Limiting the dataset for a quicker demo setup
+    data = [item['text'] for item in dataset][:1000]
+    # Create the collection
+    qdrant_client.create_collection(
         collection_name=COLLECTION_NAME,
         vectors_config=models.VectorParams(size=model.get_sentence_embedding_dimension(), distance=models.Distance.COSINE),
     )
     # --- Generate and Index Embeddings ---
     print("Generating and indexing embeddings...")
+    # This can take a moment on the first run
+    qdrant_client.add(
+        collection_name=COLLECTION_NAME,
+        documents=data,
+        ids=list(range(len(data))), # Simple sequential IDs
+        embedding_model=model
+    )
     print("Embeddings indexed successfully.")
     if not query:
         return "Please enter a search query."
+    # The client's search function can now take the model directly
+    hits = qdrant_client.search(
         collection_name=COLLECTION_NAME,
+        query_text=query,
+        query_filter=None, # No filters for now
+        limit=5, # Return the top 5 most similar results
+        embedding_model=model
     )
     results_text = ""
+    if not hits:
+        return "No results found."
+    for hit in hits:
         results_text += f"**Score:** {hit.score:.4f}\n"
+        results_text += f"**Text:** {hit.payload['document']}\n\n" # Payload key is 'document' when using .add()
     return results_text
     gr.Markdown("Enter a query to search for similar news articles from the AG News dataset.")
     with gr.Row():
+        search_input = gr.Textbox(label="Search Query", placeholder="e.g., 'Latest news on space exploration'")
     search_button = gr.Button("Search")
     search_output = gr.Markdown()