Spaces:

serverdaun
/

rag-w-binary-quant

Sleeping

App Files Files Community

serverdaun commited on Aug 7

Commit

b3de77b

1 Parent(s): 4ed0f41

Refactor imports and improve code formatting across multiple files for better readability and organization.

Browse files

Files changed (6) hide show

app.py +12 -13
src/config.py +1 -1
src/data_loader.py +5 -1
src/embedding_generator.py +6 -3
src/rag_pipeline.py +6 -3
src/vector_store.py +14 -8

app.py CHANGED Viewed

@@ -1,27 +1,24 @@
-import gradio as gr
-import os
 import atexit
 import glob
 import shutil
-from src.config import (
-    DOCS_DIR,
-    COLLECTION_NAME,
-    EMBEDDING_MODEL_NAME,
-    MILVUS_DB_PATH,
-)
 from src.data_loader import load_data
 from src.embedding_generator import (
     generate_document_embeddings,
     generate_query_embeddings,
 )
 from src.vector_store import (
-    get_milvus_client,
     create_collection_if_not_exists,
     insert_data,
     search,
 )
-from src.rag_pipeline import answer_question
-from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 # Initialize models and clients
 embedding_model = HuggingFaceEmbedding(
@@ -32,16 +29,18 @@ embedding_model = HuggingFaceEmbedding(
 milvus_client = get_milvus_client(MILVUS_DB_PATH)
 # --- Cleanup Function ---
 def cleanup_documents():
     """Remove all files from the documents directory."""
     print("Cleaning up uploaded documents...")
-    files = glob.glob(os.path.join(DOCS_DIR, '*'))
     for f in files:
         if os.path.isfile(f):
             os.remove(f)
     print("Cleanup complete.")
 # Register the cleanup function to run on exit
 atexit.register(cleanup_documents)
@@ -114,4 +113,4 @@ with gr.Blocks() as demo:
 if __name__ == "__main__":
     # Ensure the documents directory exists from the start
     os.makedirs(DOCS_DIR, exist_ok=True)
-    demo.launch()

 import atexit
 import glob
+import os
 import shutil
+import gradio as gr
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from src.config import COLLECTION_NAME, DOCS_DIR, EMBEDDING_MODEL_NAME, MILVUS_DB_PATH
 from src.data_loader import load_data
 from src.embedding_generator import (
     generate_document_embeddings,
     generate_query_embeddings,
 )
+from src.rag_pipeline import answer_question
 from src.vector_store import (
     create_collection_if_not_exists,
+    get_milvus_client,
     insert_data,
     search,
 )
 # Initialize models and clients
 embedding_model = HuggingFaceEmbedding(
 milvus_client = get_milvus_client(MILVUS_DB_PATH)
 # --- Cleanup Function ---
 def cleanup_documents():
     """Remove all files from the documents directory."""
     print("Cleaning up uploaded documents...")
+    files = glob.glob(os.path.join(DOCS_DIR, "*"))
     for f in files:
         if os.path.isfile(f):
             os.remove(f)
     print("Cleanup complete.")
 # Register the cleanup function to run on exit
 atexit.register(cleanup_documents)
 if __name__ == "__main__":
     # Ensure the documents directory exists from the start
     os.makedirs(DOCS_DIR, exist_ok=True)
+    demo.launch()

src/config.py CHANGED Viewed

@@ -20,4 +20,4 @@ If the context information is not relevant to the user's query, say "I don't kno
 {query}
 # Answer
-"""

 {query}
 # Answer
+"""

src/data_loader.py CHANGED Viewed

@@ -12,7 +12,11 @@ def load_data(data_dir: str) -> list:
         A list of documents
     """
     try:
-        loader = SimpleDirectoryReader(input_dir=data_dir, required_exts=[".pdf", ".txt", ".md", ".docx", ".doc"], recursive=True)
         docs = loader.load_data()
         return docs
     except Exception as e:

         A list of documents
     """
     try:
+        loader = SimpleDirectoryReader(
+            input_dir=data_dir,
+            required_exts=[".pdf", ".txt", ".md", ".docx", ".doc"],
+            recursive=True,
+        )
         docs = loader.load_data()
         return docs
     except Exception as e:

src/embedding_generator.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from typing import Any, Generator
 import numpy as np
@@ -15,10 +16,12 @@ def batch_iterate(items: Any, batch_size: int) -> Generator[Any, None, None]:
         A generator of batches
     """
     for i in range(0, len(items), batch_size):
-        yield items[i:i + batch_size]
-def generate_document_embeddings(documents: list[str], embedding_model: Any) -> list[bytes]:
     """
     Generate document embeddings.
@@ -49,6 +52,7 @@ def generate_document_embeddings(documents: list[str], embedding_model: Any) ->
         print(f"Error generating document embeddings: {e}")
         return []
 def generate_query_embeddings(query: str, embdding_model: Any) -> bytes:
     """
     Generate query embeddings.
@@ -73,4 +77,3 @@ def generate_query_embeddings(query: str, embdding_model: Any) -> bytes:
     except Exception as e:
         print(f"Error generating query embeddings: {e}")
         return None

 from typing import Any, Generator
 import numpy as np
         A generator of batches
     """
     for i in range(0, len(items), batch_size):
+        yield items[i : i + batch_size]
+def generate_document_embeddings(
+    documents: list[str], embedding_model: Any
+) -> list[bytes]:
     """
     Generate document embeddings.
         print(f"Error generating document embeddings: {e}")
         return []
 def generate_query_embeddings(query: str, embdding_model: Any) -> bytes:
     """
     Generate query embeddings.
     except Exception as e:
         print(f"Error generating query embeddings: {e}")
         return None

src/rag_pipeline.py CHANGED Viewed

@@ -1,9 +1,12 @@
-from langchain_core.messages import HumanMessage
 from langchain.chat_models import init_chat_model
-from .config import PROMPT, MODEL_NAME, TEMPERATURE, MODEL_PROVIDER
-llm = init_chat_model(MODEL_NAME, model_provider=MODEL_PROVIDER, temperature=TEMPERATURE)
 def answer_question(query: str, contexts: list[str]) -> str:
     """

 from langchain.chat_models import init_chat_model
+from langchain_core.messages import HumanMessage
+from .config import MODEL_NAME, MODEL_PROVIDER, PROMPT, TEMPERATURE
+llm = init_chat_model(
+    MODEL_NAME, model_provider=MODEL_PROVIDER, temperature=TEMPERATURE
+)
 def answer_question(query: str, contexts: list[str]) -> str:
     """

src/vector_store.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from pymilvus import MilvusClient, DataType
 def get_milvus_client(db_path: str) -> MilvusClient:
@@ -14,12 +14,15 @@ def get_milvus_client(db_path: str) -> MilvusClient:
     try:
         client = MilvusClient(db_path)
         return client
     except Exception as e:
         print(f"Error getting Milvus client: {e}")
         return None
-def create_collection_if_not_exists(client: MilvusClient, collection_name: str, dim: int) -> None:
     """
     Create a collection in Milvus if it does not exist.
@@ -63,8 +66,8 @@ def create_collection_if_not_exists(client: MilvusClient, collection_name: str,
             index_params.add_index(
                 field_name="binary_vector",
                 index_name="binary_vector_index",
-                index_type="BIN_FLAT", # Exact search for binary vectors
-                metric_type="HAMMING", # Hamming distance for binary vectors
             )
             # Create collection with schema and index
             client.create_collection(
@@ -77,6 +80,7 @@ def create_collection_if_not_exists(client: MilvusClient, collection_name: str,
         print(f"Error creating collection: {e}")
         return None
 def insert_data(client: MilvusClient, collection_name: str, data: list[dict]):
     """
     Insert data into a collection in Milvus.
@@ -95,7 +99,9 @@ def insert_data(client: MilvusClient, collection_name: str, data: list[dict]):
         print(f"Error inserting data: {e}")
-def search(client: MilvusClient, collection_name: str, binary_query: bytes, limit: int = 5):
     """
     Search for data in a collection in Milvus.
     """
@@ -115,10 +121,10 @@ def search(client: MilvusClient, collection_name: str, binary_query: bytes, limi
         if not results:
             print("No search results found")
             return []
         contexts = [res.entity.context for res in results[0]]
         return contexts
     except Exception as e:
         print(f"Error searching for data: {e}")
-        return []

+from pymilvus import DataType, MilvusClient
 def get_milvus_client(db_path: str) -> MilvusClient:
     try:
         client = MilvusClient(db_path)
         return client
     except Exception as e:
         print(f"Error getting Milvus client: {e}")
         return None
+def create_collection_if_not_exists(
+    client: MilvusClient, collection_name: str, dim: int
+) -> None:
     """
     Create a collection in Milvus if it does not exist.
             index_params.add_index(
                 field_name="binary_vector",
                 index_name="binary_vector_index",
+                index_type="BIN_FLAT",  # Exact search for binary vectors
+                metric_type="HAMMING",  # Hamming distance for binary vectors
             )
             # Create collection with schema and index
             client.create_collection(
         print(f"Error creating collection: {e}")
         return None
 def insert_data(client: MilvusClient, collection_name: str, data: list[dict]):
     """
     Insert data into a collection in Milvus.
         print(f"Error inserting data: {e}")
+def search(
+    client: MilvusClient, collection_name: str, binary_query: bytes, limit: int = 5
+):
     """
     Search for data in a collection in Milvus.
     """
         if not results:
             print("No search results found")
             return []
         contexts = [res.entity.context for res in results[0]]
         return contexts
     except Exception as e:
         print(f"Error searching for data: {e}")
+        return []