Spaces:

ThaparInstituteOfTechnology
/

ThaparGPT

Build error

App Files Files Community

0504ankitsharma commited on Jul 15

Commit

c47212f

•

1 Parent(s): 0876a19

Upload 9 files

Browse files

Files changed (10) hide show

.gitattributes +1 -0
Dockerfile +16 -0
README.md +1 -10
data/Data.pdf +3 -0
get_embedding_function.py +10 -0
populate_database.py +109 -0
query_data.py +54 -0
requirements.txt +9 -0
server.py +176 -0
test_rag.py +49 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/Data.pdf filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:server", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,10 +1 @@
----
-title: ThaparGPT
-emoji: 👁
-colorFrom: green
-colorTo: blue
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ # rag-tutorial-v2

data/Data.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ef945caf75b8219067ce06bd625f8581c60c54d58d071ef8355d9cba9294d84
+size 1378767

get_embedding_function.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from langchain_community.embeddings.ollama import OllamaEmbeddings
+from langchain_community.embeddings.bedrock import BedrockEmbeddings
+#
+def get_embedding_function():
+#     embeddings = BedrockEmbeddings(
+#         credentials_profile_name="default", region_name="us-east-1"
+#     )
+    embeddings = OllamaEmbeddings(model="nomic-embed-text")
+    return embeddings

populate_database.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import argparse
+import os
+import shutil
+from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain.schema.document import Document
+from get_embedding_function import get_embedding_function
+from langchain_community.vectorstores import Chroma
+CHROMA_PATH = "chroma"
+DATA_PATH = "data"
+def main():
+    # Check if the database should be cleared (using the --clear flag).
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--reset", action="store_true", help="Reset the database.")
+    args = parser.parse_args()
+    if args.reset:
+        print("✨ Clearing Database")
+        clear_database()
+    # Create (or update) the data store.
+    documents = load_documents()
+    chunks = split_documents(documents)
+    add_to_chroma(chunks)
+def load_documents():
+    document_loader = PyPDFDirectoryLoader(DATA_PATH)
+    return document_loader.load()
+def split_documents(documents: list[Document]):
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=800,
+        chunk_overlap=80,
+        length_function=len,
+        is_separator_regex=False,
+    )
+    return text_splitter.split_documents(documents)
+def add_to_chroma(chunks: list[Document]):
+    # Load the existing database.
+    db = Chroma(
+        persist_directory=CHROMA_PATH, embedding_function=get_embedding_function()
+    )
+    # Calculate Page IDs.
+    chunks_with_ids = calculate_chunk_ids(chunks)
+    # Add or Update the documents.
+    existing_items = db.get(include=[])  # IDs are always included by default
+    existing_ids = set(existing_items["ids"])
+    print(f"Number of existing documents in DB: {len(existing_ids)}")
+    # Only add documents that don't exist in the DB.
+    new_chunks = []
+    for chunk in chunks_with_ids:
+        if chunk.metadata["id"] not in existing_ids:
+            new_chunks.append(chunk)
+    if len(new_chunks):
+        print(f"👉 Adding new documents: {len(new_chunks)}")
+        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
+        db.add_documents(new_chunks, ids=new_chunk_ids)
+        db.persist()
+    else:
+        print("✅ No new documents to add")
+def calculate_chunk_ids(chunks):
+    # This will create IDs like "data/monopoly.pdf:6:2"
+    # Page Source : Page Number : Chunk Index
+    last_page_id = None
+    current_chunk_index = 0
+    for chunk in chunks:
+        source = chunk.metadata.get("source")
+        page = chunk.metadata.get("page")
+        current_page_id = f"{source}:{page}"
+        # If the page ID is the same as the last one, increment the index.
+        if current_page_id == last_page_id:
+            current_chunk_index += 1
+        else:
+            current_chunk_index = 0
+        # Calculate the chunk ID.
+        chunk_id = f"{current_page_id}:{current_chunk_index}"
+        last_page_id = current_page_id
+        # Add it to the page meta-data.
+        chunk.metadata["id"] = chunk_id
+    return chunks
+def clear_database():
+    if os.path.exists(CHROMA_PATH):
+        shutil.rmtree(CHROMA_PATH)
+if __name__ == "__main__":
+    main()

query_data.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import argparse
+from langchain_community.vectorstores import Chroma
+from langchain.prompts import ChatPromptTemplate
+from langchain_community.llms.ollama import Ollama
+from get_embedding_function import get_embedding_function
+CHROMA_PATH = "chroma"
+PROMPT_TEMPLATE = """
+Answer the question based only on the following context:
+{context}
+---
+Answer the question based on the above context: {question}
+"""
+def main():
+    # Create CLI.
+    # parser = argparse.ArgumentParser()
+    # parser.add_argument("query_text", type=str, help="The query text.")
+    # args = parser.parse_args()
+    # query_text = args.query_text
+    # query_rag(query_text)
+    query_rag(input( "Enter your query: "))
+def query_rag(query_text: str):
+    # Prepare the DB.
+    embedding_function = get_embedding_function()
+    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
+    # Search the DB.
+    results = db.similarity_search_with_score(query_text, k=5)
+    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
+    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
+    prompt = prompt_template.format(context=context_text, question=query_text)
+    # print(prompt)
+    model = Ollama(model="mistral")
+    response_text = model.invoke(prompt)
+    sources = [doc.metadata.get("id", None) for doc, _score in results]
+    formatted_response = f"Response: {response_text}\nSources: {sources}"
+    print(formatted_response)
+    return response_text
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+pypdf
+langchain
+chromadb # Vector storage
+pytest
+boto3
+langchain_community
+pyyaml
+fastapi
+uvicorn[standard]

server.py ADDED Viewed

	@@ -0,0 +1,176 @@

+CHROMA_PATH = "chroma"
+DATA_PATH = "data"
+from fastapi import FastAPI
+import argparse
+import os
+import shutil
+from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain.schema.document import Document
+from get_embedding_function import get_embedding_function
+from langchain_community.vectorstores import Chroma
+from langchain.prompts import ChatPromptTemplate
+from langchain_community.llms.ollama import Ollama
+PROMPT_TEMPLATE = """
+Answer the question based only on the following context:
+{context}
+---
+Answer the question based on the above context: {question}
+"""
+app = FastAPI()
+from langchain_community.embeddings.ollama import OllamaEmbeddings
+from langchain_community.embeddings.bedrock import BedrockEmbeddings
+#
+def get_embedding_function():
+#     embeddings = BedrockEmbeddings(
+#         credentials_profile_name="default", region_name="us-east-1"
+#     )
+    embeddings = OllamaEmbeddings(model="nomic-embed-text")
+    return embeddings
+@app.get("/")
+def greet_json():
+    return {"Hello": "World!"}
+@app.get("/train")
+def train():
+    # Check if the database should be cleared (using the --clear flag).
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--reset", action="store_true", help="Reset the database.")
+    args = parser.parse_args()
+    if args.reset:
+        print("✨ Clearing Database")
+        clear_database()
+    # Create (or update) the data store.
+    documents = load_documents()
+    chunks = split_documents(documents)
+    add_to_chroma(chunks)
+def load_documents():
+    document_loader = PyPDFDirectoryLoader(DATA_PATH)
+    return document_loader.load()
+def split_documents(documents: list[Document]):
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=800,
+        chunk_overlap=80,
+        length_function=len,
+        is_separator_regex=False,
+    )
+    return text_splitter.split_documents(documents)
+def add_to_chroma(chunks: list[Document]):
+    # Load the existing database.
+    db = Chroma(
+        persist_directory=CHROMA_PATH, embedding_function=get_embedding_function()
+    )
+    # Calculate Page IDs.
+    chunks_with_ids = calculate_chunk_ids(chunks)
+    # Add or Update the documents.
+    existing_items = db.get(include=[])  # IDs are always included by default
+    existing_ids = set(existing_items["ids"])
+    print(f"Number of existing documents in DB: {len(existing_ids)}")
+    # Only add documents that don't exist in the DB.
+    new_chunks = []
+    for chunk in chunks_with_ids:
+        if chunk.metadata["id"] not in existing_ids:
+            new_chunks.append(chunk)
+    if len(new_chunks):
+        print(f"👉 Adding new documents: {len(new_chunks)}")
+        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
+        db.add_documents(new_chunks, ids=new_chunk_ids)
+        db.persist()
+    else:
+        print("✅ No new documents to add")
+def calculate_chunk_ids(chunks):
+    # This will create IDs like "data/monopoly.pdf:6:2"
+    # Page Source : Page Number : Chunk Index
+    last_page_id = None
+    current_chunk_index = 0
+    for chunk in chunks:
+        source = chunk.metadata.get("source")
+        page = chunk.metadata.get("page")
+        current_page_id = f"{source}:{page}"
+        # If the page ID is the same as the last one, increment the index.
+        if current_page_id == last_page_id:
+            current_chunk_index += 1
+        else:
+            current_chunk_index = 0
+        # Calculate the chunk ID.
+        chunk_id = f"{current_page_id}:{current_chunk_index}"
+        last_page_id = current_page_id
+        # Add it to the page meta-data.
+        chunk.metadata["id"] = chunk_id
+    return chunks
+def clear_database():
+    if os.path.exists(CHROMA_PATH):
+        shutil.rmtree(CHROMA_PATH)
+    return {""}
+@app.get("/query")
+def query(query_text: str):
+    # Prepare the DB.
+    embedding_function = get_embedding_function()
+    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
+    # Search the DB.
+    results = db.similarity_search_with_score(query_text, k=5)
+    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
+    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
+    prompt = prompt_template.format(context=context_text, question=query_text)
+    # print(prompt)
+    model = Ollama(model="mistral")
+    response_text = model.invoke(prompt)
+    sources = [doc.metadata.get("id", None) for doc, _score in results]
+    formatted_response = f"Response: {response_text}\nSources: {sources}"
+    print(formatted_response)
+    return response_text

test_rag.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from query_data import query_rag
+from langchain_community.llms.ollama import Ollama
+EVAL_PROMPT = """
+Expected Response: {expected_response}
+Actual Response: {actual_response}
+---
+(Answer with 'true' or 'false') Does the actual response match the expected response?
+"""
+def test_monopoly_rules():
+    assert query_and_validate(
+        question="How much total money does a player start with in Monopoly? (Answer with the number only)",
+        expected_response="$1500",
+    )
+def test_ticket_to_ride_rules():
+    assert query_and_validate(
+        question="How many points does the longest continuous train get in Ticket to Ride? (Answer with the number only)",
+        expected_response="10 points",
+    )
+def query_and_validate(question: str, expected_response: str):
+    response_text = query_rag(question)
+    prompt = EVAL_PROMPT.format(
+        expected_response=expected_response, actual_response=response_text
+    )
+    model = Ollama(model="mistral")
+    evaluation_results_str = model.invoke(prompt)
+    evaluation_results_str_cleaned = evaluation_results_str.strip().lower()
+    print(prompt)
+    if "true" in evaluation_results_str_cleaned:
+        # Print response in Green if it is correct.
+        print("\033[92m" + f"Response: {evaluation_results_str_cleaned}" + "\033[0m")
+        return True
+    elif "false" in evaluation_results_str_cleaned:
+        # Print response in Red if it is incorrect.
+        print("\033[91m" + f"Response: {evaluation_results_str_cleaned}" + "\033[0m")
+        return False
+    else:
+        raise ValueError(
+            f"Invalid evaluation result. Cannot determine if 'true' or 'false'."
+        )