Upload 4 files

Browse files

Files changed (4) hide show

ChemboChat_V1.code-workspace +7 -0
Project_Notes.txt +67 -0
main.py +92 -0
requirements.txt +10 -0

ChemboChat_V1.code-workspace ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "folders": [
+        {
+            "path": "."
+        }
+    ]
+}

Project_Notes.txt ADDED Viewed

	@@ -0,0 +1,67 @@

+ChemboChat- RAG Chat Application Project Notes
+##############################################
+Shortcut: Ctrl + Space
+Action: This triggers the IntelliSense menu to show code suggestions manually.
+Step1.
+    Create venv and install all required Project Dependencies
+    python -m venv .venv && source .venv/bin/activate
+    Install packages
+    pip install -r requirements.txt
+Step2.
+    Download all libraries & Dependencies for LlamaParse & Langchain.
+    Dependency Tools required for Splitting & Chunking Data & Vectoring
+    a. Text-Splitter
+    b. Embeddings
+    c. Vecotr Stores
+    d. Document Loaders
+    """
+    from langchain.text_splitter import RecursiveCharacterTextSplitter
+    from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
+    from langchain_community.vectorstores import Qdrant
+    from langchain_community.document_loaders import DirectoryLoader
+    """
+Step3.
+    # Define a function to load parsed data if available, or parse if not
+      """LLM - parsingInstructionUber10k
+            parser = LlamaParse(api_key=, result_type="", parsing_instruction=parsingInstructionUber10k)
+            llama_parse_documents = parser.load_data("./data/uber_10q_march_2022.pdf")"""
+    def load_or_parse_data():
+    data_file = "./data/parsed_data.pkl"
+    if os.path.exists(data_file):
+        # Load the parsed data from the file
+        with open(data_file, "rb") as f:
+            parsed_data = pickle.load(f)
+    else:
+        # Perform the parsing step and store the result in llama_parse_documents
+        parsingInstructionUber10k = """The provided document is a quarterly report filed by Uber Technologies,
+        Inc. with the Securities and Exchange Commission (SEC).
+        This form provides detailed financial information about the company's performance for a specific quarter.
+        It includes unaudited financial statements, management discussion and analysis, and other relevant disclosures required by the SEC.
+        It contains many tables.
+        Try to be precise while answering the questions"""
+        parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsingInstructionUber10k)
+        llama_parse_documents = parser.load_data("./data/uber_10q_march_2022.pdf")
+        # Save the parsed data to a file
+        with open(data_file, "wb") as f:
+            pickle.dump(llama_parse_documents, f)
+        # Set the parsed data to the variable
+        parsed_data = llama_parse_documents
+    return parsed_data
+Step 4.
+    # Create vector database
+    Create a vector database using document loaders and embeddings.
+    This function is to load the data and split them in to chunks using Document_loaders in LlamaParse.
+    Transform the chunks into embeddings using llama.FastEmbedEmbeddings
+    Finally, persist the embeddings into vector database.

main.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import os
+import nest_asyncio  # noqa: E402
+nest_asyncio.apply()
+# bring in our LLAMA_CLOUD_API_KEY
+from dotenv import load_dotenv
+load_dotenv()
+# LLAMAPARSE & LANGCHAIN Libraries
+##################################
+from llama_parse import LlamaParse
+from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import DirectoryLoader
+from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
+from langchain_community.vectorstores import qdrant
+llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
+qdrant_url = os.getenv("QDRANT_URL")
+qdrant_api_key = os.getenv("QDRANT_API_KEY")
+# PARSING Function
+# to_parse_documents = ["./data/XXXk.pdf", "./data/suckballs.pdf"]
+import pickle
+# Define a function to load parsed data if available, or parse if not
+def load_or_parse_data():
+    data_file = "./data/parsed_data.pkl"
+    if os.path.exists(data_file):
+        # Load the parsed data from the file
+        with open(data_file, "rb") as f:
+            parsed_data = pickle.load(f)
+    else:
+        # Perform the parsing step and store the result in llama_parse_documents
+        parsingInstructionUber10k = """The provided document is a quarterly report filed by Uber Technologies,
+        Inc. with the Securities and Exchange Commission (SEC).
+        This form provides detailed financial information about the company's performance for a specific quarter.
+        It includes unaudited financial statements, management discussion and analysis, and other relevant disclosures required by the SEC.
+        It contains many tables.
+        Try to be precise while answering the questions"""
+        parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsingInstructionUber10k)
+        llama_parse_documents = parser.load_data("./data/uber_10q_march_2022.pdf")
+        # Save the parsed data to a file
+        with open(data_file, "wb") as f:
+            pickle.dump(llama_parse_documents, f)
+        # Set the parsed data to the variable
+        parsed_data = llama_parse_documents
+    return parsed_data
+# Transform data to embeddings to persist in Db
+def create_vector_database():
+# Call the funtions to load or parse the documents
+    llama_parse_documents = load_or_parse_data()
+    print(llama_parse_documents[1].text[:100])
+    with open('data/output.md', 'a') as f:  # Open the file in append mode ('a')
+        for doc in llama_parse_documents:
+            f.write(doc.text + '\n')
+    loader = DirectoryLoader('data/', glob="**/*.md", show_progress=True)
+    documents = loader.load()
+    # Split loaded documents into chunks
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
+    docs = text_splitter.split_documents(documents)
+     # Initialize Embeddings
+    embeddings = FastEmbedEmbeddings()
+    # Create and persist a Chroma vector database from the chunked documents
+    qdrant = qdrant.from_documents(
+        documents=docs,
+        embedding=embeddings,
+        url=qdrant_url,
+        collection_name="rag",
+        api_key=qdrant_api_key
+    )
+    print('Vector DB created successfully !')
+if __name__ == "__main__":
+    create_vector_database()
+    #len(docs)
+    #docs[0]

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+langchain
+langchain-community
+llama-parse
+fastembed
+qdrant_client
+python-dotenv
+langchain-groq
+chainlit
+fastembed
+unstructured[md]