Bon-God commited on
Commit
64566a7
1 Parent(s): 9e38dce

Upload 4 files

Browse files
Files changed (4) hide show
  1. ChemboChat_V1.code-workspace +7 -0
  2. Project_Notes.txt +67 -0
  3. main.py +92 -0
  4. requirements.txt +10 -0
ChemboChat_V1.code-workspace ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "folders": [
3
+ {
4
+ "path": "."
5
+ }
6
+ ]
7
+ }
Project_Notes.txt ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ChemboChat- RAG Chat Application Project Notes
2
+ ##############################################
3
+ Shortcut: Ctrl + Space
4
+ Action: This triggers the IntelliSense menu to show code suggestions manually.
5
+
6
+ Step1.
7
+ Create venv and install all required Project Dependencies
8
+ python -m venv .venv && source .venv/bin/activate
9
+ Install packages
10
+ pip install -r requirements.txt
11
+
12
+ Step2.
13
+ Download all libraries & Dependencies for LlamaParse & Langchain.
14
+ Dependency Tools required for Splitting & Chunking Data & Vectoring
15
+ a. Text-Splitter
16
+ b. Embeddings
17
+ c. Vecotr Stores
18
+ d. Document Loaders
19
+
20
+ """
21
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
22
+ from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
23
+ from langchain_community.vectorstores import Qdrant
24
+ from langchain_community.document_loaders import DirectoryLoader
25
+ """
26
+
27
+ Step3.
28
+ # Define a function to load parsed data if available, or parse if not
29
+ """LLM - parsingInstructionUber10k
30
+ parser = LlamaParse(api_key=, result_type="", parsing_instruction=parsingInstructionUber10k)
31
+ llama_parse_documents = parser.load_data("./data/uber_10q_march_2022.pdf")"""
32
+
33
+ def load_or_parse_data():
34
+ data_file = "./data/parsed_data.pkl"
35
+
36
+ if os.path.exists(data_file):
37
+ # Load the parsed data from the file
38
+ with open(data_file, "rb") as f:
39
+ parsed_data = pickle.load(f)
40
+ else:
41
+ # Perform the parsing step and store the result in llama_parse_documents
42
+ parsingInstructionUber10k = """The provided document is a quarterly report filed by Uber Technologies,
43
+ Inc. with the Securities and Exchange Commission (SEC).
44
+ This form provides detailed financial information about the company's performance for a specific quarter.
45
+ It includes unaudited financial statements, management discussion and analysis, and other relevant disclosures required by the SEC.
46
+ It contains many tables.
47
+ Try to be precise while answering the questions"""
48
+ parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsingInstructionUber10k)
49
+ llama_parse_documents = parser.load_data("./data/uber_10q_march_2022.pdf")
50
+
51
+
52
+ # Save the parsed data to a file
53
+ with open(data_file, "wb") as f:
54
+ pickle.dump(llama_parse_documents, f)
55
+
56
+ # Set the parsed data to the variable
57
+ parsed_data = llama_parse_documents
58
+
59
+ return parsed_data
60
+
61
+ Step 4.
62
+ # Create vector database
63
+ Create a vector database using document loaders and embeddings.
64
+ This function is to load the data and split them in to chunks using Document_loaders in LlamaParse.
65
+ Transform the chunks into embeddings using llama.FastEmbedEmbeddings
66
+ Finally, persist the embeddings into vector database.
67
+
main.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import nest_asyncio # noqa: E402
3
+ nest_asyncio.apply()
4
+
5
+ # bring in our LLAMA_CLOUD_API_KEY
6
+ from dotenv import load_dotenv
7
+ load_dotenv()
8
+
9
+ # LLAMAPARSE & LANGCHAIN Libraries
10
+ ##################################
11
+ from llama_parse import LlamaParse
12
+
13
+ from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
14
+ from langchain_community.document_loaders import DirectoryLoader
15
+ from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
16
+ from langchain_community.vectorstores import qdrant
17
+
18
+ llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
19
+ qdrant_url = os.getenv("QDRANT_URL")
20
+ qdrant_api_key = os.getenv("QDRANT_API_KEY")
21
+
22
+ # PARSING Function
23
+ # to_parse_documents = ["./data/XXXk.pdf", "./data/suckballs.pdf"]
24
+
25
+ import pickle
26
+ # Define a function to load parsed data if available, or parse if not
27
+ def load_or_parse_data():
28
+ data_file = "./data/parsed_data.pkl"
29
+
30
+ if os.path.exists(data_file):
31
+ # Load the parsed data from the file
32
+ with open(data_file, "rb") as f:
33
+ parsed_data = pickle.load(f)
34
+ else:
35
+ # Perform the parsing step and store the result in llama_parse_documents
36
+ parsingInstructionUber10k = """The provided document is a quarterly report filed by Uber Technologies,
37
+ Inc. with the Securities and Exchange Commission (SEC).
38
+ This form provides detailed financial information about the company's performance for a specific quarter.
39
+ It includes unaudited financial statements, management discussion and analysis, and other relevant disclosures required by the SEC.
40
+ It contains many tables.
41
+ Try to be precise while answering the questions"""
42
+ parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsingInstructionUber10k)
43
+ llama_parse_documents = parser.load_data("./data/uber_10q_march_2022.pdf")
44
+
45
+
46
+ # Save the parsed data to a file
47
+ with open(data_file, "wb") as f:
48
+ pickle.dump(llama_parse_documents, f)
49
+
50
+ # Set the parsed data to the variable
51
+ parsed_data = llama_parse_documents
52
+
53
+ return parsed_data
54
+
55
+ # Transform data to embeddings to persist in Db
56
+ def create_vector_database():
57
+
58
+ # Call the funtions to load or parse the documents
59
+ llama_parse_documents = load_or_parse_data()
60
+ print(llama_parse_documents[1].text[:100])
61
+
62
+ with open('data/output.md', 'a') as f: # Open the file in append mode ('a')
63
+ for doc in llama_parse_documents:
64
+ f.write(doc.text + '\n')
65
+
66
+ loader = DirectoryLoader('data/', glob="**/*.md", show_progress=True)
67
+ documents = loader.load()
68
+
69
+ # Split loaded documents into chunks
70
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
71
+ docs = text_splitter.split_documents(documents)
72
+ # Initialize Embeddings
73
+ embeddings = FastEmbedEmbeddings()
74
+
75
+ # Create and persist a Chroma vector database from the chunked documents
76
+ qdrant = qdrant.from_documents(
77
+ documents=docs,
78
+ embedding=embeddings,
79
+ url=qdrant_url,
80
+ collection_name="rag",
81
+ api_key=qdrant_api_key
82
+ )
83
+
84
+ print('Vector DB created successfully !')
85
+
86
+ if __name__ == "__main__":
87
+ create_vector_database()
88
+ #len(docs)
89
+ #docs[0]
90
+
91
+
92
+
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ langchain-community
3
+ llama-parse
4
+ fastembed
5
+ qdrant_client
6
+ python-dotenv
7
+ langchain-groq
8
+ chainlit
9
+ fastembed
10
+ unstructured[md]