|
ChemboChat- RAG Chat Application Project Notes |
|
############################################## |
|
Shortcut: Ctrl + Space |
|
Action: This triggers the IntelliSense menu to show code suggestions manually. |
|
|
|
Step1. |
|
Create venv and install all required Project Dependencies |
|
python -m venv .venv && source .venv/bin/activate |
|
Install packages |
|
pip install -r requirements.txt |
|
|
|
Step2. |
|
Download all libraries & Dependencies for LlamaParse & Langchain. |
|
Dependency Tools required for Splitting & Chunking Data & Vectoring |
|
a. Text-Splitter |
|
b. Embeddings |
|
c. Vecotr Stores |
|
d. Document Loaders |
|
|
|
""" |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings |
|
from langchain_community.vectorstores import Qdrant |
|
from langchain_community.document_loaders import DirectoryLoader |
|
""" |
|
|
|
Step3. |
|
# Define a function to load parsed data if available, or parse if not |
|
"""LLM - parsingInstructionUber10k |
|
parser = LlamaParse(api_key=, result_type="", parsing_instruction=parsingInstructionUber10k) |
|
llama_parse_documents = parser.load_data("./data/uber_10q_march_2022.pdf")""" |
|
|
|
def load_or_parse_data(): |
|
data_file = "./data/parsed_data.pkl" |
|
|
|
if os.path.exists(data_file): |
|
# Load the parsed data from the file |
|
with open(data_file, "rb") as f: |
|
parsed_data = pickle.load(f) |
|
else: |
|
# Perform the parsing step and store the result in llama_parse_documents |
|
parsingInstructionUber10k = """The provided document is a quarterly report filed by Uber Technologies, |
|
Inc. with the Securities and Exchange Commission (SEC). |
|
This form provides detailed financial information about the company's performance for a specific quarter. |
|
It includes unaudited financial statements, management discussion and analysis, and other relevant disclosures required by the SEC. |
|
It contains many tables. |
|
Try to be precise while answering the questions""" |
|
parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsingInstructionUber10k) |
|
llama_parse_documents = parser.load_data("./data/uber_10q_march_2022.pdf") |
|
|
|
|
|
# Save the parsed data to a file |
|
with open(data_file, "wb") as f: |
|
pickle.dump(llama_parse_documents, f) |
|
|
|
# Set the parsed data to the variable |
|
parsed_data = llama_parse_documents |
|
|
|
return parsed_data |
|
|
|
Step 4. |
|
# Create vector database |
|
Create a vector database using document loaders and embeddings. |
|
This function is to load the data and split them in to chunks using Document_loaders in LlamaParse. |
|
Transform the chunks into embeddings using llama.FastEmbedEmbeddings |
|
Finally, persist the embeddings into vector database. |
|
|