|
import os |
|
import nest_asyncio |
|
nest_asyncio.apply() |
|
|
|
|
|
from dotenv import load_dotenv |
|
load_dotenv() |
|
|
|
|
|
|
|
from llama_parse import LlamaParse |
|
|
|
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter |
|
from langchain_community.document_loaders import DirectoryLoader |
|
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings |
|
from langchain_community.vectorstores import qdrant |
|
|
|
llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY") |
|
qdrant_url = os.getenv("QDRANT_URL") |
|
qdrant_api_key = os.getenv("QDRANT_API_KEY") |
|
|
|
|
|
|
|
|
|
import pickle |
|
|
|
def load_or_parse_data(): |
|
data_file = "./data/parsed_data.pkl" |
|
|
|
if os.path.exists(data_file): |
|
|
|
with open(data_file, "rb") as f: |
|
parsed_data = pickle.load(f) |
|
else: |
|
|
|
parsingInstructionUber10k = """The provided document is a quarterly report filed by Uber Technologies, |
|
Inc. with the Securities and Exchange Commission (SEC). |
|
This form provides detailed financial information about the company's performance for a specific quarter. |
|
It includes unaudited financial statements, management discussion and analysis, and other relevant disclosures required by the SEC. |
|
It contains many tables. |
|
Try to be precise while answering the questions""" |
|
parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsingInstructionUber10k) |
|
llama_parse_documents = parser.load_data("./data/uber_10q_march_2022.pdf") |
|
|
|
|
|
|
|
with open(data_file, "wb") as f: |
|
pickle.dump(llama_parse_documents, f) |
|
|
|
|
|
parsed_data = llama_parse_documents |
|
|
|
return parsed_data |
|
|
|
|
|
def create_vector_database(): |
|
|
|
|
|
llama_parse_documents = load_or_parse_data() |
|
print(llama_parse_documents[1].text[:100]) |
|
|
|
with open('data/output.md', 'a') as f: |
|
for doc in llama_parse_documents: |
|
f.write(doc.text + '\n') |
|
|
|
loader = DirectoryLoader('data/', glob="**/*.md", show_progress=True) |
|
documents = loader.load() |
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100) |
|
docs = text_splitter.split_documents(documents) |
|
|
|
embeddings = FastEmbedEmbeddings() |
|
|
|
|
|
qdrant = qdrant.from_documents( |
|
documents=docs, |
|
embedding=embeddings, |
|
url=qdrant_url, |
|
collection_name="rag", |
|
api_key=qdrant_api_key |
|
) |
|
|
|
print('Vector DB created successfully !') |
|
|
|
if __name__ == "__main__": |
|
create_vector_database() |
|
|
|
|
|
|
|
|
|
|
|
|