|
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.vectorstores import Chroma |
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
from langchain.document_loaders import PyPDFLoader |
|
|
|
from .embeddings import EMBEDDING_MODEL_NAME |
|
from .vectorstore import get_vectorstore |
|
|
|
|
|
def load_data(): |
|
docs = parse_data() |
|
embedding_function = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME) |
|
vectorstore = get_vectorstore(embedding_function) |
|
|
|
assert isinstance(vectorstore, Chroma) |
|
vectorstore.from_documents( |
|
docs, embedding_function, persist_directory="./chroma_db" |
|
) |
|
return vectorstore |
|
|
|
|
|
def parse_data(): |
|
loader = PyPDFLoader("data/daoism/tao-te-ching.pdf") |
|
pages = loader.load_and_split() |
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=0) |
|
docs = text_splitter.split_documents(pages) |
|
print(docs) |
|
for doc in docs: |
|
doc.metadata["name"] = parse_name(doc.metadata["source"]) |
|
doc.metadata["domain"] = parse_domain(doc.metadata["source"]) |
|
doc.metadata["page_number"] = doc.metadata["page"] |
|
doc.metadata["short_name"] = doc.metadata["name"] |
|
return docs |
|
|
|
|
|
def parse_name(source: str) -> str: |
|
return source.split("/")[-1].split(".")[0] |
|
|
|
|
|
def parse_domain(source: str) -> str: |
|
return source.split("/")[2] |
|
|
|
|
|
if __name__ == "__main__": |
|
db = load_data() |
|
|
|
query = ( |
|
"He who can bear the misfortune of a nation is called the ruler of the world." |
|
) |
|
docs = db.similarity_search(query) |
|
print(docs) |
|
|