|
from langchain_community.vectorstores.faiss import FAISS |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_community.document_loaders import text, PyPDFLoader, WebBaseLoader |
|
from langchain_community.document_transformers import Html2TextTransformer |
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
|
|
|
|
class KnowledgeBase: |
|
def __init__(self) -> None: |
|
self.embeddings = HuggingFaceEmbeddings( |
|
model_name="sentence-transformers/all-mpnet-base-v2" |
|
) |
|
self.text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=100, chunk_overlap=50 |
|
) |
|
self.retriever = None |
|
|
|
def load_txt(self, path): |
|
loader = text.TextLoader(file_path=path) |
|
documents = loader.load() |
|
chunked_docs = self.text_splitter.split_documents(documents=documents) |
|
print(chunked_docs.__len__()) |
|
db = FAISS.from_documents(embedding=self.embeddings, documents=chunked_docs) |
|
self.retriever = db.as_retriever() |
|
|
|
def load_pdf(self, path): |
|
loader = PyPDFLoader(file_path=path) |
|
documents = loader.load() |
|
chunked_docs = self.text_splitter.split_documents(documents=documents) |
|
db = FAISS.from_documents(embedding=self.embeddings, documents=chunked_docs) |
|
self.retriever = db.as_retriever() |
|
|
|
def load_url(self, path): |
|
loader = WebBaseLoader(web_path=path) |
|
docs = loader.load() |
|
html2text = Html2TextTransformer() |
|
docs_transformed = html2text.transform_documents(docs) |
|
chunked_docs = self.text_splitter.split_documents(docs_transformed) |
|
db = FAISS.from_documents(embedding=self.embeddings, documents=chunked_docs) |
|
self.retriever = db.as_retriever() |
|
|
|
def invoke(self, query): |
|
return self.retriever.invoke(query) |
|
|