|
from langchain_community.document_loaders import TextLoader |
|
from langchain_community.docstore.document import Document |
|
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter |
|
from langchain_community.vectorstores import Chroma |
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
from langchain_community.retrievers import BM25Retriever |
|
from langchain.llms import OpenAI |
|
from langchain_openai import ChatOpenAI |
|
from langchain.chains import RetrievalQA |
|
import os |
|
|
|
def split_with_source(text, source): |
|
splitter = CharacterTextSplitter( |
|
separator = "\n", |
|
chunk_size = 256, |
|
chunk_overlap = 0, |
|
length_function = len, |
|
add_start_index = True, |
|
) |
|
documents = splitter.create_documents([text]) |
|
print(documents) |
|
for doc in documents: |
|
doc.metadata["source"] = source |
|
|
|
|
|
return documents |
|
|
|
|
|
def count_files_in_folder(folder_path): |
|
|
|
if not os.path.isdir(folder_path): |
|
print("Đường dẫn không hợp lệ.") |
|
return None |
|
|
|
|
|
files = os.listdir(folder_path) |
|
|
|
|
|
file_count = len(files) |
|
|
|
return file_count |
|
|
|
def get_document_from_raw_text(): |
|
documents = [Document(page_content="", metadata={'source': 0})] |
|
files = os.listdir(os.path.join(os.getcwd(), "raw_data")) |
|
|
|
for i in files: |
|
file_path = i |
|
with open(os.path.join(os.path.join(os.getcwd(), "raw_data"),file_path), 'r', encoding="utf-8") as file: |
|
|
|
|
|
content = file.read().replace('\n\n', "\n") |
|
|
|
new_doc = content |
|
texts = split_with_source(new_doc, i) |
|
documents = documents + texts |
|
|
|
|
|
|
|
|
|
|
|
|
|
print(documents) |
|
return documents |
|
|
|
def load_the_embedding_retrieve(is_ready = False, k = 3, model= 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'): |
|
embeddings = HuggingFaceEmbeddings(model_name=model) |
|
if is_ready: |
|
retriever = Chroma(persist_directory=os.path.join(os.getcwd(), "Data"), embedding_function=embeddings).as_retriever( |
|
search_kwargs={"k": k} |
|
) |
|
else: |
|
documents = get_document_from_raw_text() |
|
print(type(documents)) |
|
retriever = Chroma.from_documents(documents, embeddings).as_retriever( |
|
search_kwargs={"k": k} |
|
) |
|
|
|
|
|
return retriever |
|
|
|
def load_the_bm25_retrieve(k = 3): |
|
documents = get_document_from_raw_text() |
|
bm25_retriever = BM25Retriever.from_documents(documents) |
|
bm25_retriever.k = k |
|
|
|
return bm25_retriever |
|
|
|
def get_qachain(llm_name = "gpt-3.5-turbo-0125", chain_type = "stuff", retriever = None, return_source_documents = True): |
|
llm = ChatOpenAI(temperature=0, |
|
model_name=llm_name) |
|
return RetrievalQA.from_chain_type(llm=llm, |
|
chain_type=chain_type, |
|
retriever=retriever, |
|
return_source_documents=return_source_documents) |
|
|
|
def process_llm_response(llm_response): |
|
print(llm_response['result']) |
|
print('\n\nSources:') |
|
for source in llm_response["source_documents"]: |
|
print(source.metadata['source']) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|