Spaces:
Runtime error
Runtime error
import pandas as pd | |
from langchain_community.document_loaders import TextLoader | |
from langchain_community.docstore.document import Document | |
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter | |
from langchain_community.vectorstores import Chroma | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain_community.retrievers import BM25Retriever | |
from langchain_community.llms import OpenAI | |
from langchain_openai import ChatOpenAI | |
from langchain.chains import RetrievalQA | |
from langchain.schema import AIMessage, HumanMessage | |
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder | |
import os | |
from langchain.retrievers import ParentDocumentRetriever | |
from langchain.storage import InMemoryStore | |
def split_with_source(text, source): | |
splitter = CharacterTextSplitter( | |
separator = "\n", | |
chunk_size = 400, | |
chunk_overlap = 0, | |
length_function = len, | |
add_start_index = True, | |
) | |
documents = splitter.create_documents([text]) | |
# print(documents) | |
for doc in documents: | |
doc.metadata["source"] = source | |
# print(doc.metadata) | |
return documents | |
def get_document_from_raw_text_each_line(): | |
documents = [Document(page_content="", metadata={'source': 0})] | |
files = os.listdir(os.path.join(os.getcwd(), "raw_data")) | |
# print(files) | |
for i in files: | |
file_path = i | |
with open(os.path.join(os.path.join(os.getcwd(), "raw_data"),file_path), 'r', encoding="utf-8") as file: | |
# Xử lý bằng text_spliter | |
# Tiền xử lý văn bản | |
content = file.readlines() | |
text = [] | |
#Split | |
for line in content: | |
line = line.strip() | |
documents.append(Document(page_content=line, metadata={"source": i})) | |
return documents | |
def count_files_in_folder(folder_path): | |
# Kiểm tra xem đường dẫn thư mục có tồn tại không | |
if not os.path.isdir(folder_path): | |
print("Đường dẫn không hợp lệ.") | |
return None | |
# Sử dụng os.listdir() để lấy danh sách các tập tin và thư mục trong thư mục | |
files = os.listdir(folder_path) | |
# Đếm số lượng tập tin trong danh sách | |
file_count = len(files) | |
return file_count | |
def get_document_from_raw_text(): | |
documents = [Document(page_content="", metadata={'source': 0})] | |
files = os.listdir(os.path.join(os.getcwd(), "raw_data")) | |
# print(files) | |
for i in files: | |
file_path = i | |
with open(os.path.join(os.path.join(os.getcwd(), "raw_data"),file_path), 'r', encoding="utf-8") as file: | |
# Xử lý bằng text_spliter | |
# Tiền xử lý văn bản | |
content = file.read().replace('\n\n', "\n") | |
# content = ''.join(content.split('.')) | |
new_doc = content | |
texts = split_with_source(new_doc, i) | |
# texts = get_document_from_raw_text_each_line() | |
documents = documents + texts | |
##Xử lý mỗi khi xuống dòng | |
# for line in file: | |
# # Loại bỏ khoảng trắng thừa và ký tự xuống dòng ở đầu và cuối mỗi dòng | |
# line = line.strip() | |
# documents.append(Document(page_content=line, metadata={"source": i})) | |
# print(documents) | |
return documents | |
def get_document_from_table(): | |
documents = [Document(page_content="", metadata={'source': 0})] | |
files = os.listdir(os.path.join(os.getcwd(), "table_data")) | |
# print(files) | |
for i in files: | |
file_path = i | |
data = pd.read_csv(os.path.join(os.path.join(os.getcwd(), "table_data"),file_path)) | |
for j, row in data.iterrows(): | |
documents.append(Document(page_content=row['data'], metadata={"source": file_path})) | |
return documents | |
def load_the_embedding_retrieve(is_ready = False, k = 3, model= 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'): | |
embeddings = HuggingFaceEmbeddings(model_name=model) | |
if is_ready: | |
retriever = Chroma(persist_directory=os.path.join(os.getcwd(), "Data"), embedding_function=embeddings).as_retriever( | |
search_kwargs={"k": k} | |
) | |
else: | |
documents = get_document_from_raw_text() + get_document_from_table() | |
# print(type(documents)) | |
retriever = Chroma.from_documents(documents, embeddings).as_retriever( | |
search_kwargs={"k": k} | |
) | |
return retriever | |
def load_the_bm25_retrieve(k = 3): | |
documents = get_document_from_raw_text() + get_document_from_table() | |
bm25_retriever = BM25Retriever.from_documents(documents) | |
bm25_retriever.k = k | |
return bm25_retriever | |
def load_the_parent_document_retrieve(model= 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'): | |
embeddings = HuggingFaceEmbeddings(model_name=model) | |
vectorstore = Chroma( | |
collection_name="split_parents", embedding_function=embeddings | |
) | |
store = InMemoryStore() | |
parent_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1200, | |
chunk_overlap=0, | |
length_function=len, | |
add_start_index=True, ) | |
child_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=400, | |
chunk_overlap=0, | |
length_function=len, | |
add_start_index=True, ) | |
retriever = ParentDocumentRetriever( | |
vectorstore=vectorstore, | |
docstore=store, | |
child_splitter=child_splitter, | |
parent_splitter=parent_splitter, | |
) | |
docs = get_document_from_raw_text() | |
retriever.add_documents(docs) | |
return retriever | |
def get_qachain(llm_name = "gpt-3.5-turbo-0125", chain_type = "stuff", retriever = None, return_source_documents = True): | |
llm = ChatOpenAI(temperature=0, | |
model_name=llm_name) | |
return RetrievalQA.from_chain_type(llm=llm, | |
chain_type=chain_type, | |
retriever=retriever, | |
return_source_documents=return_source_documents) | |
def summarize_messages(demo_ephemeral_chat_history, llm): | |
stored_messages = demo_ephemeral_chat_history.messages | |
human_chat = stored_messages[0].content | |
ai_chat = stored_messages[1].content | |
if len(stored_messages) == 0: | |
return False | |
summarization_prompt = ChatPromptTemplate.from_messages( | |
[ | |
( | |
"system", os.environ['SUMARY_MESSAGE_PROMPT'], | |
), | |
( | |
"human", | |
''' | |
History: | |
Human: {human} | |
AI: {AI} | |
Output: | |
''' | |
) | |
, | |
] | |
) | |
summarization_chain = summarization_prompt | llm | |
summary_message = summarization_chain.invoke({"AI": ai_chat, "human": human_chat}) | |
demo_ephemeral_chat_history.clear() | |
demo_ephemeral_chat_history.add_message(summary_message) | |
return demo_ephemeral_chat_history | |
def get_question_from_summarize(summary, question, llm): | |
new_qa_prompt = ChatPromptTemplate.from_messages([ | |
("system", os.environ['NEW_QUESTION_PROMPT']), | |
("human", | |
''' | |
Summary: {summary} | |
Question: {question} | |
Output: | |
''' | |
) | |
] | |
) | |
new_qa_chain = new_qa_prompt | llm | |
return new_qa_chain.invoke({'summary': summary, 'question': question}).content | |
def get_final_answer(question, context, prompt, llm): | |
qa_prompt = ChatPromptTemplate.from_messages( | |
[ | |
("system", prompt), | |
("human", ''' | |
Context: {context} | |
Question: {question} | |
Output: '''), | |
] | |
) | |
answer_chain = qa_prompt | llm | |
answer = answer_chain.invoke({'question': question, 'context': context}) | |
return answer.content | |
def process_llm_response(llm_response): | |
print(llm_response['result']) | |
print('\n\nSources:') | |
for source in llm_response["source_documents"]: | |
print(source.metadata['source']) | |