Spaces:

ShynBui
/

Vector_db_test

Runtime error

App Files Files Community

ShynBui commited on Apr 20

Commit

d15d46e

•

1 Parent(s): a2604b2

first commit

Browse files

Files changed (5) hide show

app.py +93 -0
raw_data/data_dang_bang.txt +0 -0
requirements.txt +156 -0
table_data/data_dang_bang.csv +2 -0
utils.py +209 -0

app.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# -*- coding: utf-8 -*-
+import gradio as gr
+import os
+from langchain.retrievers import EnsembleRetriever
+from utils import *
+import requests
+from pyvi import ViTokenizer, ViPosTagger
+import time
+from transformers import AutoTokenizer, AutoModelForQuestionAnswering
+import torch
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain.chains import create_history_aware_retriever, create_retrieval_chain
+from langchain_community.chat_message_histories import ChatMessageHistory
+retriever = load_the_embedding_retrieve(is_ready=False, k=10)
+bm25_retriever = load_the_bm25_retrieve(k=1)
+ensemble_retriever = EnsembleRetriever(
+    retrievers=[bm25_retriever, retriever], weights=[0.1, 0.9]
+)
+tokenizer = AutoTokenizer.from_pretrained("ShynBui/vie_qa", token=os.environ.get("HF_TOKEN"))
+model = AutoModelForQuestionAnswering.from_pretrained("ShynBui/vie_qa", token=os.environ.get("HF_TOKEN"))
+llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=1, openai_api_key=os.environ["OPENAI_API_KEY"])
+def greet3(quote, history):
+    # print(history)
+    demo_ephemeral_chat_history = ChatMessageHistory()
+    if history == '':
+        history = [("Bạn có thể giải thích về quy chế và quyền của sinh viên tại trường này không?",
+                '''Quy chế và quyền của sinh viên tại trường Đại học Mở TP.HCM được quy định rõ trong các điều khoản sau:
+                1. Hiệu trưởng Trường có quyền ra quyết định thành lập và quy định cụ thể về chức năng, nhiệm vụ, tổ chức và hoạt động của Hội đồng khen thưởng và kỷ luật sinh viên.
+                2. Sinh viên có quyền khiếu nại về khen thưởng, kỷ luật. Khi có vi phạm kỷ luật, sinh viên có quyền được phân tích và đề nghị hình thức kỷ luật thông qua việc họp với các tổ chức sinh viên và gửi biên bản họp đến phòng Công tác sinh viên để trình Hội đồng.
+                3. Sinh viên có quyền đề đạt nguyện vọng và khiếu nại lên Hiệu trưởng Trường để giải quyết các vấn đề có liên quan đến quyền, lợi ích chính đáng của sinh viên.
+                4. Sinh viên được hỗ trợ giới thiệu nhà trọ theo quy định của trường.
+                Các chủ đề liên quan mà bạn có thể muốn tìm hiểu thêm:
+                - Quy chế và quyền của sinh viên tại các trường đại học khác.
+                - Hệ thống hỗ trợ sinh viên tại trường Đại học Mở TP.HCM.
+                - Quy trình khiếu nại và giải quyết tranh chấp sinh viên tại trường Đại học Mở TP.HCM.
+                '''),
+                ("Chào.",
+                "Chào. Chúng ta vừa bắt đầu câu chuyện thôi.")]
+        for user, assistant in history[-1:]:
+            demo_ephemeral_chat_history.add_user_message(user)
+            demo_ephemeral_chat_history.add_ai_message(assistant)
+    else:
+        for user, assistant in eval(history)[-1:]:
+            demo_ephemeral_chat_history.add_user_message(user)
+            demo_ephemeral_chat_history.add_ai_message(assistant)
+    # Summary the message
+    chat_history = summarize_messages(demo_ephemeral_chat_history=demo_ephemeral_chat_history, llm=llm).messages
+    # print("Chat history:", chat_history)
+    # Get the new question
+    new_question = get_question_from_summarize(chat_history[0].content, quote, llm)
+    # Retrieve
+    documents_query = ensemble_retriever.invoke(new_question)
+    # print(documents_query)
+    context = ''
+    for i in documents_query:
+        context += i.page_content + '\n'
+    # print(context)
+    # Get answer
+    answer = get_final_answer(question=new_question, context=context,
+                              prompt=os.environ['PROMPT'], llm=llm)
+    return new_question, answer
+if __name__ == "__main__":
+    quote = "Địa chỉ nhà trường?"
+    iface = gr.Interface(fn=greet3, inputs=["text", "text"], outputs=["text", "text"])
+    iface.launch(share=True)
+#Những cái đã làm tốt hơn những gì - Đóng góp gì
+# 1. Dataset - Xu lý
+# 2. Tăng ngữ cảnh
+# 3. Tăng khả năng truy vết
+# 4.

raw_data/data_dang_bang.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,156 @@

+aiofiles==23.2.1
+aiohttp==3.9.3
+aiosignal==1.3.1
+altair==5.2.0
+annotated-types==0.6.0
+anyio==4.3.0
+asgiref==3.7.2
+async-timeout==4.0.3
+attrs==23.2.0
+backoff==2.2.1
+bcrypt==4.1.2
+build==1.0.3
+cachetools==5.3.3
+certifi==2024.2.2
+charset-normalizer==3.3.2
+chroma-hnswlib==0.7.3
+chromadb==0.4.24
+click==8.1.7
+colorama==0.4.6
+coloredlogs==15.0.1
+contourpy==1.2.0
+cycler==0.12.1
+dataclasses-json==0.6.4
+Deprecated==1.2.14
+exceptiongroup==1.2.0
+fastapi==0.110.0
+ffmpy==0.3.2
+filelock==3.13.1
+flatbuffers==23.5.26
+fonttools==4.49.0
+frozenlist==1.4.1
+fsspec==2024.2.0
+google-auth==2.28.1
+googleapis-common-protos==1.62.0
+gradio==4.19.2
+gradio_client==0.10.1
+greenlet==3.0.3
+grpcio==1.62.0
+h11==0.14.0
+httpcore==1.0.4
+httptools==0.6.1
+httpx==0.27.0
+huggingface-hub==0.21.1
+humanfriendly==10.0
+idna==3.6
+importlib-metadata==6.11.0
+importlib_resources==6.1.2
+Jinja2==3.1.3
+joblib==1.3.2
+jsonpatch==1.33
+jsonpointer==2.4
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.5
+kubernetes==29.0.0
+langchain==0.1.9
+langchain-community==0.0.24
+langchain-core==0.1.27
+langsmith==0.1.10
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+marshmallow==3.21.0
+matplotlib==3.8.3
+mdurl==0.1.2
+mmh3==4.1.0
+monotonic==1.6
+mpmath==1.3.0
+multidict==6.0.5
+mypy-extensions==1.0.0
+networkx==3.2.1
+numpy==1.26.4
+oauthlib==3.2.2
+onnxruntime==1.17.1
+opentelemetry-api==1.23.0
+opentelemetry-exporter-otlp-proto-common==1.23.0
+opentelemetry-exporter-otlp-proto-grpc==1.23.0
+opentelemetry-instrumentation==0.44b0
+opentelemetry-instrumentation-asgi==0.44b0
+opentelemetry-instrumentation-fastapi==0.44b0
+opentelemetry-proto==1.23.0
+opentelemetry-sdk==1.23.0
+opentelemetry-semantic-conventions==0.44b0
+opentelemetry-util-http==0.44b0
+orjson==3.9.15
+overrides==7.7.0
+packaging==23.2
+pandas==2.2.1
+pillow==10.2.0
+posthog==3.4.2
+protobuf==4.25.3
+pulsar-client==3.4.0
+pyasn1==0.5.1
+pyasn1-modules==0.3.0
+pydantic==2.6.3
+pydantic_core==2.16.3
+pydub==0.25.1
+Pygments==2.17.2
+pyparsing==3.1.1
+PyPika==0.48.9
+pyproject_hooks==1.0.0
+pyreadline3==3.4.1
+python-crfsuite==0.9.10
+python-dateutil==2.8.2
+python-dotenv==1.0.1
+python-multipart==0.0.9
+pytz==2024.1
+pyvi==0.1.1
+PyYAML==6.0.1
+rank-bm25==0.2.2
+referencing==0.33.0
+regex==2023.12.25
+requests==2.31.0
+requests-oauthlib==1.3.1
+rich==13.7.0
+rpds-py==0.18.0
+rsa==4.9
+ruff==0.2.2
+safetensors==0.4.2
+scikit-learn==1.4.1.post1
+scipy==1.12.0
+semantic-version==2.10.0
+sentence-transformers==2.4.0
+shellingham==1.5.4
+six==1.16.0
+sklearn-crfsuite==0.3.6
+sniffio==1.3.1
+SQLAlchemy==2.0.27
+starlette==0.36.3
+sympy==1.12
+tabulate==0.9.0
+tenacity==8.2.3
+threadpoolctl==3.3.0
+tokenizers==0.15.2
+tomli==2.0.1
+tomlkit==0.12.0
+toolz==0.12.1
+torch==2.2.1
+tqdm==4.66.2
+transformers==4.38.1
+typer==0.9.0
+typing-inspect==0.9.0
+typing_extensions==4.10.0
+tzdata==2024.1
+urllib3==2.2.1
+uvicorn==0.27.1
+watchfiles==0.21.0
+websocket-client==1.7.0
+websockets==11.0.3
+wrapt==1.16.0
+yarl==1.9.4
+zipp==3.17.0
+## The following requirements were added by pip freeze:
+distro==1.9.0
+langchain-openai==0.0.8
+openai==1.13.3
+tiktoken==0.6.0

table_data/data_dang_bang.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ data
2	+ NOne

utils.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import pandas as pd
+from langchain_community.document_loaders import TextLoader
+from langchain_community.docstore.document import Document
+from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import Chroma
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.retrievers import BM25Retriever
+from langchain_community.llms import OpenAI
+from langchain_openai import ChatOpenAI
+from langchain.chains import RetrievalQA
+from langchain.schema import AIMessage, HumanMessage
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+import os
+def split_with_source(text, source):
+    splitter = CharacterTextSplitter(
+        separator = "\n",
+        chunk_size = 400,
+        chunk_overlap  = 0,
+        length_function = len,
+        add_start_index = True,
+    )
+    documents = splitter.create_documents([text])
+    # print(documents)
+    for doc in documents:
+        doc.metadata["source"] = source
+        # print(doc.metadata)
+    return documents
+def get_document_from_raw_text_each_line():
+    documents = [Document(page_content="", metadata={'source': 0})]
+    files = os.listdir(os.path.join(os.getcwd(), "raw_data"))
+    # print(files)
+    for i in files:
+        file_path = i
+        with open(os.path.join(os.path.join(os.getcwd(), "raw_data"),file_path), 'r', encoding="utf-8") as file:
+            # Xử lý bằng text_spliter
+            # Tiền xử lý văn bản
+            content = file.readlines()
+            text = []
+            #Split
+            for line in content:
+                line = line.strip()
+                documents.append(Document(page_content=line, metadata={"source": i}))
+    return documents
+def count_files_in_folder(folder_path):
+    # Kiểm tra xem đường dẫn thư mục có tồn tại không
+    if not os.path.isdir(folder_path):
+        print("Đường dẫn không hợp lệ.")
+        return None
+    # Sử dụng os.listdir() để lấy danh sách các tập tin và thư mục trong thư mục
+    files = os.listdir(folder_path)
+    # Đếm số lượng tập tin trong danh sách
+    file_count = len(files)
+    return file_count
+def get_document_from_raw_text():
+    documents = [Document(page_content="", metadata={'source': 0})]
+    files = os.listdir(os.path.join(os.getcwd(), "raw_data"))
+    # print(files)
+    for i in files:
+        file_path = i
+        with open(os.path.join(os.path.join(os.getcwd(), "raw_data"),file_path), 'r', encoding="utf-8") as file:
+            # Xử lý bằng text_spliter
+            # Tiền xử lý văn bản
+            content = file.read().replace('\n\n', "\n")
+            # content = ''.join(content.split('.'))
+            new_doc = content
+            texts = split_with_source(new_doc, i)
+            # texts = get_document_from_raw_text_each_line()
+            documents = documents + texts
+            ##Xử lý mỗi khi xuống dòng
+            # for line in file:
+            #     # Loại bỏ khoảng trắng thừa và ký tự xuống dòng ở đầu và cuối mỗi dòng
+            #     line = line.strip()
+            #     documents.append(Document(page_content=line, metadata={"source": i}))
+    # print(documents)
+    return documents
+def get_document_from_table():
+    documents = [Document(page_content="", metadata={'source': 0})]
+    files = os.listdir(os.path.join(os.getcwd(), "table_data"))
+    # print(files)
+    for i in files:
+        file_path = i
+        data = pd.read_csv(os.path.join(os.path.join(os.getcwd(), "table_data"),file_path))
+        for j, row in data.iterrows():
+            documents.append(Document(page_content=row['data'], metadata={"source": file_path}))
+    return documents
+def load_the_embedding_retrieve(is_ready = False, k = 3, model= 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'):
+    embeddings = HuggingFaceEmbeddings(model_name=model)
+    if is_ready:
+        retriever = Chroma(persist_directory=os.path.join(os.getcwd(), "Data"), embedding_function=embeddings).as_retriever(
+            search_kwargs={"k": k}
+        )
+    else:
+        documents = get_document_from_raw_text() + get_document_from_table()
+        # print(type(documents))
+        retriever = Chroma.from_documents(documents, embeddings).as_retriever(
+            search_kwargs={"k": k}
+        )
+    return retriever
+def load_the_bm25_retrieve(k = 3):
+    documents = get_document_from_raw_text() + get_document_from_table()
+    bm25_retriever = BM25Retriever.from_documents(documents)
+    bm25_retriever.k = k
+    return bm25_retriever
+def get_qachain(llm_name = "gpt-3.5-turbo-0125", chain_type = "stuff", retriever = None, return_source_documents = True):
+    llm = ChatOpenAI(temperature=0,
+                     model_name=llm_name)
+    return RetrievalQA.from_chain_type(llm=llm,
+                                  chain_type=chain_type,
+                                  retriever=retriever,
+                                  return_source_documents=return_source_documents)
+def summarize_messages(demo_ephemeral_chat_history, llm):
+    stored_messages = demo_ephemeral_chat_history.messages
+    human_chat = stored_messages[0].content
+    ai_chat = stored_messages[1].content
+    if len(stored_messages) == 0:
+        return False
+    summarization_prompt = ChatPromptTemplate.from_messages(
+        [
+            (
+                "system", os.environ['SUMARY_MESSAGE_PROMPT'],
+            ),
+            (
+                "human",
+                '''
+                History:
+                Human: {human}
+                AI: {AI}
+                Output:'''
+            )
+            ,
+        ]
+    )
+    summarization_chain = summarization_prompt | llm
+    summary_message = summarization_chain.invoke({"AI": ai_chat, "human": human_chat})
+    demo_ephemeral_chat_history.clear()
+    demo_ephemeral_chat_history.add_message(summary_message)
+    return demo_ephemeral_chat_history
+def get_question_from_summarize(summary, question, llm):
+    new_qa_prompt = ChatPromptTemplate.from_messages([
+        ("system", os.environ['NEW_QUESTION_PROMPT']),
+        ("human",
+         '''
+         Summary: {summary}
+         Question: {question}
+         Output:'''
+         )
+    ]
+    )
+    new_qa_chain = new_qa_prompt | llm
+    return new_qa_chain.invoke({'summary': summary, 'question': question}).content
+def get_final_answer(question, context, prompt, llm):
+    qa_prompt = ChatPromptTemplate.from_messages(
+        [
+            ("system", prompt),
+            ("human", '''
+            Context: {context}
+            Question: {question}
+            Output:'''),
+        ]
+    )
+    answer_chain = qa_prompt | llm
+    answer = answer_chain.invoke({'question': question, 'context': context})
+    return answer.content
+def process_llm_response(llm_response):
+    print(llm_response['result'])
+    print('\n\nSources:')
+    for source in llm_response["source_documents"]:
+        print(source.metadata['source'])