Spaces:
Runtime error
Runtime error
Upload 3 files
Browse files- .gitattributes +1 -0
- app.py +130 -0
- quyche_uit_plus_removed.pdf +3 -0
- requirements.txt +13 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
quyche_uit_plus_removed.pdf filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""Chatbot_LLM_with_RAG Quyche_FINAL.ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colab.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1kRGRGeOuF9JORajZPlEI2H0IrvcrgYr0
|
8 |
+
"""
|
9 |
+
|
10 |
+
|
11 |
+
import os
|
12 |
+
import textwrap
|
13 |
+
|
14 |
+
import chromadb
|
15 |
+
import langchain
|
16 |
+
import openai
|
17 |
+
from langchain.chains import RetrievalQA
|
18 |
+
from langchain.chat_models import ChatOpenAI
|
19 |
+
from langchain.document_loaders import TextLoader, UnstructuredPDFLoader, YoutubeLoader, PyPDFLoader
|
20 |
+
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
|
21 |
+
from langchain.indexes import VectorstoreIndexCreator
|
22 |
+
from langchain.llms import OpenAI
|
23 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
24 |
+
from langchain.vectorstores import Chroma
|
25 |
+
from langchain.llms import GPT4All
|
26 |
+
from pdf2image import convert_from_path
|
27 |
+
|
28 |
+
|
29 |
+
|
30 |
+
# !pip uninstall 'git+https://github.com/facebookresearch/detectron2.git@57bdb21249d5418c130d54e2ebdc94dda7a4c01a'
|
31 |
+
|
32 |
+
"""Download file pdf"""
|
33 |
+
|
34 |
+
# Download file pdf
|
35 |
+
# !gdown https://drive.google.com/uc?id=19_MlM7Cmw8z_j40dk80PQbITYNET3tL2
|
36 |
+
# !gdown https://drive.google.com/uc?id=1gdM3TfvyQPDXOuFjNS9n_DgD24ThDB84
|
37 |
+
|
38 |
+
FILE_NAME="quyche_uit_plus_removed.pdf"
|
39 |
+
|
40 |
+
"""Load Data & Model"""
|
41 |
+
|
42 |
+
from getpass import getpass
|
43 |
+
OPENAI_API_KEY = "sk-proj-jFDUBtItWEzg2vE9ZZhaT3BlbkFJi3l93u3z3FuQItueKZQp"
|
44 |
+
|
45 |
+
# OPENAI_API_KEY = getpass()
|
46 |
+
|
47 |
+
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
|
48 |
+
model = OpenAI(temperature=0, model_name="gpt-3.5-turbo")
|
49 |
+
# (trang)
|
50 |
+
|
51 |
+
images = convert_from_path(FILE_NAME, dpi=88)
|
52 |
+
# len(images)
|
53 |
+
# images[-1]
|
54 |
+
|
55 |
+
"""Use UnstructuredPDFLoader to load PDFs"""
|
56 |
+
|
57 |
+
# Use UnstructuredPDFLoader to load PDFs from the Internets
|
58 |
+
pdf_loader = UnstructuredPDFLoader(FILE_NAME)
|
59 |
+
pdf_pages = pdf_loader.load_and_split()
|
60 |
+
|
61 |
+
# Text Splitters
|
62 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
|
63 |
+
texts = text_splitter.split_documents(pdf_pages)
|
64 |
+
# len(texts)
|
65 |
+
|
66 |
+
# texts[0]
|
67 |
+
|
68 |
+
# texts[-1]
|
69 |
+
|
70 |
+
"""Create Embeddings & Vectorstores"""
|
71 |
+
|
72 |
+
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
|
73 |
+
hf_embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)
|
74 |
+
|
75 |
+
db = Chroma.from_documents(texts, hf_embeddings, persist_directory="db")
|
76 |
+
|
77 |
+
"""#Use a Chain"""
|
78 |
+
|
79 |
+
custom_prompt_template = """Sử dụng các thông tin sau đây để trả lời câu hỏi của người dùng.
|
80 |
+
Nếu bạn không biết câu trả lời, chỉ cần nói rằng bạn không biết, đừng cố bịa ra câu trả lời.
|
81 |
+
Tất cả câu trả lời của bạn đều phải trả lời bằng tiếng việt
|
82 |
+
|
83 |
+
Context: {context}
|
84 |
+
Question: {question}
|
85 |
+
|
86 |
+
"""
|
87 |
+
|
88 |
+
from langchain import PromptTemplate
|
89 |
+
def set_custom_prompt():
|
90 |
+
"""
|
91 |
+
Prompt template for QA retrieval for each vectorstore
|
92 |
+
"""
|
93 |
+
prompt = PromptTemplate(template=custom_prompt_template,
|
94 |
+
input_variables=['context', 'question'])
|
95 |
+
return prompt
|
96 |
+
|
97 |
+
prompt = set_custom_prompt()
|
98 |
+
chain = RetrievalQA.from_chain_type(
|
99 |
+
llm=model,
|
100 |
+
chain_type="stuff",
|
101 |
+
retriever=db.as_retriever(search_kwargs={"k": 3}),
|
102 |
+
chain_type_kwargs={'prompt': prompt}
|
103 |
+
)
|
104 |
+
|
105 |
+
"""#QA Chatbot"""
|
106 |
+
|
107 |
+
def print_response(response: str):
|
108 |
+
print("\n".join(textwrap.wrap(response, width=100)))
|
109 |
+
|
110 |
+
# query = "Các môn bổ túc kiến thức của khóa cao học ngành khoa học máy tính gồm những môn nào?"
|
111 |
+
# response = chain.run(query)
|
112 |
+
# print_response(response)
|
113 |
+
|
114 |
+
|
115 |
+
# from langchain.chat_models import ChatOpenAI
|
116 |
+
from langchain.schema import AIMessage, HumanMessage
|
117 |
+
# import openai
|
118 |
+
import gradio as gr
|
119 |
+
|
120 |
+
def predict(message, history):
|
121 |
+
history_langchain_format = []
|
122 |
+
for human, ai in history:
|
123 |
+
history_langchain_format.append(HumanMessage(content=human))
|
124 |
+
history_langchain_format.append(AIMessage(content=ai))
|
125 |
+
history_langchain_format.append(HumanMessage(content=message))
|
126 |
+
# gpt_response = llm(history_langchain_format)
|
127 |
+
return chain.run(message)
|
128 |
+
|
129 |
+
chatbot=gr.ChatInterface(predict)
|
130 |
+
chatbot.launch(share=True)
|
quyche_uit_plus_removed.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7241999a904618f37690171837ca12cded7b85ad175520dd7e4e9ac71fb9fcd2
|
3 |
+
size 7643952
|
requirements.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
openai==0.27.4
|
3 |
+
watermark
|
4 |
+
poppler-utils
|
5 |
+
langchain==0.0.173
|
6 |
+
langchain_community
|
7 |
+
chromadb==0.3.23
|
8 |
+
pypdf==3.8.1
|
9 |
+
pygpt4all==1.1.0
|
10 |
+
pdf2image==1.16.3
|
11 |
+
tiktoken==0.3.3
|
12 |
+
unstructured[local-inference]==0.5.12
|
13 |
+
gradio==3.38.0
|