vanhai123's picture
Update app.py
2df143d verified
raw
history blame contribute delete
6.46 kB
import gradio as gr
import os
import json
import shutil
import html
from datetime import datetime
from retriever import retriever, reload_retriever
from generator import answer_query
from langchain_community.document_loaders import (
PyPDFLoader, TextLoader, CSVLoader, UnstructuredWordDocumentLoader
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
# Đường dẫn file CSS
CUSTOM_CSS_PATH = "gradio_theme.css"
# Quản lý danh sách file upload
UPLOADED_FILES_JSON = "uploaded_files.json"
uploaded_files = []
def save_uploaded_files_to_json():
with open(UPLOADED_FILES_JSON, "w", encoding="utf-8") as f:
json.dump(uploaded_files, f, ensure_ascii=False, indent=2)
def load_uploaded_files_from_json():
global uploaded_files
if os.path.exists(UPLOADED_FILES_JSON):
with open(UPLOADED_FILES_JSON, "r", encoding="utf-8") as f:
uploaded_files = json.load(f)
else:
uploaded_files = []
def update_uploaded_files():
if not uploaded_files:
return "_Chưa có tài liệu nào được tải lên._"
return "### 📚 Danh sách tài liệu đã xử lý:\n" + "\n".join(
f"- {f['name']} (Uploaded: {f['timestamp'][:19]})" for f in uploaded_files
)
# Load khi khởi động
load_uploaded_files_from_json()
def process_document(file):
file_path = file.name
if os.path.exists("vectorstore"):
shutil.rmtree("vectorstore")
try:
if file_path.endswith(".pdf"):
loader = PyPDFLoader(file_path)
elif file_path.endswith(".csv"):
loader = CSVLoader(file_path)
elif file_path.endswith(".txt"):
loader = TextLoader(file_path, autodetect_encoding=True)
elif file_path.endswith(".docx") or file_path.endswith(".doc"):
loader = UnstructuredWordDocumentLoader(file_path)
else:
return "Định dạng file không hỗ trợ.", update_uploaded_files()
documents = loader.load()
except Exception as e:
return f"Lỗi khi tải tài liệu: {e}", update_uploaded_files()
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = splitter.split_documents(documents)
if not docs:
return "Không trích xuất được nội dung từ tài liệu.", update_uploaded_files()
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.from_documents(docs, embeddings)
db.save_local("vectorstore")
reload_retriever()
uploaded_files.append({"name": os.path.basename(file.name), "timestamp": datetime.now().isoformat()})
save_uploaded_files_to_json()
return f"Đã xử lý {len(docs)} đoạn từ **{file.name}**", update_uploaded_files()
def delete_file(filename):
global uploaded_files
filename = filename.strip()
uploaded_files = [f for f in uploaded_files if f["name"] != filename]
save_uploaded_files_to_json()
return update_uploaded_files()
def clear_inputs():
return "", ""
def query_function(question, temperature, include_sources):
fixed_model = "sentence-transformers/all-MiniLM-L6-v2"
answer, docs = answer_query(question, model=fixed_model, temperature=temperature)
answer = html.escape(answer)
if include_sources and docs:
unique_sources = set()
for doc in docs:
section = doc.metadata.get("section")
if section:
unique_sources.add(section.strip())
else:
filename = os.path.basename(doc.metadata.get("source", "Unknown"))
unique_sources.add(filename.strip())
if unique_sources:
sources_list = [f"- {src}" for src in sorted(unique_sources)]
sources_text = "\n\n**Nguồn tham khảo:**\n" + "\n".join(sources_list)
answer += sources_text
return answer
# Tạo giao diện Gradio
with gr.Blocks(css=CUSTOM_CSS_PATH) as demo:
with gr.Row():
with gr.Column(scale=5):
gr.Markdown("## 🔍 RAGFlow Enterprise Search\nTìm kiếm thông minh từ tài liệu nội bộ", elem_classes="container-box")
with gr.Tabs():
# Tab Tìm kiếm
with gr.TabItem("🔍 Tìm kiếm"):
with gr.Column(elem_classes="container-box"):
question = gr.Textbox(lines=3, label="Câu hỏi")
with gr.Row():
temperature = gr.Slider(0, 1, value=0.2, step=0.1, label="Temperature")
include_sources = gr.Checkbox(label="Hiển thị nguồn", value=True)
with gr.Row():
search_btn = gr.Button("🔍 Tìm kiếm", variant="primary", elem_classes="button-primary")
clear_btn = gr.Button("🗑️ Xóa", variant="secondary", elem_classes="button-secondary")
output = gr.Markdown(elem_classes="output-box")
search_btn.click(query_function,
inputs=[question, temperature, include_sources],
outputs=[output])
clear_btn.click(clear_inputs,
outputs=[question, output])
# Tab Quản lý tài liệu
with gr.TabItem("📚 Quản lý tài liệu"):
with gr.Column(elem_classes="container-box"):
upload_file = gr.File(label="Tải lên tài liệu", file_types=[".pdf", ".docx", ".doc", ".csv", ".txt"])
upload_btn = gr.Button("📄 Tải lên và xử lý", variant="primary")
upload_status = gr.Textbox(label="Trạng thái", lines=3, interactive=False)
uploaded_files_list = gr.Markdown(value=update_uploaded_files(), elem_classes="scroll-box")
with gr.Column(elem_classes="container-box"):
delete_filename = gr.Textbox(label="Tên file muốn xóa")
delete_btn = gr.Button("🗑️ Xóa tài liệu", variant="secondary")
upload_btn.click(process_document,
inputs=[upload_file],
outputs=[upload_status, uploaded_files_list])
delete_btn.click(delete_file,
inputs=[delete_filename],
outputs=[uploaded_files_list])
demo.launch(share=True)