File size: 6,458 Bytes
580e6fc
 
 
 
2df143d
580e6fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2df143d
580e6fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2df143d
 
 
580e6fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2df143d
580e6fc
 
 
 
 
 
2df143d
580e6fc
 
2df143d
 
 
580e6fc
 
 
2df143d
 
580e6fc
2df143d
 
 
 
 
580e6fc
2df143d
580e6fc
 
 
2df143d
580e6fc
 
 
 
2df143d
 
 
 
 
 
 
 
580e6fc
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import gradio as gr
import os
import json
import shutil
import html
from datetime import datetime
from retriever import retriever, reload_retriever
from generator import answer_query
from langchain_community.document_loaders import (
    PyPDFLoader, TextLoader, CSVLoader, UnstructuredWordDocumentLoader
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# Đường dẫn file CSS
CUSTOM_CSS_PATH = "gradio_theme.css"

# Quản lý danh sách file upload
UPLOADED_FILES_JSON = "uploaded_files.json"
uploaded_files = []

def save_uploaded_files_to_json():
    with open(UPLOADED_FILES_JSON, "w", encoding="utf-8") as f:
        json.dump(uploaded_files, f, ensure_ascii=False, indent=2)

def load_uploaded_files_from_json():
    global uploaded_files
    if os.path.exists(UPLOADED_FILES_JSON):
        with open(UPLOADED_FILES_JSON, "r", encoding="utf-8") as f:
            uploaded_files = json.load(f)
    else:
        uploaded_files = []

def update_uploaded_files():
    if not uploaded_files:
        return "_Chưa có tài liệu nào được tải lên._"
    return "### 📚 Danh sách tài liệu đã xử lý:\n" + "\n".join(
        f"- {f['name']} (Uploaded: {f['timestamp'][:19]})" for f in uploaded_files
    )

# Load khi khởi động
load_uploaded_files_from_json()

def process_document(file):
    file_path = file.name

    if os.path.exists("vectorstore"):
        shutil.rmtree("vectorstore")

    try:
        if file_path.endswith(".pdf"):
            loader = PyPDFLoader(file_path)
        elif file_path.endswith(".csv"):
            loader = CSVLoader(file_path)
        elif file_path.endswith(".txt"):
            loader = TextLoader(file_path, autodetect_encoding=True)
        elif file_path.endswith(".docx") or file_path.endswith(".doc"):
            loader = UnstructuredWordDocumentLoader(file_path)
        else:
            return "Định dạng file không hỗ trợ.", update_uploaded_files()

        documents = loader.load()
    except Exception as e:
        return f"Lỗi khi tải tài liệu: {e}", update_uploaded_files()

    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    docs = splitter.split_documents(documents)

    if not docs:
        return "Không trích xuất được nội dung từ tài liệu.", update_uploaded_files()

    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    db = FAISS.from_documents(docs, embeddings)
    db.save_local("vectorstore")
    reload_retriever()

    uploaded_files.append({"name": os.path.basename(file.name), "timestamp": datetime.now().isoformat()})
    save_uploaded_files_to_json()

    return f"Đã xử lý {len(docs)} đoạn từ **{file.name}**", update_uploaded_files()

def delete_file(filename):
    global uploaded_files
    filename = filename.strip()
    uploaded_files = [f for f in uploaded_files if f["name"] != filename]
    save_uploaded_files_to_json()
    return update_uploaded_files()

def clear_inputs():
    return "", ""

def query_function(question, temperature, include_sources):
    fixed_model = "sentence-transformers/all-MiniLM-L6-v2"
    answer, docs = answer_query(question, model=fixed_model, temperature=temperature)
    answer = html.escape(answer)

    if include_sources and docs:
        unique_sources = set()
        for doc in docs:
            section = doc.metadata.get("section")
            if section:
                unique_sources.add(section.strip())
            else:
                filename = os.path.basename(doc.metadata.get("source", "Unknown"))
                unique_sources.add(filename.strip())
        if unique_sources:
            sources_list = [f"- {src}" for src in sorted(unique_sources)]
            sources_text = "\n\n**Nguồn tham khảo:**\n" + "\n".join(sources_list)
            answer += sources_text
    return answer

# Tạo giao diện Gradio
with gr.Blocks(css=CUSTOM_CSS_PATH) as demo:
    with gr.Row():
        with gr.Column(scale=5):
            gr.Markdown("## 🔍 RAGFlow Enterprise Search\nTìm kiếm thông minh từ tài liệu nội bộ", elem_classes="container-box")

    with gr.Tabs():
        # Tab Tìm kiếm
        with gr.TabItem("🔍 Tìm kiếm"):
            with gr.Column(elem_classes="container-box"):
                question       = gr.Textbox(lines=3, label="Câu hỏi")
                with gr.Row():
                    temperature     = gr.Slider(0, 1, value=0.2, step=0.1, label="Temperature")
                    include_sources = gr.Checkbox(label="Hiển thị nguồn", value=True)
                with gr.Row():
                    search_btn = gr.Button("🔍 Tìm kiếm", variant="primary", elem_classes="button-primary")
                    clear_btn  = gr.Button("🗑️ Xóa", variant="secondary", elem_classes="button-secondary")
                output = gr.Markdown(elem_classes="output-box")

            search_btn.click(query_function,
                             inputs=[question, temperature, include_sources],
                             outputs=[output])
            clear_btn.click(clear_inputs,
                            outputs=[question, output])

        # Tab Quản lý tài liệu
        with gr.TabItem("📚 Quản lý tài liệu"):
            with gr.Column(elem_classes="container-box"):
                upload_file = gr.File(label="Tải lên tài liệu", file_types=[".pdf", ".docx", ".doc", ".csv", ".txt"])
                upload_btn  = gr.Button("📄 Tải lên và xử lý", variant="primary")
                upload_status = gr.Textbox(label="Trạng thái", lines=3, interactive=False)
            uploaded_files_list = gr.Markdown(value=update_uploaded_files(), elem_classes="scroll-box")
            with gr.Column(elem_classes="container-box"):
                delete_filename = gr.Textbox(label="Tên file muốn xóa")
                delete_btn      = gr.Button("🗑️ Xóa tài liệu", variant="secondary")

            upload_btn.click(process_document,
                             inputs=[upload_file],
                             outputs=[upload_status, uploaded_files_list])
            delete_btn.click(delete_file,
                             inputs=[delete_filename],
                             outputs=[uploaded_files_list])

    demo.launch(share=True)