File size: 9,313 Bytes
dae9f6c
 
 
219ecde
dae9f6c
6a4d5f0
 
 
 
dae9f6c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
503c9af
 
 
dae9f6c
 
 
 
503c9af
dae9f6c
6a4d5f0
 
 
 
 
 
 
 
dae9f6c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3088954
dae9f6c
3088954
dae9f6c
 
 
3088954
dae9f6c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a6ca9d
26ba3a1
d182986
8b3b7e7
 
27377dd
 
 
 
dae9f6c
4520a4e
 
dae9f6c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import requests
from bs4 import BeautifulSoup
import gradio as gr
from langchain.docstore.document import Document
from langchain.chains.question_answering import load_qa_chain
# from langchain.llms import HuggingFaceHub
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains.question_answering import load_qa_chain
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langdetect import detect
import json
import os
import numpy as np
import time

# --------------------------
# Configurable Parameters
# --------------------------
CHUNK_SIZE = 500  # number of words per chunk
SIMILARITY_THRESHOLD = 0.3  # fallback threshold if similarity is too low

# Translation pipelines
# Translation to Russian (for queries not in Russian)
# translate_to_ru = pipeline("translation", model="Helsinki-NLP/opus-mt-multi-en-ru")
translate_to_ru = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ru")


# RU->EN for English queries
translate_ru_to_en = pipeline("translation", model="Helsinki-NLP/opus-mt-ru-en")


# Russian Language Model for QA
#llm = HuggingFaceHub(repo_id="DeepPavlov/rubert-base-cased", model_kwargs={"temperature": 0})
# Create a QA pipeline using the DeepPavlov model directly
qa_pipeline = pipeline(
    "question-answering",
    model="DeepPavlov/rubert-base-cased",
    tokenizer="DeepPavlov/rubert-base-cased"
)
llm = HuggingFacePipeline(pipeline=qa_pipeline)
qa_chain = load_qa_chain(llm, chain_type="stuff")

# Embedding Model
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Global cache
last_update_time = None
structured_chunks = []  # Will store tuples: (section_title, chunk_text)
chunk_embeddings = None
original_language = "ru"  # default language for knowledge base (Russian)

# --------------------------
# Utility Functions
# --------------------------

def chunk_text(text, chunk_size=CHUNK_SIZE):
    """Split a text into chunks of approximately chunk_size words."""
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

def fetch_and_structure_content(url):
    global structured_chunks, chunk_embeddings, last_update_time

    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract content under <h3> tags
        structured_sections = {}
        for section in soup.find_all("h3"):
            section_title = section.get_text(strip=True)
            section_content = []
            for sibling in section.find_next_siblings():
                if sibling.name == "h3":
                    break
                text = sibling.get_text(strip=True)
                if text:
                    section_content.append(text)
            full_section_text = " ".join(section_content).strip()
            if full_section_text:
                structured_sections[section_title] = full_section_text

        # Chunking each section to improve retrieval granularity
        structured_chunks = []
        for title, content in structured_sections.items():
            section_chunks = chunk_text(content, CHUNK_SIZE)
            for idx, ch in enumerate(section_chunks):
                # Store (title, chunk_text)
                structured_chunks.append((f"{title} - part {idx+1}", ch))

        # Precompute embeddings
        chunk_texts = [ch[1] for ch in structured_chunks]
        chunk_embeddings = embedding_model.encode(chunk_texts)

        # Save structured chunks and embeddings
        with open("knowledge_base.json", "w", encoding="utf-8") as f:
            json.dump(structured_chunks, f, ensure_ascii=False)
        np.save("embeddings.npy", chunk_embeddings)

        last_update_time = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())

        return "Knowledge base successfully updated and structured!"
    except Exception as e:
        return f"Error fetching or structuring content: {str(e)}"

# Load from cache if available
if os.path.exists("knowledge_base.json") and os.path.exists("embeddings.npy"):
    with open("knowledge_base.json", "r", encoding="utf-8") as f:
        structured_chunks = json.load(f)
    chunk_embeddings = np.load("embeddings.npy")

def detect_language(query):
    try:
        lang = detect(query)
        return lang
    except:
        return "unknown"

def translate_answer_back(result_in_russian, original_lang):
    """Translate the Russian answer back to the original language if possible.
    - If original is 'ru': return as is.
    - If original is 'en': RU->EN
    - Otherwise: fallback to English for now.
    """
    if original_lang == "ru":
        return result_in_russian
    elif original_lang == "en":
        return translate_ru_to_en(result_in_russian)[0]["translation_text"]
    else:
        # For other languages, a more complex approach would be needed.
        # As a simple fallback, translate to English.
        # (Future improvement: Add a dictionary of available RU->XX models)
        return translate_ru_to_en(result_in_russian)[0]["translation_text"]

def chatbot(query):
    global structured_chunks, chunk_embeddings

    if not structured_chunks or chunk_embeddings is None:
        return "Knowledge base is empty or not loaded. Please run an update."

    # Detect query language
    query_language = detect_language(query)
    if query_language == "unknown":
        return "Unable to detect the query language. Please try again, or specify your language."

    # Translate query to Russian if needed
    if query_language != "ru":
        # Translate the query into Russian
        query_in_russian = translate_to_ru(query)[0]["translation_text"]
    else:
        query_in_russian = query

    # Compute query embedding
    query_embedding = embedding_model.encode([query_in_russian])[0]

    # Find the most relevant chunk
    similarities = cosine_similarity([query_embedding], chunk_embeddings)[0]
    best_idx = similarities.argmax()
    best_sim = similarities[best_idx]

    if best_sim < SIMILARITY_THRESHOLD:
        # Fallback if no good match
        fallback_msg = "I'm sorry, I couldn't find a relevant answer in the knowledge base."
        if query_language != "ru":
            # Translate fallback message to English as a minimal step
            # For full multilingual support, use a language-specific model here.
            fallback_msg = fallback_msg  # This message is already in English, assume user can understand.
        return fallback_msg

    most_relevant_section = structured_chunks[best_idx][1]

    # Process the most relevant chunk with QA
    # result_in_russian = qa_chain.run(input_documents=[{"text": most_relevant_section}], question=query_in_russian)
    #result_in_russian = qa_chain.run(input_documents=[Document(page_content=most_relevant_section)], question=query_in_russian)
    #result_in_russian = qa_chain.run(input_documents=[{"context": most_relevant_section}], question=query_in_russian)
    #result_in_russian = qa_chain.run(input_documents=[Document(page_content=most_relevant_section)], question=query_in_russian)
    result_in_russian = qa_pipeline(
        question=query_in_russian,
        context=most_relevant_section
    )

    # Translate answer back to the original language as best as we can
    # final_answer = translate_answer_back(result_in_russian, query_language)
    final_answer = translate_answer_back(result_in_russian["answer"], query_language)
    return final_answer

def admin_interface(url):
    return fetch_and_structure_content(url)

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("## Multilingual Chatbot with Optimized Knowledge Base")
    gr.Markdown("This chatbot fetches documentation from a given URL, structures it, and provides answers to user queries in multiple languages.")
    
    # Admin Panel
    with gr.Column():
        gr.Markdown("### Admin Panel")
        gr.Markdown("Enter the source URL below and click 'Update Knowledge Base' to fetch and structure the content.")
        url_input = gr.Textbox(label="Enter the URL of the Documentation")
        update_button = gr.Button("Update Knowledge Base")
        update_output = gr.Textbox(label="Update Status", interactive=False)
        update_button.click(admin_interface, inputs=url_input, outputs=update_output)
        # Display last update time if available
        if last_update_time:
            gr.Markdown(f"**Last Update Time (UTC):** {last_update_time}")
        else:
            gr.Markdown("**Knowledge base not yet updated.**")

    # User Query Interface
    gr.Markdown("### User Chat Interface")
    gr.Markdown("Ask your question in any language. The system will attempt to detect your language, translate the question into Russian, find the best answer, and then translate the answer back to your language or English if direct translation is not available.")
    query = gr.Textbox(label="Enter your question in any language")
    output = gr.Textbox(label="Answer", interactive=False)
    submit = gr.Button("Submit")
    submit.click(chatbot, inputs=query, outputs=output)

demo.launch()