test4good commited on
Commit
dae9f6c
·
verified ·
1 Parent(s): 9e96a44

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +203 -0
app.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import gradio as gr
4
+ from langchain.chains.question_answering import load_qa_chain
5
+ from langchain.llms import HuggingFaceHub
6
+ from transformers import pipeline
7
+ from sentence_transformers import SentenceTransformer
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+ from langdetect import detect
10
+ import json
11
+ import os
12
+ import numpy as np
13
+ import time
14
+
15
+ # --------------------------
16
+ # Configurable Parameters
17
+ # --------------------------
18
+ CHUNK_SIZE = 500 # number of words per chunk
19
+ SIMILARITY_THRESHOLD = 0.3 # fallback threshold if similarity is too low
20
+
21
+ # Translation pipelines
22
+ # Translation to Russian (for queries not in Russian)
23
+ translate_to_ru = pipeline("translation", model="Helsinki-NLP/opus-mt-multi-en-ru")
24
+
25
+ # RU->EN for English queries
26
+ translate_ru_to_en = pipeline("translation", model="Helsinki-NLP/opus-mt-ru-en")
27
+
28
+ # Russian Language Model for QA
29
+ llm = HuggingFaceHub(repo_id="DeepPavlov/rubert-base-cased", model_kwargs={"temperature": 0})
30
+ qa_chain = load_qa_chain(llm, chain_type="stuff")
31
+
32
+ # Embedding Model
33
+ embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
34
+
35
+ # Global cache
36
+ last_update_time = None
37
+ structured_chunks = [] # Will store tuples: (section_title, chunk_text)
38
+ chunk_embeddings = None
39
+ original_language = "ru" # default language for knowledge base (Russian)
40
+
41
+ # --------------------------
42
+ # Utility Functions
43
+ # --------------------------
44
+
45
+ def chunk_text(text, chunk_size=CHUNK_SIZE):
46
+ """Split a text into chunks of approximately chunk_size words."""
47
+ words = text.split()
48
+ chunks = []
49
+ for i in range(0, len(words), chunk_size):
50
+ chunk = " ".join(words[i:i+chunk_size])
51
+ chunks.append(chunk)
52
+ return chunks
53
+
54
+ def fetch_and_structure_content(url):
55
+ global structured_chunks, chunk_embeddings, last_update_time
56
+
57
+ try:
58
+ response = requests.get(url)
59
+ response.raise_for_status()
60
+ soup = BeautifulSoup(response.text, 'html.parser')
61
+
62
+ # Extract content under <h2> tags
63
+ structured_sections = {}
64
+ for section in soup.find_all("h2"):
65
+ section_title = section.get_text(strip=True)
66
+ section_content = []
67
+ for sibling in section.find_next_siblings():
68
+ if sibling.name == "h2":
69
+ break
70
+ text = sibling.get_text(strip=True)
71
+ if text:
72
+ section_content.append(text)
73
+ full_section_text = " ".join(section_content).strip()
74
+ if full_section_text:
75
+ structured_sections[section_title] = full_section_text
76
+
77
+ # Chunking each section to improve retrieval granularity
78
+ structured_chunks = []
79
+ for title, content in structured_sections.items():
80
+ section_chunks = chunk_text(content, CHUNK_SIZE)
81
+ for idx, ch in enumerate(section_chunks):
82
+ # Store (title, chunk_text)
83
+ structured_chunks.append((f"{title} - part {idx+1}", ch))
84
+
85
+ # Precompute embeddings
86
+ chunk_texts = [ch[1] for ch in structured_chunks]
87
+ chunk_embeddings = embedding_model.encode(chunk_texts)
88
+
89
+ # Save structured chunks and embeddings
90
+ with open("knowledge_base.json", "w", encoding="utf-8") as f:
91
+ json.dump(structured_chunks, f, ensure_ascii=False)
92
+ np.save("embeddings.npy", chunk_embeddings)
93
+
94
+ last_update_time = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
95
+
96
+ return "Knowledge base successfully updated and structured!"
97
+ except Exception as e:
98
+ return f"Error fetching or structuring content: {str(e)}"
99
+
100
+ # Load from cache if available
101
+ if os.path.exists("knowledge_base.json") and os.path.exists("embeddings.npy"):
102
+ with open("knowledge_base.json", "r", encoding="utf-8") as f:
103
+ structured_chunks = json.load(f)
104
+ chunk_embeddings = np.load("embeddings.npy")
105
+
106
+ def detect_language(query):
107
+ try:
108
+ lang = detect(query)
109
+ return lang
110
+ except:
111
+ return "unknown"
112
+
113
+ def translate_answer_back(result_in_russian, original_lang):
114
+ """Translate the Russian answer back to the original language if possible.
115
+ - If original is 'ru': return as is.
116
+ - If original is 'en': RU->EN
117
+ - Otherwise: fallback to English for now.
118
+ """
119
+ if original_lang == "ru":
120
+ return result_in_russian
121
+ elif original_lang == "en":
122
+ return translate_ru_to_en(result_in_russian)[0]["translation_text"]
123
+ else:
124
+ # For other languages, a more complex approach would be needed.
125
+ # As a simple fallback, translate to English.
126
+ # (Future improvement: Add a dictionary of available RU->XX models)
127
+ return translate_ru_to_en(result_in_russian)[0]["translation_text"]
128
+
129
+ def chatbot(query):
130
+ global structured_chunks, chunk_embeddings
131
+
132
+ if not structured_chunks or chunk_embeddings is None:
133
+ return "Knowledge base is empty or not loaded. Please run an update."
134
+
135
+ # Detect query language
136
+ query_language = detect_language(query)
137
+ if query_language == "unknown":
138
+ return "Unable to detect the query language. Please try again, or specify your language."
139
+
140
+ # Translate query to Russian if needed
141
+ if query_language != "ru":
142
+ # Translate the query into Russian
143
+ query_in_russian = translate_to_ru(query)[0]["translation_text"]
144
+ else:
145
+ query_in_russian = query
146
+
147
+ # Compute query embedding
148
+ query_embedding = embedding_model.encode([query_in_russian])[0]
149
+
150
+ # Find the most relevant chunk
151
+ similarities = cosine_similarity([query_embedding], chunk_embeddings)[0]
152
+ best_idx = similarities.argmax()
153
+ best_sim = similarities[best_idx]
154
+
155
+ if best_sim < SIMILARITY_THRESHOLD:
156
+ # Fallback if no good match
157
+ fallback_msg = "I'm sorry, I couldn't find a relevant answer in the knowledge base."
158
+ if query_language != "ru":
159
+ # Translate fallback message to English as a minimal step
160
+ # For full multilingual support, use a language-specific model here.
161
+ fallback_msg = fallback_msg # This message is already in English, assume user can understand.
162
+ return fallback_msg
163
+
164
+ most_relevant_section = structured_chunks[best_idx][1]
165
+
166
+ # Process the most relevant chunk with QA
167
+ result_in_russian = qa_chain.run(input_documents=[{"text": most_relevant_section}], question=query_in_russian)
168
+
169
+ # Translate answer back to the original language as best as we can
170
+ final_answer = translate_answer_back(result_in_russian, query_language)
171
+ return final_answer
172
+
173
+ def admin_interface(url):
174
+ return fetch_and_structure_content(url)
175
+
176
+ # Gradio Interface
177
+ with gr.Blocks() as demo:
178
+ gr.Markdown("## Multilingual Chatbot with Optimized Knowledge Base")
179
+ gr.Markdown("This chatbot fetches documentation from a given URL, structures it, and provides answers to user queries in multiple languages.")
180
+
181
+ # Admin Panel
182
+ with gr.Column():
183
+ gr.Markdown("### Admin Panel")
184
+ gr.Markdown("Enter the source URL below and click 'Update Knowledge Base' to fetch and structure the content.")
185
+ url_input = gr.Textbox(label="Enter the URL of the Documentation")
186
+ update_button = gr.Button("Update Knowledge Base")
187
+ update_output = gr.Textbox(label="Update Status", interactive=False)
188
+ update_button.click(admin_interface, inputs=url_input, outputs=update_output)
189
+ # Display last update time if available
190
+ if last_update_time:
191
+ gr.Markdown(f"**Last Update Time (UTC):** {last_update_time}")
192
+ else:
193
+ gr.Markdown("**Knowledge base not yet updated.**")
194
+
195
+ # User Query Interface
196
+ gr.Markdown("### User Chat Interface")
197
+ gr.Markdown("Ask your question in any language. The system will attempt to detect your language, translate the question into Russian, find the best answer, and then translate the answer back to your language or English if direct translation is not available.")
198
+ query = gr.Textbox(label="Enter your question in any language")
199
+ output = gr.Textbox(label="Answer", interactive=False)
200
+ submit = gr.Button("Submit")
201
+ submit.click(chatbot, inputs=query, outputs=output)
202
+
203
+ demo.launch()