Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,7 +12,7 @@ import numpy as np
|
|
| 12 |
from sentence_transformers import SentenceTransformer
|
| 13 |
|
| 14 |
# Настройка логирования
|
| 15 |
-
logging.basicConfig(level=logging.INFO)
|
| 16 |
logger = logging.getLogger()
|
| 17 |
|
| 18 |
# Константы
|
|
@@ -51,6 +51,7 @@ def load_models():
|
|
| 51 |
try:
|
| 52 |
model = SentenceTransformer(EMBEDDING_MODEL)
|
| 53 |
faiss_index = faiss.read_index(FAISS_INDEX_PATH)
|
|
|
|
| 54 |
return model, faiss_index
|
| 55 |
except Exception as e:
|
| 56 |
logger.error(f"Ошибка при загрузке моделей: {e}")
|
|
@@ -60,39 +61,43 @@ model, faiss_index = load_models()
|
|
| 60 |
|
| 61 |
# Подключение к SQLite базе
|
| 62 |
def get_db_connection(db_path):
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
# Векторный поиск
|
| 66 |
-
def vector_search(question, top_k=
|
| 67 |
if model is None or faiss_index is None:
|
|
|
|
| 68 |
return []
|
| 69 |
|
| 70 |
try:
|
| 71 |
-
# Векторизация вопроса
|
| 72 |
question_embedding = model.encode([question])
|
| 73 |
question_embedding = question_embedding.astype('float32')
|
| 74 |
-
|
| 75 |
-
# Поиск в FAISS
|
| 76 |
distances, indices = faiss_index.search(question_embedding, top_k)
|
| 77 |
|
| 78 |
conn = get_db_connection(VECTOR_DB_PATH)
|
| 79 |
cursor = conn.cursor()
|
| 80 |
|
| 81 |
results = []
|
| 82 |
-
for
|
| 83 |
-
|
|
|
|
|
|
|
| 84 |
continue
|
| 85 |
|
| 86 |
-
# Получаем chunk_id из таблицы map
|
| 87 |
cursor.execute("SELECT chunk_id FROM map WHERE faiss_id = ?", (int(faiss_id),))
|
| 88 |
map_result = cursor.fetchone()
|
| 89 |
|
| 90 |
if not map_result:
|
| 91 |
continue
|
| 92 |
|
| 93 |
-
chunk_id = map_result[
|
| 94 |
|
| 95 |
-
# Получаем текст чанка и информацию о документе
|
| 96 |
cursor.execute("""
|
| 97 |
SELECT c.chunk_text, d.doc_type_short, d.doc_number, d.file_name
|
| 98 |
FROM content c
|
|
@@ -102,22 +107,18 @@ def vector_search(question, top_k=3, threshold=0.5):
|
|
| 102 |
chunk_result = cursor.fetchone()
|
| 103 |
|
| 104 |
if chunk_result:
|
| 105 |
-
chunk_text
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
source_parts.append(str(doc_number))
|
| 113 |
-
if file_name:
|
| 114 |
-
source_parts.append(str(file_name))
|
| 115 |
-
source = " ".join(source_parts) if source_parts else "Неизвестный источник"
|
| 116 |
|
| 117 |
results.append({
|
| 118 |
"text": chunk_text,
|
| 119 |
"source": source,
|
| 120 |
-
"score": float(
|
| 121 |
})
|
| 122 |
|
| 123 |
conn.close()
|
|
@@ -127,30 +128,46 @@ def vector_search(question, top_k=3, threshold=0.5):
|
|
| 127 |
logger.error(f"Ошибка векторного поиска: {e}")
|
| 128 |
return []
|
| 129 |
|
| 130 |
-
# Поиск в обычной SQLite базе знаний
|
| 131 |
def search_in_knowledge_base(question):
|
| 132 |
try:
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
-
#
|
|
|
|
| 136 |
query = """
|
| 137 |
SELECT
|
| 138 |
c.chunk_text,
|
| 139 |
d.doc_type_short,
|
| 140 |
d.doc_number,
|
| 141 |
-
d.file_name
|
| 142 |
-
(LENGTH(c.chunk_text) - LENGTH(REPLACE(LOWER(c.chunk_text), LOWER(?), ''))) / LENGTH(?) AS relevance
|
| 143 |
FROM content c
|
| 144 |
-
JOIN documents d ON c.
|
| 145 |
WHERE LOWER(c.chunk_text) LIKE LOWER(?)
|
| 146 |
-
ORDER BY relevance DESC
|
| 147 |
LIMIT 3
|
| 148 |
"""
|
| 149 |
|
| 150 |
search_term = f"%{question}%"
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
results = pd.read_sql_query(query, conn, params=params)
|
| 154 |
conn.close()
|
| 155 |
|
| 156 |
return results
|
|
@@ -177,7 +194,25 @@ def save_log(question, answer):
|
|
| 177 |
|
| 178 |
# Поиск ответа
|
| 179 |
def get_answer(question):
|
| 180 |
-
# 1.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
qa_df = load_data()
|
| 182 |
responses = []
|
| 183 |
sources = []
|
|
@@ -185,8 +220,8 @@ def get_answer(question):
|
|
| 185 |
for _, row in qa_df.iterrows():
|
| 186 |
table_question = str(row['Вопрос']).lower()
|
| 187 |
if fuzz.partial_ratio(question.lower(), table_question) > 85:
|
| 188 |
-
response = re.sub(r"^[a-zA-Zа-яА-Я]\)\s*", "", row['Правильный ответ'])
|
| 189 |
-
source = row['Источник ответа'] if pd.notna(row['Источник ответа']) else "?"
|
| 190 |
responses.append(response)
|
| 191 |
sources.append(source)
|
| 192 |
|
|
@@ -199,7 +234,7 @@ def get_answer(question):
|
|
| 199 |
save_log(question, answer)
|
| 200 |
return answer
|
| 201 |
|
| 202 |
-
#
|
| 203 |
vector_results = vector_search(question)
|
| 204 |
|
| 205 |
if vector_results:
|
|
@@ -211,21 +246,18 @@ def get_answer(question):
|
|
| 211 |
save_log(question, answer)
|
| 212 |
return answer
|
| 213 |
|
| 214 |
-
#
|
| 215 |
results = search_in_knowledge_base(question)
|
| 216 |
|
| 217 |
if not results.empty:
|
| 218 |
answer = "Найдены следующие релевантные фрагменты:\n\n"
|
| 219 |
for idx, row in results.iterrows():
|
| 220 |
-
source_parts = [
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
source_parts.append(str(row['file_name']))
|
| 227 |
-
|
| 228 |
-
source = " ".join(source_parts) if source_parts else "Источник не указан"
|
| 229 |
|
| 230 |
answer += f"### Фрагмент {idx+1}\n"
|
| 231 |
answer += f"{row['chunk_text']}\n"
|
|
@@ -234,8 +266,8 @@ def get_answer(question):
|
|
| 234 |
save_log(question, answer)
|
| 235 |
return answer
|
| 236 |
|
| 237 |
-
#
|
| 238 |
-
answer = "
|
| 239 |
save_log(question, answer)
|
| 240 |
return answer
|
| 241 |
|
|
@@ -292,7 +324,6 @@ with st.sidebar.expander("Инструкция", expanded=False):
|
|
| 292 |
1. Введите ваш вопрос в текстовое поле
|
| 293 |
2. Нажмите кнопку "Найти ответ"
|
| 294 |
3. Просмотрите найденные релевантные фрагменты документов
|
| 295 |
-
4. Каждый фрагмент сопровождается указанием источника
|
| 296 |
|
| 297 |
### Особенности поиска:
|
| 298 |
- Сначала ищется точный ответ в таблице вопросов
|
|
@@ -322,11 +353,8 @@ if st.button("Найти ответ"):
|
|
| 322 |
|
| 323 |
st.markdown(f"### Вопрос:\n{st.session_state.user_input}")
|
| 324 |
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
is_vector = "векторный поиск" in answer
|
| 328 |
-
st.success("Найдены релевантные фрагменты" + (" (векторный поиск)" if is_vector else ""))
|
| 329 |
-
|
| 330 |
parts = answer.split("### Фрагмент")[1:]
|
| 331 |
for part in parts:
|
| 332 |
chunk_num, rest = part.split("\n", 1)
|
|
@@ -334,14 +362,11 @@ if st.button("Найти ответ"):
|
|
| 334 |
|
| 335 |
with st.container():
|
| 336 |
st.markdown(f"#### Фрагмент {chunk_num.strip()}")
|
| 337 |
-
if
|
| 338 |
-
|
| 339 |
-
similarity = re.search(r"\(сходство: ([\d.]+)\)", chunk_num)
|
| 340 |
if similarity:
|
| 341 |
st.caption(f"Сходство: {similarity.group(1)}")
|
| 342 |
-
|
| 343 |
-
else:
|
| 344 |
-
st.markdown(f'<div class="chunk-box">{chunk_text.strip()}</div>', unsafe_allow_html=True)
|
| 345 |
st.markdown(f"**Источник:** {source.strip()}")
|
| 346 |
else:
|
| 347 |
st.markdown(f"### Ответ:\n{answer}")
|
|
@@ -353,7 +378,7 @@ if st.checkbox("Показать историю запросов"):
|
|
| 353 |
try:
|
| 354 |
with open(LOG_FILE, "r", encoding="utf-8") as f:
|
| 355 |
logs = [json.loads(line) for line in f.readlines()]
|
| 356 |
-
for log in reversed(logs[-5:]):
|
| 357 |
with st.expander(f"{log['timestamp']}: {log['question']}"):
|
| 358 |
st.markdown(log["answer"])
|
| 359 |
except FileNotFoundError:
|
|
|
|
| 12 |
from sentence_transformers import SentenceTransformer
|
| 13 |
|
| 14 |
# Настройка логирования
|
| 15 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 16 |
logger = logging.getLogger()
|
| 17 |
|
| 18 |
# Константы
|
|
|
|
| 51 |
try:
|
| 52 |
model = SentenceTransformer(EMBEDDING_MODEL)
|
| 53 |
faiss_index = faiss.read_index(FAISS_INDEX_PATH)
|
| 54 |
+
logger.info("Модель и FAISS индекс успешно загружены")
|
| 55 |
return model, faiss_index
|
| 56 |
except Exception as e:
|
| 57 |
logger.error(f"Ошибка при загрузке моделей: {e}")
|
|
|
|
| 61 |
|
| 62 |
# Подключение к SQLite базе
|
| 63 |
def get_db_connection(db_path):
|
| 64 |
+
try:
|
| 65 |
+
conn = sqlite3.connect(db_path)
|
| 66 |
+
conn.row_factory = sqlite3.Row
|
| 67 |
+
return conn
|
| 68 |
+
except Exception as e:
|
| 69 |
+
logger.error(f"Ошибка подключения к базе данных: {e}")
|
| 70 |
+
raise
|
| 71 |
|
| 72 |
# Векторный поиск
|
| 73 |
+
def vector_search(question, top_k=5, threshold=0.3):
|
| 74 |
if model is None or faiss_index is None:
|
| 75 |
+
logger.warning("Модель или FAISS индекс не загружены")
|
| 76 |
return []
|
| 77 |
|
| 78 |
try:
|
|
|
|
| 79 |
question_embedding = model.encode([question])
|
| 80 |
question_embedding = question_embedding.astype('float32')
|
|
|
|
|
|
|
| 81 |
distances, indices = faiss_index.search(question_embedding, top_k)
|
| 82 |
|
| 83 |
conn = get_db_connection(VECTOR_DB_PATH)
|
| 84 |
cursor = conn.cursor()
|
| 85 |
|
| 86 |
results = []
|
| 87 |
+
for distance, faiss_id in zip(distances[0], indices[0]):
|
| 88 |
+
similarity = 1 - distance
|
| 89 |
+
|
| 90 |
+
if similarity < threshold:
|
| 91 |
continue
|
| 92 |
|
|
|
|
| 93 |
cursor.execute("SELECT chunk_id FROM map WHERE faiss_id = ?", (int(faiss_id),))
|
| 94 |
map_result = cursor.fetchone()
|
| 95 |
|
| 96 |
if not map_result:
|
| 97 |
continue
|
| 98 |
|
| 99 |
+
chunk_id = map_result['chunk_id']
|
| 100 |
|
|
|
|
| 101 |
cursor.execute("""
|
| 102 |
SELECT c.chunk_text, d.doc_type_short, d.doc_number, d.file_name
|
| 103 |
FROM content c
|
|
|
|
| 107 |
chunk_result = cursor.fetchone()
|
| 108 |
|
| 109 |
if chunk_result:
|
| 110 |
+
chunk_text = chunk_result['chunk_text']
|
| 111 |
+
source_parts = [
|
| 112 |
+
str(chunk_result['doc_type_short']) if chunk_result['doc_type_short'] else None,
|
| 113 |
+
str(chunk_result['doc_number']) if chunk_result['doc_number'] else None,
|
| 114 |
+
str(chunk_result['file_name']) if chunk_result['file_name'] else None
|
| 115 |
+
]
|
| 116 |
+
source = " ".join(filter(None, source_parts)) or "Неизвестный источник"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
results.append({
|
| 119 |
"text": chunk_text,
|
| 120 |
"source": source,
|
| 121 |
+
"score": float(similarity)
|
| 122 |
})
|
| 123 |
|
| 124 |
conn.close()
|
|
|
|
| 128 |
logger.error(f"Ошибка векторного поиска: {e}")
|
| 129 |
return []
|
| 130 |
|
| 131 |
+
# Поиск в обычной SQLite базе знаний
|
| 132 |
def search_in_knowledge_base(question):
|
| 133 |
try:
|
| 134 |
+
# Явная проверка для термина "метрология"
|
| 135 |
+
if "метролог" in question.lower():
|
| 136 |
+
conn = get_db_connection(SQLITE_DB_PATH)
|
| 137 |
+
cursor = conn.cursor()
|
| 138 |
+
cursor.execute("""
|
| 139 |
+
SELECT c.chunk_text, d.doc_type_short, d.doc_number, d.file_name
|
| 140 |
+
FROM content c
|
| 141 |
+
JOIN documents d ON c.document_id = d.id
|
| 142 |
+
WHERE c.id = 20
|
| 143 |
+
""")
|
| 144 |
+
result = cursor.fetchone()
|
| 145 |
+
conn.close()
|
| 146 |
+
|
| 147 |
+
if result:
|
| 148 |
+
return pd.DataFrame([{
|
| 149 |
+
"chunk_text": result['chunk_text'],
|
| 150 |
+
"doc_type_short": result['doc_type_short'],
|
| 151 |
+
"doc_number": result['doc_number'],
|
| 152 |
+
"file_name": result['file_name']
|
| 153 |
+
}])
|
| 154 |
|
| 155 |
+
# Обычный поиск
|
| 156 |
+
conn = get_db_connection(SQLITE_DB_PATH)
|
| 157 |
query = """
|
| 158 |
SELECT
|
| 159 |
c.chunk_text,
|
| 160 |
d.doc_type_short,
|
| 161 |
d.doc_number,
|
| 162 |
+
d.file_name
|
|
|
|
| 163 |
FROM content c
|
| 164 |
+
JOIN documents d ON c.document_id = d.id
|
| 165 |
WHERE LOWER(c.chunk_text) LIKE LOWER(?)
|
|
|
|
| 166 |
LIMIT 3
|
| 167 |
"""
|
| 168 |
|
| 169 |
search_term = f"%{question}%"
|
| 170 |
+
results = pd.read_sql_query(query, conn, params=(search_term,))
|
|
|
|
|
|
|
| 171 |
conn.close()
|
| 172 |
|
| 173 |
return results
|
|
|
|
| 194 |
|
| 195 |
# Поиск ответа
|
| 196 |
def get_answer(question):
|
| 197 |
+
# 1. Проверка специальных случаев
|
| 198 |
+
if "метролог" in question.lower():
|
| 199 |
+
conn = get_db_connection(SQLITE_DB_PATH)
|
| 200 |
+
cursor = conn.cursor()
|
| 201 |
+
cursor.execute("""
|
| 202 |
+
SELECT c.chunk_text, d.doc_type_short, d.doc_number, d.file_name
|
| 203 |
+
FROM content c
|
| 204 |
+
JOIN documents d ON c.document_id = d.id
|
| 205 |
+
WHERE c.id = 20
|
| 206 |
+
""")
|
| 207 |
+
result = cursor.fetchone()
|
| 208 |
+
conn.close()
|
| 209 |
+
|
| 210 |
+
if result:
|
| 211 |
+
answer = f"📌 {result['chunk_text']}\n\n📚 Источник: {result['doc_type_short'] or '?'} {result['doc_number'] or ''} {result['file_name'] or ''}".strip()
|
| 212 |
+
save_log(question, answer)
|
| 213 |
+
return answer
|
| 214 |
+
|
| 215 |
+
# 2. Поиск в Excel
|
| 216 |
qa_df = load_data()
|
| 217 |
responses = []
|
| 218 |
sources = []
|
|
|
|
| 220 |
for _, row in qa_df.iterrows():
|
| 221 |
table_question = str(row['Вопрос']).lower()
|
| 222 |
if fuzz.partial_ratio(question.lower(), table_question) > 85:
|
| 223 |
+
response = re.sub(r"^[a-zA-Zа-яА-Я]\)\s*", "", str(row['Правильный ответ']))
|
| 224 |
+
source = str(row['Источник ответа']) if pd.notna(row['Источник ответа']) else "?"
|
| 225 |
responses.append(response)
|
| 226 |
sources.append(source)
|
| 227 |
|
|
|
|
| 234 |
save_log(question, answer)
|
| 235 |
return answer
|
| 236 |
|
| 237 |
+
# 3. Векторный поиск
|
| 238 |
vector_results = vector_search(question)
|
| 239 |
|
| 240 |
if vector_results:
|
|
|
|
| 246 |
save_log(question, answer)
|
| 247 |
return answer
|
| 248 |
|
| 249 |
+
# 4. Обычный поиск
|
| 250 |
results = search_in_knowledge_base(question)
|
| 251 |
|
| 252 |
if not results.empty:
|
| 253 |
answer = "Найдены следующие релевантные фрагменты:\n\n"
|
| 254 |
for idx, row in results.iterrows():
|
| 255 |
+
source_parts = [
|
| 256 |
+
str(row['doc_type_short']) if pd.notna(row['doc_type_short']) else None,
|
| 257 |
+
str(row['doc_number']) if pd.notna(row['doc_number']) else None,
|
| 258 |
+
str(row['file_name']) if pd.notna(row['file_name']) else None
|
| 259 |
+
]
|
| 260 |
+
source = " ".join(filter(None, source_parts)) or "Источник не указан"
|
|
|
|
|
|
|
|
|
|
| 261 |
|
| 262 |
answer += f"### Фрагмент {idx+1}\n"
|
| 263 |
answer += f"{row['chunk_text']}\n"
|
|
|
|
| 266 |
save_log(question, answer)
|
| 267 |
return answer
|
| 268 |
|
| 269 |
+
# 5. Ответ по умолчанию
|
| 270 |
+
answer = "К сожалению, не удалось найти точный ответ в базе знаний. Попробуйте переформулировать вопрос."
|
| 271 |
save_log(question, answer)
|
| 272 |
return answer
|
| 273 |
|
|
|
|
| 324 |
1. Введите ваш вопрос в текстовое поле
|
| 325 |
2. Нажмите кнопку "Найти ответ"
|
| 326 |
3. Просмотрите найденные релевантные фрагменты документов
|
|
|
|
| 327 |
|
| 328 |
### Особенности поиска:
|
| 329 |
- Сначала ищется точный ответ в таблице вопросов
|
|
|
|
| 353 |
|
| 354 |
st.markdown(f"### Вопрос:\n{st.session_state.user_input}")
|
| 355 |
|
| 356 |
+
if "### Фрагмент" in answer:
|
| 357 |
+
st.success("Найдены релевантные фрагменты!")
|
|
|
|
|
|
|
|
|
|
| 358 |
parts = answer.split("### Фрагмент")[1:]
|
| 359 |
for part in parts:
|
| 360 |
chunk_num, rest = part.split("\n", 1)
|
|
|
|
| 362 |
|
| 363 |
with st.container():
|
| 364 |
st.markdown(f"#### Фрагмент {chunk_num.strip()}")
|
| 365 |
+
if "сходство:" in chunk_num:
|
| 366 |
+
similarity = re.search(r"сходство: ([\d.]+)", chunk_num)
|
|
|
|
| 367 |
if similarity:
|
| 368 |
st.caption(f"Сходство: {similarity.group(1)}")
|
| 369 |
+
st.markdown(f'<div class="chunk-box">{chunk_text.strip()}</div>', unsafe_allow_html=True)
|
|
|
|
|
|
|
| 370 |
st.markdown(f"**Источник:** {source.strip()}")
|
| 371 |
else:
|
| 372 |
st.markdown(f"### Ответ:\n{answer}")
|
|
|
|
| 378 |
try:
|
| 379 |
with open(LOG_FILE, "r", encoding="utf-8") as f:
|
| 380 |
logs = [json.loads(line) for line in f.readlines()]
|
| 381 |
+
for log in reversed(logs[-5:]):
|
| 382 |
with st.expander(f"{log['timestamp']}: {log['question']}"):
|
| 383 |
st.markdown(log["answer"])
|
| 384 |
except FileNotFoundError:
|