Spaces:
Sleeping
Sleeping
File size: 6,221 Bytes
73b6937 |
|
import gradio as gr
import glob
from docx import Document
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import numpy as np
def is_header(txt):
if not txt or len(txt) < 35:
if txt == txt.upper() and not txt.endswith(('.', ':', '?', '!')):
return True
if txt.istitle() and len(txt.split()) < 6 and not txt.endswith(('.', ':', '?', '!')):
return True
return False
def get_blocks_from_docx():
docx_list = glob.glob("*.docx")
if not docx_list:
return [], []
doc = Document(docx_list[0])
blocks = []
normal_blocks = []
for p in doc.paragraphs:
txt = p.text.strip()
if (
txt
and not (len(txt) <= 3 and txt.isdigit())
and len(txt.split()) > 3
):
blocks.append(txt)
if not is_header(txt) and len(txt) > 25:
normal_blocks.append(txt)
for table in doc.tables:
for row in table.rows:
row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
if row_text and len(row_text.split()) > 3 and len(row_text) > 25:
blocks.append(row_text)
if not is_header(row_text):
normal_blocks.append(row_text)
# remove duplicates
seen = set(); blocks_clean = []
for b in blocks:
if b not in seen:
blocks_clean.append(b)
seen.add(b)
seen = set(); normal_blocks_clean = []
for b in normal_blocks:
if b not in seen:
normal_blocks_clean.append(b)
seen.add(b)
return blocks_clean, normal_blocks_clean
blocks, normal_blocks = get_blocks_from_docx()
if not blocks or not normal_blocks:
blocks = ["База знаний пуста: проверьте содержимое и структуру вашего .docx!"]
normal_blocks = ["База знаний пуста: проверьте содержимое и структуру вашего .docx!"]
vectorizer = TfidfVectorizer(lowercase=True).fit(blocks)
matrix = vectorizer.transform(blocks)
tokenizer = T5Tokenizer.from_pretrained("cointegrated/rut5-base-multitask")
model = T5ForConditionalGeneration.from_pretrained("cointegrated/rut5-base-multitask")
model.eval()
device = 'cpu'
def rut5_answer(question, context):
prompt = f"question: {question} context: {context}"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
with torch.no_grad():
output_ids = model.generate(
input_ids,
max_length=250, num_beams=4, min_length=40,
no_repeat_ngram_size=3, do_sample=False
)
return tokenizer.decode(output_ids[0], skip_special_tokens=True)
def flatten_index(idx):
# Универсальный способ из всего достать int
if isinstance(idx, (int, float, np.integer, np.floating)):
return int(idx)
if isinstance(idx, (list, tuple, np.ndarray)):
if len(idx) == 0:
return 0
return flatten_index(idx)
if hasattr(idx, "tolist"):
item = idx.tolist()
return flatten_index(item)
try:
return int(idx)
except Exception:
return 0
def ask_chatbot(question):
question = question.strip()
if not question:
return "Пожалуйста, введите вопрос."
if not normal_blocks or normal_blocks == ["База знаний пуста: проверьте содержимое и структуру вашего .docx!"]:
return "Ошибка: база знаний пуста. Проверьте .docx и перезапустите Space."
user_vec = vectorizer.transform([question.lower()])
sims = cosine_similarity(user_vec, matrix)[0]
n_blocks = min(3, len(blocks))
if n_blocks == 0:
return "Ошибка: база знаний отсутствует или пуста."
sorted_idxs = sims.argsort()[-n_blocks:][::-1]
context_blocks = []
for idx in sorted_idxs:
idx_int = flatten_index(idx)
if isinstance(idx_int, int) and 0 <= idx_int < len(blocks):
context_blocks.append(blocks[idx_int])
context = " ".join(context_blocks)
# Ответ только из абзацев, не заголовков!
best_normal_block = ""
max_sim = -1
for nb in normal_blocks:
v_nb = vectorizer.transform([nb.lower()])
sim = cosine_similarity(user_vec, v_nb)[0]
if sim > max_sim:
max_sim = sim
best_normal_block = nb
if not best_normal_block:
best_normal_block = context_blocks if context_blocks else ""
answer = rut5_answer(question, context)
if len(answer.strip().split()) < 8 or answer.count('.') < 2:
answer += "\n\n" + best_normal_block
if is_header(answer):
answer = best_normal_block
return answer
EXAMPLES = [
"Как оформить список литературы?",
"Какие сроки сдачи и защиты ВКР?",
"Какой процент оригинальности требуется?",
"Как оформлять формулы?"
]
with gr.Blocks() as demo:
gr.Markdown(
"# Русскоязычный Чат-бот по методичке (AI+документ)\nЗадайте вопрос — получите развернутый ответ на основании вашего документа!"
)
question = gr.Textbox(label="Ваш вопрос", lines=2)
ask_btn = gr.Button("Получить ответ")
answer = gr.Markdown(label="Ответ", visible=True)
def with_spinner(q):
yield "Чат-бот думает..."
yield ask_chatbot(q)
ask_btn.click(with_spinner, question, answer)
question.submit(with_spinner, question, answer)
gr.Markdown("#### Примеры вопросов:")
gr.Examples(EXAMPLES, inputs=question)
gr.Markdown("""
---
### Контакты (укажите свои)
Преподаватель: ___________________
Email: ___________________________
Кафедра: _________________________
""")
demo.launch() |