Spaces:
Sleeping
Sleeping
File size: 6,221 Bytes
73b6937 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import gradio as gr
import glob
from docx import Document
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import numpy as np
def is_header(txt):
if not txt or len(txt) < 35:
if txt == txt.upper() and not txt.endswith(('.', ':', '?', '!')):
return True
if txt.istitle() and len(txt.split()) < 6 and not txt.endswith(('.', ':', '?', '!')):
return True
return False
def get_blocks_from_docx():
docx_list = glob.glob("*.docx")
if not docx_list:
return [], []
doc = Document(docx_list[0])
blocks = []
normal_blocks = []
for p in doc.paragraphs:
txt = p.text.strip()
if (
txt
and not (len(txt) <= 3 and txt.isdigit())
and len(txt.split()) > 3
):
blocks.append(txt)
if not is_header(txt) and len(txt) > 25:
normal_blocks.append(txt)
for table in doc.tables:
for row in table.rows:
row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
if row_text and len(row_text.split()) > 3 and len(row_text) > 25:
blocks.append(row_text)
if not is_header(row_text):
normal_blocks.append(row_text)
# remove duplicates
seen = set(); blocks_clean = []
for b in blocks:
if b not in seen:
blocks_clean.append(b)
seen.add(b)
seen = set(); normal_blocks_clean = []
for b in normal_blocks:
if b not in seen:
normal_blocks_clean.append(b)
seen.add(b)
return blocks_clean, normal_blocks_clean
blocks, normal_blocks = get_blocks_from_docx()
if not blocks or not normal_blocks:
blocks = ["База знаний пуста: проверьте содержимое и структуру вашего .docx!"]
normal_blocks = ["База знаний пуста: проверьте содержимое и структуру вашего .docx!"]
vectorizer = TfidfVectorizer(lowercase=True).fit(blocks)
matrix = vectorizer.transform(blocks)
tokenizer = T5Tokenizer.from_pretrained("cointegrated/rut5-base-multitask")
model = T5ForConditionalGeneration.from_pretrained("cointegrated/rut5-base-multitask")
model.eval()
device = 'cpu'
def rut5_answer(question, context):
prompt = f"question: {question} context: {context}"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
with torch.no_grad():
output_ids = model.generate(
input_ids,
max_length=250, num_beams=4, min_length=40,
no_repeat_ngram_size=3, do_sample=False
)
return tokenizer.decode(output_ids[0], skip_special_tokens=True)
def flatten_index(idx):
# Универсальный способ из всего достать int
if isinstance(idx, (int, float, np.integer, np.floating)):
return int(idx)
if isinstance(idx, (list, tuple, np.ndarray)):
if len(idx) == 0:
return 0
return flatten_index(idx)
if hasattr(idx, "tolist"):
item = idx.tolist()
return flatten_index(item)
try:
return int(idx)
except Exception:
return 0
def ask_chatbot(question):
question = question.strip()
if not question:
return "Пожалуйста, введите вопрос."
if not normal_blocks or normal_blocks == ["База знаний пуста: проверьте содержимое и структуру вашего .docx!"]:
return "Ошибка: база знаний пуста. Проверьте .docx и перезапустите Space."
user_vec = vectorizer.transform([question.lower()])
sims = cosine_similarity(user_vec, matrix)[0]
n_blocks = min(3, len(blocks))
if n_blocks == 0:
return "Ошибка: база знаний отсутствует или пуста."
sorted_idxs = sims.argsort()[-n_blocks:][::-1]
context_blocks = []
for idx in sorted_idxs:
idx_int = flatten_index(idx)
if isinstance(idx_int, int) and 0 <= idx_int < len(blocks):
context_blocks.append(blocks[idx_int])
context = " ".join(context_blocks)
# Ответ только из абзацев, не заголовков!
best_normal_block = ""
max_sim = -1
for nb in normal_blocks:
v_nb = vectorizer.transform([nb.lower()])
sim = cosine_similarity(user_vec, v_nb)[0]
if sim > max_sim:
max_sim = sim
best_normal_block = nb
if not best_normal_block:
best_normal_block = context_blocks if context_blocks else ""
answer = rut5_answer(question, context)
if len(answer.strip().split()) < 8 or answer.count('.') < 2:
answer += "\n\n" + best_normal_block
if is_header(answer):
answer = best_normal_block
return answer
EXAMPLES = [
"Как оформить список литературы?",
"Какие сроки сдачи и защиты ВКР?",
"Какой процент оригинальности требуется?",
"Как оформлять формулы?"
]
with gr.Blocks() as demo:
gr.Markdown(
"# Русскоязычный Чат-бот по методичке (AI+документ)\nЗадайте вопрос — получите развернутый ответ на основании вашего документа!"
)
question = gr.Textbox(label="Ваш вопрос", lines=2)
ask_btn = gr.Button("Получить ответ")
answer = gr.Markdown(label="Ответ", visible=True)
def with_spinner(q):
yield "Чат-бот думает..."
yield ask_chatbot(q)
ask_btn.click(with_spinner, question, answer)
question.submit(with_spinner, question, answer)
gr.Markdown("#### Примеры вопросов:")
gr.Examples(EXAMPLES, inputs=question)
gr.Markdown("""
---
### Контакты (укажите свои)
Преподаватель: ___________________
Email: ___________________________
Кафедра: _________________________
""")
demo.launch() |