Spaces:
Running
Running
File size: 6,332 Bytes
14a84c2 c4d8795 14a84c2 d0e25a2 14a84c2 d0e25a2 14a84c2 2160154 14a84c2 2b9cf4d b1b58ec 2b9cf4d 14a84c2 d0e25a2 2160154 14a84c2 d0e25a2 14a84c2 d0e25a2 2160154 14a84c2 d0e25a2 14a84c2 2160154 d0e25a2 2160154 d0e25a2 2160154 2dda242 d0e25a2 2160154 2dda242 373e7a9 14a84c2 b1b58ec 2dda242 b1b58ec 14a84c2 c4d8795 5964260 c4d8795 5964260 c4d8795 5964260 c4d8795 14a84c2 373e7a9 14a84c2 2160154 2dda242 d0e25a2 373e7a9 5964260 b1b58ec d0e25a2 2160154 0d6bcb5 2dda242 c4d8795 5964260 c4d8795 2b9cf4d 2160154 d0e25a2 2160154 d0e25a2 14a84c2 373e7a9 d0e25a2 bf2aee6 14a84c2 f2bcdcf 14a84c2 2b9cf4d b1b58ec 2b9cf4d 14a84c2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
import gradio as gr
import glob
from docx import Document
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import numpy as np
def is_header(txt):
if not txt or len(txt) < 35:
if txt == txt.upper() and not txt.endswith(('.', ':', '?', '!')):
return True
if txt.istitle() and len(txt.split()) < 6 and not txt.endswith(('.', ':', '?', '!')):
return True
return False
def get_blocks_from_docx():
docx_list = glob.glob("*.docx")
if not docx_list:
return [], []
doc = Document(docx_list[0])
blocks = []
normal_blocks = []
for p in doc.paragraphs:
txt = p.text.strip()
if (
txt
and not (len(txt) <= 3 and txt.isdigit())
and len(txt.split()) > 3
):
blocks.append(txt)
if not is_header(txt) and len(txt) > 25:
normal_blocks.append(txt)
for table in doc.tables:
for row in table.rows:
row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
if row_text and len(row_text.split()) > 3 and len(row_text) > 25:
blocks.append(row_text)
if not is_header(row_text):
normal_blocks.append(row_text)
# remove duplicates
seen = set(); blocks_clean = []
for b in blocks:
if b not in seen:
blocks_clean.append(b)
seen.add(b)
seen = set(); normal_blocks_clean = []
for b in normal_blocks:
if b not in seen:
normal_blocks_clean.append(b)
seen.add(b)
return blocks_clean, normal_blocks_clean
blocks, normal_blocks = get_blocks_from_docx()
if not blocks or not normal_blocks:
blocks = ["База знаний пуста: проверьте содержимое и структуру вашего .docx!"]
normal_blocks = ["База знаний пуста: проверьте содержимое и структуру вашего .docx!"]
vectorizer = TfidfVectorizer(lowercase=True).fit(blocks)
matrix = vectorizer.transform(blocks)
tokenizer = T5Tokenizer.from_pretrained("cointegrated/rut5-base-multitask")
model = T5ForConditionalGeneration.from_pretrained("cointegrated/rut5-base-multitask")
model.eval()
device = 'cpu'
def rut5_answer(question, context):
prompt = f"question: {question} context: {context}"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
with torch.no_grad():
output_ids = model.generate(
input_ids,
max_length=250, num_beams=4, min_length=40,
no_repeat_ngram_size=3, do_sample=False
)
return tokenizer.decode(output_ids[0], skip_special_tokens=True)
def flatten_index(idx):
# Универсальный способ из всего достать int
if isinstance(idx, (int, float, np.integer, np.floating)):
return int(idx)
if isinstance(idx, (list, tuple, np.ndarray)):
if len(idx) == 0:
return 0
return flatten_index(idx)
if hasattr(idx, "tolist"):
item = idx.tolist()
return flatten_index(item)
try:
return int(idx)
except Exception:
return 0
def ask_chatbot(question):
question = question.strip()
if not question:
return "Пожалуйста, введите вопрос."
if not normal_blocks or normal_blocks == ["База знаний пуста: проверьте содержимое и структуру вашего .docx!"]:
return "Ошибка: база знаний пуста. Проверьте .docx и перезапустите Space."
user_vec = vectorizer.transform([question.lower()])
sims = cosine_similarity(user_vec, matrix)[0]
n_blocks = min(3, len(blocks))
if n_blocks == 0:
return "Ошибка: база знаний отсутствует или пуста."
sorted_idxs = sims.argsort()[-n_blocks:][::-1]
context_blocks = []
for idx in sorted_idxs:
idx_int = flatten_index(idx)
if isinstance(idx_int, int) and 0 <= idx_int < len(blocks):
context_blocks.append(blocks[idx_int])
context = " ".join(context_blocks)
# Ответ только из абзацев, не заголовков!
best_normal_block = ""
max_sim = -1
for nb in normal_blocks:
v_nb = vectorizer.transform([nb.lower()])
sim = cosine_similarity(user_vec, v_nb)[0]
if sim > max_sim:
max_sim = sim
best_normal_block = nb
if not best_normal_block:
best_normal_block = context_blocks if context_blocks else ""
answer = rut5_answer(question, context)
if len(answer.strip().split()) < 8 or answer.count('.') < 2:
answer += "\n\n" + best_normal_block
if is_header(answer):
answer = best_normal_block
return answer
EXAMPLES = [
"Как оформить список литературы?",
"Какие сроки сдачи и защиты ВКР?",
"Какой процент оригинальности требуется?",
"Как оформлять формулы?"
]
with gr.Blocks() as demo:
gr.Markdown(
"# Русскоязычный Чат-бот по методичке (AI+документ)\nЗадайте вопрос — получите развернутый ответ на основании вашего документа!"
)
question = gr.Textbox(label="Ваш вопрос", lines=2)
ask_btn = gr.Button("Получить ответ")
answer = gr.Markdown(label="Ответ", visible=True)
def with_spinner(q):
yield "Чат-бот думает..."
yield ask_chatbot(q)
ask_btn.click(with_spinner, question, answer)
question.submit(with_spinner, question, answer)
gr.Markdown("#### Примеры вопросов:")
gr.Examples(EXAMPLES, inputs=question)
gr.Markdown("""
---
### Контакты (укажите свои)
Преподаватель: ___________________
Email: ___________________________
Кафедра: _________________________
""")
demo.launch()
|