import gradio as gr import glob from docx import Document from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import torch from transformers import T5ForConditionalGeneration, T5Tokenizer import numpy as np def is_header(txt): if not txt or len(txt) < 35: if txt == txt.upper() and not txt.endswith(('.', ':', '?', '!')): return True if txt.istitle() and len(txt.split()) < 6 and not txt.endswith(('.', ':', '?', '!')): return True return False def get_blocks_from_docx(): docx_list = glob.glob("*.docx") if not docx_list: return [], [] doc = Document(docx_list[0]) blocks = [] normal_blocks = [] for p in doc.paragraphs: txt = p.text.strip() if ( txt and not (len(txt) <= 3 and txt.isdigit()) and len(txt.split()) > 3 ): blocks.append(txt) if not is_header(txt) and len(txt) > 25: normal_blocks.append(txt) for table in doc.tables: for row in table.rows: row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip()) if row_text and len(row_text.split()) > 3 and len(row_text) > 25: blocks.append(row_text) if not is_header(row_text): normal_blocks.append(row_text) # remove duplicates seen = set(); blocks_clean = [] for b in blocks: if b not in seen: blocks_clean.append(b) seen.add(b) seen = set(); normal_blocks_clean = [] for b in normal_blocks: if b not in seen: normal_blocks_clean.append(b) seen.add(b) return blocks_clean, normal_blocks_clean blocks, normal_blocks = get_blocks_from_docx() if not blocks or not normal_blocks: blocks = ["База знаний пуста: проверьте содержимое и структуру вашего .docx!"] normal_blocks = ["База знаний пуста: проверьте содержимое и структуру вашего .docx!"] vectorizer = TfidfVectorizer(lowercase=True).fit(blocks) matrix = vectorizer.transform(blocks) tokenizer = T5Tokenizer.from_pretrained("cointegrated/rut5-base-multitask") model = T5ForConditionalGeneration.from_pretrained("cointegrated/rut5-base-multitask") model.eval() device = 'cpu' def rut5_answer(question, context): prompt = f"question: {question} context: {context}" input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device) with torch.no_grad(): output_ids = model.generate( input_ids, max_length=250, num_beams=4, min_length=40, no_repeat_ngram_size=3, do_sample=False ) return tokenizer.decode(output_ids[0], skip_special_tokens=True) def flatten_index(idx): # Универсальный способ из всего достать int if isinstance(idx, (int, float, np.integer, np.floating)): return int(idx) if isinstance(idx, (list, tuple, np.ndarray)): if len(idx) == 0: return 0 return flatten_index(idx) if hasattr(idx, "tolist"): item = idx.tolist() return flatten_index(item) try: return int(idx) except Exception: return 0 def ask_chatbot(question): question = question.strip() if not question: return "Пожалуйста, введите вопрос." if not normal_blocks or normal_blocks == ["База знаний пуста: проверьте содержимое и структуру вашего .docx!"]: return "Ошибка: база знаний пуста. Проверьте .docx и перезапустите Space." user_vec = vectorizer.transform([question.lower()]) sims = cosine_similarity(user_vec, matrix)[0] n_blocks = min(3, len(blocks)) if n_blocks == 0: return "Ошибка: база знаний отсутствует или пуста." sorted_idxs = sims.argsort()[-n_blocks:][::-1] context_blocks = [] for idx in sorted_idxs: idx_int = flatten_index(idx) if isinstance(idx_int, int) and 0 <= idx_int < len(blocks): context_blocks.append(blocks[idx_int]) context = " ".join(context_blocks) # Ответ только из абзацев, не заголовков! best_normal_block = "" max_sim = -1 for nb in normal_blocks: v_nb = vectorizer.transform([nb.lower()]) sim = cosine_similarity(user_vec, v_nb)[0] if sim > max_sim: max_sim = sim best_normal_block = nb if not best_normal_block: best_normal_block = context_blocks if context_blocks else "" answer = rut5_answer(question, context) if len(answer.strip().split()) < 8 or answer.count('.') < 2: answer += "\n\n" + best_normal_block if is_header(answer): answer = best_normal_block return answer EXAMPLES = [ "Как оформить список литературы?", "Какие сроки сдачи и защиты ВКР?", "Какой процент оригинальности требуется?", "Как оформлять формулы?" ] with gr.Blocks() as demo: gr.Markdown( "# Русскоязычный Чат-бот по методичке (AI+документ)\nЗадайте вопрос — получите развернутый ответ на основании вашего документа!" ) question = gr.Textbox(label="Ваш вопрос", lines=2) ask_btn = gr.Button("Получить ответ") answer = gr.Markdown(label="Ответ", visible=True) def with_spinner(q): yield "Чат-бот думает..." yield ask_chatbot(q) ask_btn.click(with_spinner, question, answer) question.submit(with_spinner, question, answer) gr.Markdown("#### Примеры вопросов:") gr.Examples(EXAMPLES, inputs=question) gr.Markdown(""" --- ### Контакты (укажите свои) Преподаватель: ___________________ Email: ___________________________ Кафедра: _________________________ """) demo.launch()