QAway-to commited on
Commit
9458365
·
1 Parent(s): a5e0a96

google/flan-t5-small . app.py v2.0

Browse files
Files changed (1) hide show
  1. core/interviewer.py +74 -92
core/interviewer.py CHANGED
@@ -1,109 +1,91 @@
1
  # core/interviewer.py
2
- import random
3
- import difflib
 
 
 
 
 
 
4
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 
5
 
6
- # Выбери одну
 
 
7
  QG_MODEL = "google/flan-t5-small"
8
- # QG_MODEL = "iarfmoose/t5-base-question-generator"
9
-
10
 
11
  tokenizer = AutoTokenizer.from_pretrained(QG_MODEL)
12
  model = AutoModelForSeq2SeqLM.from_pretrained(QG_MODEL)
13
 
14
- qg = pipeline(
15
  "text2text-generation",
16
  model=model,
17
  tokenizer=tokenizer,
18
  max_new_tokens=40,
19
  num_beams=4,
20
- no_repeat_ngram_size=4
21
  )
22
 
23
- CATEGORIES = [
24
- "Introversion","Extroversion",
25
- "Sensing","Intuition",
26
- "Thinking","Feeling",
27
- "Judging","Perceiving"
28
- ]
29
-
30
- # Простенькая “память” с защитой от повторов
31
- session_state = {}
32
-
33
- def init_session(user_id: str):
34
- session_state[user_id] = {"asked": [], "answers": {}, "questions": []}
35
-
36
- def _too_similar(q: str, prev: list[str], thresh=0.86) -> bool:
37
- qn = q.lower().strip()
38
- for p in prev:
39
- if difflib.SequenceMatcher(None, qn, p.lower().strip()).ratio() >= thresh:
40
- return True
41
- return False
42
-
43
- def _clean(q: str) -> str:
44
- q = q.strip().strip('"').strip("'")
45
- # вырезаем префиксы вроде "question:", "generate a question:", etc.
46
- bad = ["question:", "generate a question", "ask", "instruction", "output only", "you are"]
47
- low = q.lower()
48
- for b in bad:
49
- if b in low:
50
- # берём правую часть после двоеточия если есть
51
- if ":" in q:
52
- q = q.split(":", 1)[-1]
53
- q = q.replace(b, "")
54
- q = q.strip()
55
- if not q.endswith("?"):
56
- q += "?"
57
- # короткие/мусорные — фоллбэк
58
- if len(q.split()) < 3:
59
- return "What do you usually enjoy doing in your free time?"
60
- return q
61
-
62
- def _template(category: str, user_answer: str) -> str:
63
  """
64
- T5 понимает краткие шаблоны лучше длинных инструкций.
65
- Для разных моделей чуть разные формулировки, но суть одна:
66
  """
67
- if "flan" in QG_MODEL:
68
- # FLAN любит простые задачи в стиле instruction-tuning
69
- return (
70
- f"Generate one open-ended question about {category.lower()} based on the user's answer.\n"
71
- f"User: {user_answer}\n"
72
- f"Question:"
73
- )
74
- elif "question-generator" in QG_MODEL:
75
- # Модель обучена на QG; ей достаточно контекста
76
- return f"generate question: {user_answer} (topic: {category})"
77
- else:
78
- # very small QG
79
- return f"answer: {user_answer} topic: {category} -> question"
80
-
81
- def generate_question(user_id: str, user_answer: str) -> str:
82
- if user_id not in session_state:
83
- init_session(user_id)
84
-
85
- S = session_state[user_id]
86
-
87
- # выбираем НЕспрошенную категорию
88
- remaining = [c for c in CATEGORIES if c not in S["asked"]]
89
- if not remaining:
90
- return "✅ All 8 categories completed."
91
-
92
- category = random.choice(remaining)
93
-
94
- # короткий, “неразговорчивый” шаблон (T5 такое любит)
95
- prompt = _template(category, user_answer)
96
-
97
- out = qg(prompt)[0]["generated_text"]
98
- q = _clean(out)
99
-
100
- # защита от повторов/перефразов
101
- tries = 0
102
- while _too_similar(q, S["questions"]) and tries < 3:
103
- out = qg(prompt)[0]["generated_text"]
104
- q = _clean(out)
105
- tries += 1
106
-
107
- S["asked"].append(category)
108
- S["questions"].append(q)
109
- return f"({category}) {q}"
 
1
  # core/interviewer.py
2
+ """
3
+ 🇬🇧 Interviewer logic module
4
+ Generates context-aware MBTI interview questions using Flan-T5 model.
5
+
6
+ 🇷🇺 Модуль логики интервьюера
7
+ Генерирует вопросы по категориям MBTI с использованием Flan-T5.
8
+ """
9
+
10
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
11
+ import random
12
 
13
+ # --------------------------------------------------------------
14
+ # 1️⃣ Настройки
15
+ # --------------------------------------------------------------
16
  QG_MODEL = "google/flan-t5-small"
 
 
17
 
18
  tokenizer = AutoTokenizer.from_pretrained(QG_MODEL)
19
  model = AutoModelForSeq2SeqLM.from_pretrained(QG_MODEL)
20
 
21
+ qg_pipe = pipeline(
22
  "text2text-generation",
23
  model=model,
24
  tokenizer=tokenizer,
25
  max_new_tokens=40,
26
  num_beams=4,
27
+ no_repeat_ngram_size=4,
28
  )
29
 
30
+ # --------------------------------------------------------------
31
+ # 2️⃣ Состояние сессии
32
+ # --------------------------------------------------------------
33
+ session_state = {
34
+ "history": {},
35
+ "categories": [
36
+ "Extroversion", "Introversion",
37
+ "Sensing", "Intuition",
38
+ "Thinking", "Feeling",
39
+ "Judging", "Perceiving"
40
+ ],
41
+ }
42
+
43
+
44
+ # --------------------------------------------------------------
45
+ # 3️⃣ Генерация нового вопроса
46
+ # --------------------------------------------------------------
47
+ def generate_question(user_id: str, user_answer: str = None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  """
49
+ Generates one question per MBTI axis.
50
+ Avoids repeating previous ones within the same session.
51
  """
52
+ history = session_state["history"].get(user_id, {"asked": []})
53
+ asked = history["asked"]
54
+ all_cats = session_state["categories"]
55
+
56
+ # Если все категории пройдены
57
+ if len(asked) >= len(all_cats):
58
+ return "✅ All MBTI axes covered."
59
+
60
+ # Выбираем следующую категорию
61
+ next_cat = next(c for c in all_cats if c not in asked)
62
+ history["asked"].append(next_cat)
63
+ session_state["history"][user_id] = history
64
+
65
+ # Промпт для T5
66
+ prompt = (
67
+ f"Generate one open-ended question about {next_cat} based on this context: '{user_answer}'. "
68
+ f"Do not repeat or explain. Output only the question itself."
69
+ )
70
+
71
+ try:
72
+ output = qg_pipe(prompt)[0]["generated_text"].strip()
73
+ except Exception as e:
74
+ return f"⚠️ Generation error: {e}"
75
+
76
+ # Очистка мусора — чтобы не было “Generate a question about...”
77
+ bad_phrases = [
78
+ "generate", "question about", "output", "explain", "instruction", "user said"
79
+ ]
80
+ for bp in bad_phrases:
81
+ if bp.lower() in output.lower():
82
+ output = output.split(bp, 1)[-1].strip().lstrip(":").strip()
83
+
84
+ # Убеждаемся, что начинается с нужного слова
85
+ if not output[0].isupper():
86
+ output = output.capitalize()
87
+
88
+ if "?" not in output:
89
+ output += "?"
90
+
91
+ return f"({next_cat}) {output}"