tuliodisanto commited on
Commit
218b8c6
·
verified ·
1 Parent(s): f29ffb9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -36
app.py CHANGED
@@ -5,7 +5,6 @@ import os
5
  import sys
6
  import traceback
7
  import subprocess
8
- # A importação do SentenceTransformer (Bi-Encoder) não é mais necessária
9
  from sentence_transformers import CrossEncoder
10
  import csv
11
  from collections import defaultdict
@@ -50,40 +49,35 @@ DATA_HAS_CHANGED = False
50
 
51
  # --- Funções de Feedback ---
52
  def normalize_text_for_feedback(text):
 
53
  if pd.isna(text): return ""
54
  try:
55
- from enhanced_search_v2 import normalize_text as es_normalize_text
56
- return es_normalize_text(str(text).strip())
57
  except ImportError:
58
  import unidecode
59
- return unidecode.unidecode(str(text).lower().strip())
 
 
 
60
 
61
  def load_user_feedback():
 
62
  global USER_BEST_MATCHES_COUNTS
63
- USER_BEST_MATCHES_COUNTS = {}
64
  feedback_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), USER_FEEDBACK_FILE)
65
  if not os.path.exists(feedback_file_path):
66
  with open(feedback_file_path, 'w', newline='', encoding='utf-8') as f: csv.writer(f).writerow(FEEDBACK_CSV_COLUMNS)
67
  return
68
  try:
69
  with open(feedback_file_path, 'r', encoding='utf-8') as f:
70
- reader = csv.reader(f)
71
- try:
72
- header = next(reader)
73
- if [col.strip() for col in header] != FEEDBACK_CSV_COLUMNS:
74
- print(f"--- [AVISO] Cabeçalho do {USER_FEEDBACK_FILE} incorreto.")
75
- return
76
- except StopIteration:
77
- return
78
-
79
  for row in reader:
80
- if len(row) == len(FEEDBACK_CSV_COLUMNS):
81
- row_dict = dict(zip(FEEDBACK_CSV_COLUMNS, row))
82
- query_norm, tuss_code = row_dict.get('query_normalized', ''), row_dict.get('tuss_code_submitted', '')
83
- if query_norm and tuss_code:
84
- if query_norm not in USER_BEST_MATCHES_COUNTS: USER_BEST_MATCHES_COUNTS[query_norm] = {}
85
- USER_BEST_MATCHES_COUNTS[query_norm][tuss_code] = USER_BEST_MATCHES_COUNTS[query_norm].get(tuss_code, 0) + 1
86
- print(f"--- [SUCESSO] Feedback de usuário carregado/sincronizado. ---")
87
  except Exception as e: print(f"--- [ERRO] Falha ao carregar feedback: {e} ---"); traceback.print_exc()
88
 
89
  # --- Execução de Scripts e Importações ---
@@ -99,7 +93,6 @@ app = Flask(__name__)
99
  DF_ORIGINAL, DF_NORMALIZED, FUZZY_CORPUS, BM25_MODEL, DB_WORD_SET, doc_freq, tuss_map = (None, None, None, None, set(), {}, {})
100
  CORRECTION_CORPUS, NORMALIZED_CORRECTION_CORPUS = [], []
101
  PORTUGUESE_WORD_SET = set()
102
- # O Bi-Encoder (SEMANTIC_MODEL) não é mais usado, então a variável foi removida.
103
  CROSS_ENCODER_MODEL = None
104
 
105
  try:
@@ -113,7 +106,6 @@ try:
113
  PORTUGUESE_WORD_SET = load_general_dictionary(general_dict_path)
114
  load_user_feedback()
115
 
116
- # O carregamento do Bi-Encoder (SEMANTIC_MODEL) foi removido para economizar memória.
117
  print("\n--- [SETUP] Carregando modelo Cross-Encoder (Etapa de reordenação)... ---")
118
  cross_encoder_model_name = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
119
  CROSS_ENCODER_MODEL = CrossEncoder(cross_encoder_model_name, device='cpu')
@@ -135,19 +127,18 @@ def search():
135
  data = request.get_json()
136
  query = data.get('query', '').strip()
137
 
138
- # O parâmetro 'semantic_model' foi removido da chamada da função
 
139
  results = search_procedure_with_log(
140
- query,
141
- DF_ORIGINAL,
142
- DF_NORMALIZED,
143
- FUZZY_CORPUS,
144
- (CORRECTION_CORPUS, NORMALIZED_CORRECTION_CORPUS),
145
- PORTUGUESE_WORD_SET,
146
- BM25_MODEL,
147
- DB_WORD_SET,
148
- doc_freq,
149
- tuss_map,
150
- limit_per_layer=15,
151
  cross_encoder_model=CROSS_ENCODER_MODEL,
152
  user_best_matches_counts=USER_BEST_MATCHES_COUNTS,
153
  user_feedback_threshold=USER_FEEDBACK_THRESHOLD
@@ -184,7 +175,7 @@ def submit_feedback_route():
184
 
185
  DATA_HAS_CHANGED = True
186
  print(f"--- [DADOS] '{USER_FEEDBACK_FILE}' foi modificado. Commit agendado para o desligamento. ---")
187
-
188
  load_user_feedback()
189
 
190
  return jsonify({"status": "success", "message": "Feedback recebido!"}), 200
 
5
  import sys
6
  import traceback
7
  import subprocess
 
8
  from sentence_transformers import CrossEncoder
9
  import csv
10
  from collections import defaultdict
 
49
 
50
  # --- Funções de Feedback ---
51
  def normalize_text_for_feedback(text):
52
+ """Função de normalização usada para consistência no arquivo de feedback."""
53
  if pd.isna(text): return ""
54
  try:
55
+ from enhanced_search_v2 import sanitize_text as es_sanitize_text
56
+ return es_sanitize_text(str(text).strip())
57
  except ImportError:
58
  import unidecode
59
+ # Fallback de higienização caso o import falhe
60
+ normalized = unidecode.unidecode(str(text).lower())
61
+ sanitized = re.sub(r'[^\w\s]', ' ', normalized)
62
+ return re.sub(r'\s+', ' ', sanitized).strip()
63
 
64
  def load_user_feedback():
65
+ """Carrega o arquivo de feedback e compila as contagens de 'melhor correspondência' por query."""
66
  global USER_BEST_MATCHES_COUNTS
67
+ USER_BEST_MATCHES_COUNTS = defaultdict(lambda: defaultdict(int))
68
  feedback_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), USER_FEEDBACK_FILE)
69
  if not os.path.exists(feedback_file_path):
70
  with open(feedback_file_path, 'w', newline='', encoding='utf-8') as f: csv.writer(f).writerow(FEEDBACK_CSV_COLUMNS)
71
  return
72
  try:
73
  with open(feedback_file_path, 'r', encoding='utf-8') as f:
74
+ reader = csv.DictReader(f)
 
 
 
 
 
 
 
 
75
  for row in reader:
76
+ query_norm = row.get('query_normalized', '')
77
+ tuss_code = row.get('tuss_code_submitted', '')
78
+ if query_norm and tuss_code:
79
+ USER_BEST_MATCHES_COUNTS[query_norm][tuss_code] += 1
80
+ print(f"--- [SUCESSO] Feedback de usuário carregado. {len(USER_BEST_MATCHES_COUNTS)} queries com feedback. ---")
 
 
81
  except Exception as e: print(f"--- [ERRO] Falha ao carregar feedback: {e} ---"); traceback.print_exc()
82
 
83
  # --- Execução de Scripts e Importações ---
 
93
  DF_ORIGINAL, DF_NORMALIZED, FUZZY_CORPUS, BM25_MODEL, DB_WORD_SET, doc_freq, tuss_map = (None, None, None, None, set(), {}, {})
94
  CORRECTION_CORPUS, NORMALIZED_CORRECTION_CORPUS = [], []
95
  PORTUGUESE_WORD_SET = set()
 
96
  CROSS_ENCODER_MODEL = None
97
 
98
  try:
 
106
  PORTUGUESE_WORD_SET = load_general_dictionary(general_dict_path)
107
  load_user_feedback()
108
 
 
109
  print("\n--- [SETUP] Carregando modelo Cross-Encoder (Etapa de reordenação)... ---")
110
  cross_encoder_model_name = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
111
  CROSS_ENCODER_MODEL = CrossEncoder(cross_encoder_model_name, device='cpu')
 
127
  data = request.get_json()
128
  query = data.get('query', '').strip()
129
 
130
+ # CORREÇÃO: A chamada da função foi atualizada para corresponder à nova assinatura
131
+ # em 'enhanced_search_v2.py', removendo os argumentos 'tuss_map' e 'limit_per_layer'.
132
  results = search_procedure_with_log(
133
+ query=query,
134
+ df_original=DF_ORIGINAL,
135
+ df_normalized=DF_NORMALIZED,
136
+ fuzzy_search_corpus=FUZZY_CORPUS,
137
+ correction_corpus=(CORRECTION_CORPUS, NORMALIZED_CORRECTION_CORPUS),
138
+ portuguese_word_set=PORTUGUESE_WORD_SET,
139
+ bm25_model=BM25_MODEL,
140
+ db_word_set=DB_WORD_SET,
141
+ doc_freq=doc_freq,
 
 
142
  cross_encoder_model=CROSS_ENCODER_MODEL,
143
  user_best_matches_counts=USER_BEST_MATCHES_COUNTS,
144
  user_feedback_threshold=USER_FEEDBACK_THRESHOLD
 
175
 
176
  DATA_HAS_CHANGED = True
177
  print(f"--- [DADOS] '{USER_FEEDBACK_FILE}' foi modificado. Commit agendado para o desligamento. ---")
178
+
179
  load_user_feedback()
180
 
181
  return jsonify({"status": "success", "message": "Feedback recebido!"}), 200