nn-semantic-search-api-hf-ep8-with-teasers

Runtime error

App Files Files Community

muryshev commited on Feb 5

Commit

58de913

•

1 Parent(s): d84c926

Updated search

Browse files

Files changed (2) hide show

legal_info_search_data/internal_docs.json +0 -0
semantic_search.py +10 -3

legal_info_search_data/internal_docs.json ADDED Viewed

The diff for this file is too large to render. See raw diff

semantic_search.py CHANGED Viewed

@@ -18,7 +18,7 @@ from legal_info_search_utils.metrics import calculate_metrics_at_k
 global_data_path = os.environ.get("GLOBAL_DATA_PATH", "legal_info_search_data/")
 global_model_path = os.environ.get("GLOBAL_MODEL_PATH",
-                                   "legal_info_search_model/20240120_122822_ep6/")
 # размеченные консультации
 data_path_consult = os.environ.get("DATA_PATH_CONSULT",
@@ -28,6 +28,10 @@ data_path_consult = os.environ.get("DATA_PATH_CONSULT",
 data_path_consult_ids = os.environ.get("DATA_PATH_CONSULT_IDS",
                                        global_data_path + "data_ids.json")
 # состав БД
 # $ export DB_SUBSETS='["train", "valid", "test"]'
 db_subsets = os.environ.get("DB_SUBSETS", ["train", "valid", "test"])
@@ -78,20 +82,23 @@ class SemanticSearch:
         with open(data_path_consult_ids, "r", encoding="utf-8") as f:
             data_ids = json.load(f)
         db_data = get_subsets_for_db(db_subsets, data_ids, all_docs)
         filtered_all_docs = filter_qa_data_types(db_data_types, all_docs)
         self.mean_refs_count = self.get_mean_refs_counts(db_data_types, filtered_all_docs)
         self.filtered_db_data = filter_db_data_types(db_data_types, db_data)
         self.all_docs_qa = get_subsets_for_qa(["valid"], data_ids, filtered_all_docs)
     def load_model(self):
         if hf_token and hf_model_name:
-            print('Using model '+hf_model_name)
             self.tokenizer = AutoTokenizer.from_pretrained(hf_model_name, use_auth_token=True)
             self.model = AutoModel.from_pretrained(hf_model_name, use_auth_token=True).to(self.device)
         else:
-            print('Using model '+global_model_path)
             self.tokenizer = AutoTokenizer.from_pretrained(global_model_path)
             self.model = AutoModel.from_pretrained(global_model_path).to(self.device)

 global_data_path = os.environ.get("GLOBAL_DATA_PATH", "legal_info_search_data/")
 global_model_path = os.environ.get("GLOBAL_MODEL_PATH",
+                                   "legal_info_search_model/20240202_204910_ep8/")
 # размеченные консультации
 data_path_consult = os.environ.get("DATA_PATH_CONSULT",
 data_path_consult_ids = os.environ.get("DATA_PATH_CONSULT_IDS",
                                        global_data_path + "data_ids.json")
+# предобработанные внутренние документы
+data_path_internal_docs = os.environ.get("DATA_PATH_INTERNAL_DOCS",
+                                       global_data_path + "internal_docs.json")
 # состав БД
 # $ export DB_SUBSETS='["train", "valid", "test"]'
 db_subsets = os.environ.get("DB_SUBSETS", ["train", "valid", "test"])
         with open(data_path_consult_ids, "r", encoding="utf-8") as f:
             data_ids = json.load(f)
+        with open(data_path_internal_docs, "r", encoding="utf-8") as f:
+            internal_docs = json.load(f)
         db_data = get_subsets_for_db(db_subsets, data_ids, all_docs)
         filtered_all_docs = filter_qa_data_types(db_data_types, all_docs)
         self.mean_refs_count = self.get_mean_refs_counts(db_data_types, filtered_all_docs)
+        self.mean_refs_count['Внутренний документ'] = 3
         self.filtered_db_data = filter_db_data_types(db_data_types, db_data)
+        self.filtered_db_data.update(internal_docs)
         self.all_docs_qa = get_subsets_for_qa(["valid"], data_ids, filtered_all_docs)
     def load_model(self):
         if hf_token and hf_model_name:
             self.tokenizer = AutoTokenizer.from_pretrained(hf_model_name, use_auth_token=True)
             self.model = AutoModel.from_pretrained(hf_model_name, use_auth_token=True).to(self.device)
         else:
             self.tokenizer = AutoTokenizer.from_pretrained(global_model_path)
             self.model = AutoModel.from_pretrained(global_model_path).to(self.device)