fix parameter name in eval_mteb

#6
Files changed (1) hide show
  1. scripts/eval_mteb.py +6 -3
scripts/eval_mteb.py CHANGED
@@ -119,6 +119,7 @@ CMTEB_TASK_LIST = ['TNews', 'IFlyTek', 'MultilingualSentiment', 'JDReview', 'Onl
119
  'T2Retrieval', 'MMarcoRetrieval', 'DuRetrieval', 'CovidRetrieval', 'CmedqaRetrieval', 'EcomRetrieval', 'MedicalRetrieval', 'VideoRetrieval',
120
  'ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STSB', 'AFQMC', 'QBQTC', 'STS22']
121
 
 
122
  MTEB_PL = [
123
  "CBD","PolEmo2.0-IN","PolEmo2.0-OUT","AllegroReviews","PAC","MassiveIntentClassification","MassiveScenarioClassification",
124
  "SICK-E-PL","PPC","CDSC-E","PSC","8TagsClustering","SICK-R-PL","CDSC-R","STS22",
@@ -405,6 +406,8 @@ class Wrapper:
405
  self._target_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
406
  self.eod_id = self.tokenizer.convert_tokens_to_ids("<|endoftext|>")
407
  self.instruction = instruction
 
 
408
 
409
  if self.tokenizer.padding_side != 'right':
410
  logger.warning(f"Change tokenizer.padding_side from {self.tokenizer.padding_side} to right")
@@ -544,9 +547,9 @@ class Wrapper:
544
 
545
  def _tokenize(self, sentences: List[str], is_query: bool):
546
 
547
- batch_dict = tokenizer(sentences, max_length=max_length - 1, return_attention_mask=False, padding=False, truncation=True)
548
- batch_dict['input_ids'] = [input_ids + [tokenizer.eos_token_id] for input_ids in batch_dict['input_ids']]
549
- batch_dict = tokenizer.pad(batch_dict, padding=True, return_attention_mask=True, return_tensors='pt')
550
  batch_dict['is_causal'] = False
551
  return batch_dict
552
 
 
119
  'T2Retrieval', 'MMarcoRetrieval', 'DuRetrieval', 'CovidRetrieval', 'CmedqaRetrieval', 'EcomRetrieval', 'MedicalRetrieval', 'VideoRetrieval',
120
  'ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STSB', 'AFQMC', 'QBQTC', 'STS22']
121
 
122
+
123
  MTEB_PL = [
124
  "CBD","PolEmo2.0-IN","PolEmo2.0-OUT","AllegroReviews","PAC","MassiveIntentClassification","MassiveScenarioClassification",
125
  "SICK-E-PL","PPC","CDSC-E","PSC","8TagsClustering","SICK-R-PL","CDSC-R","STS22",
 
406
  self._target_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
407
  self.eod_id = self.tokenizer.convert_tokens_to_ids("<|endoftext|>")
408
  self.instruction = instruction
409
+ self.default_query = default_query
410
+ self.force_default = force_default
411
 
412
  if self.tokenizer.padding_side != 'right':
413
  logger.warning(f"Change tokenizer.padding_side from {self.tokenizer.padding_side} to right")
 
547
 
548
  def _tokenize(self, sentences: List[str], is_query: bool):
549
 
550
+ batch_dict = self.tokenizer(sentences, max_length=self.max_seq_len - 1, return_attention_mask=False, padding=False, truncation=True)
551
+ batch_dict['input_ids'] = [input_ids + [self.tokenizer.eos_token_id] for input_ids in batch_dict['input_ids']]
552
+ batch_dict = self.tokenizer.pad(batch_dict, padding=True, return_attention_mask=True, return_tensors='pt')
553
  batch_dict['is_causal'] = False
554
  return batch_dict
555