ritaranx commited on
Commit
87974da
1 Parent(s): 784f855

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +10 -5
README.md CHANGED
@@ -45,17 +45,18 @@ from transformers import AutoTokenizer, AutoModel
45
 
46
  def last_token_pool(last_hidden_states: Tensor,
47
  attention_mask: Tensor) -> Tensor:
 
48
  left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
49
  if left_padding:
50
- return last_hidden_states[:, -1]
51
  else:
52
  sequence_lengths = attention_mask.sum(dim=1) - 1
53
- batch_size = last_hidden_states.shape[0]
54
- return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
55
-
56
 
57
  def get_detailed_instruct_query(task_description: str, query: str) -> str:
58
- return f'Instruct: {task_description}\nQuery: {query}'
59
 
60
  def get_detailed_instruct_passage(passage: str) -> str:
61
  return f'Represent this passage\npassage: {passage}'
@@ -79,6 +80,10 @@ max_length = 512
79
  # Tokenize the input texts
80
  batch_dict = tokenizer(input_texts, max_length=max_length, padding=True, truncation=True, return_tensors='pt')
81
 
 
 
 
 
82
  model.eval()
83
  with torch.no_grad():
84
  outputs = model(**batch_dict)
 
45
 
46
  def last_token_pool(last_hidden_states: Tensor,
47
  attention_mask: Tensor) -> Tensor:
48
+ last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
49
  left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
50
  if left_padding:
51
+ embedding = last_hidden[:, -1]
52
  else:
53
  sequence_lengths = attention_mask.sum(dim=1) - 1
54
+ batch_size = last_hidden.shape[0]
55
+ embedding = last_hidden[torch.arange(batch_size, device=last_hidden.device), sequence_lengths]
56
+ return embedding
57
 
58
  def get_detailed_instruct_query(task_description: str, query: str) -> str:
59
+ return f'{task_description}\nQuery: {query}'
60
 
61
  def get_detailed_instruct_passage(passage: str) -> str:
62
  return f'Represent this passage\npassage: {passage}'
 
80
  # Tokenize the input texts
81
  batch_dict = tokenizer(input_texts, max_length=max_length, padding=True, truncation=True, return_tensors='pt')
82
 
83
+ # Important! Adding EOS token at the end
84
+ batch_dict['input_ids'] = [input_ids + [tokenizer.eos_token_id] for input_ids in batch_dict['input_ids']]
85
+ batch_dict = tokenizer.pad(batch_dict, padding=True, return_attention_mask=True, return_tensors='pt').to("cuda")
86
+
87
  model.eval()
88
  with torch.no_grad():
89
  outputs = model(**batch_dict)