Tom Aarsen commited on
Commit
c67b216
·
1 Parent(s): 033205f

Automatically add EOS via Tokenizer, add Sentence Transformers snippet

Browse files
Files changed (2) hide show
  1. README.md +52 -7
  2. tokenizer.json +2 -2
README.md CHANGED
@@ -5,9 +5,11 @@ datasets:
5
  - codefuse-ai/F2LLM
6
  language:
7
  - en
 
 
8
  license: apache-2.0
9
  pipeline_tag: feature-extraction
10
- library_name: transformers
11
  ---
12
 
13
  # F2LLM-4B: Matching SOTA Embedding Performance with 6 Million Open-Source Data
@@ -20,7 +22,38 @@ F2LLMs (Foundation to Feature Large Language Models) are foundation models direc
20
 
21
  ## Usage
22
 
23
- To encode a batch of sentences:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  ```python
26
  from transformers import AutoModel, AutoTokenizer
@@ -32,22 +65,34 @@ model_path = "codefuse-ai/F2LLM-4B"
32
  tokenizer = AutoTokenizer.from_pretrained(model_path)
33
  model = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, device_map={'': 0})
34
 
35
- sentences = [
 
 
36
  'We present F2LLM, a family of fully open embedding LLMs that achieve a strong balance between model size, training data, and embedding performance.',
37
- 'Model checkpoints, training datasets, and training code are released, positioning F2LLM as a strong, reproducible, and budget-friendly baseline for future research in text embedding models.'
 
38
  ]
39
 
40
  def encode(sentences):
41
  batch_size = len(sentences)
42
- sentences = [s+tokenizer.eos_token for s in sentences]
43
- tokenized_inputs = tokenizer(sentences, padding=True, return_tensors='pt', add_special_tokens=False).to(model.device)
44
  last_hidden_state = model(**tokenized_inputs).last_hidden_state
45
  eos_positions = tokenized_inputs.attention_mask.sum(dim=1) - 1
46
  embeddings = last_hidden_state[torch.arange(batch_size, device=model.device), eos_positions]
47
  embeddings = F.normalize(embeddings, p=2, dim=1)
48
  return embeddings
49
 
50
- embeddings = encode(sentences)
 
 
 
 
 
 
 
 
 
 
51
  ```
52
 
53
  ## Evaluation
 
5
  - codefuse-ai/F2LLM
6
  language:
7
  - en
8
+ tags:
9
+ - transformers
10
  license: apache-2.0
11
  pipeline_tag: feature-extraction
12
+ library_name: sentence-transformers
13
  ---
14
 
15
  # F2LLM-4B: Matching SOTA Embedding Performance with 6 Million Open-Source Data
 
22
 
23
  ## Usage
24
 
25
+ ### With Sentence Transformers
26
+
27
+ To encode text using F2LLM with the [Sentence Transformers](https://www.sbert.net/) library:
28
+
29
+ ```python
30
+ from sentence_transformers import SentenceTransformer
31
+
32
+ model = SentenceTransformer("codefuse-ai/F2LLM-4B", model_kwargs={"torch_dtype": "bfloat16"})
33
+
34
+ # Some sample query and documents
35
+ query = "What is F2LLM used for?"
36
+ documents = [
37
+ 'We present F2LLM, a family of fully open embedding LLMs that achieve a strong balance between model size, training data, and embedding performance.',
38
+ 'Model checkpoints, training datasets, and training code are released, positioning F2LLM as a strong, reproducible, and budget-friendly baseline for future research in text embedding models.',
39
+ 'F2LLM is a model for computing text embeddings that can be used for various NLP tasks such as information retrieval, semantic search, and text classification.'
40
+ ]
41
+
42
+ # Encode the query and documents separately, the encode_query method uses the query prompt
43
+ query_embedding = model.encode_query(query)
44
+ document_embeddings = model.encode_document(documents)
45
+ print(query_embedding.shape, document_embeddings.shape)
46
+ # (2560,) (3, 2560)
47
+
48
+ # Compute cosine similarity between the query and documents
49
+ similarity = model.similarity(query_embedding, document_embeddings)
50
+ print(similarity)
51
+ # tensor([[0.5209, 0.5680, 0.7818]])
52
+ ```
53
+
54
+ ### With Transformers
55
+
56
+ Or directly with the [Transformers](https://huggingface.co/docs/transformers/index) library:
57
 
58
  ```python
59
  from transformers import AutoModel, AutoTokenizer
 
65
  tokenizer = AutoTokenizer.from_pretrained(model_path)
66
  model = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, device_map={'': 0})
67
 
68
+ query = "What is F2LLM used for?"
69
+ query_prompt = "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery:"
70
+ documents = [
71
  'We present F2LLM, a family of fully open embedding LLMs that achieve a strong balance between model size, training data, and embedding performance.',
72
+ 'Model checkpoints, training datasets, and training code are released, positioning F2LLM as a strong, reproducible, and budget-friendly baseline for future research in text embedding models.',
73
+ 'F2LLM is a model for computing text embeddings that can be used for various NLP tasks such as information retrieval, semantic search, and text classification.'
74
  ]
75
 
76
  def encode(sentences):
77
  batch_size = len(sentences)
78
+ tokenized_inputs = tokenizer(sentences, padding=True, return_tensors='pt').to(model.device)
 
79
  last_hidden_state = model(**tokenized_inputs).last_hidden_state
80
  eos_positions = tokenized_inputs.attention_mask.sum(dim=1) - 1
81
  embeddings = last_hidden_state[torch.arange(batch_size, device=model.device), eos_positions]
82
  embeddings = F.normalize(embeddings, p=2, dim=1)
83
  return embeddings
84
 
85
+ # Encode the query and documents
86
+ query_embedding = encode([query_prompt + query])
87
+ document_embeddings = encode(documents)
88
+ print(query_embedding.shape, document_embeddings.shape)
89
+ # torch.Size([1, 2560]) torch.Size([3, 2560])
90
+
91
+ # Compute cosine similarity between the query and documents
92
+ similarity = query_embedding @ document_embeddings.T
93
+ print(similarity)
94
+ # tensor([[0.5156, 0.5664, 0.7773]], device='cuda:0', dtype=torch.bfloat16,
95
+ # grad_fn=<MmBackward0>)
96
  ```
97
 
98
  ## Evaluation
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
- size 11422654
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38360d5a512a43641b36d6fba2df87b8a3f5464c6b5c76f03e82d6d795175566
3
+ size 11423195