Tom Aarsen commited on
Commit
d43334a
1 Parent(s): 20e3552

Add files for Sentence Transformers integration

Browse files

Note: running it with Transformers or anything else still works as usual :)

1_Pooling/config.json CHANGED
@@ -1,10 +1,9 @@
1
  {
2
- "word_embedding_dimension": 768,
3
- "pooling_mode_cls_token": false,
4
- "pooling_mode_mean_tokens": true,
5
- "pooling_mode_max_tokens": false,
6
- "pooling_mode_mean_sqrt_len_tokens": false,
7
- "pooling_mode_weightedmean_tokens": false,
8
- "pooling_mode_lasttoken": false
9
- }
10
-
 
1
  {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false
9
+ }
 
README.md CHANGED
@@ -2655,6 +2655,17 @@ Training data to train the models is released in its entirety. For more details,
2655
 
2656
  ## Usage
2657
 
 
 
 
 
 
 
 
 
 
 
 
2658
 
2659
  ```python
2660
  import torch
@@ -2669,7 +2680,7 @@ def mean_pooling(model_output, attention_mask):
2669
  sentences = ['What is TSNE?', 'Who is Laurens van der Maaten?']
2670
 
2671
  tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
2672
- model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1-unsupervised', trust_remote_code=True)
2673
 
2674
  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
2675
 
@@ -2688,8 +2699,8 @@ The model natively supports scaling of the sequence length past 2048 tokens. To
2688
  + tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', model_max_length=8192)
2689
 
2690
 
2691
- - model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1-unsupervised', trust_remote_code=True)
2692
- + model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1-unsupervised', trust_remote_code=True, rotary_scaling_factor=2)
2693
  ```
2694
 
2695
  # Join the Nomic Community
 
2655
 
2656
  ## Usage
2657
 
2658
+ ### Sentence Transformers
2659
+ ```python
2660
+ from sentence_transformers import SentenceTransformer
2661
+
2662
+ model = SentenceTransformer("../nomic-embed-text-v1", trust_remote_code=True)
2663
+ sentences = ['What is TSNE?', 'Who is Laurens van der Maaten?']
2664
+ embeddings = model.encode(sentences)
2665
+ print(embeddings)
2666
+ ```
2667
+
2668
+ ### Transformers
2669
 
2670
  ```python
2671
  import torch
 
2680
  sentences = ['What is TSNE?', 'Who is Laurens van der Maaten?']
2681
 
2682
  tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
2683
+ model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1', trust_remote_code=True)
2684
 
2685
  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
2686
 
 
2699
  + tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', model_max_length=8192)
2700
 
2701
 
2702
+ - model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1', trust_remote_code=True)
2703
+ + model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1', trust_remote_code=True, rotary_scaling_factor=2)
2704
  ```
2705
 
2706
  # Join the Nomic Community
config_sentence_transformers.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "2.4.0.dev0",
4
+ "transformers": "4.37.2",
5
+ "pytorch": "2.1.0+cu121"
6
+ }
7
+ }
modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 8192,
3
+ "do_lower_case": false
4
+ }
tokenizer_config.json CHANGED
@@ -45,7 +45,7 @@
45
  "cls_token": "[CLS]",
46
  "do_lower_case": true,
47
  "mask_token": "[MASK]",
48
- "model_max_length": 512,
49
  "pad_token": "[PAD]",
50
  "sep_token": "[SEP]",
51
  "strip_accents": null,
 
45
  "cls_token": "[CLS]",
46
  "do_lower_case": true,
47
  "mask_token": "[MASK]",
48
+ "model_max_length": 8192,
49
  "pad_token": "[PAD]",
50
  "sep_token": "[SEP]",
51
  "strip_accents": null,