Tom Aarsen commited on
Commit
c28c400
1 Parent(s): 026a71c

Add Sentence Transformers integration + README

Browse files
1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": true,
4
+ "pooling_mode_mean_tokens": false,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
README.md CHANGED
@@ -2936,6 +2936,39 @@ Based on the [intfloat/e5-large-unsupervised](https://huggingface.co/intfloat/e5
2936
  ## Usage
2937
 
2938
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2939
  ### Using Huggingface transformers
2940
 
2941
 
@@ -2948,7 +2981,7 @@ import torch
2948
  from transformers import AutoModel, AutoTokenizer
2949
 
2950
  tokenizer = AutoTokenizer.from_pretrained('Snowflake/snowflake-arctic-embed-m-long')
2951
- model = AutoModel.from_pretrained('Snowflake/snowflake-arctic-embed-m-long', add_pooling_layer=False)
2952
  model.eval()
2953
 
2954
  query_prefix = 'Represent this sentence for searching relevant passages: '
 
2936
  ## Usage
2937
 
2938
 
2939
+ ### Using Sentence Transformers
2940
+
2941
+ You can use the sentence-transformers package to use an snowflake-arctic-embed model, as shown below.
2942
+
2943
+ ```python
2944
+ from sentence_transformers import SentenceTransformer
2945
+
2946
+ model = SentenceTransformer("Snowflake/snowflake-arctic-embed-m-long", trust_remote_code=True)
2947
+
2948
+ queries = ['what is snowflake?', 'Where can I get the best tacos?']
2949
+ documents = ['The Data Cloud!', 'Mexico City of Course!']
2950
+
2951
+ query_embeddings = model.encode(queries, prompt_name="query")
2952
+ document_embeddings = model.encode(documents)
2953
+
2954
+ scores = query_embeddings @ document_embeddings.T
2955
+ for query, query_scores in zip(queries, scores):
2956
+ doc_score_pairs = list(zip(documents, query_scores))
2957
+ doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
2958
+ # Output passages & scores
2959
+ print("Query:", query)
2960
+ for document, score in doc_score_pairs:
2961
+ print(score, document)
2962
+ ```
2963
+ ```
2964
+ Query: what is snowflake?
2965
+ 0.46484852 The Data Cloud!
2966
+ 0.3758855 Mexico City of Course!
2967
+ Query: Where can I get the best tacos?
2968
+ 0.42407742 Mexico City of Course!
2969
+ 0.36740506 The Data Cloud!
2970
+ ```
2971
+
2972
  ### Using Huggingface transformers
2973
 
2974
 
 
2981
  from transformers import AutoModel, AutoTokenizer
2982
 
2983
  tokenizer = AutoTokenizer.from_pretrained('Snowflake/snowflake-arctic-embed-m-long')
2984
+ model = AutoModel.from_pretrained('Snowflake/snowflake-arctic-embed-m-long', trust_remote_code=True, add_pooling_layer=False)
2985
  model.eval()
2986
 
2987
  query_prefix = 'Represent this sentence for searching relevant passages: '
config_sentence_transformers.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "2.7.0.dev0",
4
+ "transformers": "4.39.3",
5
+ "pytorch": "2.1.0+cu121"
6
+ },
7
+ "prompts": {
8
+ "query": "Represent this sentence for searching relevant passages: "
9
+ },
10
+ "default_prompt_name": null
11
+ }
modules.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Normalize",
18
+ "type": "sentence_transformers.models.Normalize"
19
+ }
20
+ ]
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 8192,
3
+ "do_lower_case": false
4
+ }