nickprock commited on
Commit
e4c2f96
1 Parent(s): 5aad393

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +61 -21
README.md CHANGED
@@ -8,7 +8,7 @@ tags:
8
 
9
  ---
10
 
11
- # {MODEL_NAME}
12
 
13
  This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
14
 
@@ -25,12 +25,31 @@ pip install -U sentence-transformers
25
  Then you can use the model like this:
26
 
27
  ```python
28
- from sentence_transformers import SentenceTransformer
29
- sentences = ["This is an example sentence", "Each sentence is converted"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- model = SentenceTransformer('{MODEL_NAME}')
32
- embeddings = model.encode(sentences)
33
- print(embeddings)
34
  ```
35
 
36
 
@@ -42,33 +61,54 @@ Without [sentence-transformers](https://www.SBERT.net), you can use the model li
42
  from transformers import AutoTokenizer, AutoModel
43
  import torch
44
 
45
-
46
  #Mean Pooling - Take attention mask into account for correct averaging
47
  def mean_pooling(model_output, attention_mask):
48
- token_embeddings = model_output[0] #First element of model_output contains all token embeddings
49
  input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
50
  return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
51
 
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  # Sentences we want sentence embeddings for
54
- sentences = ['This is an example sentence', 'Each sentence is converted']
 
55
 
56
  # Load model from HuggingFace Hub
57
- tokenizer = AutoTokenizer.from_pretrained('{MODEL_NAME}')
58
- model = AutoModel.from_pretrained('{MODEL_NAME}')
 
 
 
 
 
 
 
59
 
60
- # Tokenize sentences
61
- encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
62
 
63
- # Compute token embeddings
64
- with torch.no_grad():
65
- model_output = model(**encoded_input)
66
 
67
- # Perform pooling. In this case, mean pooling.
68
- sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
 
 
69
 
70
- print("Sentence embeddings:")
71
- print(sentence_embeddings)
72
  ```
73
 
74
 
@@ -126,4 +166,4 @@ SentenceTransformer(
126
 
127
  ## Citing & Authors
128
 
129
- <!--- Describe where people can find more information -->
 
8
 
9
  ---
10
 
11
+ # mmarco-sentence-flare-it
12
 
13
  This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
14
 
 
25
  Then you can use the model like this:
26
 
27
  ```python
28
+ from sentence_transformers import SentenceTransformer, util
29
+
30
+ query = "Quante persone vivono a Londra?"
31
+ docs = ["A Londra vivono circa 9 milioni di persone", "Londra è conosciuta per il suo quartiere finanziario"]
32
+
33
+ #Load the model
34
+ model = SentenceTransformer('nickprock/mmarco-sentence-flare-it')
35
+
36
+ #Encode query and documents
37
+ query_emb = model.encode(query)
38
+ doc_emb = model.encode(docs)
39
+
40
+ #Compute dot score between query and all document embeddings
41
+ scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()
42
+
43
+ #Combine docs & scores
44
+ doc_score_pairs = list(zip(docs, scores))
45
+
46
+ #Sort by decreasing score
47
+ doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
48
+
49
+ #Output passages & scores
50
+ for doc, score in doc_score_pairs:
51
+ print(score, doc)
52
 
 
 
 
53
  ```
54
 
55
 
 
61
  from transformers import AutoTokenizer, AutoModel
62
  import torch
63
 
 
64
  #Mean Pooling - Take attention mask into account for correct averaging
65
  def mean_pooling(model_output, attention_mask):
66
+ token_embeddings = model_output.last_hidden_state
67
  input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
68
  return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
69
 
70
 
71
+ #Encode text
72
+ def encode(texts):
73
+ # Tokenize sentences
74
+ encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
75
+
76
+ # Compute token embeddings
77
+ with torch.no_grad():
78
+ model_output = model(**encoded_input, return_dict=True)
79
+
80
+ # Perform pooling
81
+ embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
82
+
83
+ return embeddings
84
+
85
+
86
  # Sentences we want sentence embeddings for
87
+ query = "Quante persone vivono a Londra?"
88
+ docs = ["A Londra vivono circa 9 milioni di persone", "Londra è conosciuta per il suo quartiere finanziario"]
89
 
90
  # Load model from HuggingFace Hub
91
+ tokenizer = AutoTokenizer.from_pretrained("nickprock/mmarco-sentence-flare-it")
92
+ model = AutoModel.from_pretrained("nickprock/mmarco-sentence-flare-it")
93
+
94
+ #Encode query and docs
95
+ query_emb = encode(query)
96
+ doc_emb = encode(docs)
97
+
98
+ #Compute dot score between query and all document embeddings
99
+ scores = torch.mm(query_emb, doc_emb.transpose(0, 1))[0].cpu().tolist()
100
 
101
+ #Combine docs & scores
102
+ doc_score_pairs = list(zip(docs, scores))
103
 
104
+ #Sort by decreasing score
105
+ doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
 
106
 
107
+ #Output passages & scores
108
+ print("Query:", query)
109
+ for doc, score in doc_score_pairs:
110
+ print(score, doc)
111
 
 
 
112
  ```
113
 
114
 
 
166
 
167
  ## Citing & Authors
168
 
169
+ More information about the [base model here](https://huggingface.co/osiria/flare-it/)