raphaelsty commited on
Commit
2e6cde3
1 Parent(s): a05a32a

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +61 -0
README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ ---
5
+
6
+
7
+ This model was trained with [Sparsembed](https://github.com/raphaelsty/sparsembed). You can find details on how to use it in the [Sparsembed](https://github.com/raphaelsty/sparsembed) repository.
8
+
9
+ ```sh
10
+ pip install sparsembed
11
+ ```
12
+
13
+ ```python
14
+ from sparsembed import model, retrieve
15
+ from transformers import AutoModelForMaskedLM, AutoTokenizer
16
+
17
+ device = "cuda" # cpu
18
+
19
+ batch_size = 10
20
+
21
+ # List documents to index:
22
+ documents = [
23
+ {'id': 0,
24
+ 'title': 'Paris',
25
+ 'url': 'https://en.wikipedia.org/wiki/Paris',
26
+ 'text': 'Paris is the capital and most populous city of France.'},
27
+ {'id': 1,
28
+ 'title': 'Paris',
29
+ 'url': 'https://en.wikipedia.org/wiki/Paris',
30
+ 'text': "Since the 17th century, Paris has been one of Europe's major centres of science, and arts."},
31
+ {'id': 2,
32
+ 'title': 'Paris',
33
+ 'url': 'https://en.wikipedia.org/wiki/Paris',
34
+ 'text': 'The City of Paris is the centre and seat of government of the region and province of Île-de-France.'
35
+ }]
36
+
37
+ model = model.Splade(
38
+ model=AutoModelForMaskedLM.from_pretrained("raphaelsty/splade_max").to(device),
39
+ tokenizer=AutoTokenizer.from_pretrained("raphaelsty/splade_max"),
40
+ device=device
41
+ )
42
+
43
+ retriever = retrieve.SpladeRetriever(
44
+ key="id", # Key identifier of each document.
45
+ on=["title", "text"], # Fields to search.
46
+ model=model # Splade retriever.
47
+ )
48
+
49
+ retriever = retriever.add(
50
+ documents=documents,
51
+ batch_size=batch_size,
52
+ k_tokens=256, # Number of activated tokens.
53
+ )
54
+
55
+ retriever(
56
+ ["paris", "Toulouse"], # Queries
57
+ k_tokens=20, # Maximum number of activated tokens.
58
+ k=100, # Number of documents to retrieve.
59
+ batch_size=batch_size
60
+ )
61
+ ```