davidmezzetti commited on
Commit
03883d3
1 Parent(s): fec9519
Files changed (5) hide show
  1. .gitattributes +2 -0
  2. README.md +51 -1
  3. config.json +28 -0
  4. documents +3 -0
  5. embeddings +3 -0
.gitattributes CHANGED
@@ -32,3 +32,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ documents filter=lfs diff=lfs merge=lfs -text
36
+ embeddings filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,53 @@
1
  ---
2
- license: cc-by-sa-3.0
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ inference: false
3
+ language: en
4
+ license:
5
+ - cc-by-sa-3.0
6
+ - gfdl
7
+ library_name: txtai
8
+ tags:
9
+ - sentence-similarity
10
+ datasets:
11
+ - olm/olm-wikipedia-20221220
12
  ---
13
+
14
+ # Wikipedia txtai embeddings index
15
+
16
+ This is a [txtai](https://github.com/neuml/txtai) embeddings index for the [English edition of Wikipedia](https://en.wikipedia.org/).
17
+
18
+ This index is built from the [OLM Wikipedia December 2022 dataset](https://huggingface.co/datasets/olm/olm-wikipedia-20221220).
19
+ Only the first paragraph of the [lead section](https://en.wikipedia.org/wiki/Wikipedia:Manual_of_Style/Lead_section) from each article is included in the index.
20
+ This is similar to an abstract of the article.
21
+
22
+ It also uses [Wikipedia Page Views](https://dumps.wikimedia.org/other/pageviews/readme.html) data to add a `percentile` field. The `percentile` field can be used
23
+ to only match commonly visited pages.
24
+
25
+ txtai must be [installed](https://neuml.github.io/txtai/install/) to use this model.
26
+
27
+ ## Example
28
+
29
+ Version 5.4 added support for loading embeddings indexes from the Hugging Face Hub. See the example below.
30
+
31
+ ```python
32
+ from txtai.embeddings import Embeddings
33
+
34
+ # Load the index from the HF Hub
35
+ embeddings = Embeddings()
36
+ embeddings.load(provider="huggingface-hub", container="neuml/txtai-wikipedia")
37
+
38
+ # Run a search
39
+ embeddings.search("Roman Empire")
40
+
41
+ # Run a search matching only the Top 1% of articles
42
+ embeddings.search("""
43
+ SELECT id, text, score, percentile FROM txtai WHERE similar('Boston') AND
44
+ percentile >= 0.99
45
+ """)
46
+ ```
47
+
48
+ ## Use Cases
49
+
50
+ An embeddings index generated by txtai is a fully encapsulated index format. It doesn't require a database server or dependencies outside of the Python install.
51
+
52
+ The Wikipedia index works well as a fact-based context source for conversational search. In other words, search results from this model can be passed to LLM prompts as the
53
+ context in which to answer questions.
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "format": "json",
3
+ "path": "intfloat/e5-base",
4
+ "instructions": {
5
+ "query": "query: ",
6
+ "data": "passage: "
7
+ },
8
+ "batch": 8192,
9
+ "encodebatch": 128,
10
+ "faiss": {
11
+ "quantize": true,
12
+ "sample": 0.05
13
+ },
14
+ "content": true,
15
+ "dimensions": 768,
16
+ "backend": "faiss",
17
+ "offset": 6013092,
18
+ "build": {
19
+ "create": "2023-02-20T21:57:46Z",
20
+ "python": "3.7.16",
21
+ "settings": {
22
+ "components": "IVF2193,SQ8"
23
+ },
24
+ "system": "Linux (x86_64)",
25
+ "txtai": "5.4.0"
26
+ },
27
+ "update": "2023-02-20T21:57:46Z"
28
+ }
documents ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fecb90d975ac6d48caabd1b7a5a4b94350b1af6c052e28dbc6fab4afa6051708
3
+ size 3138019328
embeddings ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb7bd5798472eb259459edf5037231198814b639649239b05799253db2df8529
3
+ size 4672920160