davidmezzetti
commited on
Commit
•
03883d3
1
Parent(s):
fec9519
Add model
Browse files- .gitattributes +2 -0
- README.md +51 -1
- config.json +28 -0
- documents +3 -0
- embeddings +3 -0
.gitattributes
CHANGED
@@ -32,3 +32,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
documents filter=lfs diff=lfs merge=lfs -text
|
36 |
+
embeddings filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,3 +1,53 @@
|
|
1 |
---
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
inference: false
|
3 |
+
language: en
|
4 |
+
license:
|
5 |
+
- cc-by-sa-3.0
|
6 |
+
- gfdl
|
7 |
+
library_name: txtai
|
8 |
+
tags:
|
9 |
+
- sentence-similarity
|
10 |
+
datasets:
|
11 |
+
- olm/olm-wikipedia-20221220
|
12 |
---
|
13 |
+
|
14 |
+
# Wikipedia txtai embeddings index
|
15 |
+
|
16 |
+
This is a [txtai](https://github.com/neuml/txtai) embeddings index for the [English edition of Wikipedia](https://en.wikipedia.org/).
|
17 |
+
|
18 |
+
This index is built from the [OLM Wikipedia December 2022 dataset](https://huggingface.co/datasets/olm/olm-wikipedia-20221220).
|
19 |
+
Only the first paragraph of the [lead section](https://en.wikipedia.org/wiki/Wikipedia:Manual_of_Style/Lead_section) from each article is included in the index.
|
20 |
+
This is similar to an abstract of the article.
|
21 |
+
|
22 |
+
It also uses [Wikipedia Page Views](https://dumps.wikimedia.org/other/pageviews/readme.html) data to add a `percentile` field. The `percentile` field can be used
|
23 |
+
to only match commonly visited pages.
|
24 |
+
|
25 |
+
txtai must be [installed](https://neuml.github.io/txtai/install/) to use this model.
|
26 |
+
|
27 |
+
## Example
|
28 |
+
|
29 |
+
Version 5.4 added support for loading embeddings indexes from the Hugging Face Hub. See the example below.
|
30 |
+
|
31 |
+
```python
|
32 |
+
from txtai.embeddings import Embeddings
|
33 |
+
|
34 |
+
# Load the index from the HF Hub
|
35 |
+
embeddings = Embeddings()
|
36 |
+
embeddings.load(provider="huggingface-hub", container="neuml/txtai-wikipedia")
|
37 |
+
|
38 |
+
# Run a search
|
39 |
+
embeddings.search("Roman Empire")
|
40 |
+
|
41 |
+
# Run a search matching only the Top 1% of articles
|
42 |
+
embeddings.search("""
|
43 |
+
SELECT id, text, score, percentile FROM txtai WHERE similar('Boston') AND
|
44 |
+
percentile >= 0.99
|
45 |
+
""")
|
46 |
+
```
|
47 |
+
|
48 |
+
## Use Cases
|
49 |
+
|
50 |
+
An embeddings index generated by txtai is a fully encapsulated index format. It doesn't require a database server or dependencies outside of the Python install.
|
51 |
+
|
52 |
+
The Wikipedia index works well as a fact-based context source for conversational search. In other words, search results from this model can be passed to LLM prompts as the
|
53 |
+
context in which to answer questions.
|
config.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"format": "json",
|
3 |
+
"path": "intfloat/e5-base",
|
4 |
+
"instructions": {
|
5 |
+
"query": "query: ",
|
6 |
+
"data": "passage: "
|
7 |
+
},
|
8 |
+
"batch": 8192,
|
9 |
+
"encodebatch": 128,
|
10 |
+
"faiss": {
|
11 |
+
"quantize": true,
|
12 |
+
"sample": 0.05
|
13 |
+
},
|
14 |
+
"content": true,
|
15 |
+
"dimensions": 768,
|
16 |
+
"backend": "faiss",
|
17 |
+
"offset": 6013092,
|
18 |
+
"build": {
|
19 |
+
"create": "2023-02-20T21:57:46Z",
|
20 |
+
"python": "3.7.16",
|
21 |
+
"settings": {
|
22 |
+
"components": "IVF2193,SQ8"
|
23 |
+
},
|
24 |
+
"system": "Linux (x86_64)",
|
25 |
+
"txtai": "5.4.0"
|
26 |
+
},
|
27 |
+
"update": "2023-02-20T21:57:46Z"
|
28 |
+
}
|
documents
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fecb90d975ac6d48caabd1b7a5a4b94350b1af6c052e28dbc6fab4afa6051708
|
3 |
+
size 3138019328
|
embeddings
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fb7bd5798472eb259459edf5037231198814b639649239b05799253db2df8529
|
3 |
+
size 4672920160
|