davidmezzetti
commited on
Commit
•
99e1eb8
1
Parent(s):
576db1d
September 2024 data update
Browse files- README.md +6 -6
- config.json +6 -6
- documents +2 -2
- embeddings +2 -2
README.md
CHANGED
@@ -8,14 +8,14 @@ library_name: txtai
|
|
8 |
tags:
|
9 |
- sentence-similarity
|
10 |
datasets:
|
11 |
-
- NeuML/wikipedia-
|
12 |
---
|
13 |
|
14 |
# Wikipedia txtai embeddings index
|
15 |
|
16 |
This is a [txtai](https://github.com/neuml/txtai) embeddings index for the [English edition of Wikipedia](https://en.wikipedia.org/).
|
17 |
|
18 |
-
This index is built from the [Wikipedia
|
19 |
|
20 |
It also uses [Wikipedia Page Views](https://dumps.wikimedia.org/other/pageviews/readme.html) data to add a `percentile` field. The `percentile` field can be used
|
21 |
to only match commonly visited pages.
|
@@ -24,7 +24,7 @@ txtai must be [installed](https://neuml.github.io/txtai/install/) to use this mo
|
|
24 |
|
25 |
## Example
|
26 |
|
27 |
-
|
28 |
|
29 |
```python
|
30 |
from txtai.embeddings import Embeddings
|
@@ -75,7 +75,7 @@ pip install txtchat mwparserfromhell datasets
|
|
75 |
- Download and build pageviews database
|
76 |
```bash
|
77 |
mkdir -p pageviews/data
|
78 |
-
wget -P pageviews/data https://dumps.wikimedia.org/other/pageview_complete/monthly/
|
79 |
python -m txtchat.data.wikipedia.views -p en.wikipedia -v pageviews
|
80 |
```
|
81 |
|
@@ -85,7 +85,7 @@ python -m txtchat.data.wikipedia.views -p en.wikipedia -v pageviews
|
|
85 |
from datasets import load_dataset
|
86 |
|
87 |
# Data dump date from https://dumps.wikimedia.org/enwiki/
|
88 |
-
date = "
|
89 |
|
90 |
# Build and save dataset
|
91 |
ds = load_dataset("neuml/wikipedia", language="en", date=date)
|
@@ -95,7 +95,7 @@ ds.save_to_disk(f"wikipedia-{date}")
|
|
95 |
- Build txtai-wikipedia index
|
96 |
```bash
|
97 |
python -m txtchat.data.wikipedia.index \
|
98 |
-
-d wikipedia-
|
99 |
-o txtai-wikipedia \
|
100 |
-v pageviews/pageviews.sqlite
|
101 |
```
|
|
|
8 |
tags:
|
9 |
- sentence-similarity
|
10 |
datasets:
|
11 |
+
- NeuML/wikipedia-20240901
|
12 |
---
|
13 |
|
14 |
# Wikipedia txtai embeddings index
|
15 |
|
16 |
This is a [txtai](https://github.com/neuml/txtai) embeddings index for the [English edition of Wikipedia](https://en.wikipedia.org/).
|
17 |
|
18 |
+
This index is built from the [Wikipedia September 2024 dataset](https://huggingface.co/datasets/neuml/wikipedia-20240901). Only the first paragraph of the [lead section](https://en.wikipedia.org/wiki/Wikipedia:Manual_of_Style/Lead_section) from each article is included in the index. This is similar to an abstract of the article.
|
19 |
|
20 |
It also uses [Wikipedia Page Views](https://dumps.wikimedia.org/other/pageviews/readme.html) data to add a `percentile` field. The `percentile` field can be used
|
21 |
to only match commonly visited pages.
|
|
|
24 |
|
25 |
## Example
|
26 |
|
27 |
+
See the example below. This index requires txtai >= 7.4.
|
28 |
|
29 |
```python
|
30 |
from txtai.embeddings import Embeddings
|
|
|
75 |
- Download and build pageviews database
|
76 |
```bash
|
77 |
mkdir -p pageviews/data
|
78 |
+
wget -P pageviews/data https://dumps.wikimedia.org/other/pageview_complete/monthly/2024/2024-08/pageviews-202408-user.bz2
|
79 |
python -m txtchat.data.wikipedia.views -p en.wikipedia -v pageviews
|
80 |
```
|
81 |
|
|
|
85 |
from datasets import load_dataset
|
86 |
|
87 |
# Data dump date from https://dumps.wikimedia.org/enwiki/
|
88 |
+
date = "20240901"
|
89 |
|
90 |
# Build and save dataset
|
91 |
ds = load_dataset("neuml/wikipedia", language="en", date=date)
|
|
|
95 |
- Build txtai-wikipedia index
|
96 |
```bash
|
97 |
python -m txtchat.data.wikipedia.index \
|
98 |
+
-d wikipedia-20240901 \
|
99 |
-o txtai-wikipedia \
|
100 |
-v pageviews/pageviews.sqlite
|
101 |
```
|
config.json
CHANGED
@@ -14,15 +14,15 @@
|
|
14 |
"content": true,
|
15 |
"dimensions": 768,
|
16 |
"backend": "faiss",
|
17 |
-
"offset":
|
18 |
"build": {
|
19 |
-
"create": "2024-
|
20 |
-
"python": "3.8.
|
21 |
"settings": {
|
22 |
-
"components": "
|
23 |
},
|
24 |
"system": "Linux (x86_64)",
|
25 |
-
"txtai": "
|
26 |
},
|
27 |
-
"update": "2024-
|
28 |
}
|
|
|
14 |
"content": true,
|
15 |
"dimensions": 768,
|
16 |
"backend": "faiss",
|
17 |
+
"offset": 6272285,
|
18 |
"build": {
|
19 |
+
"create": "2024-09-12T17:06:38Z",
|
20 |
+
"python": "3.8.19",
|
21 |
"settings": {
|
22 |
+
"components": "IVF2240,SQ8"
|
23 |
},
|
24 |
"system": "Linux (x86_64)",
|
25 |
+
"txtai": "7.4.0"
|
26 |
},
|
27 |
+
"update": "2024-09-12T17:06:38Z"
|
28 |
}
|
documents
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a54c4473c76d15a7bf2d1b8a1b590d3aaeacc0426324c4a5b1d886d729a43b92
|
3 |
+
size 3292749824
|
embeddings
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:47bcc78a1602223ac3b2b8355b3c244a62fb2e4d0e2f04cdc8a8d0a865692b35
|
3 |
+
size 4874198688
|