birgermoell
commited on
Commit
•
b452553
1
Parent(s):
d1850c7
Update README.md
Browse files
README.md
CHANGED
@@ -10,6 +10,45 @@ part of the wiki40b dataset.
|
|
10 |
|
11 |
https://huggingface.co/datasets/wiki40b
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
## Data cleaning and preprocessing
|
15 |
The data was cleaned and preprocessed using the following script. Make sure to install depencies for beam_runner to make the dataset work.
|
@@ -26,10 +65,18 @@ def load_and_clean_wiki():
|
|
26 |
return filtered_dataset
|
27 |
|
28 |
def filter_wikipedia(batch):
|
29 |
-
batch["text"] = " ".join(batch["text"].split("\
|
30 |
-
|
31 |
-
|
32 |
-
batch["text"] = " ".join(batch["text"].split("\
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
batch["text"] = " ".join(batch["text"].split("_NEWLINE_"))
|
34 |
batch["text"] = " ".join(batch["text"].split("\xa0"))
|
35 |
return batch
|
|
|
10 |
|
11 |
https://huggingface.co/datasets/wiki40b
|
12 |
|
13 |
+
## Model series
|
14 |
+
This model is part of a series of models training on TPU with Flax Jax during Huggingface Flax/Jax challenge.
|
15 |
+
|
16 |
+
## Gpt models
|
17 |
+
|
18 |
+
## Swedish Gpt
|
19 |
+
https://huggingface.co/birgermoell/swedish-gpt/
|
20 |
+
|
21 |
+
## Swedish gpt wiki
|
22 |
+
https://huggingface.co/flax-community/swe-gpt-wiki
|
23 |
+
|
24 |
+
# Nordic gpt wiki
|
25 |
+
https://huggingface.co/flax-community/nordic-gpt-wiki
|
26 |
+
|
27 |
+
## Dansk gpt wiki
|
28 |
+
https://huggingface.co/flax-community/dansk-gpt-wiki
|
29 |
+
|
30 |
+
## Norsk gpt wiki
|
31 |
+
https://huggingface.co/flax-community/norsk-gpt-wiki
|
32 |
+
|
33 |
+
## Roberta models
|
34 |
+
|
35 |
+
## Nordic Roberta Wiki
|
36 |
+
https://huggingface.co/flax-community/nordic-roberta-wiki
|
37 |
+
|
38 |
+
## Swe Roberta Wiki Oscar
|
39 |
+
https://huggingface.co/flax-community/swe-roberta-wiki-oscar
|
40 |
+
|
41 |
+
## Roberta Swedish Scandi
|
42 |
+
https://huggingface.co/birgermoell/roberta-swedish-scandi
|
43 |
+
|
44 |
+
## Roberta Swedish
|
45 |
+
https://huggingface.co/birgermoell/roberta-swedish
|
46 |
+
|
47 |
+
## Swedish T5 model
|
48 |
+
https://huggingface.co/birgermoell/t5-base-swedish
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
|
53 |
## Data cleaning and preprocessing
|
54 |
The data was cleaned and preprocessed using the following script. Make sure to install depencies for beam_runner to make the dataset work.
|
|
|
65 |
return filtered_dataset
|
66 |
|
67 |
def filter_wikipedia(batch):
|
68 |
+
batch["text"] = " ".join(batch["text"].split("\
|
69 |
+
_START_SECTION_\
|
70 |
+
"))
|
71 |
+
batch["text"] = " ".join(batch["text"].split("\
|
72 |
+
_START_ARTICLE_\
|
73 |
+
"))
|
74 |
+
batch["text"] = " ".join(batch["text"].split("\
|
75 |
+
_START_ARTICLE_\
|
76 |
+
"))
|
77 |
+
batch["text"] = " ".join(batch["text"].split("\
|
78 |
+
_START_PARAGRAPH_\
|
79 |
+
"))
|
80 |
batch["text"] = " ".join(batch["text"].split("_NEWLINE_"))
|
81 |
batch["text"] = " ".join(batch["text"].split("\xa0"))
|
82 |
return batch
|