Add normalization steps, fix som bugs, add tfboard tracker
Browse files- .gitattributes +1 -0
- README.md +2 -2
- src/data_utils.py +3 -7
- src/requirements.txt +2 -1
.gitattributes
CHANGED
@@ -14,3 +14,4 @@
|
|
14 |
*.pb filter=lfs diff=lfs merge=lfs -text
|
15 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
16 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
|
|
|
14 |
*.pb filter=lfs diff=lfs merge=lfs -text
|
15 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
16 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -33,9 +33,9 @@ python create_config.py --name_or_path gpt2-medium --params '{"vocab_size": 4200
|
|
33 |
|
34 |
Steps:
|
35 |
|
36 |
-
- [
|
37 |
|
38 |
-
- [
|
39 |
|
40 |
- [ ] Remove Telegram, Instagram advertisements, or posts (a whole record)
|
41 |
|
|
|
33 |
|
34 |
Steps:
|
35 |
|
36 |
+
- [x] Remove stretched words such as ســــــــــلام
|
37 |
|
38 |
+
- [x] Remove links, user-mentioning (such as @jane_doe)
|
39 |
|
40 |
- [ ] Remove Telegram, Instagram advertisements, or posts (a whole record)
|
41 |
|
src/data_utils.py
CHANGED
@@ -2,7 +2,6 @@ from hazm import word_tokenize
|
|
2 |
from hazm import sent_tokenize
|
3 |
import re
|
4 |
import six
|
5 |
-
import string
|
6 |
|
7 |
from normalizer import normalize
|
8 |
|
@@ -13,15 +12,15 @@ def filter_by_lang_regex(text, ratio=0.7, regex="0-9۰۱۲۳۴۵۶۷۸۹ءآئا
|
|
13 |
candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text)).replace(" ", "")
|
14 |
text = text.replace(" ", "")
|
15 |
|
16 |
-
return
|
17 |
|
18 |
|
19 |
def filter_by_num_tokens(text, gt=64):
|
20 |
-
return
|
21 |
|
22 |
|
23 |
def filter_by_num_sents(text, gt=2):
|
24 |
-
return
|
25 |
|
26 |
|
27 |
def normalizer(text, do_lowercase=False):
|
@@ -31,6 +30,3 @@ def normalizer(text, do_lowercase=False):
|
|
31 |
text = text.lower()
|
32 |
|
33 |
return text
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
2 |
from hazm import sent_tokenize
|
3 |
import re
|
4 |
import six
|
|
|
5 |
|
6 |
from normalizer import normalize
|
7 |
|
|
|
12 |
candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text)).replace(" ", "")
|
13 |
text = text.replace(" ", "")
|
14 |
|
15 |
+
return (len(candidate_text) / len(text)) > ratio
|
16 |
|
17 |
|
18 |
def filter_by_num_tokens(text, gt=64):
|
19 |
+
return len(word_tokenize(text)) > gt
|
20 |
|
21 |
|
22 |
def filter_by_num_sents(text, gt=2):
|
23 |
+
return len(sent_tokenize(text)) > gt
|
24 |
|
25 |
|
26 |
def normalizer(text, do_lowercase=False):
|
|
|
30 |
text = text.lower()
|
31 |
|
32 |
return text
|
|
|
|
|
|
src/requirements.txt
CHANGED
@@ -3,4 +3,5 @@ jax>=0.2.8
|
|
3 |
jaxlib>=0.1.59
|
4 |
flax>=0.3.4
|
5 |
optax>=0.0.8
|
6 |
-
hazm
|
|
|
|
3 |
jaxlib>=0.1.59
|
4 |
flax>=0.3.4
|
5 |
optax>=0.0.8
|
6 |
+
hazm
|
7 |
+
tensorboard
|