Spaces:
Sleeping
Sleeping
Yurii Paniv
commited on
Commit
•
0659669
1
Parent(s):
aa0bba0
Replace apostrophe
Browse files- scripts/extract_text_corpus.py +2 -1
- scripts/wiki_import.py +2 -1
scripts/extract_text_corpus.py
CHANGED
@@ -11,7 +11,7 @@ text_file = open(OUT_FILE, mode="a")
|
|
11 |
tokenizer = nltk.SpaceTokenizer()
|
12 |
paranthesis_regex = re.compile(r'\(.*\)')
|
13 |
allowed_chars = ["а", "б", "в", "г", "ґ", "д", "е", "є", "ж", "з", "и", "і", "ї", "й", "к", "л",
|
14 |
-
"м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ь", "ю", "я", "-", "
|
15 |
|
16 |
for subdir, dirs, files in os.walk(FOLDER):
|
17 |
for file in files:
|
@@ -25,6 +25,7 @@ for subdir, dirs, files in os.walk(FOLDER):
|
|
25 |
input_file = open(file_path, encoding="cp1251")
|
26 |
cleaned_text = input_file.read()
|
27 |
cleaned_text = cleaned_text.lower()
|
|
|
28 |
cleaned_text = paranthesis_regex.sub('', cleaned_text)
|
29 |
cleaned_text = cleaned_text.strip()
|
30 |
cleaned_text = cleaned_text.split(".")
|
|
|
11 |
tokenizer = nltk.SpaceTokenizer()
|
12 |
paranthesis_regex = re.compile(r'\(.*\)')
|
13 |
allowed_chars = ["а", "б", "в", "г", "ґ", "д", "е", "є", "ж", "з", "и", "і", "ї", "й", "к", "л",
|
14 |
+
"м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ь", "ю", "я", "-", "’"]
|
15 |
|
16 |
for subdir, dirs, files in os.walk(FOLDER):
|
17 |
for file in files:
|
|
|
25 |
input_file = open(file_path, encoding="cp1251")
|
26 |
cleaned_text = input_file.read()
|
27 |
cleaned_text = cleaned_text.lower()
|
28 |
+
cleaned_text = cleaned_text.replace("'", "’")
|
29 |
cleaned_text = paranthesis_regex.sub('', cleaned_text)
|
30 |
cleaned_text = cleaned_text.strip()
|
31 |
cleaned_text = cleaned_text.split(".")
|
scripts/wiki_import.py
CHANGED
@@ -15,7 +15,7 @@ text_file = open(OUT_PATH, mode="a")
|
|
15 |
tokenizer = nltk.SpaceTokenizer()
|
16 |
paranthesis_regex = re.compile(r'\(.*\)')
|
17 |
allowed_chars = ["а", "б", "в", "г", "ґ", "д", "е", "є", "ж", "з", "и", "і", "ї", "й", "к", "л",
|
18 |
-
"м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ь", "ю", "я", "-", "
|
19 |
|
20 |
cleaner = Cleaner()
|
21 |
# iter = 0
|
@@ -27,6 +27,7 @@ for title, text in iterate('../data/ukwiki-20210320-pages-articles-multistream.x
|
|
27 |
cleaned_text = cleaned_text.replace("н. е.", "нашої ери")
|
28 |
cleaned_text = cleaned_text.replace("ім.", "імені")
|
29 |
cleaned_text = cleaned_text.replace("див.", "дивись")
|
|
|
30 |
cleaned_text = paranthesis_regex.sub('', cleaned_text)
|
31 |
cleaned_text = cleaned_text.strip()
|
32 |
cleaned_text = cleaned_text.split(".")
|
|
|
15 |
tokenizer = nltk.SpaceTokenizer()
|
16 |
paranthesis_regex = re.compile(r'\(.*\)')
|
17 |
allowed_chars = ["а", "б", "в", "г", "ґ", "д", "е", "є", "ж", "з", "и", "і", "ї", "й", "к", "л",
|
18 |
+
"м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ь", "ю", "я", "-", "’"]
|
19 |
|
20 |
cleaner = Cleaner()
|
21 |
# iter = 0
|
|
|
27 |
cleaned_text = cleaned_text.replace("н. е.", "нашої ери")
|
28 |
cleaned_text = cleaned_text.replace("ім.", "імені")
|
29 |
cleaned_text = cleaned_text.replace("див.", "дивись")
|
30 |
+
cleaned_text = cleaned_text.replace("'", "’")
|
31 |
cleaned_text = paranthesis_regex.sub('', cleaned_text)
|
32 |
cleaned_text = cleaned_text.strip()
|
33 |
cleaned_text = cleaned_text.split(".")
|