File size: 2,413 Bytes
0f120d1
8451e68
 
0f120d1
8451e68
 
 
 
0f120d1
8451e68
0f120d1
 
 
8451e68
 
 
 
0659669
8451e68
 
0f120d1
8451e68
 
 
 
 
 
 
 
0659669
8451e68
 
 
 
 
 
 
 
 
 
 
 
 
0f120d1
 
 
 
 
 
 
 
 
 
 
 
8451e68
 
 
 
 
 
 
 
 
0f120d1
 
 
8451e68
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# this script is used for importing wiki text into scorer format
from wiki_dump_reader import Cleaner, iterate
from os import remove
from os.path import exists
import nltk
import re
nltk.download("punkt")

OUT_PATH = "../data/wiki_text.txt"

if exists(OUT_PATH):
    remove(OUT_PATH)
text_file = open(OUT_PATH, mode="a")

tokenizer = nltk.SpaceTokenizer()
paranthesis_regex = re.compile(r'\(.*\)')
allowed_chars = ["а", "б", "в", "г", "ґ", "д", "е", "є", "ж", "з", "и", "і", "ї", "й", "к", "л",
                 "м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ь", "ю", "я", "-", "’"]

cleaner = Cleaner()
# iter = 0
for title, text in iterate('../data/ukwiki-20210320-pages-articles-multistream.xml'):
    text = cleaner.clean_text(text)
    cleaned_text, _ = cleaner.build_links(text)
    cleaned_text = cleaned_text.lower()
    cleaned_text = cleaned_text.replace(" ", " ")
    cleaned_text = cleaned_text.replace("н. е.", "нашої ери")
    cleaned_text = cleaned_text.replace("ім.", "імені")
    cleaned_text = cleaned_text.replace("див.", "дивись")
    cleaned_text = cleaned_text.replace("'", "’")
    cleaned_text = paranthesis_regex.sub('', cleaned_text)
    cleaned_text = cleaned_text.strip()
    cleaned_text = cleaned_text.split(".")
    out_text = []
    for text in cleaned_text:
        text = text.strip()
        if text.endswith(", що вивчає"):
            continue
        if text.startswith("redirect") or text.startswith("перенаправлення"):
            continue

        words = tokenizer.tokenize(text)
        words = [i for i in words if not i.isdigit()]
        new_words = []
        for word in words:
            include = True
            for letter in word:
                if word.startswith("-"):
                    word = word[1:]
                if letter not in allowed_chars:
                    include = False
            if include:
                new_words.append(word)
        words = new_words
        if all([len(i) <= 1 for i in words]):
            continue
        if len(words) == 0:
            continue
        out_text.append(
            " ".join(words))
    cleaned_text = "\n".join(out_text)
    if cleaned_text == "":
        continue
    text_file.write(cleaned_text + "\n")
    # iter += 1
    # if iter > 5:
    #    break

text_file.close()