Spaces:

robinhad
/

ukrainian-stt

Running

File size: 2,183 Bytes

# this script is used for importing random texts from folder and converting it for scorer
import os
import nltk
import re
nltk.download("punkt")

FOLDER = "../data/текст/"
OUT_FILE = "../data/texts.txt"
text_file = open(OUT_FILE, mode="a")

tokenizer = nltk.SpaceTokenizer()
paranthesis_regex = re.compile(r'\(.*\)')
allowed_chars = ["а", "б", "в", "г", "ґ", "д", "е", "є", "ж", "з", "и", "і", "ї", "й", "к", "л",
                 "м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ь", "ю", "я", "-", "’"]

for subdir, dirs, files in os.walk(FOLDER):
    for file in files:
        file_path = os.path.join(subdir, file)
        print(file_path)
        input_file = open(file_path)
        try:
            cleaned_text = input_file.read()
        except:
            input_file.close()
            input_file = open(file_path, encoding="cp1251")
            cleaned_text = input_file.read()
        cleaned_text = cleaned_text.lower()
        cleaned_text = cleaned_text.replace("'", "’")
        cleaned_text = paranthesis_regex.sub('', cleaned_text)
        cleaned_text = cleaned_text.strip()
        cleaned_text = cleaned_text.split(".")
        out_text = []
        for text in cleaned_text:
            text = text.strip()

            words = tokenizer.tokenize(text)
            words = [i for i in words if not i.isdigit()]
            new_words = []
            for word in words:
                include = True
                for letter in word:
                    if word.startswith("-"):
                        word = word[1:]
                    if letter not in allowed_chars:
                        include = False
                if include:
                    new_words.append(word)
            words = new_words
            if all([len(i) <= 1 for i in words]):
                continue
            if len(words) == 0:
                continue
            out_text.append(
                " ".join(words))
        cleaned_text = "\n".join(out_text)
        if cleaned_text == "":
            continue
        text_file.write(cleaned_text + "\n")
        input_file.close()


text_file.close()