Spaces:

robinhad
/

ukrainian-stt

Running

ukrainian-stt / scripts /extract_text_corpus.py

Yurii Paniv

Replace apostrophe

0659669 over 3 years ago

2.18 kB

	# this script is used for importing random texts from folder and converting it for scorer
	import os
	import nltk
	import re
	nltk.download("punkt")

	FOLDER = "../data/текст/"
	OUT_FILE = "../data/texts.txt"
	text_file = open(OUT_FILE, mode="a")

	tokenizer = nltk.SpaceTokenizer()
	paranthesis_regex = re.compile(r'\(.*\)')
	allowed_chars = ["а", "б", "в", "г", "ґ", "д", "е", "є", "ж", "з", "и", "і", "ї", "й", "к", "л",
	"м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ь", "ю", "я", "-", "’"]

	for subdir, dirs, files in os.walk(FOLDER):
	for file in files:
	file_path = os.path.join(subdir, file)
	print(file_path)
	input_file = open(file_path)
	try:
	cleaned_text = input_file.read()
	except:
	input_file.close()
	input_file = open(file_path, encoding="cp1251")
	cleaned_text = input_file.read()
	cleaned_text = cleaned_text.lower()
	cleaned_text = cleaned_text.replace("'", "’")
	cleaned_text = paranthesis_regex.sub('', cleaned_text)
	cleaned_text = cleaned_text.strip()
	cleaned_text = cleaned_text.split(".")
	out_text = []
	for text in cleaned_text:
	text = text.strip()

	words = tokenizer.tokenize(text)
	words = [i for i in words if not i.isdigit()]
	new_words = []
	for word in words:
	include = True
	for letter in word:
	if word.startswith("-"):
	word = word[1:]
	if letter not in allowed_chars:
	include = False
	if include:
	new_words.append(word)
	words = new_words
	if all([len(i) <= 1 for i in words]):
	continue
	if len(words) == 0:
	continue
	out_text.append(
	" ".join(words))
	cleaned_text = "\n".join(out_text)
	if cleaned_text == "":
	continue
	text_file.write(cleaned_text + "\n")
	input_file.close()


	text_file.close()