ice-roberta / preprocess.py
Sigurdur's picture
Upload 16 files
787eab8
raw
history blame
2.53 kB
#! usr/bin/python3
import xml.etree.ElementTree as ET
from os import listdir, walk, remove
from os.path import isfile, join
import sys
from glob import glob
import urllib.request
def download_data():
urls = ["http://hdl.handle.net/20.500.12537/192", "https://repository.clarin.is/repository/xmlui/handle/20.500.12537/32/allzip", "https://repository.clarin.is/repository/xmlui/bitstream/handle/20.500.12537/96/icesum.json?sequence=1&isAllowed=y"]
for url in urls:
urllib.request.urlretrieve(url)
def get_contents(path='./{http://www.tei-c.org/ns/1.0}text/*/*/*', root=None):
results = ''
paragraphs = root.findall(path)
for p in paragraphs:
results += ''.join(t for t in p.itertext()) + '\n'
return results
def save(path, data):
f = open(path, 'w', encoding='utf8')
f.write(data)
f.close()
def delete_contents_of_dir(path='./data/train/*'):
files = glob(path)
for f in files:
remove(f)
def path_to_files(path='./CC_BY/IGC-News1-21.05.TEI/frettabladid_is'):
the_paths = []
for dirs in walk(path):
subdirectory = dirs[0]
# extract the path to the text files
for file in listdir(subdirectory):
# if the file does not exist pass
if isfile(join(subdirectory, file)):
the_paths.append(f'{subdirectory}/{file}')
return the_paths
def convert_n_format(paths):
# train test ratio
train = int(len(paths) * .8)
for n in range(len(paths)):
path = paths[n]
# parse the xml file
tree = ET.parse(path)
root = tree.getroot()
# get contents of the file
contents = get_contents(root=root)
# empty directory before saving to it, to ensure no leeking of data
# delete_contents_of_dir('./data/test/*')
# save the file as a .txt in either the train or test dir
id = 'news'
output_path = (f'./data/train/{n}_{id}.txt' if n < (train - 1)
else f'./data/test/{(n - train)}_{id}.txt')
# output_path = (f'./another/{n}-train.txt' if n < (train -1) else f'./another/{(n - len(paths) *.01)}-test.txt')
save(output_path, contents)
print("Beginning data preproccessing")
# get all the paths to the files
print("Collecting paths")
all_paths = path_to_files('./CC_BY/IGC-News1-21.05.TEI/frettabladid_is')
print("Found ", len(all_paths))
# convert and format all the files
print("converting files...")
convert_n_format(all_paths)
print("\nDone!")