#! usr/bin/python3 import xml.etree.ElementTree as ET from os import listdir, walk, remove from os.path import isfile, join import sys from glob import glob import urllib.request def download_data(): urls = ["http://hdl.handle.net/20.500.12537/192", "https://repository.clarin.is/repository/xmlui/handle/20.500.12537/32/allzip", "https://repository.clarin.is/repository/xmlui/bitstream/handle/20.500.12537/96/icesum.json?sequence=1&isAllowed=y"] for url in urls: urllib.request.urlretrieve(url) def get_contents(path='./{http://www.tei-c.org/ns/1.0}text/*/*/*', root=None): results = '' paragraphs = root.findall(path) for p in paragraphs: results += ''.join(t for t in p.itertext()) + '\n' return results def save(path, data): f = open(path, 'w', encoding='utf8') f.write(data) f.close() def delete_contents_of_dir(path='./data/train/*'): files = glob(path) for f in files: remove(f) def path_to_files(path='./CC_BY/IGC-News1-21.05.TEI/frettabladid_is'): the_paths = [] for dirs in walk(path): subdirectory = dirs[0] # extract the path to the text files for file in listdir(subdirectory): # if the file does not exist pass if isfile(join(subdirectory, file)): the_paths.append(f'{subdirectory}/{file}') return the_paths def convert_n_format(paths): # train test ratio train = int(len(paths) * .8) for n in range(len(paths)): path = paths[n] # parse the xml file tree = ET.parse(path) root = tree.getroot() # get contents of the file contents = get_contents(root=root) # empty directory before saving to it, to ensure no leeking of data # delete_contents_of_dir('./data/test/*') # save the file as a .txt in either the train or test dir id = 'news' output_path = (f'./data/train/{n}_{id}.txt' if n < (train - 1) else f'./data/test/{(n - train)}_{id}.txt') # output_path = (f'./another/{n}-train.txt' if n < (train -1) else f'./another/{(n - len(paths) *.01)}-test.txt') save(output_path, contents) print("Beginning data preproccessing") # get all the paths to the files print("Collecting paths") all_paths = path_to_files('./CC_BY/IGC-News1-21.05.TEI/frettabladid_is') print("Found ", len(all_paths)) # convert and format all the files print("converting files...") convert_n_format(all_paths) print("\nDone!")