|
|
|
import xml.etree.ElementTree as ET |
|
from os import listdir, walk, remove |
|
from os.path import isfile, join |
|
import sys |
|
from glob import glob |
|
import urllib.request |
|
|
|
|
|
def download_data(): |
|
urls = ["http://hdl.handle.net/20.500.12537/192", "https://repository.clarin.is/repository/xmlui/handle/20.500.12537/32/allzip", "https://repository.clarin.is/repository/xmlui/bitstream/handle/20.500.12537/96/icesum.json?sequence=1&isAllowed=y"] |
|
for url in urls: |
|
urllib.request.urlretrieve(url) |
|
|
|
|
|
def get_contents(path='./{http://www.tei-c.org/ns/1.0}text/*/*/*', root=None): |
|
results = '' |
|
paragraphs = root.findall(path) |
|
|
|
for p in paragraphs: |
|
results += ''.join(t for t in p.itertext()) + '\n' |
|
|
|
return results |
|
|
|
|
|
def save(path, data): |
|
f = open(path, 'w', encoding='utf8') |
|
f.write(data) |
|
f.close() |
|
|
|
|
|
def delete_contents_of_dir(path='./data/train/*'): |
|
files = glob(path) |
|
for f in files: |
|
remove(f) |
|
|
|
|
|
def path_to_files(path='./CC_BY/IGC-News1-21.05.TEI/frettabladid_is'): |
|
the_paths = [] |
|
|
|
for dirs in walk(path): |
|
subdirectory = dirs[0] |
|
|
|
|
|
for file in listdir(subdirectory): |
|
|
|
if isfile(join(subdirectory, file)): |
|
the_paths.append(f'{subdirectory}/{file}') |
|
return the_paths |
|
|
|
|
|
def convert_n_format(paths): |
|
|
|
train = int(len(paths) * .8) |
|
|
|
for n in range(len(paths)): |
|
|
|
path = paths[n] |
|
|
|
|
|
tree = ET.parse(path) |
|
root = tree.getroot() |
|
|
|
|
|
contents = get_contents(root=root) |
|
|
|
|
|
|
|
|
|
|
|
id = 'news' |
|
output_path = (f'./data/train/{n}_{id}.txt' if n < (train - 1) |
|
else f'./data/test/{(n - train)}_{id}.txt') |
|
|
|
|
|
|
|
save(output_path, contents) |
|
|
|
|
|
|
|
print("Beginning data preproccessing") |
|
|
|
|
|
print("Collecting paths") |
|
all_paths = path_to_files('./CC_BY/IGC-News1-21.05.TEI/frettabladid_is') |
|
print("Found ", len(all_paths)) |
|
|
|
|
|
print("converting files...") |
|
convert_n_format(all_paths) |
|
print("\nDone!") |