Sigurdur
/

ice-roberta

Model card Files Files and versions

ice-roberta / preprocess.py

Sigurdur's picture

Upload 16 files

787eab8 over 2 years ago

2.53 kB

	#! usr/bin/python3
	import xml.etree.ElementTree as ET
	from os import listdir, walk, remove
	from os.path import isfile, join
	import sys
	from glob import glob
	import urllib.request


	def download_data():
	urls = ["http://hdl.handle.net/20.500.12537/192", "https://repository.clarin.is/repository/xmlui/handle/20.500.12537/32/allzip", "https://repository.clarin.is/repository/xmlui/bitstream/handle/20.500.12537/96/icesum.json?sequence=1&isAllowed=y"]
	for url in urls:
	urllib.request.urlretrieve(url)


	def get_contents(path='./{http://www.tei-c.org/ns/1.0}text///*', root=None):
	results = ''
	paragraphs = root.findall(path)

	for p in paragraphs:
	results += ''.join(t for t in p.itertext()) + '\n'

	return results


	def save(path, data):
	f = open(path, 'w', encoding='utf8')
	f.write(data)
	f.close()


	def delete_contents_of_dir(path='./data/train/*'):
	files = glob(path)
	for f in files:
	remove(f)


	def path_to_files(path='./CC_BY/IGC-News1-21.05.TEI/frettabladid_is'):
	the_paths = []

	for dirs in walk(path):
	subdirectory = dirs[0]

	# extract the path to the text files
	for file in listdir(subdirectory):
	# if the file does not exist pass
	if isfile(join(subdirectory, file)):
	the_paths.append(f'{subdirectory}/{file}')
	return the_paths


	def convert_n_format(paths):
	# train test ratio
	train = int(len(paths) * .8)

	for n in range(len(paths)):

	path = paths[n]

	# parse the xml file
	tree = ET.parse(path)
	root = tree.getroot()

	# get contents of the file
	contents = get_contents(root=root)

	# empty directory before saving to it, to ensure no leeking of data
	# delete_contents_of_dir('./data/test/*')

	# save the file as a .txt in either the train or test dir
	id = 'news'
	output_path = (f'./data/train/{n}_{id}.txt' if n < (train - 1)
	else f'./data/test/{(n - train)}_{id}.txt')

	# output_path = (f'./another/{n}-train.txt' if n < (train -1) else f'./another/{(n - len(paths) *.01)}-test.txt')

	save(output_path, contents)



	print("Beginning data preproccessing")

	# get all the paths to the files
	print("Collecting paths")
	all_paths = path_to_files('./CC_BY/IGC-News1-21.05.TEI/frettabladid_is')
	print("Found ", len(all_paths))

	# convert and format all the files
	print("converting files...")
	convert_n_format(all_paths)
	print("\nDone!")