File size: 2,529 Bytes
787eab8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#! usr/bin/python3
import xml.etree.ElementTree as ET
from os import listdir, walk, remove
from os.path import isfile, join
import sys
from glob import glob
import urllib.request


def download_data():
    urls = ["http://hdl.handle.net/20.500.12537/192", "https://repository.clarin.is/repository/xmlui/handle/20.500.12537/32/allzip", "https://repository.clarin.is/repository/xmlui/bitstream/handle/20.500.12537/96/icesum.json?sequence=1&isAllowed=y"]
    for url in urls:
        urllib.request.urlretrieve(url)


def get_contents(path='./{http://www.tei-c.org/ns/1.0}text/*/*/*', root=None):
    results = ''
    paragraphs = root.findall(path)

    for p in paragraphs:
        results += ''.join(t for t in p.itertext()) + '\n'

    return results


def save(path, data):
    f = open(path, 'w', encoding='utf8')
    f.write(data)
    f.close()


def delete_contents_of_dir(path='./data/train/*'):
    files = glob(path)
    for f in files:
        remove(f)


def path_to_files(path='./CC_BY/IGC-News1-21.05.TEI/frettabladid_is'):
    the_paths = []

    for dirs in walk(path):
        subdirectory = dirs[0]

        # extract the path to the text files
        for file in listdir(subdirectory):
            # if the file does not exist pass
            if isfile(join(subdirectory, file)):
                the_paths.append(f'{subdirectory}/{file}')
    return the_paths


def convert_n_format(paths):
    # train test ratio
    train = int(len(paths) * .8)
   
    for n in range(len(paths)):

        path = paths[n]

        # parse the xml file
        tree = ET.parse(path)
        root = tree.getroot()

        # get contents of the file
        contents = get_contents(root=root)
        
        # empty directory before saving to it, to ensure no leeking of data
        # delete_contents_of_dir('./data/test/*')

        # save the file as a .txt in either the train or test dir
        id = 'news'
        output_path = (f'./data/train/{n}_{id}.txt' if n < (train - 1)
                    else f'./data/test/{(n - train)}_{id}.txt')

        # output_path = (f'./another/{n}-train.txt' if n < (train -1) else f'./another/{(n - len(paths) *.01)}-test.txt')

        save(output_path, contents)



print("Beginning data preproccessing")

# get all the paths to the files
print("Collecting paths")
all_paths = path_to_files('./CC_BY/IGC-News1-21.05.TEI/frettabladid_is')
print("Found ", len(all_paths))

# convert and format all the files
print("converting files...")
convert_n_format(all_paths)
print("\nDone!")