|
import argparse |
|
import os |
|
|
|
|
|
|
|
def write_ud_files(args): |
|
languages_for_low_resource = ['el'] |
|
|
|
languages = sorted(list(set(languages_for_low_resource))) |
|
splits = ['train', 'dev', 'test'] |
|
lng_to_files = dict((language, {}) for language in languages) |
|
for language, d in lng_to_files.items(): |
|
for split in splits: |
|
d[split] = [] |
|
lng_to_files[language] = d |
|
sub_folders = os.listdir(args.ud_data_path) |
|
for sub_folder in sub_folders: |
|
folder = os.path.join(args.ud_data_path, sub_folder) |
|
files = os.listdir(folder) |
|
for file in files: |
|
for language in languages: |
|
if file.startswith(language) and file.endswith('conllu'): |
|
for split in splits: |
|
if split in file: |
|
full_path = os.path.join(folder, file) |
|
lng_to_files[language][split].append(full_path) |
|
break |
|
|
|
for language, split_dict in lng_to_files.items(): |
|
for split, files in split_dict.items(): |
|
if split == 'dev' and len(files) == 0: |
|
files = split_dict['train'] |
|
print('No dev files were found, copying train files instead') |
|
sentences = [] |
|
num_sentences = 0 |
|
for file in files: |
|
with open(file, 'r') as file: |
|
for line in file: |
|
new_line = [] |
|
line = line.strip() |
|
if len(line) == 0: |
|
sentences.append(new_line) |
|
num_sentences += 1 |
|
continue |
|
tokens = line.split('\t') |
|
if not tokens[0].isdigit(): |
|
continue |
|
id = tokens[0] |
|
word = tokens[1] |
|
pos = tokens[3] |
|
ner = tokens[5] |
|
head = tokens[6] |
|
arc_tag = tokens[7] |
|
new_line = [id, word, pos, ner, head, arc_tag] |
|
sentences.append(new_line) |
|
print('Language: %s Split: %s Num. Sentences: %s ' % (language, split, num_sentences)) |
|
if not os.path.exists('data'): |
|
os.makedirs('data') |
|
write_data_path = 'data/MRL/ud_pos_ner_dp_' + split + '_' + language |
|
print('creating %s' % write_data_path) |
|
with open(write_data_path, 'w') as f: |
|
for line in sentences: |
|
f.write('\t'.join(line) + '\n') |
|
|
|
def main(): |
|
|
|
args_ = argparse.ArgumentParser() |
|
args_.add_argument('--ud_data_path', help='Directory path of the UD treebanks.', required=True) |
|
|
|
args = args_.parse_args() |
|
write_ud_files(args) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|