|
import os |
|
from .prepare_data import ROOT, END |
|
import pdb |
|
def get_split(path): |
|
if 'train' in path: |
|
if 'extra_train' in path: |
|
split = 'extra_train' |
|
else: |
|
split = 'train' |
|
elif 'dev' in path: |
|
if 'extra_dev' in path: |
|
split = 'extra_dev' |
|
else: |
|
split = 'dev' |
|
else: |
|
split = 'test' |
|
return split |
|
|
|
def add_number_of_children(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True): |
|
if src_domain == tgt_domain: |
|
pred_paths = [] |
|
if use_unlabeled_data: |
|
pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and 'poetry' not in file and 'prose' not in file and 'extra' in file and tgt_domain in file] |
|
|
|
gold_paths = [file for file in os.listdir(parser_path) if file.endswith("gold.txt") and 'poetry' not in file and 'prose' not in file and 'extra' not in file and tgt_domain in file and 'train' not in file] |
|
if use_labeled_data: |
|
gold_paths += [file for file in os.listdir(parser_path) if file.endswith("gold.txt") and 'poetry' not in file and 'prose' not in file and 'extra' not in file and tgt_domain in file and 'train' in file] |
|
|
|
if not use_unlabeled_data and not use_labeled_data: |
|
raise ValueError |
|
else: |
|
pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and tgt_domain in file] |
|
|
|
gold_paths = [] |
|
if use_labeled_data: |
|
gold_paths = ['data/onto_pos_ner_dp_train_' + src_domain] |
|
|
|
if not use_unlabeled_data and not use_labeled_data: |
|
raise ValueError |
|
|
|
paths = pred_paths + gold_paths |
|
print("Adding labels to paths: %s" % ', '.join(paths)) |
|
root_line = ['0', ROOT, 'XX', 'O', '0', 'root'] |
|
writing_paths = {} |
|
sentences = {} |
|
for path in paths: |
|
if tgt_domain in path: |
|
reading_path = parser_path + path |
|
writing_path = model_path + 'parser_' + path |
|
split = get_split(writing_path) |
|
else: |
|
reading_path = path |
|
writing_path = model_path + 'parser_' + 'domain_' + src_domain + '_train_model_domain_' + src_domain + '_data_domain_' + src_domain + '_gold.txt' |
|
split = 'extra_train' |
|
writing_paths[split] = writing_path |
|
len_sent = 0 |
|
number_of_children = {} |
|
lines = [] |
|
sentences_list = [] |
|
with open(reading_path, 'r') as file: |
|
for line in file: |
|
|
|
line = line.strip() |
|
|
|
if len(line) == 0: |
|
for idx in range(len_sent): |
|
node = str(idx + 1) |
|
if node not in number_of_children: |
|
lines[idx].append('0') |
|
else: |
|
lines[idx].append(str(number_of_children[node])) |
|
if len(lines) > 0: |
|
tmp_root_line = root_line + [str(number_of_children['0'])] |
|
sentences_list.append(tmp_root_line) |
|
for line_ in lines: |
|
sentences_list.append(line_) |
|
sentences_list.append([]) |
|
lines = [] |
|
number_of_children = {} |
|
len_sent = 0 |
|
continue |
|
tokens = line.split('\t') |
|
idx = tokens[0] |
|
word = tokens[1] |
|
pos = tokens[2] |
|
ner = tokens[3] |
|
head = tokens[4] |
|
arc_tag = tokens[5] |
|
if head not in number_of_children: |
|
number_of_children[head] = 1 |
|
else: |
|
number_of_children[head] += 1 |
|
lines.append([idx, word, pos, ner, head, arc_tag]) |
|
len_sent += 1 |
|
sentences[split] = sentences_list |
|
|
|
train_sentences = [] |
|
if 'train' in sentences: |
|
train_sentences = sentences['train'] |
|
else: |
|
writing_paths['train'] = writing_paths['extra_train'].replace('extra_train', 'train') |
|
if 'extra_train' in sentences: |
|
train_sentences += sentences['extra_train'] |
|
del writing_paths['extra_train'] |
|
if 'extra_dev' in sentences: |
|
train_sentences += sentences['extra_dev'] |
|
del writing_paths['extra_dev'] |
|
with open(writing_paths['train'], 'w') as f: |
|
for sent in train_sentences: |
|
f.write('\t'.join(sent) + '\n') |
|
for split in ['dev', 'test']: |
|
if split in sentences: |
|
split_sentences = sentences[split] |
|
with open(writing_paths[split], 'w') as f: |
|
for sent in split_sentences: |
|
f.write('\t'.join(sent) + '\n') |
|
return writing_paths |
|
|
|
|
|
def add_distance_from_the_root(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True): |
|
if src_domain == tgt_domain: |
|
pred_paths = [] |
|
if use_unlabeled_data: |
|
pred_paths = [file for file in os.listdir(parser_path) if |
|
file.endswith("pred.txt") and 'poetry' not in file and 'prose' not in file and 'extra' in file and tgt_domain in file] |
|
|
|
gold_paths = [file for file in os.listdir(parser_path) if |
|
file.endswith("gold.txt") and 'poetry' not in file and 'prose' not in file and 'extra' not in file and tgt_domain in file and 'train' not in file] |
|
if use_labeled_data: |
|
gold_paths += [file for file in os.listdir(parser_path) if |
|
file.endswith("gold.txt") and 'extra' not in file and 'poetry' not in file and 'prose' not in file and tgt_domain in file and 'train' in file] |
|
|
|
if not use_unlabeled_data and not use_labeled_data: |
|
raise ValueError |
|
else: |
|
pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and tgt_domain in file] |
|
|
|
gold_paths = [] |
|
if use_labeled_data: |
|
gold_paths = ['data/onto_pos_ner_dp_train_' + src_domain] |
|
|
|
if not use_unlabeled_data and not use_labeled_data: |
|
raise ValueError |
|
|
|
paths = pred_paths + gold_paths |
|
print("Adding labels to paths: %s" % ', '.join(paths)) |
|
root_line = ['0', ROOT, 'XX', 'O', '0', 'root', '0'] |
|
writing_paths = {} |
|
sentences = {} |
|
for path in paths: |
|
if tgt_domain in path: |
|
reading_path = parser_path + path |
|
writing_path = model_path + 'parser_' + path |
|
split = get_split(writing_path) |
|
else: |
|
reading_path = path |
|
writing_path = model_path + 'parser_' + 'domain_' + src_domain + '_train_model_domain_' + src_domain + '_data_domain_' + src_domain + '_gold.txt' |
|
split = 'extra_train' |
|
writing_paths[split] = writing_path |
|
len_sent = 0 |
|
tree_dict = {'0': '0'} |
|
lines = [] |
|
sentences_list = [] |
|
with open(reading_path, 'r') as file: |
|
for line in file: |
|
|
|
line = line.strip() |
|
if len(line) == 0: |
|
for idx in range(len_sent): |
|
depth = 1 |
|
node = str(idx + 1) |
|
while tree_dict[node] != '0': |
|
node = tree_dict[node] |
|
depth += 1 |
|
lines[idx].append(str(depth)) |
|
if len(lines) > 0: |
|
sentences_list.append(root_line) |
|
for line_ in lines: |
|
sentences_list.append(line_) |
|
sentences_list.append([]) |
|
lines = [] |
|
tree_dict = {'0': '0'} |
|
len_sent = 0 |
|
continue |
|
tokens = line.split('\t') |
|
idx = tokens[0] |
|
word = tokens[1] |
|
pos = tokens[2] |
|
ner = tokens[3] |
|
head = tokens[4] |
|
arc_tag = tokens[5] |
|
tree_dict[idx] = head |
|
lines.append([idx, word, pos, ner, head, arc_tag]) |
|
len_sent += 1 |
|
sentences[split] = sentences_list |
|
|
|
train_sentences = [] |
|
if 'train' in sentences: |
|
train_sentences = sentences['train'] |
|
else: |
|
writing_paths['train'] = writing_paths['extra_train'].replace('extra_train', 'train') |
|
if 'extra_train' in sentences: |
|
train_sentences += sentences['extra_train'] |
|
del writing_paths['extra_train'] |
|
if 'extra_dev' in sentences: |
|
train_sentences += sentences['extra_dev'] |
|
del writing_paths['extra_dev'] |
|
with open(writing_paths['train'], 'w') as f: |
|
for sent in train_sentences: |
|
f.write('\t'.join(sent) + '\n') |
|
for split in ['dev', 'test']: |
|
if split in sentences: |
|
split_sentences = sentences[split] |
|
with open(writing_paths[split], 'w') as f: |
|
for sent in split_sentences: |
|
f.write('\t'.join(sent) + '\n') |
|
return writing_paths |
|
|
|
def add_relative_pos_based(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True): |
|
|
|
|
|
def pos_cluster(pos): |
|
|
|
if pos[0] == 'V': |
|
pos = 'VB' |
|
elif pos == 'NNS': |
|
pos = 'NN' |
|
elif pos == 'NNPS': |
|
pos = 'NNP' |
|
elif 'JJ' in pos: |
|
pos = 'JJ' |
|
elif pos[:2] == 'RB' or pos == 'WRB' or pos == 'RP': |
|
pos = 'RB' |
|
elif pos[:3] == 'PRP': |
|
pos = 'PRP' |
|
elif pos in ['.', ':', ',', "''", '``']: |
|
pos = '.' |
|
elif pos[0] == '-': |
|
pos = '-RB-' |
|
elif pos[:2] == 'WP': |
|
pos = 'WP' |
|
return pos |
|
|
|
if src_domain == tgt_domain: |
|
pred_paths = [] |
|
if use_unlabeled_data: |
|
pred_paths = [file for file in os.listdir(parser_path) if |
|
file.endswith("pred.txt") and 'poetry' not in file and 'prose' not in file and 'extra' in file and tgt_domain in file] |
|
|
|
gold_paths = [file for file in os.listdir(parser_path) if |
|
file.endswith("gold.txt") and 'poetry' not in file and 'prose' not in file and 'extra' not in file and tgt_domain in file and 'train' not in file] |
|
if use_labeled_data: |
|
gold_paths += [file for file in os.listdir(parser_path) if |
|
file.endswith("gold.txt") and 'extra' not in file and 'poetry' not in file and 'prose' not in file and tgt_domain in file and 'train' in file] |
|
|
|
if not use_unlabeled_data and not use_labeled_data: |
|
raise ValueError |
|
else: |
|
pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and tgt_domain in file] |
|
gold_paths = [] |
|
if use_labeled_data: |
|
gold_paths = ['data/onto_pos_ner_dp_train_' + src_domain] |
|
|
|
if not use_unlabeled_data: |
|
raise ValueError |
|
|
|
paths = pred_paths + gold_paths |
|
print("Adding labels to paths: %s" % ', '.join(paths)) |
|
root_line = ['0', ROOT, 'XX', 'O', '0', 'root', '+0_XX'] |
|
writing_paths = {} |
|
sentences = {} |
|
for path in paths: |
|
if tgt_domain in path: |
|
reading_path = parser_path + path |
|
writing_path = model_path + 'parser_' + path |
|
split = get_split(writing_path) |
|
else: |
|
reading_path = path |
|
writing_path = model_path + 'parser_' + 'domain_' + src_domain + '_train_model_domain_' + src_domain + '_data_domain_' + src_domain + '_gold.txt' |
|
split = 'extra_train' |
|
writing_paths[split] = writing_path |
|
len_sent = 0 |
|
tree_dict = {'0': '0'} |
|
lines = [] |
|
sentences_list = [] |
|
with open(reading_path, 'r') as file: |
|
for line in file: |
|
|
|
line = line.strip() |
|
if len(line) == 0: |
|
for idx in range(len_sent): |
|
info_of_a_word = lines[idx] |
|
|
|
head = int(info_of_a_word[4]) - 1 |
|
if head == -1: |
|
info_about_head = root_line |
|
else: |
|
info_about_head = lines[head] |
|
if idx < head: |
|
relative_position_head = 1 |
|
postag_head = pos_cluster(info_about_head[2]) |
|
|
|
for x in range(idx + 1, head): |
|
another_word = lines[x] |
|
postag_word_before_head = pos_cluster(another_word[2]) |
|
if postag_word_before_head == postag_head: |
|
relative_position_head += 1 |
|
label = str( |
|
"+" + |
|
repr(relative_position_head) + |
|
"_" + |
|
postag_head) |
|
lines[idx].append(label) |
|
|
|
|
|
elif idx > head: |
|
relative_position_head = 1 |
|
postag_head = pos_cluster(info_about_head[2]) |
|
for x in range(head + 1, idx): |
|
another_word = lines[x] |
|
postag_word_before_head = pos_cluster(another_word[2]) |
|
if postag_word_before_head == postag_head: |
|
relative_position_head += 1 |
|
label = str( |
|
"-" + |
|
repr(relative_position_head) + |
|
"_" + |
|
postag_head) |
|
lines[idx].append(label) |
|
if len(lines) > 0: |
|
sentences_list.append(root_line) |
|
for line_ in lines: |
|
sentences_list.append(line_) |
|
sentences_list.append([]) |
|
lines = [] |
|
tree_dict = {'0': '0'} |
|
len_sent = 0 |
|
continue |
|
tokens = line.split('\t') |
|
idx = tokens[0] |
|
word = tokens[1] |
|
pos = tokens[2] |
|
ner = tokens[3] |
|
head = tokens[4] |
|
arc_tag = tokens[5] |
|
tree_dict[idx] = head |
|
lines.append([idx, word, pos, ner, head, arc_tag]) |
|
len_sent += 1 |
|
sentences[split] = sentences_list |
|
|
|
train_sentences = [] |
|
if 'train' in sentences: |
|
train_sentences = sentences['train'] |
|
else: |
|
writing_paths['train'] = writing_paths['extra_train'].replace('extra_train', 'train') |
|
if 'extra_train' in sentences: |
|
train_sentences += sentences['extra_train'] |
|
del writing_paths['extra_train'] |
|
if 'extra_dev' in sentences: |
|
train_sentences += sentences['extra_dev'] |
|
del writing_paths['extra_dev'] |
|
with open(writing_paths['train'], 'w') as f: |
|
for sent in train_sentences: |
|
f.write('\t'.join(sent) + '\n') |
|
for split in ['dev', 'test']: |
|
if split in sentences: |
|
split_sentences = sentences[split] |
|
with open(writing_paths[split], 'w') as f: |
|
for sent in split_sentences: |
|
f.write('\t'.join(sent) + '\n') |
|
return writing_paths |
|
|
|
def add_language_model(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True): |
|
if src_domain == tgt_domain: |
|
pred_paths = [] |
|
if use_unlabeled_data: |
|
pred_paths = [file for file in os.listdir(parser_path) if |
|
file.endswith("pred.txt") and 'extra' in file and tgt_domain in file] |
|
|
|
gold_paths = [file for file in os.listdir(parser_path) if |
|
file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' not in file] |
|
if use_labeled_data: |
|
gold_paths += [file for file in os.listdir(parser_path) if |
|
file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' in file] |
|
|
|
if not use_unlabeled_data and not use_labeled_data: |
|
raise ValueError |
|
else: |
|
pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and tgt_domain in file] |
|
|
|
gold_paths = [] |
|
if use_labeled_data: |
|
gold_paths = ['data/onto_pos_ner_dp_train_' + src_domain] |
|
|
|
if not use_unlabeled_data and not use_labeled_data: |
|
raise ValueError |
|
|
|
paths = pred_paths + gold_paths |
|
print("Adding labels to paths: %s" % ', '.join(paths)) |
|
root_line = ['0', ROOT, 'XX', 'O', '0', 'root'] |
|
writing_paths = {} |
|
sentences = {} |
|
for path in paths: |
|
if tgt_domain in path: |
|
reading_path = parser_path + path |
|
writing_path = model_path + 'parser_' + path |
|
split = get_split(writing_path) |
|
else: |
|
reading_path = path |
|
writing_path = model_path + 'parser_' + 'domain_' + src_domain + '_train_model_domain_' + src_domain + '_data_domain_' + src_domain + '_gold.txt' |
|
split = 'extra_train' |
|
writing_paths[split] = writing_path |
|
len_sent = 0 |
|
lines = [] |
|
sentences_list = [] |
|
with open(reading_path, 'r') as file: |
|
for line in file: |
|
|
|
line = line.strip() |
|
if len(line) == 0: |
|
for idx in range(len_sent): |
|
if idx < len_sent - 1: |
|
lines[idx].append(lines[idx+1][1]) |
|
else: |
|
lines[idx].append(END) |
|
if len(lines) > 0: |
|
tmp_root_line = root_line + [lines[0][1]] |
|
sentences_list.append(tmp_root_line) |
|
for line_ in lines: |
|
sentences_list.append(line_) |
|
sentences_list.append([]) |
|
lines = [] |
|
len_sent = 0 |
|
continue |
|
tokens = line.split('\t') |
|
idx = tokens[0] |
|
word = tokens[1] |
|
pos = tokens[2] |
|
ner = tokens[3] |
|
head = tokens[4] |
|
arc_tag = tokens[5] |
|
lines.append([idx, word, pos, ner, head, arc_tag]) |
|
len_sent += 1 |
|
sentences[split] = sentences_list |
|
|
|
train_sentences = [] |
|
if 'train' in sentences: |
|
train_sentences = sentences['train'] |
|
else: |
|
writing_paths['train'] = writing_paths['extra_train'].replace('extra_train', 'train') |
|
if 'extra_train' in sentences: |
|
train_sentences += sentences['extra_train'] |
|
del writing_paths['extra_train'] |
|
if 'extra_dev' in sentences: |
|
train_sentences += sentences['extra_dev'] |
|
del writing_paths['extra_dev'] |
|
with open(writing_paths['train'], 'w') as f: |
|
for sent in train_sentences: |
|
f.write('\t'.join(sent) + '\n') |
|
for split in ['dev', 'test']: |
|
if split in sentences: |
|
split_sentences = sentences[split] |
|
with open(writing_paths[split], 'w') as f: |
|
for sent in split_sentences: |
|
f.write('\t'.join(sent) + '\n') |
|
return writing_paths |
|
|
|
def add_relative_TAG(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True): |
|
|
|
|
|
def pos_cluster(pos): |
|
|
|
if pos[0] == 'V': |
|
pos = 'VB' |
|
elif pos == 'NNS': |
|
pos = 'NN' |
|
elif pos == 'NNPS': |
|
pos = 'NNP' |
|
elif 'JJ' in pos: |
|
pos = 'JJ' |
|
elif pos[:2] == 'RB' or pos == 'WRB' or pos == 'RP': |
|
pos = 'RB' |
|
elif pos[:3] == 'PRP': |
|
pos = 'PRP' |
|
elif pos in ['.', ':', ',', "''", '``']: |
|
pos = '.' |
|
elif pos[0] == '-': |
|
pos = '-RB-' |
|
elif pos[:2] == 'WP': |
|
pos = 'WP' |
|
return pos |
|
|
|
if src_domain == tgt_domain: |
|
pred_paths = [] |
|
if use_unlabeled_data: |
|
pred_paths = [file for file in os.listdir(parser_path) if |
|
file.endswith("pred.txt") and 'extra' in file and tgt_domain in file] |
|
|
|
gold_paths = [file for file in os.listdir(parser_path) if |
|
file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' not in file] |
|
if use_labeled_data: |
|
gold_paths += [file for file in os.listdir(parser_path) if |
|
file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' in file] |
|
|
|
if not use_unlabeled_data and not use_labeled_data: |
|
raise ValueError |
|
else: |
|
pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and tgt_domain in file] |
|
gold_paths = [] |
|
if use_labeled_data: |
|
gold_paths = ['data/onto_pos_ner_dp_train_' + src_domain] |
|
|
|
if not use_unlabeled_data: |
|
raise ValueError |
|
|
|
paths = pred_paths + gold_paths |
|
print("Adding labels to paths: %s" % ', '.join(paths)) |
|
root_line = ['0', ROOT, 'XX', 'O', '0', 'root', '+0_XX'] |
|
writing_paths = {} |
|
sentences = {} |
|
for path in paths: |
|
if tgt_domain in path: |
|
reading_path = parser_path + path |
|
writing_path = model_path + 'parser_' + path |
|
split = get_split(writing_path) |
|
else: |
|
reading_path = path |
|
writing_path = model_path + 'parser_' + 'domain_' + src_domain + '_train_model_domain_' + src_domain + '_data_domain_' + src_domain + '_gold.txt' |
|
split = 'extra_train' |
|
writing_paths[split] = writing_path |
|
len_sent = 0 |
|
tree_dict = {'0': '0'} |
|
lines = [] |
|
sentences_list = [] |
|
with open(reading_path, 'r') as file: |
|
|
|
for line in file: |
|
|
|
|
|
|
|
line = line.strip() |
|
if len(line) == 0: |
|
for idx in range(len_sent): |
|
info_of_a_word = lines[idx] |
|
|
|
head = int(info_of_a_word[4]) - 1 |
|
if head == -1: |
|
info_about_head = root_line |
|
else: |
|
|
|
info_about_head = lines[head] |
|
|
|
if idx < head: |
|
relative_position_head = 1 |
|
tag_head = info_about_head[5] |
|
|
|
for x in range(idx + 1, head): |
|
another_word = lines[x] |
|
tag_word_before_head = another_word[5] |
|
if tag_word_before_head == tag_head: |
|
relative_position_head += 1 |
|
label = str( |
|
"+" + |
|
repr(relative_position_head) + |
|
"_" + |
|
tag_head) |
|
lines[idx].append(label) |
|
|
|
|
|
elif idx > head: |
|
relative_position_head = 1 |
|
tag_head = info_about_head[5] |
|
for x in range(head + 1, idx): |
|
another_word = lines[x] |
|
tag_word_before_head = another_word[5] |
|
if tag_word_before_head == tag_head: |
|
relative_position_head += 1 |
|
label = str( |
|
"-" + |
|
repr(relative_position_head) + |
|
"_" + |
|
tag_head) |
|
lines[idx].append(label) |
|
if len(lines) > 0: |
|
sentences_list.append(root_line) |
|
for line_ in lines: |
|
sentences_list.append(line_) |
|
sentences_list.append([]) |
|
lines = [] |
|
tree_dict = {'0': '0'} |
|
len_sent = 0 |
|
continue |
|
tokens = line.split('\t') |
|
idx = tokens[0] |
|
word = tokens[1] |
|
pos = tokens[2] |
|
ner = tokens[3] |
|
head = tokens[4] |
|
arc_tag = tokens[5] |
|
tree_dict[idx] = head |
|
lines.append([idx, word, pos, ner, head, arc_tag]) |
|
len_sent += 1 |
|
sentences[split] = sentences_list |
|
|
|
train_sentences = [] |
|
if 'train' in sentences: |
|
train_sentences = sentences['train'] |
|
else: |
|
writing_paths['train'] = writing_paths['extra_train'].replace('extra_train', 'train') |
|
if 'extra_train' in sentences: |
|
train_sentences += sentences['extra_train'] |
|
del writing_paths['extra_train'] |
|
if 'extra_dev' in sentences: |
|
train_sentences += sentences['extra_dev'] |
|
del writing_paths['extra_dev'] |
|
with open(writing_paths['train'], 'w') as f: |
|
for sent in train_sentences: |
|
f.write('\t'.join(sent) + '\n') |
|
for split in ['dev', 'test']: |
|
if split in sentences: |
|
split_sentences = sentences[split] |
|
with open(writing_paths[split], 'w') as f: |
|
for sent in split_sentences: |
|
f.write('\t'.join(sent) + '\n') |
|
return writing_paths |
|
|
|
|
|
def add_head(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True): |
|
|
|
|
|
def pos_cluster(pos): |
|
|
|
if pos[0] == 'V': |
|
pos = 'VB' |
|
elif pos == 'NNS': |
|
pos = 'NN' |
|
elif pos == 'NNPS': |
|
pos = 'NNP' |
|
elif 'JJ' in pos: |
|
pos = 'JJ' |
|
elif pos[:2] == 'RB' or pos == 'WRB' or pos == 'RP': |
|
pos = 'RB' |
|
elif pos[:3] == 'PRP': |
|
pos = 'PRP' |
|
elif pos in ['.', ':', ',', "''", '``']: |
|
pos = '.' |
|
elif pos[0] == '-': |
|
pos = '-RB-' |
|
elif pos[:2] == 'WP': |
|
pos = 'WP' |
|
return pos |
|
|
|
if src_domain == tgt_domain: |
|
pred_paths = [] |
|
if use_unlabeled_data: |
|
pred_paths = [file for file in os.listdir(parser_path) if |
|
file.endswith("pred.txt") and 'extra' in file and tgt_domain in file] |
|
|
|
gold_paths = [file for file in os.listdir(parser_path) if |
|
file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' not in file] |
|
if use_labeled_data: |
|
gold_paths += [file for file in os.listdir(parser_path) if |
|
file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' in file] |
|
|
|
if not use_unlabeled_data and not use_labeled_data: |
|
raise ValueError |
|
else: |
|
pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and tgt_domain in file] |
|
gold_paths = [] |
|
if use_labeled_data: |
|
gold_paths = ['data/onto_pos_ner_dp_train_' + src_domain] |
|
|
|
if not use_unlabeled_data: |
|
raise ValueError |
|
|
|
paths = pred_paths + gold_paths |
|
print("Adding labels to paths: %s" % ', '.join(paths)) |
|
root_line = ['0', ROOT, 'XX', 'O', '0', 'root', '+0_XX'] |
|
writing_paths = {} |
|
sentences = {} |
|
for path in paths: |
|
if tgt_domain in path: |
|
reading_path = parser_path + path |
|
writing_path = model_path + 'parser_' + path |
|
split = get_split(writing_path) |
|
else: |
|
reading_path = path |
|
writing_path = model_path + 'parser_' + 'domain_' + src_domain + '_train_model_domain_' + src_domain + '_data_domain_' + src_domain + '_gold.txt' |
|
split = 'extra_train' |
|
writing_paths[split] = writing_path |
|
len_sent = 0 |
|
tree_dict = {'0': '0'} |
|
lines = [] |
|
sentences_list = [] |
|
with open(reading_path, 'r') as file: |
|
|
|
for line in file: |
|
|
|
|
|
|
|
line = line.strip() |
|
if len(line) == 0: |
|
for idx in range(len_sent): |
|
info_of_a_word = lines[idx] |
|
|
|
head = int(info_of_a_word[4]) - 1 |
|
if head == -1: |
|
info_about_head = root_line |
|
else: |
|
|
|
info_about_head = lines[head] |
|
head_word = info_about_head[1] |
|
lines[idx].append(head_word) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if len(lines) > 0: |
|
sentences_list.append(root_line) |
|
for line_ in lines: |
|
sentences_list.append(line_) |
|
sentences_list.append([]) |
|
lines = [] |
|
tree_dict = {'0': '0'} |
|
len_sent = 0 |
|
continue |
|
tokens = line.split('\t') |
|
idx = tokens[0] |
|
word = tokens[1] |
|
pos = tokens[2] |
|
ner = tokens[3] |
|
head = tokens[4] |
|
arc_tag = tokens[5] |
|
tree_dict[idx] = head |
|
lines.append([idx, word, pos, ner, head, arc_tag]) |
|
len_sent += 1 |
|
sentences[split] = sentences_list |
|
|
|
train_sentences = [] |
|
if 'train' in sentences: |
|
train_sentences = sentences['train'] |
|
else: |
|
writing_paths['train'] = writing_paths['extra_train'].replace('extra_train', 'train') |
|
if 'extra_train' in sentences: |
|
train_sentences += sentences['extra_train'] |
|
del writing_paths['extra_train'] |
|
if 'extra_dev' in sentences: |
|
train_sentences += sentences['extra_dev'] |
|
del writing_paths['extra_dev'] |
|
with open(writing_paths['train'], 'w') as f: |
|
for sent in train_sentences: |
|
f.write('\t'.join(sent) + '\n') |
|
for split in ['dev', 'test']: |
|
if split in sentences: |
|
split_sentences = sentences[split] |
|
with open(writing_paths[split], 'w') as f: |
|
for sent in split_sentences: |
|
f.write('\t'.join(sent) + '\n') |
|
return writing_paths |
|
import json |
|
def get_modified_coarse(ma): |
|
ma = ma.replace('sgpl','sg').replace('sgdu','sg') |
|
with open('/home/jivnesh/DCST_scratch/utils/io_/coarse_to_ma_dict.json', 'r') as fh: |
|
coarse_dict = json.load(fh) |
|
for key in coarse_dict.keys(): |
|
if ma in coarse_dict[key]: |
|
return key |
|
def add_head_coarse_pos(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True): |
|
|
|
|
|
def pos_cluster(pos): |
|
|
|
if pos[0] == 'V': |
|
pos = 'VB' |
|
elif pos == 'NNS': |
|
pos = 'NN' |
|
elif pos == 'NNPS': |
|
pos = 'NNP' |
|
elif 'JJ' in pos: |
|
pos = 'JJ' |
|
elif pos[:2] == 'RB' or pos == 'WRB' or pos == 'RP': |
|
pos = 'RB' |
|
elif pos[:3] == 'PRP': |
|
pos = 'PRP' |
|
elif pos in ['.', ':', ',', "''", '``']: |
|
pos = '.' |
|
elif pos[0] == '-': |
|
pos = '-RB-' |
|
elif pos[:2] == 'WP': |
|
pos = 'WP' |
|
return pos |
|
|
|
if src_domain == tgt_domain: |
|
pred_paths = [] |
|
if use_unlabeled_data: |
|
pred_paths = [file for file in os.listdir(parser_path) if |
|
file.endswith("pred.txt") and 'extra' in file and tgt_domain in file] |
|
|
|
gold_paths = [file for file in os.listdir(parser_path) if |
|
file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' not in file] |
|
if use_labeled_data: |
|
gold_paths += [file for file in os.listdir(parser_path) if |
|
file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' in file] |
|
|
|
if not use_unlabeled_data and not use_labeled_data: |
|
raise ValueError |
|
else: |
|
pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and tgt_domain in file] |
|
gold_paths = [] |
|
if use_labeled_data: |
|
gold_paths = ['data/onto_pos_ner_dp_train_' + src_domain] |
|
|
|
if not use_unlabeled_data: |
|
raise ValueError |
|
|
|
paths = pred_paths + gold_paths |
|
print("Adding labels to paths: %s" % ', '.join(paths)) |
|
root_line = ['0', ROOT, 'XX', 'O', '0', 'root', 'O'] |
|
writing_paths = {} |
|
sentences = {} |
|
for path in paths: |
|
if tgt_domain in path: |
|
reading_path = parser_path + path |
|
writing_path = model_path + 'parser_' + path |
|
split = get_split(writing_path) |
|
else: |
|
reading_path = path |
|
writing_path = model_path + 'parser_' + 'domain_' + src_domain + '_train_model_domain_' + src_domain + '_data_domain_' + src_domain + '_gold.txt' |
|
split = 'extra_train' |
|
writing_paths[split] = writing_path |
|
len_sent = 0 |
|
tree_dict = {'0': '0'} |
|
lines = [] |
|
sentences_list = [] |
|
with open(reading_path, 'r') as file: |
|
|
|
for line in file: |
|
|
|
|
|
|
|
line = line.strip() |
|
if len(line) == 0: |
|
for idx in range(len_sent): |
|
info_of_a_word = lines[idx] |
|
|
|
head = int(info_of_a_word[4]) - 1 |
|
if head == -1: |
|
info_about_head = root_line |
|
else: |
|
|
|
info_about_head = lines[head] |
|
postag_head = info_about_head[2] |
|
lines[idx].append(postag_head) |
|
if len(lines) > 0: |
|
sentences_list.append(root_line) |
|
for line_ in lines: |
|
sentences_list.append(line_) |
|
sentences_list.append([]) |
|
lines = [] |
|
tree_dict = {'0': '0'} |
|
len_sent = 0 |
|
continue |
|
tokens = line.split('\t') |
|
idx = tokens[0] |
|
word = tokens[1] |
|
pos = tokens[2] |
|
ner = tokens[3] |
|
head = tokens[4] |
|
arc_tag = tokens[5] |
|
tree_dict[idx] = head |
|
lines.append([idx, word, pos, ner, head, arc_tag]) |
|
len_sent += 1 |
|
sentences[split] = sentences_list |
|
|
|
train_sentences = [] |
|
if 'train' in sentences: |
|
train_sentences = sentences['train'] |
|
else: |
|
writing_paths['train'] = writing_paths['extra_train'].replace('extra_train', 'train') |
|
if 'extra_train' in sentences: |
|
train_sentences += sentences['extra_train'] |
|
del writing_paths['extra_train'] |
|
if 'extra_dev' in sentences: |
|
train_sentences += sentences['extra_dev'] |
|
del writing_paths['extra_dev'] |
|
with open(writing_paths['train'], 'w') as f: |
|
for sent in train_sentences: |
|
f.write('\t'.join(sent) + '\n') |
|
for split in ['dev', 'test']: |
|
if split in sentences: |
|
split_sentences = sentences[split] |
|
with open(writing_paths[split], 'w') as f: |
|
for sent in split_sentences: |
|
f.write('\t'.join(sent) + '\n') |
|
return writing_paths |
|
|
|
def add_head_ma(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True): |
|
|
|
|
|
def pos_cluster(pos): |
|
|
|
if pos[0] == 'V': |
|
pos = 'VB' |
|
elif pos == 'NNS': |
|
pos = 'NN' |
|
elif pos == 'NNPS': |
|
pos = 'NNP' |
|
elif 'JJ' in pos: |
|
pos = 'JJ' |
|
elif pos[:2] == 'RB' or pos == 'WRB' or pos == 'RP': |
|
pos = 'RB' |
|
elif pos[:3] == 'PRP': |
|
pos = 'PRP' |
|
elif pos in ['.', ':', ',', "''", '``']: |
|
pos = '.' |
|
elif pos[0] == '-': |
|
pos = '-RB-' |
|
elif pos[:2] == 'WP': |
|
pos = 'WP' |
|
return pos |
|
|
|
if src_domain == tgt_domain: |
|
pred_paths = [] |
|
if use_unlabeled_data: |
|
pred_paths = [file for file in os.listdir(parser_path) if |
|
file.endswith("pred.txt") and 'extra' in file and tgt_domain in file] |
|
|
|
gold_paths = [file for file in os.listdir(parser_path) if |
|
file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' not in file] |
|
if use_labeled_data: |
|
gold_paths += [file for file in os.listdir(parser_path) if |
|
file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' in file] |
|
|
|
if not use_unlabeled_data and not use_labeled_data: |
|
raise ValueError |
|
else: |
|
pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and tgt_domain in file] |
|
gold_paths = [] |
|
if use_labeled_data: |
|
gold_paths = ['data/onto_pos_ner_dp_train_' + src_domain] |
|
|
|
if not use_unlabeled_data: |
|
raise ValueError |
|
|
|
paths = pred_paths + gold_paths |
|
print("Adding labels to paths: %s" % ', '.join(paths)) |
|
root_line = ['0', ROOT, 'XX', 'O', '0', 'root', 'XX'] |
|
writing_paths = {} |
|
sentences = {} |
|
for path in paths: |
|
if tgt_domain in path: |
|
reading_path = parser_path + path |
|
writing_path = model_path + 'parser_' + path |
|
split = get_split(writing_path) |
|
else: |
|
reading_path = path |
|
writing_path = model_path + 'parser_' + 'domain_' + src_domain + '_train_model_domain_' + src_domain + '_data_domain_' + src_domain + '_gold.txt' |
|
split = 'extra_train' |
|
writing_paths[split] = writing_path |
|
len_sent = 0 |
|
tree_dict = {'0': '0'} |
|
lines = [] |
|
sentences_list = [] |
|
with open(reading_path, 'r') as file: |
|
|
|
for line in file: |
|
|
|
|
|
|
|
line = line.strip() |
|
if len(line) == 0: |
|
for idx in range(len_sent): |
|
info_of_a_word = lines[idx] |
|
|
|
head = int(info_of_a_word[4]) - 1 |
|
if head == -1: |
|
info_about_head = root_line |
|
else: |
|
|
|
info_about_head = lines[head] |
|
postag_head = pos_cluster(info_about_head[2]) |
|
lines[idx].append(postag_head) |
|
if len(lines) > 0: |
|
sentences_list.append(root_line) |
|
for line_ in lines: |
|
sentences_list.append(line_) |
|
sentences_list.append([]) |
|
lines = [] |
|
tree_dict = {'0': '0'} |
|
len_sent = 0 |
|
continue |
|
tokens = line.split('\t') |
|
idx = tokens[0] |
|
word = tokens[1] |
|
pos = tokens[2] |
|
ner = tokens[3] |
|
head = tokens[4] |
|
arc_tag = tokens[5] |
|
tree_dict[idx] = head |
|
lines.append([idx, word, pos, ner, head, arc_tag]) |
|
len_sent += 1 |
|
sentences[split] = sentences_list |
|
|
|
train_sentences = [] |
|
if 'train' in sentences: |
|
train_sentences = sentences['train'] |
|
else: |
|
writing_paths['train'] = writing_paths['extra_train'].replace('extra_train', 'train') |
|
if 'extra_train' in sentences: |
|
train_sentences += sentences['extra_train'] |
|
del writing_paths['extra_train'] |
|
if 'extra_dev' in sentences: |
|
train_sentences += sentences['extra_dev'] |
|
del writing_paths['extra_dev'] |
|
with open(writing_paths['train'], 'w') as f: |
|
for sent in train_sentences: |
|
f.write('\t'.join(sent) + '\n') |
|
for split in ['dev', 'test']: |
|
if split in sentences: |
|
split_sentences = sentences[split] |
|
with open(writing_paths[split], 'w') as f: |
|
for sent in split_sentences: |
|
f.write('\t'.join(sent) + '\n') |
|
return writing_paths |
|
|
|
def add_label(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True): |
|
if src_domain == tgt_domain: |
|
pred_paths = [] |
|
if use_unlabeled_data: |
|
pred_paths = [file for file in os.listdir(parser_path) if |
|
file.endswith("pred.txt") and 'extra' in file and tgt_domain in file] |
|
|
|
gold_paths = [file for file in os.listdir(parser_path) if |
|
file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' not in file] |
|
if use_labeled_data: |
|
gold_paths += [file for file in os.listdir(parser_path) if |
|
file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' in file] |
|
|
|
if not use_unlabeled_data and not use_labeled_data: |
|
raise ValueError |
|
else: |
|
pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and tgt_domain in file] |
|
|
|
gold_paths = [] |
|
if use_labeled_data: |
|
gold_paths = ['data/onto_pos_ner_dp_train_' + src_domain] |
|
|
|
if not use_unlabeled_data and not use_labeled_data: |
|
raise ValueError |
|
|
|
paths = pred_paths + gold_paths |
|
print('############ Add Label Task #################') |
|
print("Adding labels to paths: %s" % ', '.join(paths)) |
|
root_line = ['0', ROOT, 'XX', 'O', '0', 'root'] |
|
writing_paths = {} |
|
sentences = {} |
|
for path in paths: |
|
if tgt_domain in path: |
|
reading_path = parser_path + path |
|
writing_path = model_path + 'parser_' + path |
|
split = get_split(writing_path) |
|
else: |
|
reading_path = path |
|
writing_path = model_path + 'parser_' + 'domain_' + src_domain + '_train_model_domain_' + src_domain + '_data_domain_' + src_domain + '_gold.txt' |
|
split = 'extra_train' |
|
writing_paths[split] = writing_path |
|
len_sent = 0 |
|
lines = [] |
|
sentences_list = [] |
|
with open(reading_path, 'r') as file: |
|
for line in file: |
|
|
|
line = line.strip() |
|
|
|
if len(line) == 0: |
|
|
|
for idx in range(len_sent): |
|
lines[idx].append(lines[idx][5]) |
|
|
|
if len(lines) > 0: |
|
tmp_root_line = root_line + [root_line[5]] |
|
sentences_list.append(tmp_root_line) |
|
for line_ in lines: |
|
sentences_list.append(line_) |
|
sentences_list.append([]) |
|
lines = [] |
|
len_sent = 0 |
|
continue |
|
tokens = line.split('\t') |
|
idx = tokens[0] |
|
word = tokens[1] |
|
pos = tokens[2] |
|
ner = tokens[3] |
|
head = tokens[4] |
|
arc_tag = tokens[5] |
|
lines.append([idx, word, pos, ner, head, arc_tag]) |
|
len_sent += 1 |
|
sentences[split] = sentences_list |
|
|
|
train_sentences = [] |
|
if 'train' in sentences: |
|
train_sentences = sentences['train'] |
|
else: |
|
|
|
writing_paths['train'] = writing_paths['extra_train'].replace('extra_train', 'train') |
|
if 'extra_train' in sentences: |
|
train_sentences += sentences['extra_train'] |
|
del writing_paths['extra_train'] |
|
if 'extra_dev' in sentences: |
|
train_sentences += sentences['extra_dev'] |
|
del writing_paths['extra_dev'] |
|
with open(writing_paths['train'], 'w') as f: |
|
for sent in train_sentences: |
|
f.write('\t'.join(sent) + '\n') |
|
for split in ['dev', 'test']: |
|
if split in sentences: |
|
split_sentences = sentences[split] |
|
with open(writing_paths[split], 'w') as f: |
|
for sent in split_sentences: |
|
f.write('\t'.join(sent) + '\n') |
|
return writing_paths |
|
|
|
def predict_ma_tag_of_modifier(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True): |
|
if src_domain == tgt_domain: |
|
pred_paths = [] |
|
if use_unlabeled_data: |
|
pred_paths = [file for file in os.listdir(parser_path) if |
|
file.endswith("pred.txt") and 'extra' in file and tgt_domain in file] |
|
|
|
gold_paths = [file for file in os.listdir(parser_path) if |
|
file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' not in file] |
|
if use_labeled_data: |
|
gold_paths += [file for file in os.listdir(parser_path) if |
|
file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' in file] |
|
|
|
if not use_unlabeled_data and not use_labeled_data: |
|
raise ValueError |
|
else: |
|
pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and tgt_domain in file] |
|
|
|
gold_paths = [] |
|
if use_labeled_data: |
|
gold_paths = ['data/onto_pos_ner_dp_train_' + src_domain] |
|
|
|
if not use_unlabeled_data and not use_labeled_data: |
|
raise ValueError |
|
|
|
paths = pred_paths + gold_paths |
|
print('############ Add Label Task #################') |
|
print("Adding labels to paths: %s" % ', '.join(paths)) |
|
root_line = ['0', ROOT, 'XX', 'O', '0', 'root'] |
|
writing_paths = {} |
|
sentences = {} |
|
for path in paths: |
|
if tgt_domain in path: |
|
reading_path = parser_path + path |
|
writing_path = model_path + 'parser_' + path |
|
split = get_split(writing_path) |
|
else: |
|
reading_path = path |
|
writing_path = model_path + 'parser_' + 'domain_' + src_domain + '_train_model_domain_' + src_domain + '_data_domain_' + src_domain + '_gold.txt' |
|
split = 'extra_train' |
|
writing_paths[split] = writing_path |
|
len_sent = 0 |
|
lines = [] |
|
sentences_list = [] |
|
with open(reading_path, 'r') as file: |
|
for line in file: |
|
|
|
line = line.strip() |
|
|
|
if len(line) == 0: |
|
|
|
for idx in range(len_sent): |
|
lines[idx].append(clean_ma(lines[idx][3])) |
|
|
|
if len(lines) > 0: |
|
tmp_root_line = root_line + [root_line[3]] |
|
sentences_list.append(tmp_root_line) |
|
for line_ in lines: |
|
sentences_list.append(line_) |
|
sentences_list.append([]) |
|
lines = [] |
|
len_sent = 0 |
|
continue |
|
tokens = line.split('\t') |
|
idx = tokens[0] |
|
word = tokens[1] |
|
pos = tokens[2] |
|
ner = tokens[3] |
|
head = tokens[4] |
|
arc_tag = tokens[5] |
|
lines.append([idx, word, pos, ner, head, arc_tag]) |
|
len_sent += 1 |
|
sentences[split] = sentences_list |
|
|
|
train_sentences = [] |
|
if 'train' in sentences: |
|
train_sentences = sentences['train'] |
|
else: |
|
writing_paths['train'] = writing_paths['extra_train'].replace('extra_train', 'train') |
|
if 'extra_train' in sentences: |
|
train_sentences += sentences['extra_train'] |
|
del writing_paths['extra_train'] |
|
if 'extra_dev' in sentences: |
|
train_sentences += sentences['extra_dev'] |
|
del writing_paths['extra_dev'] |
|
with open(writing_paths['train'], 'w') as f: |
|
for sent in train_sentences: |
|
f.write('\t'.join(sent) + '\n') |
|
for split in ['dev', 'test']: |
|
if split in sentences: |
|
split_sentences = sentences[split] |
|
with open(writing_paths[split], 'w') as f: |
|
for sent in split_sentences: |
|
f.write('\t'.join(sent) + '\n') |
|
return writing_paths |
|
|
|
def predict_coarse_of_modifier(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True): |
|
if src_domain == tgt_domain: |
|
pred_paths = [] |
|
if use_unlabeled_data: |
|
pred_paths = [file for file in os.listdir(parser_path) if |
|
file.endswith("pred.txt") and 'extra' in file and tgt_domain in file] |
|
|
|
gold_paths = [file for file in os.listdir(parser_path) if |
|
file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' not in file] |
|
if use_labeled_data: |
|
gold_paths += [file for file in os.listdir(parser_path) if |
|
file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' in file] |
|
|
|
if not use_unlabeled_data and not use_labeled_data: |
|
raise ValueError |
|
else: |
|
pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and tgt_domain in file] |
|
|
|
gold_paths = [] |
|
if use_labeled_data: |
|
gold_paths = ['data/onto_pos_ner_dp_train_' + src_domain] |
|
|
|
if not use_unlabeled_data and not use_labeled_data: |
|
raise ValueError |
|
|
|
paths = pred_paths + gold_paths |
|
print('############ Add Label Task #################') |
|
print("Adding labels to paths: %s" % ', '.join(paths)) |
|
root_line = ['0', ROOT, 'XX', 'O', '0', 'root'] |
|
writing_paths = {} |
|
sentences = {} |
|
for path in paths: |
|
if tgt_domain in path: |
|
reading_path = parser_path + path |
|
writing_path = model_path + 'parser_' + path |
|
split = get_split(writing_path) |
|
else: |
|
reading_path = path |
|
writing_path = model_path + 'parser_' + 'domain_' + src_domain + '_train_model_domain_' + src_domain + '_data_domain_' + src_domain + '_gold.txt' |
|
split = 'extra_train' |
|
writing_paths[split] = writing_path |
|
len_sent = 0 |
|
lines = [] |
|
sentences_list = [] |
|
with open(reading_path, 'r') as file: |
|
for line in file: |
|
|
|
line = line.strip() |
|
|
|
if len(line) == 0: |
|
|
|
for idx in range(len_sent): |
|
lines[idx].append(lines[idx][3]) |
|
|
|
if len(lines) > 0: |
|
tmp_root_line = root_line + [root_line[3]] |
|
sentences_list.append(tmp_root_line) |
|
for line_ in lines: |
|
sentences_list.append(line_) |
|
sentences_list.append([]) |
|
lines = [] |
|
len_sent = 0 |
|
continue |
|
tokens = line.split('\t') |
|
idx = tokens[0] |
|
word = tokens[1] |
|
pos = tokens[2] |
|
ner = tokens[3] |
|
head = tokens[4] |
|
arc_tag = tokens[5] |
|
lines.append([idx, word, pos, ner, head, arc_tag]) |
|
len_sent += 1 |
|
sentences[split] = sentences_list |
|
|
|
train_sentences = [] |
|
if 'train' in sentences: |
|
train_sentences = sentences['train'] |
|
else: |
|
writing_paths['train'] = writing_paths['extra_train'].replace('extra_train', 'train') |
|
if 'extra_train' in sentences: |
|
train_sentences += sentences['extra_train'] |
|
del writing_paths['extra_train'] |
|
if 'extra_dev' in sentences: |
|
train_sentences += sentences['extra_dev'] |
|
del writing_paths['extra_dev'] |
|
with open(writing_paths['train'], 'w') as f: |
|
for sent in train_sentences: |
|
f.write('\t'.join(sent) + '\n') |
|
for split in ['dev', 'test']: |
|
if split in sentences: |
|
split_sentences = sentences[split] |
|
with open(writing_paths[split], 'w') as f: |
|
for sent in split_sentences: |
|
f.write('\t'.join(sent) + '\n') |
|
return writing_paths |
|
import re |
|
def get_case(ma): |
|
indeclinable = ['ind','prep','interj','prep','conj','part'] |
|
case_list = ['nom','voc','acc','i','inst','dat','abl','g','loc'] |
|
gender_list = ['n','f','m','*'] |
|
person_list = ['1','2','3'] |
|
no_list = ['du','sg','pl'] |
|
pops = [' ac',' ps'] |
|
ma=ma.replace('sgpl','sg').replace('sgdu','sg') |
|
temp = re.sub("([\(\[]).*?([\)\]])", "\g<1>\g<2>", ma).replace('[] ','').strip(' []') |
|
temp = temp.split('.') |
|
if temp[-1] == '': |
|
temp.pop(-1) |
|
|
|
case='' |
|
no='' |
|
person='' |
|
gender='' |
|
tense='' |
|
coarse='' |
|
for a,b in enumerate(temp): |
|
if b in pops: |
|
temp.pop(a) |
|
|
|
for a,b in enumerate(temp): |
|
if b.strip() in gender_list: |
|
gender = b.strip() |
|
temp.pop(a) |
|
|
|
for a,b in enumerate(temp): |
|
if b.strip() in case_list: |
|
case = b.strip() |
|
temp.pop(a) |
|
if case!= '': |
|
coarse ='Noun' |
|
|
|
for a,b in enumerate(temp): |
|
if b.strip() in person_list: |
|
person = b.strip() |
|
temp.pop(a) |
|
|
|
for a,b in enumerate(temp): |
|
if b.strip() in no_list: |
|
no = b.strip() |
|
temp.pop(a) |
|
|
|
for b in temp: |
|
tense=tense+ ' '+b.strip() |
|
tense=tense.strip() |
|
|
|
|
|
if tense == 'adv': |
|
coarse = 'adv' |
|
for ind in indeclinable: |
|
if tense == ind: |
|
coarse = 'Ind' |
|
if tense == 'abs' or tense == 'ca abs': |
|
coarse = 'IV' |
|
if tense!='' and coarse=='': |
|
if person !='' or no!='': |
|
coarse= 'FV' |
|
else: |
|
coarse = 'IV' |
|
if case == 'i': |
|
return 'inst' |
|
|
|
if case !='': |
|
return case |
|
else: |
|
return coarse |
|
def clean_ma(ma): |
|
ma = re.sub("([\(\[]).*?([\)\]])", "\g<1>\g<2>", ma).replace('[] ','').strip(' []').replace(' ac','').replace(' ps','').replace('sgpl','sg').replace('sgdu','sg') |
|
ma = ma.replace('i.','inst.').replace('.','').replace(' ','') |
|
return ma |
|
def predict_case_of_modifier(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True): |
|
if src_domain == tgt_domain: |
|
pred_paths = [] |
|
if use_unlabeled_data: |
|
pred_paths = [file for file in os.listdir(parser_path) if |
|
file.endswith("pred.txt") and 'extra' in file and tgt_domain in file] |
|
|
|
gold_paths = [file for file in os.listdir(parser_path) if |
|
file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' not in file] |
|
if use_labeled_data: |
|
gold_paths += [file for file in os.listdir(parser_path) if |
|
file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' in file] |
|
|
|
if not use_unlabeled_data and not use_labeled_data: |
|
raise ValueError |
|
else: |
|
pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and tgt_domain in file] |
|
|
|
gold_paths = [] |
|
if use_labeled_data: |
|
gold_paths = ['data/onto_pos_ner_dp_train_' + src_domain] |
|
|
|
if not use_unlabeled_data and not use_labeled_data: |
|
raise ValueError |
|
|
|
paths = pred_paths + gold_paths |
|
print('############ Add Label Task #################') |
|
print("Adding labels to paths: %s" % ', '.join(paths)) |
|
root_line = ['0', ROOT, 'XX', 'O', '0', 'root'] |
|
writing_paths = {} |
|
sentences = {} |
|
for path in paths: |
|
if tgt_domain in path: |
|
reading_path = parser_path + path |
|
writing_path = model_path + 'parser_' + path |
|
split = get_split(writing_path) |
|
else: |
|
reading_path = path |
|
writing_path = model_path + 'parser_' + 'domain_' + src_domain + '_train_model_domain_' + src_domain + '_data_domain_' + src_domain + '_gold.txt' |
|
split = 'extra_train' |
|
writing_paths[split] = writing_path |
|
len_sent = 0 |
|
lines = [] |
|
sentences_list = [] |
|
with open(reading_path, 'r') as file: |
|
for line in file: |
|
|
|
line = line.strip() |
|
|
|
if len(line) == 0: |
|
|
|
for idx in range(len_sent): |
|
lines[idx].append(get_case(lines[idx][3])) |
|
|
|
if len(lines) > 0: |
|
tmp_root_line = root_line + [root_line[3]] |
|
sentences_list.append(tmp_root_line) |
|
for line_ in lines: |
|
sentences_list.append(line_) |
|
sentences_list.append([]) |
|
lines = [] |
|
len_sent = 0 |
|
continue |
|
tokens = line.split('\t') |
|
idx = tokens[0] |
|
word = tokens[1] |
|
pos = tokens[2] |
|
ner = tokens[3] |
|
head = tokens[4] |
|
arc_tag = tokens[5] |
|
lines.append([idx, word, pos, ner, head, arc_tag]) |
|
len_sent += 1 |
|
sentences[split] = sentences_list |
|
|
|
train_sentences = [] |
|
if 'train' in sentences: |
|
train_sentences = sentences['train'] |
|
else: |
|
writing_paths['train'] = writing_paths['extra_train'].replace('extra_train', 'train') |
|
if 'extra_train' in sentences: |
|
train_sentences += sentences['extra_train'] |
|
del writing_paths['extra_train'] |
|
if 'extra_dev' in sentences: |
|
train_sentences += sentences['extra_dev'] |
|
del writing_paths['extra_dev'] |
|
with open(writing_paths['train'], 'w') as f: |
|
for sent in train_sentences: |
|
f.write('\t'.join(sent) + '\n') |
|
for split in ['dev', 'test']: |
|
if split in sentences: |
|
split_sentences = sentences[split] |
|
with open(writing_paths[split], 'w') as f: |
|
for sent in split_sentences: |
|
f.write('\t'.join(sent) + '\n') |
|
return writing_paths |
|
|
|
def Multitask_case_predict(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True): |
|
writing_paths = {} |
|
|
|
writing_paths['train'] = 'data/ud_pos_ner_dp_train_san_case' |
|
writing_paths['dev'] = 'data/ud_pos_ner_dp_dev_san_case' |
|
writing_paths['test'] = 'data/ud_pos_ner_dp_test_san_case' |
|
return writing_paths |
|
|
|
def Multitask_POS_predict(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True): |
|
writing_paths = {} |
|
|
|
|
|
|
|
|
|
writing_paths['train'] = 'data/ud_pos_ner_dp_train_san_POS' |
|
writing_paths['dev'] = 'data/ud_pos_ner_dp_dev_san_POS' |
|
writing_paths['test'] = 'data/ud_pos_ner_dp_test_san_POS' |
|
return writing_paths |
|
|
|
def Multitask_coarse_predict(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True): |
|
writing_paths = {} |
|
|
|
writing_paths['train'] = 'data/Multitask_coarse_predict_train_san' |
|
writing_paths['dev'] = 'data/Multitask_coarse_predict_dev_san' |
|
writing_paths['test'] = 'data/Multitask_coarse_predict_test_san' |
|
return writing_paths |
|
|
|
def Multitask_label_predict(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True): |
|
writing_paths = {} |
|
|
|
writing_paths['train'] = 'data/Multitask_label_predict_train_san' |
|
writing_paths['dev'] = 'data/Multitask_label_predict_dev_san' |
|
writing_paths['test'] = 'data/Multitask_label_predict_test_san' |
|
return writing_paths |
|
|
|
|
|
def MRL_case(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True): |
|
writing_paths = {} |
|
|
|
writing_paths['train'] = 'data/Prep_MRL/ud_pos_ner_dp_train_'+src_domain+'_case' |
|
writing_paths['dev'] = 'data/Prep_MRL/ud_pos_ner_dp_dev_'+src_domain+'_case' |
|
writing_paths['test'] = 'data/Prep_MRL/ud_pos_ner_dp_test_'+src_domain+'_case' |
|
return writing_paths |
|
|
|
def MRL_POS(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True): |
|
writing_paths = {} |
|
|
|
writing_paths['train'] = 'data/Prep_MRL/ud_pos_ner_dp_train_'+src_domain+'_POS' |
|
writing_paths['dev'] = 'data/Prep_MRL/ud_pos_ner_dp_dev_'+src_domain+'_POS' |
|
writing_paths['test'] = 'data/Prep_MRL/ud_pos_ner_dp_test_'+src_domain+'_POS' |
|
return writing_paths |
|
|
|
def MRL_label(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True): |
|
writing_paths = {} |
|
|
|
writing_paths['train'] = 'data/Prep_MRL/ud_pos_ner_dp_train_'+src_domain+'_dep' |
|
writing_paths['dev'] = 'data/Prep_MRL/ud_pos_ner_dp_dev_'+src_domain+'_dep' |
|
writing_paths['test'] = 'data/Prep_MRL/ud_pos_ner_dp_test_'+src_domain+'_dep' |
|
return writing_paths |
|
|
|
def MRL_no(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True): |
|
writing_paths = {} |
|
|
|
writing_paths['train'] = 'data/Prep_MRL/ud_pos_ner_dp_train_'+src_domain+'_no' |
|
writing_paths['dev'] = 'data/Prep_MRL/ud_pos_ner_dp_dev_'+src_domain+'_no' |
|
writing_paths['test'] = 'data/Prep_MRL/ud_pos_ner_dp_test_'+src_domain+'_no' |
|
return writing_paths |
|
|
|
def MRL_Person(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True): |
|
writing_paths = {} |
|
|
|
writing_paths['train'] = 'data/Prep_MRL/ud_pos_ner_dp_train_'+src_domain+'_per' |
|
writing_paths['dev'] = 'data/Prep_MRL/ud_pos_ner_dp_dev_'+src_domain+'_per' |
|
writing_paths['test'] = 'data/Prep_MRL/ud_pos_ner_dp_test_'+src_domain+'_per' |
|
return writing_paths |
|
def MRL_Gender(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True): |
|
writing_paths = {} |
|
|
|
writing_paths['train'] = 'data/Prep_MRL/ud_pos_ner_dp_train_'+src_domain+'_gen' |
|
writing_paths['dev'] = 'data/Prep_MRL/ud_pos_ner_dp_dev_'+src_domain+'_gen' |
|
writing_paths['test'] = 'data/Prep_MRL/ud_pos_ner_dp_test_'+src_domain+'_gen' |
|
return writing_paths |