LCM / utils /io_ /write_extra_labels.py

Upload folder using huggingface_hub

e8f4897 verified about 2 months ago

67.5 kB

	import os
	from .prepare_data import ROOT, END
	import pdb
	def get_split(path):
	if 'train' in path:
	if 'extra_train' in path:
	split = 'extra_train'
	else:
	split = 'train'
	elif 'dev' in path:
	if 'extra_dev' in path:
	split = 'extra_dev'
	else:
	split = 'dev'
	else:
	split = 'test'
	return split

	def add_number_of_children(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True):
	if src_domain == tgt_domain:
	pred_paths = []
	if use_unlabeled_data:
	pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and 'poetry' not in file and 'prose' not in file and 'extra' in file and tgt_domain in file]

	gold_paths = [file for file in os.listdir(parser_path) if file.endswith("gold.txt") and 'poetry' not in file and 'prose' not in file and 'extra' not in file and tgt_domain in file and 'train' not in file]
	if use_labeled_data:
	gold_paths += [file for file in os.listdir(parser_path) if file.endswith("gold.txt") and 'poetry' not in file and 'prose' not in file and 'extra' not in file and tgt_domain in file and 'train' in file]

	if not use_unlabeled_data and not use_labeled_data:
	raise ValueError
	else:
	pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and tgt_domain in file]

	gold_paths = []
	if use_labeled_data:
	gold_paths = ['data/onto_pos_ner_dp_train_' + src_domain]

	if not use_unlabeled_data and not use_labeled_data:
	raise ValueError

	paths = pred_paths + gold_paths
	print("Adding labels to paths: %s" % ', '.join(paths))
	root_line = ['0', ROOT, 'XX', 'O', '0', 'root']
	writing_paths = {}
	sentences = {}
	for path in paths:
	if tgt_domain in path:
	reading_path = parser_path + path
	writing_path = model_path + 'parser_' + path
	split = get_split(writing_path)
	else:
	reading_path = path
	writing_path = model_path + 'parser_' + 'domain_' + src_domain + '_train_model_domain_' + src_domain + '_data_domain_' + src_domain + '_gold.txt'
	split = 'extra_train'
	writing_paths[split] = writing_path
	len_sent = 0
	number_of_children = {}
	lines = []
	sentences_list = []
	with open(reading_path, 'r') as file:
	for line in file:
	# line = line.decode('utf-8')
	line = line.strip()
	# print(line)
	if len(line) == 0:
	for idx in range(len_sent):
	node = str(idx + 1)
	if node not in number_of_children:
	lines[idx].append('0')
	else:
	lines[idx].append(str(number_of_children[node]))
	if len(lines) > 0:
	tmp_root_line = root_line + [str(number_of_children['0'])]
	sentences_list.append(tmp_root_line)
	for line_ in lines:
	sentences_list.append(line_)
	sentences_list.append([])
	lines = []
	number_of_children = {}
	len_sent = 0
	continue
	tokens = line.split('\t')
	idx = tokens[0]
	word = tokens[1]
	pos = tokens[2]
	ner = tokens[3]
	head = tokens[4]
	arc_tag = tokens[5]
	if head not in number_of_children:
	number_of_children[head] = 1
	else:
	number_of_children[head] += 1
	lines.append([idx, word, pos, ner, head, arc_tag])
	len_sent += 1
	sentences[split] = sentences_list

	train_sentences = []
	if 'train' in sentences:
	train_sentences = sentences['train']
	else:
	writing_paths['train'] = writing_paths['extra_train'].replace('extra_train', 'train')
	if 'extra_train' in sentences:
	train_sentences += sentences['extra_train']
	del writing_paths['extra_train']
	if 'extra_dev' in sentences:
	train_sentences += sentences['extra_dev']
	del writing_paths['extra_dev']
	with open(writing_paths['train'], 'w') as f:
	for sent in train_sentences:
	f.write('\t'.join(sent) + '\n')
	for split in ['dev', 'test']:
	if split in sentences:
	split_sentences = sentences[split]
	with open(writing_paths[split], 'w') as f:
	for sent in split_sentences:
	f.write('\t'.join(sent) + '\n')
	return writing_paths


	def add_distance_from_the_root(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True):
	if src_domain == tgt_domain:
	pred_paths = []
	if use_unlabeled_data:
	pred_paths = [file for file in os.listdir(parser_path) if
	file.endswith("pred.txt") and 'poetry' not in file and 'prose' not in file and 'extra' in file and tgt_domain in file]

	gold_paths = [file for file in os.listdir(parser_path) if
	file.endswith("gold.txt") and 'poetry' not in file and 'prose' not in file and 'extra' not in file and tgt_domain in file and 'train' not in file]
	if use_labeled_data:
	gold_paths += [file for file in os.listdir(parser_path) if
	file.endswith("gold.txt") and 'extra' not in file and 'poetry' not in file and 'prose' not in file and tgt_domain in file and 'train' in file]

	if not use_unlabeled_data and not use_labeled_data:
	raise ValueError
	else:
	pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and tgt_domain in file]

	gold_paths = []
	if use_labeled_data:
	gold_paths = ['data/onto_pos_ner_dp_train_' + src_domain]

	if not use_unlabeled_data and not use_labeled_data:
	raise ValueError

	paths = pred_paths + gold_paths
	print("Adding labels to paths: %s" % ', '.join(paths))
	root_line = ['0', ROOT, 'XX', 'O', '0', 'root', '0']
	writing_paths = {}
	sentences = {}
	for path in paths:
	if tgt_domain in path:
	reading_path = parser_path + path
	writing_path = model_path + 'parser_' + path
	split = get_split(writing_path)
	else:
	reading_path = path
	writing_path = model_path + 'parser_' + 'domain_' + src_domain + '_train_model_domain_' + src_domain + '_data_domain_' + src_domain + '_gold.txt'
	split = 'extra_train'
	writing_paths[split] = writing_path
	len_sent = 0
	tree_dict = {'0': '0'}
	lines = []
	sentences_list = []
	with open(reading_path, 'r') as file:
	for line in file:
	# line = line.decode('utf-8')
	line = line.strip()
	if len(line) == 0:
	for idx in range(len_sent):
	depth = 1
	node = str(idx + 1)
	while tree_dict[node] != '0':
	node = tree_dict[node]
	depth += 1
	lines[idx].append(str(depth))
	if len(lines) > 0:
	sentences_list.append(root_line)
	for line_ in lines:
	sentences_list.append(line_)
	sentences_list.append([])
	lines = []
	tree_dict = {'0': '0'}
	len_sent = 0
	continue
	tokens = line.split('\t')
	idx = tokens[0]
	word = tokens[1]
	pos = tokens[2]
	ner = tokens[3]
	head = tokens[4]
	arc_tag = tokens[5]
	tree_dict[idx] = head
	lines.append([idx, word, pos, ner, head, arc_tag])
	len_sent += 1
	sentences[split] = sentences_list

	train_sentences = []
	if 'train' in sentences:
	train_sentences = sentences['train']
	else:
	writing_paths['train'] = writing_paths['extra_train'].replace('extra_train', 'train')
	if 'extra_train' in sentences:
	train_sentences += sentences['extra_train']
	del writing_paths['extra_train']
	if 'extra_dev' in sentences:
	train_sentences += sentences['extra_dev']
	del writing_paths['extra_dev']
	with open(writing_paths['train'], 'w') as f:
	for sent in train_sentences:
	f.write('\t'.join(sent) + '\n')
	for split in ['dev', 'test']:
	if split in sentences:
	split_sentences = sentences[split]
	with open(writing_paths[split], 'w') as f:
	for sent in split_sentences:
	f.write('\t'.join(sent) + '\n')
	return writing_paths

	def add_relative_pos_based(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True):
	# most of the code for this function is taken from:
	# https://github.com/mstrise/dep2label/blob/master/encoding.py
	def pos_cluster(pos):
	# clustering the parts of speech
	if pos[0] == 'V':
	pos = 'VB'
	elif pos == 'NNS':
	pos = 'NN'
	elif pos == 'NNPS':
	pos = 'NNP'
	elif 'JJ' in pos:
	pos = 'JJ'
	elif pos[:2] == 'RB' or pos == 'WRB' or pos == 'RP':
	pos = 'RB'
	elif pos[:3] == 'PRP':
	pos = 'PRP'
	elif pos in ['.', ':', ',', "''", '``']:
	pos = '.'
	elif pos[0] == '-':
	pos = '-RB-'
	elif pos[:2] == 'WP':
	pos = 'WP'
	return pos

	if src_domain == tgt_domain:
	pred_paths = []
	if use_unlabeled_data:
	pred_paths = [file for file in os.listdir(parser_path) if
	file.endswith("pred.txt") and 'poetry' not in file and 'prose' not in file and 'extra' in file and tgt_domain in file]

	gold_paths = [file for file in os.listdir(parser_path) if
	file.endswith("gold.txt") and 'poetry' not in file and 'prose' not in file and 'extra' not in file and tgt_domain in file and 'train' not in file]
	if use_labeled_data:
	gold_paths += [file for file in os.listdir(parser_path) if
	file.endswith("gold.txt") and 'extra' not in file and 'poetry' not in file and 'prose' not in file and tgt_domain in file and 'train' in file]

	if not use_unlabeled_data and not use_labeled_data:
	raise ValueError
	else:
	pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and tgt_domain in file]
	gold_paths = []
	if use_labeled_data:
	gold_paths = ['data/onto_pos_ner_dp_train_' + src_domain]

	if not use_unlabeled_data:
	raise ValueError

	paths = pred_paths + gold_paths
	print("Adding labels to paths: %s" % ', '.join(paths))
	root_line = ['0', ROOT, 'XX', 'O', '0', 'root', '+0_XX']
	writing_paths = {}
	sentences = {}
	for path in paths:
	if tgt_domain in path:
	reading_path = parser_path + path
	writing_path = model_path + 'parser_' + path
	split = get_split(writing_path)
	else:
	reading_path = path
	writing_path = model_path + 'parser_' + 'domain_' + src_domain + '_train_model_domain_' + src_domain + '_data_domain_' + src_domain + '_gold.txt'
	split = 'extra_train'
	writing_paths[split] = writing_path
	len_sent = 0
	tree_dict = {'0': '0'}
	lines = []
	sentences_list = []
	with open(reading_path, 'r') as file:
	for line in file:
	# line = line.decode('utf-8')
	line = line.strip()
	if len(line) == 0:
	for idx in range(len_sent):
	info_of_a_word = lines[idx]
	# head is on the right side from the word
	head = int(info_of_a_word[4]) - 1
	if head == -1:
	info_about_head = root_line
	else:
	info_about_head = lines[head]
	if idx < head:
	relative_position_head = 1
	postag_head = pos_cluster(info_about_head[2])

	for x in range(idx + 1, head):
	another_word = lines[x]
	postag_word_before_head = pos_cluster(another_word[2])
	if postag_word_before_head == postag_head:
	relative_position_head += 1
	label = str(
	"+" +
	repr(relative_position_head) +
	"_" +
	postag_head)
	lines[idx].append(label)

	# head is on the left side from the word
	elif idx > head:
	relative_position_head = 1
	postag_head = pos_cluster(info_about_head[2])
	for x in range(head + 1, idx):
	another_word = lines[x]
	postag_word_before_head = pos_cluster(another_word[2])
	if postag_word_before_head == postag_head:
	relative_position_head += 1
	label = str(
	"-" +
	repr(relative_position_head) +
	"_" +
	postag_head)
	lines[idx].append(label)
	if len(lines) > 0:
	sentences_list.append(root_line)
	for line_ in lines:
	sentences_list.append(line_)
	sentences_list.append([])
	lines = []
	tree_dict = {'0': '0'}
	len_sent = 0
	continue
	tokens = line.split('\t')
	idx = tokens[0]
	word = tokens[1]
	pos = tokens[2]
	ner = tokens[3]
	head = tokens[4]
	arc_tag = tokens[5]
	tree_dict[idx] = head
	lines.append([idx, word, pos, ner, head, arc_tag])
	len_sent += 1
	sentences[split] = sentences_list

	train_sentences = []
	if 'train' in sentences:
	train_sentences = sentences['train']
	else:
	writing_paths['train'] = writing_paths['extra_train'].replace('extra_train', 'train')
	if 'extra_train' in sentences:
	train_sentences += sentences['extra_train']
	del writing_paths['extra_train']
	if 'extra_dev' in sentences:
	train_sentences += sentences['extra_dev']
	del writing_paths['extra_dev']
	with open(writing_paths['train'], 'w') as f:
	for sent in train_sentences:
	f.write('\t'.join(sent) + '\n')
	for split in ['dev', 'test']:
	if split in sentences:
	split_sentences = sentences[split]
	with open(writing_paths[split], 'w') as f:
	for sent in split_sentences:
	f.write('\t'.join(sent) + '\n')
	return writing_paths

	def add_language_model(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True):
	if src_domain == tgt_domain:
	pred_paths = []
	if use_unlabeled_data:
	pred_paths = [file for file in os.listdir(parser_path) if
	file.endswith("pred.txt") and 'extra' in file and tgt_domain in file]

	gold_paths = [file for file in os.listdir(parser_path) if
	file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' not in file]
	if use_labeled_data:
	gold_paths += [file for file in os.listdir(parser_path) if
	file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' in file]

	if not use_unlabeled_data and not use_labeled_data:
	raise ValueError
	else:
	pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and tgt_domain in file]

	gold_paths = []
	if use_labeled_data:
	gold_paths = ['data/onto_pos_ner_dp_train_' + src_domain]

	if not use_unlabeled_data and not use_labeled_data:
	raise ValueError

	paths = pred_paths + gold_paths
	print("Adding labels to paths: %s" % ', '.join(paths))
	root_line = ['0', ROOT, 'XX', 'O', '0', 'root']
	writing_paths = {}
	sentences = {}
	for path in paths:
	if tgt_domain in path:
	reading_path = parser_path + path
	writing_path = model_path + 'parser_' + path
	split = get_split(writing_path)
	else:
	reading_path = path
	writing_path = model_path + 'parser_' + 'domain_' + src_domain + '_train_model_domain_' + src_domain + '_data_domain_' + src_domain + '_gold.txt'
	split = 'extra_train'
	writing_paths[split] = writing_path
	len_sent = 0
	lines = []
	sentences_list = []
	with open(reading_path, 'r') as file:
	for line in file:
	# line = line.decode('utf-8')
	line = line.strip()
	if len(line) == 0:
	for idx in range(len_sent):
	if idx < len_sent - 1:
	lines[idx].append(lines[idx+1][1])
	else:
	lines[idx].append(END)
	if len(lines) > 0:
	tmp_root_line = root_line + [lines[0][1]]
	sentences_list.append(tmp_root_line)
	for line_ in lines:
	sentences_list.append(line_)
	sentences_list.append([])
	lines = []
	len_sent = 0
	continue
	tokens = line.split('\t')
	idx = tokens[0]
	word = tokens[1]
	pos = tokens[2]
	ner = tokens[3]
	head = tokens[4]
	arc_tag = tokens[5]
	lines.append([idx, word, pos, ner, head, arc_tag])
	len_sent += 1
	sentences[split] = sentences_list

	train_sentences = []
	if 'train' in sentences:
	train_sentences = sentences['train']
	else:
	writing_paths['train'] = writing_paths['extra_train'].replace('extra_train', 'train')
	if 'extra_train' in sentences:
	train_sentences += sentences['extra_train']
	del writing_paths['extra_train']
	if 'extra_dev' in sentences:
	train_sentences += sentences['extra_dev']
	del writing_paths['extra_dev']
	with open(writing_paths['train'], 'w') as f:
	for sent in train_sentences:
	f.write('\t'.join(sent) + '\n')
	for split in ['dev', 'test']:
	if split in sentences:
	split_sentences = sentences[split]
	with open(writing_paths[split], 'w') as f:
	for sent in split_sentences:
	f.write('\t'.join(sent) + '\n')
	return writing_paths

	def add_relative_TAG(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True):
	# most of the code for this function is taken from:
	# https://github.com/mstrise/dep2label/blob/master/encoding.py
	def pos_cluster(pos):
	# clustering the parts of speech
	if pos[0] == 'V':
	pos = 'VB'
	elif pos == 'NNS':
	pos = 'NN'
	elif pos == 'NNPS':
	pos = 'NNP'
	elif 'JJ' in pos:
	pos = 'JJ'
	elif pos[:2] == 'RB' or pos == 'WRB' or pos == 'RP':
	pos = 'RB'
	elif pos[:3] == 'PRP':
	pos = 'PRP'
	elif pos in ['.', ':', ',', "''", '``']:
	pos = '.'
	elif pos[0] == '-':
	pos = '-RB-'
	elif pos[:2] == 'WP':
	pos = 'WP'
	return pos

	if src_domain == tgt_domain:
	pred_paths = []
	if use_unlabeled_data:
	pred_paths = [file for file in os.listdir(parser_path) if
	file.endswith("pred.txt") and 'extra' in file and tgt_domain in file]

	gold_paths = [file for file in os.listdir(parser_path) if
	file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' not in file]
	if use_labeled_data:
	gold_paths += [file for file in os.listdir(parser_path) if
	file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' in file]

	if not use_unlabeled_data and not use_labeled_data:
	raise ValueError
	else:
	pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and tgt_domain in file]
	gold_paths = []
	if use_labeled_data:
	gold_paths = ['data/onto_pos_ner_dp_train_' + src_domain]

	if not use_unlabeled_data:
	raise ValueError

	paths = pred_paths + gold_paths
	print("Adding labels to paths: %s" % ', '.join(paths))
	root_line = ['0', ROOT, 'XX', 'O', '0', 'root', '+0_XX']
	writing_paths = {}
	sentences = {}
	for path in paths:
	if tgt_domain in path:
	reading_path = parser_path + path
	writing_path = model_path + 'parser_' + path
	split = get_split(writing_path)
	else:
	reading_path = path
	writing_path = model_path + 'parser_' + 'domain_' + src_domain + '_train_model_domain_' + src_domain + '_data_domain_' + src_domain + '_gold.txt'
	split = 'extra_train'
	writing_paths[split] = writing_path
	len_sent = 0
	tree_dict = {'0': '0'}
	lines = []
	sentences_list = []
	with open(reading_path, 'r') as file:

	for line in file:
	# print(line)
	# print(reading_path)
	# line = line.decode('utf-8')
	line = line.strip()
	if len(line) == 0:
	for idx in range(len_sent):
	info_of_a_word = lines[idx]
	# head is on the right side from the word
	head = int(info_of_a_word[4]) - 1
	if head == -1:
	info_about_head = root_line
	else:
	# print(len(lines), head)
	info_about_head = lines[head]

	if idx < head:
	relative_position_head = 1
	tag_head = info_about_head[5]

	for x in range(idx + 1, head):
	another_word = lines[x]
	tag_word_before_head = another_word[5]
	if tag_word_before_head == tag_head:
	relative_position_head += 1
	label = str(
	"+" +
	repr(relative_position_head) +
	"_" +
	tag_head)
	lines[idx].append(label)

	# head is on the left side from the word
	elif idx > head:
	relative_position_head = 1
	tag_head = info_about_head[5]
	for x in range(head + 1, idx):
	another_word = lines[x]
	tag_word_before_head = another_word[5]
	if tag_word_before_head == tag_head:
	relative_position_head += 1
	label = str(
	"-" +
	repr(relative_position_head) +
	"_" +
	tag_head)
	lines[idx].append(label)
	if len(lines) > 0:
	sentences_list.append(root_line)
	for line_ in lines:
	sentences_list.append(line_)
	sentences_list.append([])
	lines = []
	tree_dict = {'0': '0'}
	len_sent = 0
	continue
	tokens = line.split('\t')
	idx = tokens[0]
	word = tokens[1]
	pos = tokens[2]
	ner = tokens[3]
	head = tokens[4]
	arc_tag = tokens[5]
	tree_dict[idx] = head
	lines.append([idx, word, pos, ner, head, arc_tag])
	len_sent += 1
	sentences[split] = sentences_list

	train_sentences = []
	if 'train' in sentences:
	train_sentences = sentences['train']
	else:
	writing_paths['train'] = writing_paths['extra_train'].replace('extra_train', 'train')
	if 'extra_train' in sentences:
	train_sentences += sentences['extra_train']
	del writing_paths['extra_train']
	if 'extra_dev' in sentences:
	train_sentences += sentences['extra_dev']
	del writing_paths['extra_dev']
	with open(writing_paths['train'], 'w') as f:
	for sent in train_sentences:
	f.write('\t'.join(sent) + '\n')
	for split in ['dev', 'test']:
	if split in sentences:
	split_sentences = sentences[split]
	with open(writing_paths[split], 'w') as f:
	for sent in split_sentences:
	f.write('\t'.join(sent) + '\n')
	return writing_paths


	def add_head(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True):
	# most of the code for this function is taken from:
	# https://github.com/mstrise/dep2label/blob/master/encoding.py
	def pos_cluster(pos):
	# clustering the parts of speech
	if pos[0] == 'V':
	pos = 'VB'
	elif pos == 'NNS':
	pos = 'NN'
	elif pos == 'NNPS':
	pos = 'NNP'
	elif 'JJ' in pos:
	pos = 'JJ'
	elif pos[:2] == 'RB' or pos == 'WRB' or pos == 'RP':
	pos = 'RB'
	elif pos[:3] == 'PRP':
	pos = 'PRP'
	elif pos in ['.', ':', ',', "''", '``']:
	pos = '.'
	elif pos[0] == '-':
	pos = '-RB-'
	elif pos[:2] == 'WP':
	pos = 'WP'
	return pos

	if src_domain == tgt_domain:
	pred_paths = []
	if use_unlabeled_data:
	pred_paths = [file for file in os.listdir(parser_path) if
	file.endswith("pred.txt") and 'extra' in file and tgt_domain in file]

	gold_paths = [file for file in os.listdir(parser_path) if
	file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' not in file]
	if use_labeled_data:
	gold_paths += [file for file in os.listdir(parser_path) if
	file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' in file]

	if not use_unlabeled_data and not use_labeled_data:
	raise ValueError
	else:
	pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and tgt_domain in file]
	gold_paths = []
	if use_labeled_data:
	gold_paths = ['data/onto_pos_ner_dp_train_' + src_domain]

	if not use_unlabeled_data:
	raise ValueError

	paths = pred_paths + gold_paths
	print("Adding labels to paths: %s" % ', '.join(paths))
	root_line = ['0', ROOT, 'XX', 'O', '0', 'root', '+0_XX']
	writing_paths = {}
	sentences = {}
	for path in paths:
	if tgt_domain in path:
	reading_path = parser_path + path
	writing_path = model_path + 'parser_' + path
	split = get_split(writing_path)
	else:
	reading_path = path
	writing_path = model_path + 'parser_' + 'domain_' + src_domain + '_train_model_domain_' + src_domain + '_data_domain_' + src_domain + '_gold.txt'
	split = 'extra_train'
	writing_paths[split] = writing_path
	len_sent = 0
	tree_dict = {'0': '0'}
	lines = []
	sentences_list = []
	with open(reading_path, 'r') as file:

	for line in file:
	# print(line)
	# print(reading_path)
	# line = line.decode('utf-8')
	line = line.strip()
	if len(line) == 0:
	for idx in range(len_sent):
	info_of_a_word = lines[idx]
	# head is on the right side from the word
	head = int(info_of_a_word[4]) - 1
	if head == -1:
	info_about_head = root_line
	else:
	# print(len(lines), head)
	info_about_head = lines[head]
	head_word = info_about_head[1]
	lines[idx].append(head_word)
	# if idx < head:
	# relative_position_head = 1


	# for x in range(idx + 1, head):
	# another_word = lines[x]
	# postag_word_before_head = pos_cluster(another_word[2])
	# if postag_word_before_head == postag_head:
	# relative_position_head += 1
	# label = str(
	# "+" +
	# repr(relative_position_head) +
	# "_" +
	# postag_head)


	# # head is on the left side from the word
	# elif idx > head:
	# relative_position_head = 1
	# postag_head = pos_cluster(info_about_head[2])
	# for x in range(head + 1, idx):
	# another_word = lines[x]
	# postag_word_before_head = pos_cluster(another_word[2])
	# if postag_word_before_head == postag_head:
	# relative_position_head += 1
	# label = str(
	# "-" +
	# repr(relative_position_head) +
	# "_" +
	# postag_head)
	# lines[idx].append(label)
	if len(lines) > 0:
	sentences_list.append(root_line)
	for line_ in lines:
	sentences_list.append(line_)
	sentences_list.append([])
	lines = []
	tree_dict = {'0': '0'}
	len_sent = 0
	continue
	tokens = line.split('\t')
	idx = tokens[0]
	word = tokens[1]
	pos = tokens[2]
	ner = tokens[3]
	head = tokens[4]
	arc_tag = tokens[5]
	tree_dict[idx] = head
	lines.append([idx, word, pos, ner, head, arc_tag])
	len_sent += 1
	sentences[split] = sentences_list

	train_sentences = []
	if 'train' in sentences:
	train_sentences = sentences['train']
	else:
	writing_paths['train'] = writing_paths['extra_train'].replace('extra_train', 'train')
	if 'extra_train' in sentences:
	train_sentences += sentences['extra_train']
	del writing_paths['extra_train']
	if 'extra_dev' in sentences:
	train_sentences += sentences['extra_dev']
	del writing_paths['extra_dev']
	with open(writing_paths['train'], 'w') as f:
	for sent in train_sentences:
	f.write('\t'.join(sent) + '\n')
	for split in ['dev', 'test']:
	if split in sentences:
	split_sentences = sentences[split]
	with open(writing_paths[split], 'w') as f:
	for sent in split_sentences:
	f.write('\t'.join(sent) + '\n')
	return writing_paths
	import json
	def get_modified_coarse(ma):
	ma = ma.replace('sgpl','sg').replace('sgdu','sg')
	with open('/home/jivnesh/DCST_scratch/utils/io_/coarse_to_ma_dict.json', 'r') as fh:
	coarse_dict = json.load(fh)
	for key in coarse_dict.keys():
	if ma in coarse_dict[key]:
	return key
	def add_head_coarse_pos(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True):
	# most of the code for this function is taken from:
	# https://github.com/mstrise/dep2label/blob/master/encoding.py
	def pos_cluster(pos):
	# clustering the parts of speech
	if pos[0] == 'V':
	pos = 'VB'
	elif pos == 'NNS':
	pos = 'NN'
	elif pos == 'NNPS':
	pos = 'NNP'
	elif 'JJ' in pos:
	pos = 'JJ'
	elif pos[:2] == 'RB' or pos == 'WRB' or pos == 'RP':
	pos = 'RB'
	elif pos[:3] == 'PRP':
	pos = 'PRP'
	elif pos in ['.', ':', ',', "''", '``']:
	pos = '.'
	elif pos[0] == '-':
	pos = '-RB-'
	elif pos[:2] == 'WP':
	pos = 'WP'
	return pos

	if src_domain == tgt_domain:
	pred_paths = []
	if use_unlabeled_data:
	pred_paths = [file for file in os.listdir(parser_path) if
	file.endswith("pred.txt") and 'extra' in file and tgt_domain in file]

	gold_paths = [file for file in os.listdir(parser_path) if
	file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' not in file]
	if use_labeled_data:
	gold_paths += [file for file in os.listdir(parser_path) if
	file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' in file]

	if not use_unlabeled_data and not use_labeled_data:
	raise ValueError
	else:
	pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and tgt_domain in file]
	gold_paths = []
	if use_labeled_data:
	gold_paths = ['data/onto_pos_ner_dp_train_' + src_domain]

	if not use_unlabeled_data:
	raise ValueError

	paths = pred_paths + gold_paths
	print("Adding labels to paths: %s" % ', '.join(paths))
	root_line = ['0', ROOT, 'XX', 'O', '0', 'root', 'O']
	writing_paths = {}
	sentences = {}
	for path in paths:
	if tgt_domain in path:
	reading_path = parser_path + path
	writing_path = model_path + 'parser_' + path
	split = get_split(writing_path)
	else:
	reading_path = path
	writing_path = model_path + 'parser_' + 'domain_' + src_domain + '_train_model_domain_' + src_domain + '_data_domain_' + src_domain + '_gold.txt'
	split = 'extra_train'
	writing_paths[split] = writing_path
	len_sent = 0
	tree_dict = {'0': '0'}
	lines = []
	sentences_list = []
	with open(reading_path, 'r') as file:

	for line in file:
	# print(line)
	# print(reading_path)
	# line = line.decode('utf-8')
	line = line.strip()
	if len(line) == 0:
	for idx in range(len_sent):
	info_of_a_word = lines[idx]
	# head is on the right side from the word
	head = int(info_of_a_word[4]) - 1
	if head == -1:
	info_about_head = root_line
	else:
	# print(len(lines), head)
	info_about_head = lines[head]
	postag_head = info_about_head[2]
	lines[idx].append(postag_head)
	if len(lines) > 0:
	sentences_list.append(root_line)
	for line_ in lines:
	sentences_list.append(line_)
	sentences_list.append([])
	lines = []
	tree_dict = {'0': '0'}
	len_sent = 0
	continue
	tokens = line.split('\t')
	idx = tokens[0]
	word = tokens[1]
	pos = tokens[2]
	ner = tokens[3]
	head = tokens[4]
	arc_tag = tokens[5]
	tree_dict[idx] = head
	lines.append([idx, word, pos, ner, head, arc_tag])
	len_sent += 1
	sentences[split] = sentences_list

	train_sentences = []
	if 'train' in sentences:
	train_sentences = sentences['train']
	else:
	writing_paths['train'] = writing_paths['extra_train'].replace('extra_train', 'train')
	if 'extra_train' in sentences:
	train_sentences += sentences['extra_train']
	del writing_paths['extra_train']
	if 'extra_dev' in sentences:
	train_sentences += sentences['extra_dev']
	del writing_paths['extra_dev']
	with open(writing_paths['train'], 'w') as f:
	for sent in train_sentences:
	f.write('\t'.join(sent) + '\n')
	for split in ['dev', 'test']:
	if split in sentences:
	split_sentences = sentences[split]
	with open(writing_paths[split], 'w') as f:
	for sent in split_sentences:
	f.write('\t'.join(sent) + '\n')
	return writing_paths

	def add_head_ma(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True):
	# most of the code for this function is taken from:
	# https://github.com/mstrise/dep2label/blob/master/encoding.py
	def pos_cluster(pos):
	# clustering the parts of speech
	if pos[0] == 'V':
	pos = 'VB'
	elif pos == 'NNS':
	pos = 'NN'
	elif pos == 'NNPS':
	pos = 'NNP'
	elif 'JJ' in pos:
	pos = 'JJ'
	elif pos[:2] == 'RB' or pos == 'WRB' or pos == 'RP':
	pos = 'RB'
	elif pos[:3] == 'PRP':
	pos = 'PRP'
	elif pos in ['.', ':', ',', "''", '``']:
	pos = '.'
	elif pos[0] == '-':
	pos = '-RB-'
	elif pos[:2] == 'WP':
	pos = 'WP'
	return pos

	if src_domain == tgt_domain:
	pred_paths = []
	if use_unlabeled_data:
	pred_paths = [file for file in os.listdir(parser_path) if
	file.endswith("pred.txt") and 'extra' in file and tgt_domain in file]

	gold_paths = [file for file in os.listdir(parser_path) if
	file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' not in file]
	if use_labeled_data:
	gold_paths += [file for file in os.listdir(parser_path) if
	file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' in file]

	if not use_unlabeled_data and not use_labeled_data:
	raise ValueError
	else:
	pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and tgt_domain in file]
	gold_paths = []
	if use_labeled_data:
	gold_paths = ['data/onto_pos_ner_dp_train_' + src_domain]

	if not use_unlabeled_data:
	raise ValueError

	paths = pred_paths + gold_paths
	print("Adding labels to paths: %s" % ', '.join(paths))
	root_line = ['0', ROOT, 'XX', 'O', '0', 'root', 'XX']
	writing_paths = {}
	sentences = {}
	for path in paths:
	if tgt_domain in path:
	reading_path = parser_path + path
	writing_path = model_path + 'parser_' + path
	split = get_split(writing_path)
	else:
	reading_path = path
	writing_path = model_path + 'parser_' + 'domain_' + src_domain + '_train_model_domain_' + src_domain + '_data_domain_' + src_domain + '_gold.txt'
	split = 'extra_train'
	writing_paths[split] = writing_path
	len_sent = 0
	tree_dict = {'0': '0'}
	lines = []
	sentences_list = []
	with open(reading_path, 'r') as file:

	for line in file:
	# print(line)
	# print(reading_path)
	# line = line.decode('utf-8')
	line = line.strip()
	if len(line) == 0:
	for idx in range(len_sent):
	info_of_a_word = lines[idx]
	# head is on the right side from the word
	head = int(info_of_a_word[4]) - 1
	if head == -1:
	info_about_head = root_line
	else:
	# print(len(lines), head)
	info_about_head = lines[head]
	postag_head = pos_cluster(info_about_head[2])
	lines[idx].append(postag_head)
	if len(lines) > 0:
	sentences_list.append(root_line)
	for line_ in lines:
	sentences_list.append(line_)
	sentences_list.append([])
	lines = []
	tree_dict = {'0': '0'}
	len_sent = 0
	continue
	tokens = line.split('\t')
	idx = tokens[0]
	word = tokens[1]
	pos = tokens[2]
	ner = tokens[3]
	head = tokens[4]
	arc_tag = tokens[5]
	tree_dict[idx] = head
	lines.append([idx, word, pos, ner, head, arc_tag])
	len_sent += 1
	sentences[split] = sentences_list

	train_sentences = []
	if 'train' in sentences:
	train_sentences = sentences['train']
	else:
	writing_paths['train'] = writing_paths['extra_train'].replace('extra_train', 'train')
	if 'extra_train' in sentences:
	train_sentences += sentences['extra_train']
	del writing_paths['extra_train']
	if 'extra_dev' in sentences:
	train_sentences += sentences['extra_dev']
	del writing_paths['extra_dev']
	with open(writing_paths['train'], 'w') as f:
	for sent in train_sentences:
	f.write('\t'.join(sent) + '\n')
	for split in ['dev', 'test']:
	if split in sentences:
	split_sentences = sentences[split]
	with open(writing_paths[split], 'w') as f:
	for sent in split_sentences:
	f.write('\t'.join(sent) + '\n')
	return writing_paths

	def add_label(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True):
	if src_domain == tgt_domain:
	pred_paths = []
	if use_unlabeled_data:
	pred_paths = [file for file in os.listdir(parser_path) if
	file.endswith("pred.txt") and 'extra' in file and tgt_domain in file]

	gold_paths = [file for file in os.listdir(parser_path) if
	file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' not in file]
	if use_labeled_data:
	gold_paths += [file for file in os.listdir(parser_path) if
	file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' in file]

	if not use_unlabeled_data and not use_labeled_data:
	raise ValueError
	else:
	pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and tgt_domain in file]

	gold_paths = []
	if use_labeled_data:
	gold_paths = ['data/onto_pos_ner_dp_train_' + src_domain]

	if not use_unlabeled_data and not use_labeled_data:
	raise ValueError

	paths = pred_paths + gold_paths
	print('############ Add Label Task #################')
	print("Adding labels to paths: %s" % ', '.join(paths))
	root_line = ['0', ROOT, 'XX', 'O', '0', 'root']
	writing_paths = {}
	sentences = {}
	for path in paths:
	if tgt_domain in path:
	reading_path = parser_path + path
	writing_path = model_path + 'parser_' + path
	split = get_split(writing_path)
	else:
	reading_path = path
	writing_path = model_path + 'parser_' + 'domain_' + src_domain + '_train_model_domain_' + src_domain + '_data_domain_' + src_domain + '_gold.txt'
	split = 'extra_train'
	writing_paths[split] = writing_path
	len_sent = 0
	lines = []
	sentences_list = []
	with open(reading_path, 'r') as file:
	for line in file:
	# line = line.decode('utf-8')
	line = line.strip()
	# Now blank space got detected
	if len(line) == 0:
	# Append next word to last column
	for idx in range(len_sent):
	lines[idx].append(lines[idx][5])
	# Add root line first
	if len(lines) > 0:
	tmp_root_line = root_line + [root_line[5]]
	sentences_list.append(tmp_root_line)
	for line_ in lines:
	sentences_list.append(line_)
	sentences_list.append([])
	lines = []
	len_sent = 0
	continue
	tokens = line.split('\t')
	idx = tokens[0]
	word = tokens[1]
	pos = tokens[2]
	ner = tokens[3]
	head = tokens[4]
	arc_tag = tokens[5]
	lines.append([idx, word, pos, ner, head, arc_tag])
	len_sent += 1
	sentences[split] = sentences_list

	train_sentences = []
	if 'train' in sentences:
	train_sentences = sentences['train']
	else:
	# pdb.set_trace()
	writing_paths['train'] = writing_paths['extra_train'].replace('extra_train', 'train')
	if 'extra_train' in sentences:
	train_sentences += sentences['extra_train']
	del writing_paths['extra_train']
	if 'extra_dev' in sentences:
	train_sentences += sentences['extra_dev']
	del writing_paths['extra_dev']
	with open(writing_paths['train'], 'w') as f:
	for sent in train_sentences:
	f.write('\t'.join(sent) + '\n')
	for split in ['dev', 'test']:
	if split in sentences:
	split_sentences = sentences[split]
	with open(writing_paths[split], 'w') as f:
	for sent in split_sentences:
	f.write('\t'.join(sent) + '\n')
	return writing_paths

	def predict_ma_tag_of_modifier(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True):
	if src_domain == tgt_domain:
	pred_paths = []
	if use_unlabeled_data:
	pred_paths = [file for file in os.listdir(parser_path) if
	file.endswith("pred.txt") and 'extra' in file and tgt_domain in file]

	gold_paths = [file for file in os.listdir(parser_path) if
	file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' not in file]
	if use_labeled_data:
	gold_paths += [file for file in os.listdir(parser_path) if
	file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' in file]

	if not use_unlabeled_data and not use_labeled_data:
	raise ValueError
	else:
	pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and tgt_domain in file]

	gold_paths = []
	if use_labeled_data:
	gold_paths = ['data/onto_pos_ner_dp_train_' + src_domain]

	if not use_unlabeled_data and not use_labeled_data:
	raise ValueError

	paths = pred_paths + gold_paths
	print('############ Add Label Task #################')
	print("Adding labels to paths: %s" % ', '.join(paths))
	root_line = ['0', ROOT, 'XX', 'O', '0', 'root']
	writing_paths = {}
	sentences = {}
	for path in paths:
	if tgt_domain in path:
	reading_path = parser_path + path
	writing_path = model_path + 'parser_' + path
	split = get_split(writing_path)
	else:
	reading_path = path
	writing_path = model_path + 'parser_' + 'domain_' + src_domain + '_train_model_domain_' + src_domain + '_data_domain_' + src_domain + '_gold.txt'
	split = 'extra_train'
	writing_paths[split] = writing_path
	len_sent = 0
	lines = []
	sentences_list = []
	with open(reading_path, 'r') as file:
	for line in file:
	# line = line.decode('utf-8')
	line = line.strip()
	# Now blank space got detected
	if len(line) == 0:
	# Append next word to last column
	for idx in range(len_sent):
	lines[idx].append(clean_ma(lines[idx][3]))
	# Add root line first
	if len(lines) > 0:
	tmp_root_line = root_line + [root_line[3]]
	sentences_list.append(tmp_root_line)
	for line_ in lines:
	sentences_list.append(line_)
	sentences_list.append([])
	lines = []
	len_sent = 0
	continue
	tokens = line.split('\t')
	idx = tokens[0]
	word = tokens[1]
	pos = tokens[2]
	ner = tokens[3]
	head = tokens[4]
	arc_tag = tokens[5]
	lines.append([idx, word, pos, ner, head, arc_tag])
	len_sent += 1
	sentences[split] = sentences_list

	train_sentences = []
	if 'train' in sentences:
	train_sentences = sentences['train']
	else:
	writing_paths['train'] = writing_paths['extra_train'].replace('extra_train', 'train')
	if 'extra_train' in sentences:
	train_sentences += sentences['extra_train']
	del writing_paths['extra_train']
	if 'extra_dev' in sentences:
	train_sentences += sentences['extra_dev']
	del writing_paths['extra_dev']
	with open(writing_paths['train'], 'w') as f:
	for sent in train_sentences:
	f.write('\t'.join(sent) + '\n')
	for split in ['dev', 'test']:
	if split in sentences:
	split_sentences = sentences[split]
	with open(writing_paths[split], 'w') as f:
	for sent in split_sentences:
	f.write('\t'.join(sent) + '\n')
	return writing_paths

	def predict_coarse_of_modifier(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True):
	if src_domain == tgt_domain:
	pred_paths = []
	if use_unlabeled_data:
	pred_paths = [file for file in os.listdir(parser_path) if
	file.endswith("pred.txt") and 'extra' in file and tgt_domain in file]

	gold_paths = [file for file in os.listdir(parser_path) if
	file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' not in file]
	if use_labeled_data:
	gold_paths += [file for file in os.listdir(parser_path) if
	file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' in file]

	if not use_unlabeled_data and not use_labeled_data:
	raise ValueError
	else:
	pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and tgt_domain in file]

	gold_paths = []
	if use_labeled_data:
	gold_paths = ['data/onto_pos_ner_dp_train_' + src_domain]

	if not use_unlabeled_data and not use_labeled_data:
	raise ValueError

	paths = pred_paths + gold_paths
	print('############ Add Label Task #################')
	print("Adding labels to paths: %s" % ', '.join(paths))
	root_line = ['0', ROOT, 'XX', 'O', '0', 'root']
	writing_paths = {}
	sentences = {}
	for path in paths:
	if tgt_domain in path:
	reading_path = parser_path + path
	writing_path = model_path + 'parser_' + path
	split = get_split(writing_path)
	else:
	reading_path = path
	writing_path = model_path + 'parser_' + 'domain_' + src_domain + '_train_model_domain_' + src_domain + '_data_domain_' + src_domain + '_gold.txt'
	split = 'extra_train'
	writing_paths[split] = writing_path
	len_sent = 0
	lines = []
	sentences_list = []
	with open(reading_path, 'r') as file:
	for line in file:
	# line = line.decode('utf-8')
	line = line.strip()
	# Now blank space got detected
	if len(line) == 0:
	# Append next word to last column
	for idx in range(len_sent):
	lines[idx].append(lines[idx][3])
	# Add root line first
	if len(lines) > 0:
	tmp_root_line = root_line + [root_line[3]]
	sentences_list.append(tmp_root_line)
	for line_ in lines:
	sentences_list.append(line_)
	sentences_list.append([])
	lines = []
	len_sent = 0
	continue
	tokens = line.split('\t')
	idx = tokens[0]
	word = tokens[1]
	pos = tokens[2]
	ner = tokens[3]
	head = tokens[4]
	arc_tag = tokens[5]
	lines.append([idx, word, pos, ner, head, arc_tag])
	len_sent += 1
	sentences[split] = sentences_list

	train_sentences = []
	if 'train' in sentences:
	train_sentences = sentences['train']
	else:
	writing_paths['train'] = writing_paths['extra_train'].replace('extra_train', 'train')
	if 'extra_train' in sentences:
	train_sentences += sentences['extra_train']
	del writing_paths['extra_train']
	if 'extra_dev' in sentences:
	train_sentences += sentences['extra_dev']
	del writing_paths['extra_dev']
	with open(writing_paths['train'], 'w') as f:
	for sent in train_sentences:
	f.write('\t'.join(sent) + '\n')
	for split in ['dev', 'test']:
	if split in sentences:
	split_sentences = sentences[split]
	with open(writing_paths[split], 'w') as f:
	for sent in split_sentences:
	f.write('\t'.join(sent) + '\n')
	return writing_paths
	import re
	def get_case(ma):
	indeclinable = ['ind','prep','interj','prep','conj','part']
	case_list = ['nom','voc','acc','i','inst','dat','abl','g','loc']
	gender_list = ['n','f','m','*']
	person_list = ['1','2','3']
	no_list = ['du','sg','pl']
	pops = [' ac',' ps']
	ma=ma.replace('sgpl','sg').replace('sgdu','sg')
	temp = re.sub("([\(\[]).*?([\)\]])", "\g<1>\g<2>", ma).replace('[] ','').strip(' []')
	temp = temp.split('.')
	if temp[-1] == '':
	temp.pop(-1)
	# Remove active passive
	case=''
	no=''
	person=''
	gender=''
	tense=''
	coarse=''
	for a,b in enumerate(temp):
	if b in pops:
	temp.pop(a)
	# Get gender
	for a,b in enumerate(temp):
	if b.strip() in gender_list:
	gender = b.strip()
	temp.pop(a)
	# Get case
	for a,b in enumerate(temp):
	if b.strip() in case_list:
	case = b.strip()
	temp.pop(a)
	if case!= '':
	coarse ='Noun'
	# Get person
	for a,b in enumerate(temp):
	if b.strip() in person_list:
	person = b.strip()
	temp.pop(a)
	# Get no
	for a,b in enumerate(temp):
	if b.strip() in no_list:
	no = b.strip()
	temp.pop(a)
	# Get Tense
	for b in temp:
	tense=tense+ ' '+b.strip()
	tense=tense.strip()

	# print(tense)
	if tense == 'adv':
	coarse = 'adv'
	for ind in indeclinable:
	if tense == ind:
	coarse = 'Ind'
	if tense == 'abs' or tense == 'ca abs':
	coarse = 'IV'
	if tense!='' and coarse=='':
	if person !='' or no!='':
	coarse= 'FV'
	else:
	coarse = 'IV'
	if case == 'i':
	return 'inst'

	if case !='':
	return case
	else:
	return coarse
	def clean_ma(ma):
	ma = re.sub("([\(\[]).*?([\)\]])", "\g<1>\g<2>", ma).replace('[] ','').strip(' []').replace(' ac','').replace(' ps','').replace('sgpl','sg').replace('sgdu','sg')
	ma = ma.replace('i.','inst.').replace('.','').replace(' ','')
	return ma
	def predict_case_of_modifier(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True):
	if src_domain == tgt_domain:
	pred_paths = []
	if use_unlabeled_data:
	pred_paths = [file for file in os.listdir(parser_path) if
	file.endswith("pred.txt") and 'extra' in file and tgt_domain in file]

	gold_paths = [file for file in os.listdir(parser_path) if
	file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' not in file]
	if use_labeled_data:
	gold_paths += [file for file in os.listdir(parser_path) if
	file.endswith("gold.txt") and 'extra' not in file and tgt_domain in file and 'train' in file]

	if not use_unlabeled_data and not use_labeled_data:
	raise ValueError
	else:
	pred_paths = [file for file in os.listdir(parser_path) if file.endswith("pred.txt") and tgt_domain in file]

	gold_paths = []
	if use_labeled_data:
	gold_paths = ['data/onto_pos_ner_dp_train_' + src_domain]

	if not use_unlabeled_data and not use_labeled_data:
	raise ValueError

	paths = pred_paths + gold_paths
	print('############ Add Label Task #################')
	print("Adding labels to paths: %s" % ', '.join(paths))
	root_line = ['0', ROOT, 'XX', 'O', '0', 'root']
	writing_paths = {}
	sentences = {}
	for path in paths:
	if tgt_domain in path:
	reading_path = parser_path + path
	writing_path = model_path + 'parser_' + path
	split = get_split(writing_path)
	else:
	reading_path = path
	writing_path = model_path + 'parser_' + 'domain_' + src_domain + '_train_model_domain_' + src_domain + '_data_domain_' + src_domain + '_gold.txt'
	split = 'extra_train'
	writing_paths[split] = writing_path
	len_sent = 0
	lines = []
	sentences_list = []
	with open(reading_path, 'r') as file:
	for line in file:
	# line = line.decode('utf-8')
	line = line.strip()
	# Now blank space got detected
	if len(line) == 0:
	# Append next word to last column
	for idx in range(len_sent):
	lines[idx].append(get_case(lines[idx][3]))
	# Add root line first
	if len(lines) > 0:
	tmp_root_line = root_line + [root_line[3]]
	sentences_list.append(tmp_root_line)
	for line_ in lines:
	sentences_list.append(line_)
	sentences_list.append([])
	lines = []
	len_sent = 0
	continue
	tokens = line.split('\t')
	idx = tokens[0]
	word = tokens[1]
	pos = tokens[2]
	ner = tokens[3]
	head = tokens[4]
	arc_tag = tokens[5]
	lines.append([idx, word, pos, ner, head, arc_tag])
	len_sent += 1
	sentences[split] = sentences_list

	train_sentences = []
	if 'train' in sentences:
	train_sentences = sentences['train']
	else:
	writing_paths['train'] = writing_paths['extra_train'].replace('extra_train', 'train')
	if 'extra_train' in sentences:
	train_sentences += sentences['extra_train']
	del writing_paths['extra_train']
	if 'extra_dev' in sentences:
	train_sentences += sentences['extra_dev']
	del writing_paths['extra_dev']
	with open(writing_paths['train'], 'w') as f:
	for sent in train_sentences:
	f.write('\t'.join(sent) + '\n')
	for split in ['dev', 'test']:
	if split in sentences:
	split_sentences = sentences[split]
	with open(writing_paths[split], 'w') as f:
	for sent in split_sentences:
	f.write('\t'.join(sent) + '\n')
	return writing_paths

	def Multitask_case_predict(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True):
	writing_paths = {}
	# multitask_silver_20ktrain_san
	writing_paths['train'] = 'data/ud_pos_ner_dp_train_san_case'
	writing_paths['dev'] = 'data/ud_pos_ner_dp_dev_san_case'
	writing_paths['test'] = 'data/ud_pos_ner_dp_test_san_case'
	return writing_paths

	def Multitask_POS_predict(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True):
	writing_paths = {}
	# multitask_silver_20ktrain_san
	# writing_paths['train'] = 'data/Multitask_POS_predict_train_san'
	# writing_paths['dev'] = 'data/Multitask_POS_predict_dev_san'
	# writing_paths['test'] = 'data/Multitask_POS_predict_test_san'
	writing_paths['train'] = 'data/ud_pos_ner_dp_train_san_POS'
	writing_paths['dev'] = 'data/ud_pos_ner_dp_dev_san_POS'
	writing_paths['test'] = 'data/ud_pos_ner_dp_test_san_POS'
	return writing_paths

	def Multitask_coarse_predict(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True):
	writing_paths = {}
	# multitask_silver_20ktrain_san
	writing_paths['train'] = 'data/Multitask_coarse_predict_train_san'
	writing_paths['dev'] = 'data/Multitask_coarse_predict_dev_san'
	writing_paths['test'] = 'data/Multitask_coarse_predict_test_san'
	return writing_paths

	def Multitask_label_predict(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True):
	writing_paths = {}
	# multitask_silver_20ktrain_san
	writing_paths['train'] = 'data/Multitask_label_predict_train_san'
	writing_paths['dev'] = 'data/Multitask_label_predict_dev_san'
	writing_paths['test'] = 'data/Multitask_label_predict_test_san'
	return writing_paths


	def MRL_case(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True):
	writing_paths = {}
	# multitask_silver_20ktrain_san
	writing_paths['train'] = 'data/Prep_MRL/ud_pos_ner_dp_train_'+src_domain+'_case'
	writing_paths['dev'] = 'data/Prep_MRL/ud_pos_ner_dp_dev_'+src_domain+'_case'
	writing_paths['test'] = 'data/Prep_MRL/ud_pos_ner_dp_test_'+src_domain+'_case'
	return writing_paths

	def MRL_POS(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True):
	writing_paths = {}
	# multitask_silver_20ktrain_san
	writing_paths['train'] = 'data/Prep_MRL/ud_pos_ner_dp_train_'+src_domain+'_POS'
	writing_paths['dev'] = 'data/Prep_MRL/ud_pos_ner_dp_dev_'+src_domain+'_POS'
	writing_paths['test'] = 'data/Prep_MRL/ud_pos_ner_dp_test_'+src_domain+'_POS'
	return writing_paths

	def MRL_label(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True):
	writing_paths = {}
	# multitask_silver_20ktrain_san
	writing_paths['train'] = 'data/Prep_MRL/ud_pos_ner_dp_train_'+src_domain+'_dep'
	writing_paths['dev'] = 'data/Prep_MRL/ud_pos_ner_dp_dev_'+src_domain+'_dep'
	writing_paths['test'] = 'data/Prep_MRL/ud_pos_ner_dp_test_'+src_domain+'_dep'
	return writing_paths

	def MRL_no(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True):
	writing_paths = {}
	# multitask_silver_20ktrain_san
	writing_paths['train'] = 'data/Prep_MRL/ud_pos_ner_dp_train_'+src_domain+'_no'
	writing_paths['dev'] = 'data/Prep_MRL/ud_pos_ner_dp_dev_'+src_domain+'_no'
	writing_paths['test'] = 'data/Prep_MRL/ud_pos_ner_dp_test_'+src_domain+'_no'
	return writing_paths

	def MRL_Person(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True):
	writing_paths = {}
	# multitask_silver_20ktrain_san
	writing_paths['train'] = 'data/Prep_MRL/ud_pos_ner_dp_train_'+src_domain+'_per'
	writing_paths['dev'] = 'data/Prep_MRL/ud_pos_ner_dp_dev_'+src_domain+'_per'
	writing_paths['test'] = 'data/Prep_MRL/ud_pos_ner_dp_test_'+src_domain+'_per'
	return writing_paths
	def MRL_Gender(model_path, parser_path, src_domain, tgt_domain, use_unlabeled_data=True, use_labeled_data=True):
	writing_paths = {}
	# multitask_silver_20ktrain_san
	writing_paths['train'] = 'data/Prep_MRL/ud_pos_ner_dp_train_'+src_domain+'_gen'
	writing_paths['dev'] = 'data/Prep_MRL/ud_pos_ner_dp_dev_'+src_domain+'_gen'
	writing_paths['test'] = 'data/Prep_MRL/ud_pos_ner_dp_test_'+src_domain+'_gen'
	return writing_paths