Spaces:

bowphs
/

athenas-lens

Sleeping

App Files Files Community

athenas-lens / utils.py

bowphs

Add initial attempt of a code framework.

3bc4816 over 1 year ago

raw

history blame

5 kB

	import re
	from typing import List, Dict, Set
	import numpy as np
	import torch
	from ufal.chu_liu_edmonds import chu_liu_edmonds

	DEPENDENCY_RELATIONS = [
	"acl",
	"advcl",
	"advmod",
	"amod",
	"appos",
	"aux",
	"case",
	"cc",
	"ccomp",
	"conj",
	"cop",
	"csubj",
	"det",
	"iobj",
	"mark",
	"nmod",
	"nsubj",
	"nummod",
	"obj",
	"obl",
	"parataxis",
	"punct",
	"root",
	"vocative",
	"xcomp",
	]
	INDEX2TAG = {idx: tag for idx, tag in enumerate(DEPENDENCY_RELATIONS)}
	TAG2INDEX = {tag: idx for idx, tag in enumerate(DEPENDENCY_RELATIONS)}


	def preprocess_text(text: str) -> List[str]:
	text = text.strip()
	text = re.sub("(?<! )(?=[.,!?()·;:])\|(?<=[.,!?()·;:])(?! )", r" ", text)
	return text.split()


	def batched_index_select(
	input: torch.Tensor, dim: int, index: torch.Tensor
	) -> torch.Tensor:
	views = [input.shape[0]] + [
	1 if i != dim else -1 for i in range(1, len(input.shape))
	]
	expanse = list(input.shape)
	expanse[0] = -1
	expanse[dim] = -1
	index = index.view(views).expand(expanse)
	return torch.gather(input, dim, index)


	def get_relevant_tokens(tokenized: torch.Tensor, start_ids: Set[int]) -> List[int]:
	return [tokenized[idx].item() for idx in range(len(tokenized)) if idx in start_ids]


	def resolve(
	edmonds_head: List[int], word_ids: List[int], parent_probs_table: torch.Tensor
	) -> torch.Tensor:
	multiple_roots = [i for i, x in enumerate(edmonds_head) if x == 0]
	if len(multiple_roots) > 1:
	main_root = max(multiple_roots, key=edmonds_head.count)
	secondary_roots = set(multiple_roots) - {main_root}
	for root in secondary_roots:
	parent_probs_table[0][word_ids.index(root)][0] = 0
	return parent_probs_table


	def apply_chu_liu_edmonds(
	parent_probs_table: torch.Tensor, tokenized_input: Dict, start_ids: Set[int]
	) -> List[int]:
	parent_probs_table = (
	parent_probs_table
	if parent_probs_table.shape[1] == parent_probs_table.shape[2]
	else parent_probs_table[:, :, 1:]
	)
	edmonds_heads, _ = chu_liu_edmonds(
	parent_probs_table.squeeze(0).cpu().numpy().astype("double")
	)
	edmonds_heads = torch.tensor(edmonds_heads).unsqueeze(0)
	edmonds_heads[edmonds_heads == -1] = 0
	tokenized_input["head_labels"] = edmonds_heads
	return get_relevant_tokens(edmonds_heads[0], start_ids)


	def get_word_endings(tokenized_input):
	word_ids = tokenized_input.word_ids(batch_index=0)
	start_ids = set()
	word_endings = {0: (1, 0)}
	for word_id in word_ids:
	if word_id is not None:
	start, end = tokenized_input.word_to_tokens(
	batch_or_word_index=0, word_index=word_id
	)
	start_ids.add(start)
	word_endings[start] = (end, word_id + 1)
	for a in range(start + 1, end + 1):
	word_endings[a] = (end, word_id + 1)
	return word_endings, start_ids, word_ids


	def get_dependencies(
	dependency_parser,
	label_parser,
	tokenizer,
	collator,
	labels: bool,
	sentence: List[str],
	) -> Dict:
	tokenized_input = tokenizer(
	sentence, truncation=True, is_split_into_words=True, add_special_tokens=True
	)
	dep_dict: Dict[str, List[Dict[str, str]]] = {
	"words": [{"text": "ROOT", "tag": ""}],
	"arcs": [],
	}

	word_endings, start_ids, word_ids = get_word_endings(tokenized_input)
	tokenized_input = collator([tokenized_input])
	_, _, parent_probs_table = dependency_parser(**tokenized_input)

	irrelevant = torch.tensor(
	[
	idx.item()
	for idx in torch.arange(parent_probs_table.size(1))
	if idx.item() not in start_ids and idx.item() != 0
	]
	)
	if irrelevant.nelement() > 0:
	parent_probs_table.index_fill_(1, irrelevant, torch.nan)
	parent_probs_table.index_fill_(2, irrelevant, torch.nan)

	edmonds_head = apply_chu_liu_edmonds(parent_probs_table, tokenized_input, start_ids)
	parent_probs_table = resolve(edmonds_head, word_ids, parent_probs_table)
	edmonds_head = apply_chu_liu_edmonds(parent_probs_table, tokenized_input, start_ids)

	if labels:
	predictions_labels = np.argmax(
	label_parser(**tokenized_input).logits.detach().cpu().numpy(), axis=-1
	)
	predicted_relations = get_relevant_tokens(predictions_labels[0], start_ids)
	predicted_relations = [
	INDEX2TAG[predicted_relations[idx]] for idx in range(len(sentence))
	]
	else:
	predicted_relations = [""] * len(sentence)

	for idx, head in enumerate(edmonds_head):
	arc = {
	"start": min(idx + 1, word_endings[head][1]),
	"end": max(idx + 1, word_endings[head][1]),
	"label": predicted_relations[idx],
	"dir": "left" if idx + 1 < word_endings[head][1] else "right",
	}
	dep_dict["arcs"].append(arc)

	return dep_dict