Spaces:

BengaliNLP
/

bertspace

Runtime error

bertspace / server /utils /token_processing.py

Create token_processing.py

017380c verified 6 months ago

1.42 kB

	import numpy as np
	from transformers.tokenization_bert import BertTokenizer
	from .f import flatten_, assoc, memoize, GetAttr

	from typing import List

	def fix_byte_spaces(toks: List[str]) -> List[str]:
	return [t.replace("\u0120", " ").replace("\u010A", "\\n") for t in toks]

	@memoize
	def get_bpe(bpe_pretrained_name_or_path):
	return BertTokenizer.from_pretrained(bpe_pretrained_name_or_path)

	# [String] -> [String]
	def remove_CLS_SEP(toks):
	return [t for t in toks if t not in set(["[CLS]", "[SEP]"])]

	# torch.Tensor -> np.Array
	def process_hidden_tensors(t):
	"""Embeddings are returned from the BERT model in a non-ideal embedding shape:
	- unnecessary batch dimension
	- Undesired second sentence "[SEP]".

	Drop the unnecessary information and just return what we need for the first sentence
	"""
	# Drop unnecessary batch dim and second sent
	t = t.squeeze(0)[:-1]

	# Drop second sentence sep ??
	t = t[1:-1]

	# Convert to numpy
	return t.data.numpy()


	# np.Array -> np.Array
	def normalize(a):
	"""Divide each head by its norm"""
	norms = np.linalg.norm(a, axis=-1, keepdims=True)
	return a / norms


	# np.Array:<a,b,c,d> -> np.Array<a,b,c*d>
	def reshape(a):
	"""Combine the last two dimensions of a numpy array"""
	all_head_size = a.shape[-2] * a.shape[-1]
	new_shape = a.shape[:-2] + (all_head_size,)
	return a.reshape(new_shape)