Update fincat_utils.py
Browse files- fincat_utils.py +4 -38
fincat_utils.py
CHANGED
@@ -29,26 +29,7 @@ def extract_context_words(x, window = 6):
|
|
29 |
"""The following functions have been created with inspiration from https://github.com/arushiprakash/MachineLearning/blob/main/BERT%20Word%20Embeddings.ipynb"""
|
30 |
|
31 |
def bert_text_preparation(text, tokenizer):
|
32 |
-
|
33 |
-
|
34 |
-
Takes a string argument and performs
|
35 |
-
pre-processing like adding special tokens,
|
36 |
-
tokenization, tokens to ids, and tokens to
|
37 |
-
segment ids. All tokens are mapped to seg-
|
38 |
-
ment id = 1.
|
39 |
-
|
40 |
-
Args:
|
41 |
-
text (str): Text to be converted
|
42 |
-
tokenizer (obj): Tokenizer object
|
43 |
-
to convert text into BERT-re-
|
44 |
-
adable tokens and ids
|
45 |
-
|
46 |
-
Returns:
|
47 |
-
list: List of BERT-readable tokens
|
48 |
-
obj: Torch tensor with token ids
|
49 |
-
obj: Torch tensor segment ids
|
50 |
-
|
51 |
-
"""
|
52 |
marked_text = "[CLS] " + text + " [SEP]"
|
53 |
tokenized_text = tokenizer.tokenize(marked_text)
|
54 |
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
@@ -61,22 +42,7 @@ def bert_text_preparation(text, tokenizer):
|
|
61 |
return tokenized_text, tokens_tensor, segments_tensors
|
62 |
|
63 |
def get_bert_embeddings(tokens_tensor, segments_tensors, model):
|
64 |
-
|
65 |
-
|
66 |
-
Args:
|
67 |
-
tokens_tensor (obj): Torch tensor size [n_tokens]
|
68 |
-
with token ids for each token in text
|
69 |
-
segments_tensors (obj): Torch tensor size [n_tokens]
|
70 |
-
with segment ids for each token in text
|
71 |
-
model (obj): Embedding model to generate embeddings
|
72 |
-
from token and segment ids
|
73 |
-
|
74 |
-
Returns:
|
75 |
-
list: List of list of floats of size
|
76 |
-
[n_tokens, n_embedding_dimensions]
|
77 |
-
containing embeddings for each token
|
78 |
-
"""
|
79 |
-
|
80 |
# Gradient calculation id disabled
|
81 |
# Model is in inference mode
|
82 |
with torch.no_grad():
|
@@ -106,5 +72,5 @@ def bert_embedding_extract(context_text, word):
|
|
106 |
word_embedding_all.append(word_embedding)
|
107 |
word_embedding_mean = np.array(word_embedding_all).mean(axis=0)
|
108 |
return word_embedding_mean
|
109 |
-
|
110 |
-
|
|
|
29 |
"""The following functions have been created with inspiration from https://github.com/arushiprakash/MachineLearning/blob/main/BERT%20Word%20Embeddings.ipynb"""
|
30 |
|
31 |
def bert_text_preparation(text, tokenizer):
|
32 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
marked_text = "[CLS] " + text + " [SEP]"
|
34 |
tokenized_text = tokenizer.tokenize(marked_text)
|
35 |
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
|
|
42 |
return tokenized_text, tokens_tensor, segments_tensors
|
43 |
|
44 |
def get_bert_embeddings(tokens_tensor, segments_tensors, model):
|
45 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
# Gradient calculation id disabled
|
47 |
# Model is in inference mode
|
48 |
with torch.no_grad():
|
|
|
72 |
word_embedding_all.append(word_embedding)
|
73 |
word_embedding_mean = np.array(word_embedding_all).mean(axis=0)
|
74 |
return word_embedding_mean
|
75 |
+
except:
|
76 |
+
return ['None']
|