Spaces:
Runtime error
Runtime error
| #import nlpaug | |
| #import nlpaug.augmenter.word as naw | |
| import warnings | |
| warnings.filterwarnings("ignore", category=FutureWarning) | |
| import nltk | |
| nltk.download('punkt') | |
| import pandas as pd | |
| from nltk import pos_tag | |
| from nltk.corpus import stopwords | |
| import string | |
| #from gensim.models.phrases import Phrases, Phraser | |
| import numpy as np | |
| import re | |
| from gensim.models import Word2Vec | |
| import pickle | |
| import os | |
| from pathos.multiprocessing import ProcessingPool as Pool | |
| import itertools | |
| from time import time | |
| nltk.download('stopwords') | |
| #import parmap | |
| nltk.download('averaged_perceptron_tagger') | |
| import torch | |
| #device = torch.device('cuda') | |
| from torch.utils.data import Dataset | |
| from transformers import BertTokenizer | |
| from ast import literal_eval | |
| import os.path | |
| import os | |
| from torch.nn.utils import clip_grad_norm_ | |
| from torch.utils.data import DataLoader | |
| from torch.nn.functional import softmax | |
| from torch.nn import CrossEntropyLoss | |
| from torch.optim import Adam | |
| import time | |
| from sklearn import metrics | |
| import statistics | |
| from transformers import get_linear_schedule_with_warmup | |
| #device = torch.device('cuda') | |
| import torch | |
| from torch.utils.data import Dataset | |
| from transformers import BertTokenizer | |
| import pandas as pd | |
| from ast import literal_eval | |
| import os.path | |
| nltk.download('punkt') | |
| import pandas as pd | |
| import string | |
| #from gensim.models.phrases import Phrases, Phraser | |
| #from anytree import Node, RenderTree, PreOrderIter | |
| from pathos.multiprocessing import ProcessingPool as Pool | |
| import itertools | |
| from time import time | |
| import os | |
| nltk.download('stopwords') | |
| #import parmap | |
| from torch.nn.utils import clip_grad_norm_ | |
| from torch.utils.data import DataLoader | |
| from transformers import get_linear_schedule_with_warmup | |
| import torch.nn as nn | |
| from transformers import * | |
| nltk.download('punkt') | |
| nltk.download('wordnet') | |
| nltk.download('omw-1.4') | |
| device = torch.device('cpu') | |
| tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') | |
| MAX_SEQ_LEN = 256 | |
| MASK_TOKEN = '[MASK]' | |
| BATCH_SIZE=32 | |
| def generate_production_batch(batch): | |
| tok=[(instance.tokens for instance in batch)] | |
| tok=list( itertools.chain.from_iterable(tok)) | |
| tok=list( itertools.chain.from_iterable([[' '.join(i)] for i in tok])) | |
| encoded = tokenizer.__call__(tok, add_special_tokens=True, | |
| max_length=MAX_SEQ_LEN, pad_to_max_length=True, | |
| return_tensors='pt') | |
| input_ids = encoded['input_ids'] | |
| attn_mask = encoded['attention_mask'] | |
| entity_indices = indices_for_entity_ranges([instance.entity_range for instance in batch]) | |
| return input_ids, attn_mask, entity_indices, batch | |
| def indices_for_entity_ranges(ranges): | |
| max_e_len = max(end - start for start, end in ranges) | |
| indices = torch.tensor([[[min(t, end)] * HIDDEN_OUTPUT_FEATURES | |
| for t in range(start, start + max_e_len + 1)] | |
| for start, end in ranges]) | |
| return indices | |
| import pickle5 as pickle | |
| #print(os.getcwd()) | |
| open_file = open("./labels.pkl", "rb") | |
| LABELS = pickle.load(open_file) | |
| NUM_CLASSES = len(LABELS) | |
| open_file.close() | |
| with open('./labels_map.pkl', 'rb') as f: | |
| LABEL_MAP = pickle.load(f) | |
| open_file = open("./labels.pkl", "rb") | |
| LABELS = pickle.load(open_file) | |
| open_file.close() | |
| with open('./labels_map.pkl', 'rb') as f: | |
| LABEL_MAP = pickle.load(f) | |
| class EntityDataset(Dataset): | |
| def __init__(self, df, size=None): | |
| # filter inapplicable rows | |
| self.df = df[df.apply(lambda x: EntityDataset.instance_from_row(x) is not None, axis=1)] | |
| print(len(self.df)) | |
| # sample data if a size is specified | |
| if size is not None and size < len(self): | |
| self.df = self.df.sample(size, replace=False) | |
| def from_df(df, size=None): | |
| dataset = EntityDataset(df, size=size) | |
| print('Obtained dataset of size', len(dataset)) | |
| return dataset | |
| def instance_from_row(row): | |
| unpacked_arr = literal_eval(row['entityMentions']) if type(row['entityMentions']) is str else row['entityMentions'] | |
| entity= unpacked_arr[0]['text'] | |
| text = row['sentText'] | |
| return EntityDataset.get_instance(text, entity) | |
| def get_instance(text, entity, label=None): | |
| tokens = tokenizer.tokenize(text) | |
| i = 0 | |
| found_entity = True | |
| entity_range = (0,100) | |
| if found_entity: | |
| return PairRelInstance(tokens, entity, entity_range, None, text) | |
| def __len__(self): | |
| return len(self.df.index) | |
| def __getitem__(self, idx): | |
| return EntityDataset.instance_from_row(self.df.iloc[idx]) | |
| class PairRelInstance: | |
| def __init__(self, tokens, entity, entity_range, label, text): | |
| self.tokens = tokens | |
| self.entity = entity | |
| self.entity_range = entity_range | |
| self.label = label | |
| self.text = text | |
| TRAINED_WEIGHTS = 'bert-base-uncased' | |
| HIDDEN_OUTPUT_FEATURES = 768 | |
| class PairRelInstance: | |
| def __init__(self, tokens, entity, entity_range, label, text): | |
| self.tokens = tokens | |
| self.entity = entity | |
| self.entity_range = entity_range | |
| self.label = label | |
| self.text = text | |
| def input_text_format(text ): | |
| if text is not None: | |
| return text, [{'text': text}] | |
| return None | |
| def prep(s): | |
| return s.replace('_', ' ').lower() | |
| class BertEntityExtractor: | |
| def __init__(self): | |
| self.net = EntityBertNet() | |
| def load_saved(path): | |
| extr = BertEntityExtractor() | |
| extr.net = EntityBertNet() | |
| extr.net.load_state_dict(torch.load('./TickerExtraction/entity_model2.pt',map_location=torch.device('cpu'))) #,map_location=torch.device('cpu') | |
| extr.net.eval() | |
| return extr | |
| def load_trained_model(): | |
| entity_extractor_path = './TickerExtraction/entity_model2.pt' | |
| entity_extractor = BertEntityExtractor.load_saved(entity_extractor_path) | |
| return entity_extractor | |
| def input_text(self,texts): | |
| mapping1=[input_text_format(texts)] | |
| entity_texts = [t for t in mapping1 | |
| if t is not None] | |
| df = pd.DataFrame(entity_texts, columns=['sentText', 'entityMentions']) | |
| df['sentText']=str(df['sentText'][0]) | |
| data = EntityDataset.from_df(df) | |
| return data,df | |
| def extract_entity_probabilities(self, file_path=None, dataset=None, size=None): | |
| # load data | |
| if file_path is not None: | |
| data, _ = EntityDataset.from_file(file_path, size=size) | |
| else: | |
| if dataset is None: | |
| raise AttributeError('file_path and data cannot both be None') | |
| data = dataset | |
| loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, | |
| collate_fn=generate_production_batch) | |
| #print("loader"+str(loader)) | |
| self.net.to(device) | |
| self.net.eval() | |
| probs =[] | |
| with torch.no_grad(): | |
| for input_ids, attn_mask, entity_indices, instances in loader: | |
| # send batch to gpu | |
| input_ids, attn_mask, entity_indices = tuple(i.to(device) for i in [input_ids, attn_mask, | |
| entity_indices]) | |
| # forward pass | |
| output_scores = softmax(self.net(input_ids, attn_mask, entity_indices), dim=1) | |
| for i,(ins, score) in enumerate(zip(instances, output_scores.tolist())): | |
| probs.append(score) | |
| return probs | |
| #print(probs) | |
| return {t: statistics.mean(t_probs) if len(t_probs) > 0 else None for t, t_probs in probs.items()} | |
| class EntityBertNet(nn.Module): | |
| def __init__(self): | |
| super(EntityBertNet, self).__init__() | |
| config = BertConfig.from_pretrained(TRAINED_WEIGHTS) | |
| self.bert_base = BertModel.from_pretrained(TRAINED_WEIGHTS, config=config) | |
| self.fc = nn.Linear(HIDDEN_OUTPUT_FEATURES, NUM_CLASSES) | |
| def forward(self, input_ids, attn_mask, entity_indices): | |
| # BERT | |
| bert_output, _ = self.bert_base(input_ids=input_ids, attention_mask=attn_mask,return_dict=False) | |
| # max pooling at entity locations | |
| entity_pooled_output = EntityBertNet.pooled_output(bert_output, entity_indices) | |
| # fc layer (softmax activation done in loss function) | |
| x = self.fc(entity_pooled_output) | |
| return x | |
| def pooled_output(bert_output, indices): | |
| #print(bert_output) | |
| outputs = torch.gather(input=bert_output, dim=1, index=indices) | |
| pooled_output, _ = torch.max(outputs, dim=1) | |
| return pooled_output | |