Spaces:

Markins
/

Using_BERT_Models_for_Sequential_Text_Classification_in_Biomedical_Abstracts

Sleeping

App Files Files Community

Using_BERT_Models_for_Sequential_Text_Classification_in_Biomedical_Abstracts / MakePredictions.py

Markins

Resolved Bug :: Improper data shape getting validated before passing into a batch

9dba45d over 1 year ago

raw

history blame

4.2 kB

	import numpy as np
	from spacy.lang.en import English
	import pandas as pd

	import nltk
	from nltk.corpus import stopwords
	from nltk.stem import PorterStemmer
	import re

	import torch
	import torch.nn.functional as F

	from Dataset import CustomDataSetManager

	# nltk.download("stopwords")
	# STOPWORDS = stopwords.words("english")
	# porter = PorterStemmer()

	def download_stopwords():
	nltk.download("stopwords")
	STOPWORDS = stopwords.words("english")
	porter = PorterStemmer()
	return STOPWORDS, porter

	def preprocess(text, stopwords):
	"""Conditional preprocessing on our text unique to our task."""
	# Lower
	text = text.lower()

	# Remove stopwords
	pattern = re.compile(r"\b(" + r"\|".join(stopwords) + r")\b\s*")
	text = pattern.sub("", text)

	# Remove words in paranthesis
	text = re.sub(r"\([^)]*\)", "", text)

	# Spacing and filters
	text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)
	text = re.sub("[^A-Za-z0-9]+", " ", text) # remove non alphanumeric chars
	text = re.sub(" +", " ", text) # remove multiple spaces
	text = text.strip()

	return text

	def spacy_function(abstract):

	# setup English sentence parser
	nlp = English()

	# create sentence splitting pipeline object
	sentencizer = nlp.create_pipe("sentencizer")

	# add sentence splitting pipeline object to sentence parser
	nlp.add_pipe('sentencizer')

	# create "doc" of parsed sequences, change index for a different abstract
	doc = nlp(abstract)

	# return detected sentences from doc in string type (not spaCy token type)
	abstract_lines = [str(sent) for sent in list(doc.sents)]

	return abstract_lines

	# ---------------------------------------------------------------------------------------------------------------------------

	def model_prediction(model, dataloader):
	"""Prediction step."""
	# Set model to eval mode
	model.eval()
	y_trues, y_probs = [], []
	# Iterate over val batches
	for i, batch in enumerate(dataloader):
	# Forward pass w/ inputs
	# batch = [item.to(.device) for item in batch] # Set device
	inputs = batch
	z = model(inputs)
	# Store outputs
	y_prob = F.softmax(z, dim=1).detach().cpu().numpy()
	y_probs.extend(y_prob)
	return np.vstack(y_probs)

	# ---------------------------------------------------------------------------------------------------------------------------

	def make_predictions(text, model, tokenizer, label_encoder): # embedding path
	# getting all lines seprated from abstract
	abstract_lines = list()
	abstract_lines = spacy_function(text)

	# Get total number of lines
	total_lines_in_sample = len(abstract_lines)

	# Go through each line in abstract and create a list of dictionaries containing features for each line
	sample_lines = []
	for i, line in enumerate(abstract_lines):
	sample_dict = {}
	sample_dict["text"] = str(line)
	sample_dict["line_number"] = i
	sample_dict["total_lines"] = total_lines_in_sample - 1
	sample_lines.append(sample_dict)

	# converting sample line list into pandas Dataframe
	df = pd.DataFrame(sample_lines)

	# getting stopword
	STOPWORDS, porter = download_stopwords()

	# applying preprocessing function to lines
	df.text = df.text.apply(lambda x: preprocess(x, STOPWORDS))

	# converting texts into numberical sequences
	text_seq = tokenizer.texts_to_sequences(texts=df['text'])

	# creating Dataset
	dataset = CustomDataSetManager(text_seq=text_seq, line_num=df['line_number'], total_line=df['total_lines'])

	# creating dataloader
	dataloader = dataset.create_dataloader(batch_size=2)

	# Preparing embedings
	# embedding_matrix = get_embeddings(embeding_path, tokenizer, 300)

	# creating model
	# model = MachineModel(embedding_dim=300, vocab_size=len(tokenizer), hidden_dim=128, n_layers=3, linear_output=128, num_classes=len(label_encoder), pretrained_embeddings=embedding_matrix)

	# loading model weight

	# setting model into evaluation mode
	model.eval()

	# getting predictions
	y_pred = model_prediction(model, dataloader)

	# converting predictions into label class
	pred = y_pred.argmax(axis=1)
	pred = label_encoder.decode(pred)

	return abstract_lines, pred