crf_ner_violations_legallens / predict.py

Predict file

2b0bc49 verified 11 days ago

5.92 kB

	import ast
	import pandas as pd
	import joblib
	import nltk
	from nltk import pos_tag
	import string
	from nltk.stem import WordNetLemmatizer
	from nltk.stem import PorterStemmer


	# Check if nltk modules are downloaded, if not download them
	nltk.download('wordnet')
	nltk.download('omw-1.4')
	nltk.download("averaged_perceptron_tagger")


	class getsentence(object):
	'''
	This class is used to get the sentences from the dataset.
	Converts from BIO format to sentences using their sentence numbers
	'''
	def __init__(self, data):
	self.n_sent = 1.0
	self.data = data
	self.empty = False
	self.grouped = self.data.groupby("sentence_num").apply(self._agg_func)
	self.sentences = [s for s in self.grouped]

	def _agg_func(self, s):
	return [(w, p) for w, p in zip(s["token"].values.tolist(),
	s["pos_tag"].values.tolist())]


	def word2features(sent, i):
	'''
	This method is used to extract features from the words in the sentence.
	The main features extracted are:
	- word.lower(): The word in lowercase
	- word.isdigit(): If the word is a digit
	- word.punct(): If the word is a punctuation
	- postag: The pos tag of the word
	- word.lemma(): The lemma of the word
	- word.stem(): The stem of the word
	The features (not all) are also extracted for the 4 previous and 4 next words.
	'''
	global token_count
	wordnet_lemmatizer = WordNetLemmatizer()
	porter_stemmer = PorterStemmer()
	word = sent[i][0]
	postag = sent[i][1]

	features = {
	'bias': 1.0,
	'word.lower()': word.lower(),
	'word.isdigit()': word.isdigit(),
	# Check if its punctuations
	'word.punct()': word in string.punctuation,
	'postag': postag,
	# Lemma of the word
	'word.lemma()': wordnet_lemmatizer.lemmatize(word),
	# Stem of the word
	'word.stem()': porter_stemmer.stem(word)
	}
	if i > 0:
	word1 = sent[i-1][0]
	postag1 = sent[i-1][1]
	features.update({
	'-1:word.lower()': word1.lower(),
	'-1:word.isdigit()': word1.isdigit(),
	'-1:word.punct()': word1 in string.punctuation,
	'-1:postag': postag1
	})
	if i - 2 >= 0:
	features.update({
	'-2:word.lower()': sent[i-2][0].lower(),
	'-2:word.isdigit()': sent[i-2][0].isdigit(),
	'-2:word.punct()': sent[i-2][0] in string.punctuation,
	'-2:postag': sent[i-2][1]
	})
	if i - 3 >= 0:
	features.update({
	'-3:word.lower()': sent[i-3][0].lower(),
	'-3:word.isdigit()': sent[i-3][0].isdigit(),
	'-3:word.punct()': sent[i-3][0] in string.punctuation,
	'-3:postag': sent[i-3][1]
	})
	if i - 4 >= 0:
	features.update({
	'-4:word.lower()': sent[i-4][0].lower(),
	'-4:word.isdigit()': sent[i-4][0].isdigit(),
	'-4:word.punct()': sent[i-4][0] in string.punctuation,
	'-4:postag': sent[i-4][1]
	})
	else:
	features['BOS'] = True

	if i < len(sent)-1:
	word1 = sent[i+1][0]
	postag1 = sent[i+1][1]
	features.update({
	'+1:word.lower()': word1.lower(),
	'+1:word.isdigit()': word1.isdigit(),
	'+1:word.punct()': word1 in string.punctuation,
	'+1:postag': postag1
	})
	if i + 2 < len(sent):
	features.update({
	'+2:word.lower()': sent[i+2][0].lower(),
	'+2:word.isdigit()': sent[i+2][0].isdigit(),
	'+2:word.punct()': sent[i+2][0] in string.punctuation,
	'+2:postag': sent[i+2][1]
	})
	if i + 3 < len(sent):
	features.update({
	'+3:word.lower()': sent[i+3][0].lower(),
	'+3:word.isdigit()': sent[i+3][0].isdigit(),
	'+3:word.punct()': sent[i+3][0] in string.punctuation,
	'+3:postag': sent[i+3][1]
	})
	if i + 4 < len(sent):
	features.update({
	'+4:word.lower()': sent[i+4][0].lower(),
	'+4:word.isdigit()': sent[i+4][0].isdigit(),
	'+4:word.punct()': sent[i+4][0] in string.punctuation,
	'+4:postag': sent[i+4][1]
	})
	else:
	features['EOS'] = True

	return features


	def sent2features(sent):
	'''
	This method is used to extract features from the sentence.
	'''
	return [word2features(sent, i) for i in range(len(sent))]


	print("Evaluating the model...")
	# Load file from your directory
	df_eval = pd.read_excel("testset_NER_LegalLens.xlsx")
	print("Read the evaluation dataset.")
	df_eval["tokens"] = df_eval["tokens"].apply(ast.literal_eval)
	df_eval['pos_tags'] = df_eval['tokens'].apply(lambda x: [tag[1]
	for tag in pos_tag(x)])
	data_eval = []
	for i in range(len(df_eval)):
	for j in range(len(df_eval["tokens"][i])):
	data_eval.append(
	{
	"sentence_num": i+1,
	"id": df_eval["id"][i],
	"token": df_eval["tokens"][i][j],
	"pos_tag": df_eval["pos_tags"][i][j],
	}
	)
	data_eval = pd.DataFrame(data_eval)
	print("Dataframe created.")
	getter = getsentence(data_eval)
	sentences_eval = getter.sentences
	X_eval = [sent2features(s) for s in sentences_eval]
	print("Predicting the NER tags...")
	# Load model from your direction
	crf = joblib.load("../models/crf.pkl")
	y_pred_eval = crf.predict(X_eval)
	print("NER tags predicted.")
	df_eval["ner_tags"] = y_pred_eval
	df_eval.drop(columns=["pos_tags"], inplace=True)
	print("Saving the predictions...")
	df_eval.to_csv("predictions_NERLens.csv", index=False)
	print("Predictions saved.")