Token Classification
Scikit-learn
English
ner
legal
crf
shashankmc's picture
Predict file
2b0bc49 verified
import ast
import pandas as pd
import joblib
import nltk
from nltk import pos_tag
import string
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
# Check if nltk modules are downloaded, if not download them
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download("averaged_perceptron_tagger")
class getsentence(object):
'''
This class is used to get the sentences from the dataset.
Converts from BIO format to sentences using their sentence numbers
'''
def __init__(self, data):
self.n_sent = 1.0
self.data = data
self.empty = False
self.grouped = self.data.groupby("sentence_num").apply(self._agg_func)
self.sentences = [s for s in self.grouped]
def _agg_func(self, s):
return [(w, p) for w, p in zip(s["token"].values.tolist(),
s["pos_tag"].values.tolist())]
def word2features(sent, i):
'''
This method is used to extract features from the words in the sentence.
The main features extracted are:
- word.lower(): The word in lowercase
- word.isdigit(): If the word is a digit
- word.punct(): If the word is a punctuation
- postag: The pos tag of the word
- word.lemma(): The lemma of the word
- word.stem(): The stem of the word
The features (not all) are also extracted for the 4 previous and 4 next words.
'''
global token_count
wordnet_lemmatizer = WordNetLemmatizer()
porter_stemmer = PorterStemmer()
word = sent[i][0]
postag = sent[i][1]
features = {
'bias': 1.0,
'word.lower()': word.lower(),
'word.isdigit()': word.isdigit(),
# Check if its punctuations
'word.punct()': word in string.punctuation,
'postag': postag,
# Lemma of the word
'word.lemma()': wordnet_lemmatizer.lemmatize(word),
# Stem of the word
'word.stem()': porter_stemmer.stem(word)
}
if i > 0:
word1 = sent[i-1][0]
postag1 = sent[i-1][1]
features.update({
'-1:word.lower()': word1.lower(),
'-1:word.isdigit()': word1.isdigit(),
'-1:word.punct()': word1 in string.punctuation,
'-1:postag': postag1
})
if i - 2 >= 0:
features.update({
'-2:word.lower()': sent[i-2][0].lower(),
'-2:word.isdigit()': sent[i-2][0].isdigit(),
'-2:word.punct()': sent[i-2][0] in string.punctuation,
'-2:postag': sent[i-2][1]
})
if i - 3 >= 0:
features.update({
'-3:word.lower()': sent[i-3][0].lower(),
'-3:word.isdigit()': sent[i-3][0].isdigit(),
'-3:word.punct()': sent[i-3][0] in string.punctuation,
'-3:postag': sent[i-3][1]
})
if i - 4 >= 0:
features.update({
'-4:word.lower()': sent[i-4][0].lower(),
'-4:word.isdigit()': sent[i-4][0].isdigit(),
'-4:word.punct()': sent[i-4][0] in string.punctuation,
'-4:postag': sent[i-4][1]
})
else:
features['BOS'] = True
if i < len(sent)-1:
word1 = sent[i+1][0]
postag1 = sent[i+1][1]
features.update({
'+1:word.lower()': word1.lower(),
'+1:word.isdigit()': word1.isdigit(),
'+1:word.punct()': word1 in string.punctuation,
'+1:postag': postag1
})
if i + 2 < len(sent):
features.update({
'+2:word.lower()': sent[i+2][0].lower(),
'+2:word.isdigit()': sent[i+2][0].isdigit(),
'+2:word.punct()': sent[i+2][0] in string.punctuation,
'+2:postag': sent[i+2][1]
})
if i + 3 < len(sent):
features.update({
'+3:word.lower()': sent[i+3][0].lower(),
'+3:word.isdigit()': sent[i+3][0].isdigit(),
'+3:word.punct()': sent[i+3][0] in string.punctuation,
'+3:postag': sent[i+3][1]
})
if i + 4 < len(sent):
features.update({
'+4:word.lower()': sent[i+4][0].lower(),
'+4:word.isdigit()': sent[i+4][0].isdigit(),
'+4:word.punct()': sent[i+4][0] in string.punctuation,
'+4:postag': sent[i+4][1]
})
else:
features['EOS'] = True
return features
def sent2features(sent):
'''
This method is used to extract features from the sentence.
'''
return [word2features(sent, i) for i in range(len(sent))]
print("Evaluating the model...")
# Load file from your directory
df_eval = pd.read_excel("testset_NER_LegalLens.xlsx")
print("Read the evaluation dataset.")
df_eval["tokens"] = df_eval["tokens"].apply(ast.literal_eval)
df_eval['pos_tags'] = df_eval['tokens'].apply(lambda x: [tag[1]
for tag in pos_tag(x)])
data_eval = []
for i in range(len(df_eval)):
for j in range(len(df_eval["tokens"][i])):
data_eval.append(
{
"sentence_num": i+1,
"id": df_eval["id"][i],
"token": df_eval["tokens"][i][j],
"pos_tag": df_eval["pos_tags"][i][j],
}
)
data_eval = pd.DataFrame(data_eval)
print("Dataframe created.")
getter = getsentence(data_eval)
sentences_eval = getter.sentences
X_eval = [sent2features(s) for s in sentences_eval]
print("Predicting the NER tags...")
# Load model from your direction
crf = joblib.load("../models/crf.pkl")
y_pred_eval = crf.predict(X_eval)
print("NER tags predicted.")
df_eval["ner_tags"] = y_pred_eval
df_eval.drop(columns=["pos_tags"], inplace=True)
print("Saving the predictions...")
df_eval.to_csv("predictions_NERLens.csv", index=False)
print("Predictions saved.")