In [None]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install seqeval

In [None]:
import pandas as pd
import re
from transformers import BertTokenizer, BertForTokenClassification, AdamW, BertTokenizerFast
from nltk.tokenize import sent_tokenize, word_tokenize
import torch.nn as nn
import torch
import tqdm

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download stopwords if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Set of stop words (you can add more if needed)
stop_words = set(stopwords.words('english'))

In [None]:
test_file = "/kaggle/input/miimansa/G1.xlsx"
model_path = "/kaggle/input/ner_model/pytorch/default/1/model_weights1.pth"

In [None]:
df = pd.read_excel(test_file)

In [None]:
df.dropna(inplace=True)

In [None]:
# Define the label mapping
label_map = {
    "O": 0,
    "B-treatment": 1, "I-treatment": 2,
    "B-chronic_disease": 3, "I-chronic_disease": 4,
    "B-cancer": 5, "I-cancer": 6,
    "B-allergy_name": 7, "I-allergy_name": 8
}

num_labels = len(label_map)
max_sent_len = 256

# Hyperparameters
batch_size = 16

# Define device: Use GPU (cuda) if available, else use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the tokenizer
# tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(label_map))

In [None]:
def preprocess_data(df):
#     max_len=float('-inf')
    all_input_ids = []
    all_attention_masks = []
    all_labels = []

    for _, row in df.iterrows():
        text = row['text']
        entity = row['tags']

        if not pd.isna(entity) and not pd.isna(text):

            entity = entity.split(',')
            # Remove all empty strings using filter
            entities = list(filter(lambda x: x.strip(), entity))
    #         print("entities: ", entities)

            tokenized_input = tokenizer(text, truncation=True, padding='max_length', max_length=max_sent_len, return_offsets_mapping=True)
            input_ids = tokenized_input['input_ids']
            attention_mask = tokenized_input['attention_mask']
            offset_mapping = tokenized_input['offset_mapping']

            labels = ['O'] * len(input_ids)

            for entity in entities:
                start_idx, end_idx, label = entity.split(':')
                start_idx, end_idx = int(start_idx)-1, int(end_idx)-1

                entity_started = False
                for idx, (start, end) in enumerate(offset_mapping):
                    if start_idx <= start < end_idx and end != 0:
                        if not entity_started:
                            labels[idx] = f"B-{label}"
                            entity_started = True
                        else:
                            labels[idx] = f"I-{label}"
                    elif end < start_idx:
                        entity_started = False

            all_input_ids.append(input_ids)
            all_attention_masks.append(attention_mask)
            all_labels.append([label_map[label] for label in labels])

    # Get processed data
    processed_data = {
        "tokens": all_input_ids,
        "attention_mask": all_attention_masks,
        "labels": all_labels
    }
    return processed_data

test_processed_data = preprocess_data(df)

In [None]:
id2label = {v: k for k, v in label_map.items()}

input_ids = test_processed_data['tokens'][4]
print("Tokens:", tokenizer.convert_ids_to_tokens(input_ids))

labels = test_processed_data['labels'][4]
print("Labels:", labels)
print("Label names:", [id2label[label] for label in labels])

In [None]:
from torch.utils.data import DataLoader, Dataset

class NERDataset(Dataset):
    def __init__(self, encodings, attention_masks, labels):
        self.encodings = encodings
        self.attention_masks = attention_masks
        self.labels = labels

    def __getitem__(self, idx):
        item={}
        item['input_ids'] = torch.tensor(self.encodings[idx])
        item['labels'] = torch.tensor(self.labels[idx])
        item['attention_mask'] = torch.tensor(self.attention_masks[idx])

        return item

    def __len__(self):
        return len(self.labels)
    
# Create the test dataset and dataloader
ner_dataset = NERDataset(test_processed_data["tokens"], test_processed_data["attention_mask"], test_processed_data["labels"])
test_dataloader = DataLoader(ner_dataset, batch_size=batch_size, shuffle=True)

In [None]:
from sklearn.metrics import f1_score
import numpy as np
from seqeval.metrics import classification_report

def evaluation(test_dataloaders, model):

    # Evaluation on test dataset
    model.eval()

    correct_predictions = 0
    total = 0

    y_true = []
    y_pred = []

    with torch.no_grad():
        for batch in tqdm.tqdm(test_dataloaders):
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids)
            # Get predictions by taking the argmax of the logits
            predictions = torch.argmax(outputs.logits, dim=-1)

            # Convert to numpy arrays
            labels = labels.cpu().numpy()
            predictions = predictions.cpu().numpy()

            for label, pred in zip(labels, predictions):
                # Filter out -100 labels
                y_true.append([id2label[l] for l in label if l != -100])
                y_pred.append([id2label[p] for p, l in zip(pred, label) if l != -100])

    print(classification_report(y_true, y_pred))
    print("*"*40)

    report = classification_report(y_true, y_pred, output_dict=True)

    # Extracting F1 scores for each entity type
    entity_f1_scores = {}
    for label in ['treatment', 'chronic_disease', 'cancer', 'allergy_name']:
        entity_f1_scores[label] = report[label]['f1-score']

    weighted_avg_f1 = report['weighted avg']['f1-score']

    print("Entity-wise F1 scores:")
    for entity, score in entity_f1_scores.items():
        print(f"{entity}: {score:.4f}")
    print(f"Weighted Average F1 score: {weighted_avg_f1:.4f}")

    return (entity_f1_scores, weighted_avg_f1)

In [None]:
model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(label_map))
model.load_state_dict(torch.load(model_path))
model.to(device)

T1_results = evaluation(test_dataloader, model)

In [None]:
def get_all_scores(results):
    score_dict = {'Weighted Average':[]}
    for result in results:
        for entity, score in result[0].items():
            score_dict[entity] = score_dict.get(entity,[])
            score_dict[entity].append(score)
        score_dict['Weighted Average'].append(result[1])
    score_df = pd.DataFrame(score_dict)
    return score_df

In [None]:
all_scores_df = get_all_scores([T1_results]).T
all_scores_df.columns = ["Performance on the test set"]
all_scores_df

In [None]:
all_scores_df.to_csv('all_scores_df.csv')