rahulkiitk commited on
Commit
320f492
·
verified ·
1 Parent(s): 5d3a408

Upload 10 files

Browse files
Assignment.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Evaluation.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.14","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[{"sourceId":9661596,"sourceType":"datasetVersion","datasetId":5902909},{"sourceId":140348,"sourceType":"modelInstanceVersion","isSourceIdPinned":true,"modelInstanceId":118867,"modelId":142118}],"dockerImageVersionId":30787,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# # This Python 3 environment comes with many helpful analytics libraries installed\n# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# # For example, here's several helpful packages to load\n\n# import numpy as np # linear algebra\n# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# # Input data files are available in the read-only \"../input/\" directory\n# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\n# import os\n# for dirname, _, filenames in os.walk('/kaggle/input'):\n# for filename in filenames:\n# print(os.path.join(dirname, filename))\n\n# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"!pip install seqeval","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import pandas as pd\nimport re\nfrom transformers import BertTokenizer, BertForTokenClassification, AdamW, BertTokenizerFast\nfrom nltk.tokenize import sent_tokenize, word_tokenize\nimport torch.nn as nn\nimport torch\nimport tqdm","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import re\nimport nltk\nfrom nltk.corpus import stopwords\nfrom nltk.tokenize import word_tokenize\n\n# Download stopwords if not already downloaded\nnltk.download('punkt')\nnltk.download('stopwords')\n\n# Set of stop words (you can add more if needed)\nstop_words = set(stopwords.words('english'))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"test_file = \"/kaggle/input/miimansa/G1.xlsx\"\nmodel_path = \"/kaggle/input/ner_model/pytorch/default/1/model_weights1.pth\"","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df = pd.read_excel(test_file)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df.dropna(inplace=True)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Define the label mapping\nlabel_map = {\n \"O\": 0,\n \"B-treatment\": 1, \"I-treatment\": 2,\n \"B-chronic_disease\": 3, \"I-chronic_disease\": 4,\n \"B-cancer\": 5, \"I-cancer\": 6,\n \"B-allergy_name\": 7, \"I-allergy_name\": 8\n}\n\nnum_labels = len(label_map)\nmax_sent_len = 256\n\n# Hyperparameters\nbatch_size = 16\n\n# Define device: Use GPU (cuda) if available, else use CPU\ndevice = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n\n# Initialize the tokenizer\n# tokenizer = BertTokenizer.from_pretrained(\"bert-base-cased\")\ntokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')\nmodel = BertForTokenClassification.from_pretrained(\"bert-base-cased\", num_labels=len(label_map))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"def preprocess_data(df):\n# max_len=float('-inf')\n all_input_ids = []\n all_attention_masks = []\n all_labels = []\n\n for _, row in df.iterrows():\n text = row['text']\n entity = row['tags']\n\n if not pd.isna(entity) and not pd.isna(text):\n\n entity = entity.split(',')\n # Remove all empty strings using filter\n entities = list(filter(lambda x: x.strip(), entity))\n # print(\"entities: \", entities)\n\n tokenized_input = tokenizer(text, truncation=True, padding='max_length', max_length=max_sent_len, return_offsets_mapping=True)\n input_ids = tokenized_input['input_ids']\n attention_mask = tokenized_input['attention_mask']\n offset_mapping = tokenized_input['offset_mapping']\n\n labels = ['O'] * len(input_ids)\n\n for entity in entities:\n start_idx, end_idx, label = entity.split(':')\n start_idx, end_idx = int(start_idx)-1, int(end_idx)-1\n\n entity_started = False\n for idx, (start, end) in enumerate(offset_mapping):\n if start_idx <= start < end_idx and end != 0:\n if not entity_started:\n labels[idx] = f\"B-{label}\"\n entity_started = True\n else:\n labels[idx] = f\"I-{label}\"\n elif end < start_idx:\n entity_started = False\n\n all_input_ids.append(input_ids)\n all_attention_masks.append(attention_mask)\n all_labels.append([label_map[label] for label in labels])\n\n # Get processed data\n processed_data = {\n \"tokens\": all_input_ids,\n \"attention_mask\": all_attention_masks,\n \"labels\": all_labels\n }\n return processed_data\n\ntest_processed_data = preprocess_data(df)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"id2label = {v: k for k, v in label_map.items()}\n\ninput_ids = test_processed_data['tokens'][4]\nprint(\"Tokens:\", tokenizer.convert_ids_to_tokens(input_ids))\n\nlabels = test_processed_data['labels'][4]\nprint(\"Labels:\", labels)\nprint(\"Label names:\", [id2label[label] for label in labels])","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from torch.utils.data import DataLoader, Dataset\n\nclass NERDataset(Dataset):\n def __init__(self, encodings, attention_masks, labels):\n self.encodings = encodings\n self.attention_masks = attention_masks\n self.labels = labels\n\n def __getitem__(self, idx):\n item={}\n item['input_ids'] = torch.tensor(self.encodings[idx])\n item['labels'] = torch.tensor(self.labels[idx])\n item['attention_mask'] = torch.tensor(self.attention_masks[idx])\n\n return item\n\n def __len__(self):\n return len(self.labels)\n \n# Create the test dataset and dataloader\nner_dataset = NERDataset(test_processed_data[\"tokens\"], test_processed_data[\"attention_mask\"], test_processed_data[\"labels\"])\ntest_dataloader = DataLoader(ner_dataset, batch_size=batch_size, shuffle=True)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from sklearn.metrics import f1_score\nimport numpy as np\nfrom seqeval.metrics import classification_report\n\ndef evaluation(test_dataloaders, model):\n\n # Evaluation on test dataset\n model.eval()\n\n correct_predictions = 0\n total = 0\n\n y_true = []\n y_pred = []\n\n with torch.no_grad():\n for batch in tqdm.tqdm(test_dataloaders):\n input_ids = batch['input_ids'].to(device)\n labels = batch['labels'].to(device)\n\n outputs = model(input_ids)\n # Get predictions by taking the argmax of the logits\n predictions = torch.argmax(outputs.logits, dim=-1)\n\n # Convert to numpy arrays\n labels = labels.cpu().numpy()\n predictions = predictions.cpu().numpy()\n\n for label, pred in zip(labels, predictions):\n # Filter out -100 labels\n y_true.append([id2label[l] for l in label if l != -100])\n y_pred.append([id2label[p] for p, l in zip(pred, label) if l != -100])\n\n print(classification_report(y_true, y_pred))\n print(\"*\"*40)\n\n report = classification_report(y_true, y_pred, output_dict=True)\n\n # Extracting F1 scores for each entity type\n entity_f1_scores = {}\n for label in ['treatment', 'chronic_disease', 'cancer', 'allergy_name']:\n entity_f1_scores[label] = report[label]['f1-score']\n\n weighted_avg_f1 = report['weighted avg']['f1-score']\n\n print(\"Entity-wise F1 scores:\")\n for entity, score in entity_f1_scores.items():\n print(f\"{entity}: {score:.4f}\")\n print(f\"Weighted Average F1 score: {weighted_avg_f1:.4f}\")\n\n return (entity_f1_scores, weighted_avg_f1)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"model = BertForTokenClassification.from_pretrained(\"bert-base-cased\", num_labels=len(label_map))\nmodel.load_state_dict(torch.load(model_path))\nmodel.to(device)\n\nT1_results = evaluation(test_dataloader, model)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"def get_all_scores(results):\n score_dict = {'Weighted Average':[]}\n for result in results:\n for entity, score in result[0].items():\n score_dict[entity] = score_dict.get(entity,[])\n score_dict[entity].append(score)\n score_dict['Weighted Average'].append(result[1])\n score_df = pd.DataFrame(score_dict)\n return score_df","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"all_scores_df = get_all_scores([T1_results]).T\nall_scores_df.columns = [\"Performance on the test set\"]\nall_scores_df","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"all_scores_df.to_csv('all_scores_df.csv')","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}
README.md CHANGED
@@ -10,4 +10,10 @@ pinned: false
10
  short_description: Continual Learning Task for Named Entity Recognition (NER)
11
  ---
12
 
 
 
 
 
 
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
10
  short_description: Continual Learning Task for Named Entity Recognition (NER)
11
  ---
12
 
13
+
14
+ To train the model use -> Assignment.ipynb
15
+ To evaluate the mdoe use -> Evaluation.ipynb
16
+
17
+ just give the file name and model path in the beginning.
18
+
19
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
Report.pdf ADDED
Binary file (88.9 kB). View file
 
results/.~lock.all_scores_df.csv# ADDED
@@ -0,0 +1 @@
 
 
1
+ ,rahul,rahul-HP,19.10.2024 17:20,file:///home/rahul/.config/libreoffice/4;
results/all_scores_df.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ ,Performance on the test set of T1,Performance on the test set of T1 and T2.,"Performance on the test set of T1, T2 and T3.",Performance on combined G1+G2+G3
2
+ Weighted Average,0.5067888144317206,0.6141782856955111,0.6169180510853507,0.5598100308567419
3
+ treatment,0.5415005436752447,0.6139972554401097,0.6263488080301128,0.5325735407361775
4
+ chronic_disease,0.46567164179104475,0.631209818819404,0.6110091743119266,0.5763137254901961
5
+ cancer,0.5764854614412137,0.5851703406813628,0.6100746268656717,0.6278755074424899
6
+ allergy_name,0.029850746268656716,0.47000000000000003,0.580281690140845,0.3860759493670886
results/model_weights1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d75155e2529c2aad289060a48d9d3362131c33b8d2b519ef7328a42df3251527
3
+ size 430988657
results/model_weights2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:913451b2ee7afa008a837ee5bbaacb6c42351586e57880766dd9e9a87b6895b5
3
+ size 430988657
results/model_weights3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da7f365cdd7bb975dbb898f9d4416e4bb95ad084d39f0171597a70bc8fd3ca9d
3
+ size 430988657
results/model_weights4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0cf63471c01edc5aa5080f5b074e7ce9374aa0bea17a8f982e4cbc8ad8101a5
3
+ size 430988657