Spaces:

BecomeAllan
/

meta-demo-app

Build error

App Files Files Community

BecomeAllan commited on Nov 8, 2022

Commit

6b71499

1 Parent(s): 4a0a4f7

update

Browse files

Files changed (7) hide show

.vscode/settings.json +7 -0
ML-SLRC/Info.json +1 -0
ML-SLRC/ML_SLRC.py +574 -0
ML-SLRC/Util_funs.py +740 -0
ML-SLRC/__init__.py +4 -0
ML-SLRC/model.pt +3 -0
app.py +0 -1

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "workbench.colorCustomizations": {
+    "activityBar.background": "#590F35",
+    "titleBar.activeBackground": "#7C154B",
+    "titleBar.activeForeground": "#FEFCFD"
+  }
+}

ML-SLRC/Info.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"inner_print": 2, "bert_layers": 4, "max_seq_length": 512, "meta_epoch": 20, "k_spt": 8, "k_qry": 8, "outer_batch_size": 5, "inner_batch_size": 4, "outer_update_lr": 5e-05, "inner_update_lr": 5e-05, "inner_update_step": 4, "inner_update_step_eval": 4, "num_task_train": 20, "pos_weight": 1.5, "tresh": 0.9, "model": "allenai/scibert_scivocab_uncased"}

ML-SLRC/ML_SLRC.py ADDED Viewed

	@@ -0,0 +1,574 @@

+from torch import nn
+import torch
+import numpy as np
+from copy import deepcopy
+import re
+import unicodedata
+from torch.utils.data import Dataset, DataLoader,TensorDataset, RandomSampler
+from sklearn.model_selection import train_test_split
+from torch.optim import Adam
+from copy import deepcopy
+import gc
+import torch
+import numpy as np
+from torchmetrics import functional as fn
+import random
+# Pre-trained model
+class Encoder(nn.Module):
+  def __init__(self, layers, freeze_bert, model):
+    super(Encoder, self).__init__()
+    # Dummy Parameter
+    self.dummy_param = nn.Parameter(torch.empty(0))
+    # Pre-trained model
+    self.model = deepcopy(model)
+    # Freezing bert parameters
+    if freeze_bert:
+      for param in self.model.parameters():
+        param.requires_grad = freeze_bert
+    # Selecting hidden layers of the pre-trained model
+    old_model_encoder = self.model.encoder.layer
+    new_model_encoder = nn.ModuleList()
+    for i in layers:
+      new_model_encoder.append(old_model_encoder[i])
+    self.model.encoder.layer = new_model_encoder
+  # Feed forward
+  def forward(self, **x):
+    return self.model(**x)['pooler_output']
+# Complete model
+class SLR_Classifier(nn.Module):
+  def __init__(self, **data):
+    super(SLR_Classifier, self).__init__()
+    # Dummy Parameter
+    self.dummy_param = nn.Parameter(torch.empty(0))
+    # Loss function
+    # Binary Cross Entropy with logits reduced to mean
+    self.loss_fn = nn.BCEWithLogitsLoss(reduction = 'mean',
+                                        pos_weight=torch.FloatTensor([data.get("pos_weight",  2.5)]))
+    # Pre-trained model
+    self.Encoder = Encoder(layers = data.get("bert_layers",  range(12)),
+                           freeze_bert = data.get("freeze_bert",  False),
+                           model = data.get("model"),
+                           )
+    # Feature Map Layer
+    self.feature_map = nn.Sequential(
+            # nn.LayerNorm(self.Encoder.model.config.hidden_size),
+            nn.BatchNorm1d(self.Encoder.model.config.hidden_size),
+            # nn.Dropout(data.get("drop", 0.5)),
+            nn.Linear(self.Encoder.model.config.hidden_size, 200),
+            nn.Dropout(data.get("drop", 0.5)),
+        )
+    # Classifier Layer
+    self.classifier = nn.Sequential(
+            # nn.LayerNorm(self.Encoder.model.config.hidden_size),
+            # nn.Dropout(data.get("drop", 0.5)),
+            # nn.BatchNorm1d(self.Encoder.model.config.hidden_size),
+            # nn.Dropout(data.get("drop", 0.5)),
+            nn.Tanh(),
+            nn.Linear(200, 1)
+        )
+    # Initializing layer parameters
+    nn.init.normal_(self.feature_map[1].weight, mean=0, std=0.00001)
+    nn.init.zeros_(self.feature_map[1].bias)
+  # Feed forward
+  def forward(self, input_ids, attention_mask, token_type_ids, labels):
+    predict = self.Encoder(**{"input_ids":input_ids,
+                              "attention_mask":attention_mask,
+                              "token_type_ids":token_type_ids})
+    feature = self.feature_map(predict)
+    logit = self.classifier(feature)
+    predict = torch.sigmoid(logit)
+    # Loss function
+    loss = self.loss_fn(logit.to(torch.float), labels.to(torch.float).unsqueeze(1))
+    return [loss, [feature, logit], predict]
+# Undesirable patterns within texts
+patterns = {
+    'CONCLUSIONS AND IMPLICATIONS':'',
+    'BACKGROUND AND PURPOSE':'',
+    'EXPERIMENTAL APPROACH':'',
+    'KEY RESULTS AEA':'',
+    '©':'',
+    '®':'',
+    'μ':'',
+    '(C)':'',
+    'OBJECTIVE:':'',
+    'MATERIALS AND METHODS:':'',
+    'SIGNIFICANCE:':'',
+    'BACKGROUND:':'',
+    'RESULTS:':'',
+    'METHODS:':'',
+    'CONCLUSIONS:':'',
+    'AIM:':'',
+    'STUDY DESIGN:':'',
+    'CLINICAL RELEVANCE:':'',
+    'CONCLUSION:':'',
+    'HYPOTHESIS:':'',
+    'CLINICAL RELEVANCE:':'',
+    'Questions/Purposes:':'',
+    'Introduction:':'',
+    'PURPOSE:':'',
+    'PATIENTS AND METHODS:':'',
+    'FINDINGS:':'',
+    'INTERPRETATIONS:':'',
+    'FUNDING:':'',
+    'PROGRESS:':'',
+    'CONTEXT:':'',
+    'MEASURES:':'',
+    'DESIGN:':'',
+    'BACKGROUND AND OBJECTIVES:':'',
+    '<p>':'',
+    '</p>':'',
+    '<<ETX>>':'',
+    '+/-':'',
+    '\(.+\)':'',
+    '\[.+\]':'',
+    ' \d ':'',
+    '<':'',
+    '>':'',
+    '- ':'',
+    ' +':' ',
+    ', ,':',',
+    ',,':',',
+    '%':' percent',
+    'per cent':' percent'
+    }
+patterns = {x.lower():y for x,y in patterns.items()}
+LABEL_MAP = {'negative': 0,
+             'not included':0,
+             '0':0,
+             0:0,
+             'excluded':0,
+             'positive': 1,
+             'included':1,
+             '1':1,
+             1:1,
+             }
+class SLR_DataSet(Dataset):
+  def __init__(self,treat_text =None, **args):
+    self.tokenizer = args.get('tokenizer')
+    self.data = args.get('data')
+    self.max_seq_length = args.get("max_seq_length", 512)
+    self.INPUT_NAME = args.get("input", 'x')
+    self.LABEL_NAME = args.get("output", 'y')
+    self.treat_text = treat_text
+  # Tokenizing and processing text
+  def encode_text(self, example):
+    comment_text = example[self.INPUT_NAME]
+    if self.treat_text:
+      comment_text = self.treat_text(comment_text)
+    try:
+      labels = LABEL_MAP[example[self.LABEL_NAME].lower()]
+    except:
+      labels = -1
+    encoding = self.tokenizer.encode_plus(
+      (comment_text, "It is great text"),
+      add_special_tokens=True,
+      max_length=self.max_seq_length,
+      return_token_type_ids=True,
+      padding="max_length",
+      truncation=True,
+      return_attention_mask=True,
+      return_tensors='pt',
+    )
+    return tuple((
+      encoding["input_ids"].flatten(),
+      encoding["attention_mask"].flatten(),
+      encoding["token_type_ids"].flatten(),
+      torch.tensor([torch.tensor(labels).to(int)])
+    ))
+  def __len__(self):
+    return len(self.data)
+  # Returning data
+  def __getitem__(self, index: int):
+    # print(index)
+    data_row = self.data.reset_index().iloc[index]
+    temp_data =  self.encode_text(data_row)
+    return temp_data
+class Learner(nn.Module):
+    def __init__(self, **args):
+        """
+        :param args:
+        """
+        super(Learner, self).__init__()
+        self.inner_print = args.get('inner_print')
+        self.inner_batch_size = args.get('inner_batch_size')
+        self.outer_update_lr  = args.get('outer_update_lr')
+        self.inner_update_lr  = args.get('inner_update_lr')
+        self.inner_update_step = args.get('inner_update_step')
+        self.inner_update_step_eval = args.get('inner_update_step_eval')
+        self.model = args.get('model')
+        self.device = args.get('device')
+        # Outer optimizer
+        self.outer_optimizer = Adam(self.model.parameters(), lr=self.outer_update_lr)
+        self.model.train()
+    def forward(self, batch_tasks, training = True, valid_train = True):
+        """
+        batch = [(support TensorDataset, query TensorDataset),
+                 (support TensorDataset, query TensorDataset),
+                 (support TensorDataset, query TensorDataset),
+                 (support TensorDataset, query TensorDataset)]
+        # support = TensorDataset(all_input_ids, all_attention_mask, all_segment_ids, all_label_ids)
+        """
+        task_accs = []
+        task_f1 = []
+        task_recall = []
+        sum_gradients = []
+        num_task = len(batch_tasks)
+        num_inner_update_step = self.inner_update_step if training else self.inner_update_step_eval
+        # Outer loop tasks
+        for task_id, task in enumerate(batch_tasks):
+            support = task[0]
+            query   = task[1]
+            name   = task[2]
+            # Copying model
+            fast_model = deepcopy(self.model)
+            fast_model.to(self.device)
+            # Inner trainer optimizer
+            inner_optimizer = Adam(fast_model.parameters(), lr=self.inner_update_lr)
+            # Creating training data loaders
+            if len(support) % self.inner_batch_size == 1 :
+              support_dataloader = DataLoader(support, sampler=RandomSampler(support),
+                                              batch_size=self.inner_batch_size,
+                                              drop_last=True)
+            else:
+              support_dataloader = DataLoader(support, sampler=RandomSampler(support),
+                                              batch_size=self.inner_batch_size,
+                                              drop_last=False)
+            # steps_per_epoch=len(support) // self.inner_batch_size
+            # total_training_steps = steps_per_epoch * 5
+            # warmup_steps = total_training_steps // 3
+            #
+            # scheduler = get_linear_schedule_with_warmup(
+            #            inner_optimizer,
+            #           num_warmup_steps=warmup_steps,
+            #           num_training_steps=total_training_steps
+            #           )
+            fast_model.train()
+            # Inner loop training epoch (support set)
+            if valid_train:
+              print('----Task',task_id,":", name, '----')
+            for i in range(0, num_inner_update_step):
+                all_loss = []
+                # Inner loop training batch (support set)
+                for inner_step, batch in enumerate(support_dataloader):
+                    batch = tuple(t.to(self.device) for t in batch)
+                    input_ids, attention_mask, token_type_ids, label_id = batch
+                    # Feed Foward
+                    loss, _, _ = fast_model(input_ids, attention_mask, token_type_ids=token_type_ids, labels = label_id)
+                    # Computing gradients
+                    loss.backward()
+                    # torch.nn.utils.clip_grad_norm_(fast_model.parameters(), max_norm=1)
+                    # Updating inner training parameters
+                    inner_optimizer.step()
+                    inner_optimizer.zero_grad()
+                    # Appending losses
+                    all_loss.append(loss.item())
+                    del batch, input_ids, attention_mask, label_id
+                    torch.cuda.empty_cache()
+                if valid_train:
+                  if (i+1) % self.inner_print == 0:
+                      print("Inner Loss: ", np.mean(all_loss))
+            fast_model.to(torch.device('cpu'))
+            # Inner training phase weights
+            if training:
+                meta_weights = list(self.model.parameters())
+                fast_weights = list(fast_model.parameters())
+                # Appending gradients
+                gradients = []
+                for i, (meta_params, fast_params) in enumerate(zip(meta_weights, fast_weights)):
+                    gradient = meta_params - fast_params
+                    if task_id == 0:
+                        sum_gradients.append(gradient)
+                    else:
+                        sum_gradients[i] += gradient
+            # Inner test (query set)
+            fast_model.to(self.device)
+            fast_model.eval()
+            if valid_train:
+              # Inner test (query set)
+              fast_model.to(self.device)
+              fast_model.eval()
+            with torch.no_grad():
+                # Data loader
+                query_dataloader = DataLoader(query, sampler=None, batch_size=len(query))
+                query_batch = iter(query_dataloader).next()
+                query_batch = tuple(t.to(self.device) for t in query_batch)
+                q_input_ids, q_attention_mask, q_token_type_ids, q_label_id = query_batch
+                # Feedfoward
+                _, _, pre_label_id = fast_model(q_input_ids, q_attention_mask, q_token_type_ids, labels = q_label_id)
+                # Predictions
+                pre_label_id = pre_label_id.detach().cpu().squeeze()
+                # Labels
+                q_label_id = q_label_id.detach().cpu()
+                # Calculating metrics
+                acc = fn.accuracy(pre_label_id, q_label_id).item()
+                recall = fn.recall(pre_label_id, q_label_id).item(),
+                f1 = fn.f1_score(pre_label_id, q_label_id).item()
+                # appending metrics
+                task_accs.append(acc)
+                task_f1.append(f1)
+                task_recall.append(recall)
+                fast_model.to(torch.device('cpu'))
+            del fast_model, inner_optimizer
+            torch.cuda.empty_cache()
+        print("\n")
+        print("f1:",np.mean(task_f1))
+        print("recall:",np.mean(task_recall))
+        # Updating outer training parameters
+        if training:
+            # Mean of gradients
+            for i in range(0,len(sum_gradients)):
+                sum_gradients[i] = sum_gradients[i] / float(num_task)
+            # Indexing parameters to model
+            for i, params in enumerate(self.model.parameters()):
+                params.grad = sum_gradients[i]
+            # Updating parameters
+            self.outer_optimizer.step()
+            self.outer_optimizer.zero_grad()
+            del sum_gradients
+            gc.collect()
+            torch.cuda.empty_cache()
+        if valid_train:
+          return np.mean(task_accs)
+        else:
+          return np.array(0)
+# Creating Meta Tasks
+class MetaTask(Dataset):
+    def __init__(self, examples, num_task, k_support, k_query,
+                 tokenizer, training=True, max_seq_length=512,
+                 treat_text =None, **args):
+        """
+        :param samples: list of samples
+        :param num_task: number of training tasks.
+        :param k_support: number of classes support samples per task
+        :param k_query: number of classes query sample per task
+        """
+        self.examples = examples
+        self.num_task =  num_task
+        self.k_support = k_support
+        self.k_query = k_query
+        self.tokenizer = tokenizer
+        self.max_seq_length = max_seq_length
+        self.treat_text = treat_text
+        # Randomly generating tasks
+        self.create_batch(self.num_task, training)
+    # Creating batch
+    def create_batch(self, num_task, training):
+        self.supports = []  # support set
+        self.queries = []  # query set
+        self.task_names = [] # Name of task
+        self.supports_indexs = [] # index of supports
+        self.queries_indexs = [] # index of queries
+        self.num_task=num_task
+        # Available tasks
+        domains = self.examples['domain'].unique()
+        # If not training, create all tasks
+        if not(training):
+          self.task_names = domains
+          num_task = len(self.task_names)
+          self.num_task=num_task
+        for b in range(num_task):  # For each task,
+            total_per_class = self.k_support + self.k_query
+            task_size = 2*self.k_support + 2*self.k_query
+            # Select a task at random
+            if training:
+              domain = random.choice(domains)
+              self.task_names.append(domain)
+            else:
+              domain = self.task_names[b]
+            # Task data
+            domainExamples = self.examples[self.examples['domain'] == domain]
+            # Minimal label quantity
+            min_per_class = min(domainExamples['label'].value_counts())
+            if total_per_class > min_per_class:
+              total_per_class = min_per_class
+            # Select k_support + k_query task examples
+            # Sample (n) from each label(class)
+            selected_examples = domainExamples.groupby("label").sample(total_per_class, replace = False)
+            # Split data into support (training) and query (testing) sets
+            s, q = train_test_split(selected_examples,
+                                    stratify= selected_examples["label"],
+                                    test_size= 2*self.k_query/task_size,
+                                    shuffle=True)
+            # Permutating data
+            s = s.sample(frac=1)
+            q = q.sample(frac=1)
+            # Appending indexes
+            if not(training):
+              self.supports_indexs.append(s.index)
+              self.queries_indexs.append(q.index)
+            # Creating list of support (training) and query (testing) tasks
+            self.supports.append(s.to_dict('records'))
+            self.queries.append(q.to_dict('records'))
+    # Creating task tensors
+    def create_feature_set(self, examples):
+        all_input_ids      = torch.empty(len(examples), self.max_seq_length, dtype = torch.long)
+        all_attention_mask = torch.empty(len(examples), self.max_seq_length, dtype = torch.long)
+        all_token_type_ids = torch.empty(len(examples), self.max_seq_length, dtype = torch.long)
+        all_label_ids      = torch.empty(len(examples), dtype = torch.long)
+        for _id, e in enumerate(examples):
+          all_input_ids[_id], all_attention_mask[_id], all_token_type_ids[_id], all_label_ids[_id] = self.encode_text(e)
+        return TensorDataset(
+            all_input_ids,
+            all_attention_mask,
+            all_token_type_ids,
+            all_label_ids
+        )
+    # Data encoding
+    def encode_text(self, example):
+      comment_text = example["text"]
+      if self.treat_text:
+        comment_text = self.treat_text(comment_text)
+      labels = LABEL_MAP[example["label"]]
+      encoding = self.tokenizer.encode_plus(
+        (comment_text, "It is a great text."),
+        add_special_tokens=True,
+        max_length=self.max_seq_length,
+        return_token_type_ids=True,
+        padding="max_length",
+        truncation=True,
+        return_attention_mask=True,
+        return_tensors='pt',
+      )
+      return tuple((
+        encoding["input_ids"].flatten(),
+        encoding["attention_mask"].flatten(),
+        encoding["token_type_ids"].flatten(),
+        torch.tensor([torch.tensor(labels).to(int)])
+      ))
+    # Returns data upon calling
+    def __getitem__(self, index):
+        support_set = self.create_feature_set(self.supports[index])
+        query_set   = self.create_feature_set(self.queries[index])
+        name        = self.task_names[index]
+        return support_set, query_set, name
+    def __len__(self):
+        return self.num_task
+class treat_text:
+  def __init__(self, patterns):
+    self.patterns = patterns
+  def __call__(self,text):
+    text = unicodedata.normalize("NFKD",str(text))
+    text = multiple_replace(self.patterns,text.lower())
+    text = re.sub('(\(.+\))|(\[.+\])|( \d )|(<)|(>)|(- )','', text)
+    text = re.sub('( +)',' ', text)
+    text = re.sub('(, ,)|(,,)',',', text)
+    text = re.sub('(%)|(per cent)',' percent', text)
+    return text
+# Regex multiple replace function
+def multiple_replace(dict, text):
+  # Building regex from dict keys
+  regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))
+  # Substitution
+  return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)

ML-SLRC/Util_funs.py ADDED Viewed

	@@ -0,0 +1,740 @@

+from ML_SLRC import *
+import os
+import numpy as np
+import pandas as pd
+from torch.utils.data import  DataLoader
+from torch.optim import Adam
+import gc
+from torchmetrics import functional as fn
+import random
+from tqdm import tqdm
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import roc_curve, auc
+import ipywidgets as widgets
+from IPython.display import  display, clear_output
+import matplotlib.pyplot as plt
+import warnings
+import torch
+import time
+from sklearn.manifold import TSNE
+from copy import deepcopy
+import seaborn as sns
+import matplotlib.pylab as plt
+import json
+from pathlib import Path
+import re
+from collections import defaultdict
+# SEED = 2222
+# gen_seed = torch.Generator().manual_seed(SEED)
+# Random seed function
+def random_seed(value):
+    torch.backends.cudnn.deterministic=True
+    torch.manual_seed(value)
+    torch.cuda.manual_seed(value)
+    np.random.seed(value)
+    random.seed(value)
+# Tasks for meta-learner
+def create_batch_of_tasks(taskset, is_shuffle = True, batch_size = 4):
+    idxs = list(range(0,len(taskset)))
+    if is_shuffle:
+        random.shuffle(idxs)
+    for i in range(0,len(idxs), batch_size):
+        yield [taskset[idxs[i]] for i in range(i, min(i + batch_size,len(taskset)))]
+# Prepare data to process by Domain-learner
+def prepare_data(data, batch_size, tokenizer,max_seq_length,
+                 input = 'text', output = 'label',
+                 train_size_per_class = 5, global_datasets = False,
+                 treat_text_fun =None):
+  data = data.reset_index().drop("index", axis=1)
+  if global_datasets:
+    global data_train, data_test
+  # Sample task for training
+  data_train = data.groupby('label').sample(train_size_per_class, replace=False)
+  idex = data.index.isin(data_train.index)
+  # The Test set to label by the model
+  data_test = data
+  # Transform in dataset to model
+  ## Train
+  dataset_train = SLR_DataSet(
+    data = data_train.sample(frac=1),
+    input = input,
+    output = output,
+    tokenizer=tokenizer,
+    max_seq_length =max_seq_length,
+    treat_text =treat_text_fun)
+  ## Test
+  dataset_test = SLR_DataSet(
+    data = data_test,
+    input = input,
+    output = output,
+    tokenizer=tokenizer,
+    max_seq_length =max_seq_length,
+    treat_text =treat_text_fun)
+  # Dataloaders
+  ## Train
+  data_train_loader = DataLoader(dataset_train,
+                           shuffle=True,
+                           batch_size=batch_size['train']
+                                )
+  ## Test
+  if len(dataset_test) % batch_size['test'] == 1 :
+    data_test_loader = DataLoader(dataset_test,
+                                    batch_size=batch_size['test'],
+                                    drop_last=True)
+  else:
+    data_test_loader = DataLoader(dataset_test,
+                                    batch_size=batch_size['test'],
+                                    drop_last=False)
+  return data_train_loader, data_test_loader, data_train, data_test
+# Meta trainer
+def meta_train(data, model, device, Info,
+               print_epoch =True,
+                Test_resource =None,
+                treat_text_fun =None):
+  # Meta-learner model
+  learner = Learner(model = model, device = device, **Info)
+  # Testing tasks
+  if isinstance(Test_resource, pd.DataFrame):
+    test = MetaTask(Test_resource, num_task = 0, k_support=10, k_query=10,
+                  training=False,treat_text =treat_text_fun, **Info)
+  torch.clear_autocast_cache()
+  gc.collect()
+  torch.cuda.empty_cache()
+  # Meta epoch (Outer epoch)
+  for epoch in tqdm(range(Info['meta_epoch']), desc= "Meta epoch ", ncols=80):
+      # Train tasks
+      train = MetaTask(data,
+                      num_task = Info['num_task_train'],
+                      k_support=Info['k_qry'],
+                      k_query=Info['k_spt'],
+                      treat_text =treat_text_fun, **Info)
+      # Batch of train tasks
+      db = create_batch_of_tasks(train, is_shuffle = True, batch_size = Info["outer_batch_size"])
+      if print_epoch:
+      # Outer loop bach training
+        for step, task_batch in enumerate(db):
+            print("\n-----------------Training Mode","Meta_epoch:", epoch ,"-----------------\n")
+            # meta-feedfoward (outer-feedfoward)
+            acc = learner(task_batch, valid_train= print_epoch)
+            print('Step:', step, '\ttraining Acc:', acc)
+        if isinstance(Test_resource, pd.DataFrame):
+          # Validating Model
+          if ((epoch+1) % 4) + step == 0:
+              random_seed(123)
+              print("\n-----------------Testing Mode-----------------\n")
+              # Batch of test tasks
+              db_test = create_batch_of_tasks(test, is_shuffle = False, batch_size = 1)
+              acc_all_test = []
+              # Looping testing tasks
+              for test_batch in db_test:
+                  acc = learner(test_batch, training = False)
+                  acc_all_test.append(acc)
+              print('Test acc:', np.mean(acc_all_test))
+              del acc_all_test, db_test
+              # Restarting training randomly
+              random_seed(int(time.time() % 10))
+      else:
+        for step, task_batch in enumerate(db):
+            # meta-feedfoward (outer-feedfoward)
+            acc = learner(task_batch, print_epoch, valid_train= print_epoch)
+  torch.clear_autocast_cache()
+  gc.collect()
+  torch.cuda.empty_cache()
+def train_loop(data_train_loader, data_test_loader, model, device, epoch = 4, lr = 1, print_info = True, name = 'name', weight_decay = 1):
+  # Start the model's parameters
+  model_meta = deepcopy(model)
+  optimizer = Adam(model_meta.parameters(), lr=lr, weight_decay = weight_decay)
+  model_meta.to(device)
+  model_meta.train()
+  # Task epoch (Inner epoch)
+  for i in range(0, epoch):
+      all_loss = []
+      # Inner training batch (support set)
+      for inner_step, batch in enumerate(data_train_loader):
+          batch = tuple(t.to(device) for t in batch)
+          input_ids, attention_mask,q_token_type_ids, label_id = batch
+          # Inner Feedfoward
+          loss, _, _ = model_meta(input_ids, attention_mask,q_token_type_ids, labels = label_id.squeeze())
+          # compute grads
+          loss.backward()
+          # update parameters
+          optimizer.step()
+          optimizer.zero_grad()
+          all_loss.append(loss.item())
+      if (i % 2 == 0) & print_info:
+          print("Loss: ", np.mean(all_loss))
+  # Test evaluation
+  model_meta.eval()
+  all_loss = []
+  all_acc = []
+  features = []
+  labels = []
+  predi_logit = []
+  with torch.no_grad():
+      # Test's Batch loop
+      for inner_step, batch in enumerate(tqdm(data_test_loader,
+                                              desc="Test validation | " + name,
+                                              ncols=80)) :
+        batch = tuple(t.to(device) for t in batch)
+        input_ids, attention_mask,q_token_type_ids, label_id = batch
+        # Predictions
+        _, feature, _ = model_meta(input_ids, attention_mask,q_token_type_ids, labels = label_id.squeeze())
+ #       prediction = prediction.detach().cpu().squeeze()
+  #      label_id = label_id.detach().cpu()
+        logit = feature[1].detach().cpu()
+   #     feature_lat = feature[0].detach().cpu()
+#        labels.append(label_id.numpy().squeeze())
+#        features.append(feature_lat.numpy())
+        predi_logit.append(logit.numpy())
+        # Accuracy over the test's bach
+#        acc = fn.accuracy(prediction, label_id).item()
+ #       all_acc.append(acc)
+      del input_ids, attention_mask, label_id, batch
+  if print_info:
+    print("acc:", np.mean(all_acc))
+  model_meta.to('cpu')
+  gc.collect()
+  torch.cuda.empty_cache()
+  del model_meta, optimizer
+  logits = np.concatenate(np.array(predi_logit,dtype=object))
+  logits = torch.tensor(logits.astype(np.float32)).detach().clone()
+#  return features, labels, predi_logit
+  return logits.detach().clone()
+# Process predictions and map the feature_map in tsne
+def map_feature_tsne(features, labels, predi_logit):
+  features = np.concatenate(np.array(features,dtype=object))
+  features = torch.tensor(features.astype(np.float32)).detach().clone()
+  labels = np.concatenate(np.array(labels,dtype=object))
+  labels = torch.tensor(labels.astype(int)).detach().clone()
+  logits = np.concatenate(np.array(predi_logit,dtype=object))
+  logits = torch.tensor(logits.astype(np.float32)).detach().clone()
+  # Dimention reduction
+  X_embedded = TSNE(n_components=2, learning_rate='auto',
+                    init='random').fit_transform(features.detach().clone())
+  return logits.detach().clone(), X_embedded, labels.detach().clone(), features.detach().clone()
+def wss_calc(logit, labels, trsh = 0.5):
+  # Prediction label given the threshold
+  predict_trash = torch.sigmoid(logit).squeeze() >= trsh
+  # Compute confusion matrix values
+  CM = confusion_matrix(labels, predict_trash.to(int) )
+  tn, fp, fne, tp = CM.ravel()
+  P = (tp + fne)
+  N = (tn + fp)
+  recall = tp/(tp+fne)
+  # WSS
+  wss = (tn + fne)/len(labels) -(1- recall)
+  # AWSS
+  awss = (tn/N - fne/P)
+  return {
+      "wss": round(wss,4),
+      "awss": round(awss,4),
+      "R": round(recall,4),
+      "CM": CM
+      }
+# Compute the metrics
+def plot(logits, X_embedded, labels, threshold, show = True,
+         namefig = "plot", make_plot = True, print_stats = True, save = True):
+  col = pd.MultiIndex.from_tuples([
+                                   ("Predict", "0"),
+                                   ("Predict", "1")
+                                   ])
+  index = pd.MultiIndex.from_tuples([
+                                   ("Real", "0"),
+                                   ("Real", "1")
+                                   ])
+  predict = torch.sigmoid(logits).detach().clone()
+  # Roc curve
+  fpr, tpr, thresholds = roc_curve(labels, predict.squeeze())
+  # Given by a Recall of 95% (threshold avaliation)
+  ## WSS
+  ### Index to recall
+  idx_wss95 = sum(tpr < 0.95)
+  ### threshold
+  thresholds95 = thresholds[idx_wss95]
+  ### Compute the metrics
+  wss95_info = wss_calc(logits,labels, thresholds95 )
+  acc_wss95 = fn.accuracy(predict, labels, threshold=thresholds95)
+  f1_wss95 = fn.f1_score(predict, labels, threshold=thresholds95)
+  # Given by a threshold (recall avaliation)
+  ### Compute the metrics
+  wss_info = wss_calc(logits,labels, threshold )
+  acc_wssR = fn.accuracy(predict, labels, threshold=threshold)
+  f1_wssR = fn.f1_score(predict, labels, threshold=threshold)
+  metrics= {
+      # WSS
+      "WSS@95": wss95_info['wss'],
+      "AWSS@95": wss95_info['awss'],
+      "WSS@R": wss_info['wss'],
+      "AWSS@R": wss_info['awss'],
+      # Recall
+      "Recall_WSS@95": wss95_info['R'],
+      "Recall_WSS@R": wss_info['R'],
+      # acc
+      "acc@95": acc_wss95.item(),
+      "acc@R": acc_wssR.item(),
+      # f1
+      "f1@95": f1_wss95.item(),
+      "f1@R": f1_wssR.item(),
+      # threshold 95
+      "threshold@95": thresholds95
+  }
+  # Print stats
+  if print_stats:
+    wss95= f"WSS@95:{wss95_info['wss']}, R: {wss95_info['R']}"
+    wss95_adj= f"ASSWSS@95:{wss95_info['awss']}"
+    print(wss95)
+    print(wss95_adj)
+    print('Acc.:', round(acc_wss95.item(), 4))
+    print('F1-score:', round(f1_wss95.item(), 4))
+    print(f"threshold to wss95: {round(thresholds95, 4)}")
+    cm = pd.DataFrame(wss95_info['CM'],
+              index=index,
+              columns=col)
+    print("\nConfusion matrix:")
+    print(cm)
+    print("\n---Metrics with threshold:", threshold, "----\n")
+    wss= f"WSS@R:{wss_info['wss']}, R: {wss_info['R']}"
+    print(wss)
+    wss_adj= f"AWSS@R:{wss_info['awss']}"
+    print(wss_adj)
+    print('Acc.:', round(acc_wssR.item(), 4))
+    print('F1-score:', round(f1_wssR.item(), 4))
+    cm = pd.DataFrame(wss_info['CM'],
+                index=index,
+                columns=col)
+    print("\nConfusion matrix:")
+    print(cm)
+  # Plots
+  if make_plot:
+    fig, axes = plt.subplots(1, 4, figsize=(25,10))
+    alpha = torch.squeeze(predict).numpy()
+    # TSNE
+    p1 = sns.scatterplot(x=X_embedded[:, 0],
+                  y=X_embedded[:, 1],
+                  hue=labels,
+                  alpha=alpha, ax = axes[0]).set_title('Predictions-TSNE', size=20)
+    # WSS@95
+    t_wss = predict >= thresholds95
+    t_wss = t_wss.squeeze().numpy()
+    p2 = sns.scatterplot(x=X_embedded[t_wss, 0],
+                  y=X_embedded[t_wss, 1],
+                  hue=labels[t_wss],
+                  alpha=alpha[t_wss], ax = axes[1]).set_title('WSS@95', size=20)
+    # WSS@R
+    t = predict >= threshold
+    t = t.squeeze().numpy()
+    p3 = sns.scatterplot(x=X_embedded[t, 0],
+                  y=X_embedded[t, 1],
+                  hue=labels[t],
+                  alpha=alpha[t], ax = axes[2]).set_title(f'Predictions-threshold {threshold}', size=20)
+    # ROC-Curve
+    roc_auc = auc(fpr, tpr)
+    lw = 2
+    axes[3].plot(
+      fpr,
+      tpr,
+      color="darkorange",
+      lw=lw,
+      label="ROC curve (area = %0.2f)" % roc_auc)
+    axes[3].plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
+    axes[3].axhline(y=0.95, color='r', linestyle='-')
+    # axes[3].set(xlabel="False Positive Rate", ylabel="True Positive Rate")
+    axes[3].legend(loc="lower right")
+    axes[3].set_title(label= "ROC", size = 20)
+    axes[3].set_ylabel("True Positive Rate", fontsize = 15)
+    axes[3].set_xlabel("False Positive Rate", fontsize = 15)
+    if show:
+      plt.show()
+    if save:
+      fig.savefig(namefig, dpi=fig.dpi)
+  return metrics
+def auc_plot(logits,labels, color = "darkorange", label = "test"):
+    predict = torch.sigmoid(logits).detach().clone()
+    fpr, tpr, thresholds = roc_curve(labels, predict.squeeze())
+    roc_auc = auc(fpr, tpr)
+    lw = 2
+    label = label + str(round(roc_auc,2))
+    # print(label)
+    plt.plot(
+      fpr,
+      tpr,
+      color=color,
+      lw=lw,
+      label= label
+      )
+    plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
+    plt.axhline(y=0.95, color='r', linestyle='-')
+# Interface to evaluation
+class diagnosis():
+  def __init__(self, names, Valid_resource, batch_size_test,
+   model,Info, device,treat_text_fun=None,start = 0):
+    self.names=names
+    self.Valid_resource=Valid_resource
+    self.batch_size_test=batch_size_test
+    self.model=model
+    self.start=start
+    self.Info = Info
+    self.device = device
+    self.treat_text_fun = treat_text_fun
+    # BOX INPUT
+    self.value_trash = widgets.FloatText(
+        value=0.95,
+        description='threshold',
+        disabled=False
+    )
+    self.valueb = widgets.IntText(
+        value=10,
+        description='size',
+        disabled=False
+    )
+    # Buttons
+    self.train_b = widgets.Button(description="Train")
+    self.next_b = widgets.Button(description="Next")
+    self.eval_b = widgets.Button(description="Evaluation")
+    self.hbox = widgets.HBox([self.train_b, self.valueb])
+    # Click buttons functions
+    self.next_b.on_click(self.Next_button)
+    self.train_b.on_click(self.Train_button)
+    self.eval_b.on_click(self.Evaluation_button)
+  # Next button
+  def Next_button(self,p):
+    clear_output()
+    self.i=self.i+1
+    # Select the domain data
+    self.domain = self.names[self.i]
+    self.data = self.Valid_resource[self.Valid_resource['domain'] == self.domain]
+    print("Name:", self.domain)
+    print(self.data['label'].value_counts())
+    display(self.hbox)
+    display(self.next_b)
+  # Train button
+  def Train_button(self, y):
+    clear_output()
+    print(self.domain)
+    # Prepare data for training (domain-learner)
+    self.data_train_loader, self.data_test_loader, self.data_train, self.data_test = prepare_data(self.data,
+              train_size_per_class = self.valueb.value,
+              batch_size = {'train': self.Info['inner_batch_size'],
+                            'test': self.batch_size_test},
+              max_seq_length = self.Info['max_seq_length'],
+              tokenizer = self.Info['tokenizer'],
+              input = "text",
+              output = "label",
+              treat_text_fun=self.treat_text_fun)
+    # Train the model and predict in the test set
+    self.logits, self.X_embedded, self.labels, self.features = train_loop(self.data_train_loader, self.data_test_loader,
+                                                        self.model, self.device,
+                                                        epoch = self.Info['inner_update_step'],
+                                                        lr=self.Info['inner_update_lr'],
+                                                        print_info=True,
+                                                        name = self.domain)
+    tresh_box = widgets.HBox([self.eval_b, self.value_trash])
+    display(self.hbox)
+    display(tresh_box)
+    display(self.next_b)
+  # Evaluation button
+  def Evaluation_button(self, te):
+    clear_output()
+    tresh_box = widgets.HBox([self.eval_b, self.value_trash])
+    print(self.domain)
+    # print("\n")
+    print("-------Train data-------")
+    print(data_train['label'].value_counts())
+    print("-------Test data-------")
+    print(data_test['label'].value_counts())
+    # print("\n")
+    display(self.next_b)
+    display(tresh_box)
+    display(self.hbox)
+    # Compute metrics
+    metrics = plot(self.logits, self.X_embedded, self.labels,
+                    threshold=self.Info['threshold'], show = True,
+                    namefig= 'test',
+                  make_plot = True,
+                  print_stats = True,
+                  save=False)
+  def __call__(self):
+    self.i= self.start-1
+    clear_output()
+    display(self.next_b)
+# Simulation attemps of domain learner
+def pipeline_simulation(Valid_resource, names_to_valid, path_save,
+                        model, Info, device, initializer_model,
+                        treat_text_fun=None):
+  n_attempt  = 5
+  batch_test = 100
+  # Create a directory to save informations
+  for name in names_to_valid:
+    name = re.sub("\.csv", "",name)
+    Path(path_save  + name + "/img").mkdir(parents=True, exist_ok=True)
+  # Dict to sabe roc curves
+  roc_stats = defaultdict(lambda: defaultdict(
+      lambda: defaultdict(
+          list
+          )
+      )
+  )
+  all_metrics = []
+  # Loop over a list of domains
+  for name in names_to_valid:
+    # Select a domain dataset
+    data = Valid_resource[Valid_resource['domain'] == name].reset_index().drop("index", axis=1)
+    # Attempts simulation
+    for attempt in range(n_attempt):
+      print("---"*4,"attempt", attempt, "---"*4)
+      # Prepare data to pass to the model
+      data_train_loader, data_test_loader,  _ , _ = prepare_data(data,
+                train_size_per_class = Info['k_spt'],
+                batch_size = {'train': Info['inner_batch_size'],
+                              'test': batch_test},
+                max_seq_length = Info['max_seq_length'],
+                tokenizer = Info['tokenizer'],
+                input = "text",
+                output = "label",
+                treat_text_fun=treat_text_fun)
+      # Train the model and evaluate on the test set of the domain
+      logits, X_embedded, labels, features = train_loop(data_train_loader, data_test_loader,
+                                                        model, device,
+                                                        epoch = Info['inner_update_step'],
+                                                        lr=Info['inner_update_lr'],
+                                                        print_info=False,
+                                                        name = name)
+      name_domain = re.sub("\.csv", "",name)
+      # Compute the metrics
+      metrics = plot(logits, X_embedded, labels,
+                    threshold=Info['threshold'], show = False,
+                    namefig= path_save  + name_domain + "/img/" + str(attempt) + 'plots',
+        make_plot = True, print_stats = False, save =  True)
+      # Compute the roc-curve
+      fpr, tpr, _ = roc_curve(labels, torch.sigmoid(logits).squeeze())
+      # Save the correspoud information of the domain
+      metrics['name'] = name_domain
+      metrics['layer_size'] = Info['bert_layers']
+      metrics['attempt'] = attempt
+      roc_stats[name_domain][str(Info['bert_layers'])]['fpr'].append(fpr.tolist())
+      roc_stats[name_domain][str(Info['bert_layers'])]['tpr'].append(tpr.tolist())
+      all_metrics.append(metrics)
+      # Save the metrics and the roc curve  of the attemp
+      pd.DataFrame(all_metrics).to_csv(path_save+ "metrics.csv")
+      roc_path =  path_save + "roc_stats.json"
+      with open(roc_path, 'w') as fp:
+          json.dump(roc_stats, fp)
+      del fpr, tpr, logits, X_embedded, labels
+      del features, metrics,  _
+  # Save the information used to evaluate the validation resource
+  save_info = Info.copy()
+  save_info['model'] = initializer_model.tokenizer.name_or_path
+  save_info.pop("tokenizer")
+  save_info.pop("bert_layers")
+  info_path =  path_save+"info.json"
+  with open(info_path, 'w') as fp:
+      json.dump(save_info, fp)
+# Loading dataset statistics
+def load_data_statistics(paths, names):
+  size = []
+  pos = []
+  neg = []
+  for p in paths:
+    data = pd.read_csv(p)
+    data = data.dropna()
+    # Dataset size
+    size.append(len(data))
+    # Number of positive labels
+    pos.append(data['labels'].value_counts()[1])
+    # Number of negative labels
+    neg.append(data['labels'].value_counts()[0])
+  del data
+  info_load = pd.DataFrame({
+      "size":size,
+      "pos":pos,
+      "neg":neg,
+      "names":names,
+      "paths": paths })
+  return info_load
+# Loading the datasets
+def load_data(train_info_load):
+  col = ['abstract','title', 'labels', 'domain']
+  data_train = pd.DataFrame(columns=col)
+  for p in train_info_load['paths']:
+    data_temp = pd.read_csv(p).loc[:, ['labels', 'title', 'abstract']]
+    data_temp = pd.read_csv(p).loc[:, ['labels', 'title', 'abstract']]
+    data_temp['domain'] = os.path.basename(p)
+    data_train = pd.concat([data_train, data_temp])
+  data_train['text'] = data_train['title'] + data_train['abstract'].replace(np.nan, '')
+  return( data_train \
+            .replace({"labels":{0:"negative", 1:'positive'}})\
+            .rename({"labels":"label"} , axis=1)\
+            .loc[ :,("text","domain","label")]
+        )

ML-SLRC/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@


1	+ from . import Util_funs
2	+
3	+
4	+

ML-SLRC/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a859f39dc8ff55df919ef6794dcfc3ca08f873ae11fd0fd78c50d65089a6f019
+size 213540902

app.py CHANGED Viewed

@@ -174,7 +174,6 @@ def treat_data_input(data, etailment_txt):
 import gc
 from torch.optim import Adam
-from scipy.stats import entropy
 def treat_train_evaluate(dataload_train, dataload_remain):
   device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

 import gc
 from torch.optim import Adam
 def treat_train_evaluate(dataload_train, dataload_remain):
   device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')