Spaces:

nicpopovic
/

stoke

Sleeping

App Files Files Community

npops commited on Mar 21, 2024

Commit

765e08e

1 Parent(s): b31c45a

init

Browse files

Files changed (20) hide show

.gitignore +5 -0
README.md +31 -7
example_generate.py +26 -0
example_train.py +16 -0
requirements.txt +8 -0
stoke/__init__.py +0 -0
stoke/docs/images/playground.png +0 -0
stoke/src/classifier/__init__.py +0 -0
stoke/src/classifier/probes.py +38 -0
stoke/src/data/__init__.py +0 -0
stoke/src/data/generation.py +288 -0
stoke/src/data/util.py +233 -0
stoke/src/playground/__init__.py +0 -0
stoke/src/playground/app.py +442 -0
stoke/src/selection/__init__.py +0 -0
stoke/src/selection/simple.py +60 -0
stoke/src/trainer/__init__.py +0 -0
stoke/src/trainer/trainer.py +284 -0
stoke/src/trainer/util.py +44 -0
stoke/tests/__init__.py +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+venv/
+.DS_Store
+/transformers/
+/data/
+/examples/

README.md CHANGED Viewed

@@ -1,12 +1,36 @@
 ---
-title: Stoke
-emoji: 😻
-colorFrom: blue
-colorTo: indigo
 sdk: streamlit
-sdk_version: 1.32.2
-app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: STOKE playground demo
+emoji: 🐢
+colorFrom: gray
+colorTo: red
 sdk: streamlit
+sdk_version: 1.31.1
+app_file: stoke/playground/app.py
 pinned: false
 ---
+# STOKE: A Toolkit for Streaming Token Classification
+[Huggingface Space](https://huggingface.co/spaces/nicpopovic/stoke)
+[Related publication](https://arxiv.org/abs/2403.11747)
+*Note: This code is still being cleaned up currently.*
+## Quick start
+You can use pip to install the required dependency (including the transformers fork)
+```
+python3 -m venv venv
+source venv/bin/activate
+pip install -r requirements.txt
+streamlit run stoke/playground/app.py
+```
+This will launch the playground, shown below:
+![](stoke/docs/images/playground.png)
+## Get custom transformers fork
+```
+git clone -b STOKE https://github.com/nicpopovic/transformers.git
+```

example_generate.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from stoke.src.data.util import GenerationConfig, split_data, conll_prompts
+from stoke.src.data.generation import DataGenerator, FlairNERModel
+# generation parameters
+generation_kwargs = {"max_new_tokens": 100, "repetition_penalty": 1.2}
+# Creating TrainConfig object with default values
+config = GenerationConfig(language_model="gpt2", output_path="data/", dataset_name="test", cuda=False, generation_kwargs=generation_kwargs)
+# create annotation model
+reference_model = FlairNERModel(config.language_model, "flair/ner-english-ontonotes-large")
+# create DataGenerator
+generator = DataGenerator(config, reference_model)
+# run generator
+generated_texts = generator.generate_text(conll_prompts()[:10], generation_kwargs)
+# annotate text with reference model
+annotated_texts = generator.annotate_text(generated_texts)
+# save data in correct format
+generator.save_data(annotated_texts)
+# split dataset
+split_data(config.path_data)

example_train.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# imports
+from stoke.src.trainer.util import TrainConfig
+from stoke.src.trainer.trainer import Trainer
+from stoke.src.selection.simple import create_config_for_path
+# create TrainConfig object with default values
+config = TrainConfig('data/gpt2/test', n_steps_per_epoch=10, n_epochs=10)
+# create Trainer
+trainer = Trainer(config)
+# run training
+trainer.train()
+# create basic config for playground
+create_config_for_path(config.path, "basic")

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+git+https://github.com/nicpopovic/transformers.git@STOKE
+streamlit
+torch
+matplotlib
+flair
+nltk
+datasets
+torcheval

stoke/__init__.py ADDED Viewed

File without changes

stoke/docs/images/playground.png ADDED Viewed

stoke/src/classifier/__init__.py ADDED Viewed

File without changes

stoke/src/classifier/probes.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import torch
+class MLP(torch.nn.Module):
+    def __init__(self, input_dim, output_dim, hidden_dim=1024, cuda=False):
+        super(MLP, self).__init__()
+        self.fc1 = torch.nn.Linear(input_dim, hidden_dim)  # Input layer to hidden layer
+        self.fc3 = torch.nn.Linear(hidden_dim, output_dim)  # Hidden layer to output layer
+        if cuda:
+            self.device = "cuda"
+        else:
+            self.device = "cpu"
+        self.to(self.device)
+    def forward(self, x):
+        x = torch.flatten(x, start_dim=1)
+        x = torch.relu(self.fc1(x))
+        x = self.fc3(x)
+        return x
+class MLPProbe(torch.nn.Module):
+    def __init__(self, input_dim, output_dim, hidden_dim=1024, cuda=False):
+        super(MLPProbe, self).__init__()
+        self.fc1 = torch.nn.Linear(input_dim, hidden_dim)  # Input layer to hidden layer
+        self.fc3 = torch.nn.Linear(hidden_dim, output_dim)  # Hidden layer to output layer
+        if cuda:
+            self.device = "cuda"
+        else:
+            self.device = "cpu"
+        self.to(self.device)
+    def forward(self, x):
+        x = torch.relu(self.fc1(x))
+        x = self.fc3(x)
+        return x

stoke/src/data/__init__.py ADDED Viewed

File without changes

stoke/src/data/generation.py ADDED Viewed

	@@ -0,0 +1,288 @@

+from transformers import pipeline
+from tqdm import tqdm
+import json
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from flair.models import SequenceTagger
+from flair.data import Sentence
+class AnnotationModel:
+    def __init__(self, model_id_for_tokenizer):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id_for_tokenizer, use_fast=True)
+        self.pipe = pipeline("token-classification", model="FacebookAI/xlm-roberta-large-finetuned-conll03-english", aggregation_strategy="simple")
+    def annotate_text(self, text):
+        iob_tags = ['O'] * len(text)
+        mentions = []
+        text_str = self.tokenizer.decode(text)
+        ner_tags = self.pipe(text_str)
+        offsets = []
+        offset = 0
+        for i, token_id in enumerate(text):
+            offsets.append(offset)
+            offset = len(self.tokenizer.decode(text[:i+1]))
+        offsets.append(offset)
+        for tag in ner_tags:
+            try:
+                start = self.get_token_for_char(tag["start"], offsets)
+                end = self.get_token_for_char(tag["end"]-1, offsets)
+                mentions.append([start, end])
+                for i in range(start, end+1):
+                    #iob_tags[i] = "I-" + tag["entity_group"]
+                    iob_tags[i] = tag["entity_group"]
+                #iob_tags[start] = "B-" + tag["entity_group"]
+                iob_tags[start] = tag["entity_group"]
+            except Exception as e:
+                print(e)
+                pass
+        return {"tokens": text, "ner_tags": iob_tags, "mentions": mentions}
+    def get_token_for_char(self, i, offsets):
+        for off in range(len(offsets)):
+            if i < offsets[off]:
+                return off - 1
+        return len(offsets) - 1
+class FlairNERModel:
+    def __init__(self, model_id_for_tokenizer, flair_model_name):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id_for_tokenizer, use_fast=True)
+        self.tagger = SequenceTagger.load(flair_model_name)
+        self.name = flair_model_name
+    def annotate_text(self, text):
+        iob_tags = ['O'] * len(text)
+        mentions = []
+        text_str = self.tokenizer.decode(text)
+        sentence = Sentence(text_str)
+        # Predict NER tags
+        self.tagger.predict(sentence)
+        ner_tags = sentence.get_spans('ner')
+        offsets = []
+        offset = 0
+        for i, token_id in enumerate(text):
+            offsets.append(offset)
+            offset = len(self.tokenizer.decode(text[:i+1]))
+        offsets.append(offset)
+        for tag in ner_tags:
+            try:
+                start = self.get_token_for_char(tag.start_position, offsets)
+                end = self.get_token_for_char(tag.end_position-1, offsets)
+                mentions.append([start, end])
+                for i in range(start, end+1):
+                    #iob_tags[i] = "I-"+ tag.get_labels('ner')[0].to_dict()['value']
+                    iob_tags[i] = tag.get_labels('ner')[0].to_dict()['value']
+                #iob_tags[start] = "B-"+ tag.get_labels('ner')[0].to_dict()['value']
+                iob_tags[start] = tag.get_labels('ner')[0].to_dict()['value']
+                #print(tag, self.tokenizer.decode(text[start:end+1]))
+            except Exception as e:
+                print(tag)
+                print(e)
+                pass
+        return {"tokens": text, "ner_tags": iob_tags, "mentions": mentions}
+    def get_token_for_char(self, i, offsets):
+        for off in range(len(offsets)):
+            if i < offsets[off]:
+                return off - 1
+        return len(offsets) - 1
+class FlairChunkingModel:
+    def __init__(self, model_id_for_tokenizer, flair_model_name):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id_for_tokenizer, use_fast=True)
+        self.tagger = SequenceTagger.load(flair_model_name)
+    def annotate_text(self, text):
+        iob_tags = ['O'] * len(text)
+        mentions = []
+        text_str = self.tokenizer.decode(text)
+        sentence = Sentence(text_str)
+        # Predict NER tags
+        self.tagger.predict(sentence)
+        ner_tags = sentence.get_spans('np')
+        offsets = []
+        offset = 0
+        for i, token_id in enumerate(text):
+            offsets.append(offset)
+            offset = len(self.tokenizer.decode(text[:i+1]))
+        offsets.append(offset)
+        for tag in ner_tags:
+            try:
+                start = self.get_token_for_char(tag.start_position, offsets)
+                end = self.get_token_for_char(tag.end_position-1, offsets)
+                mentions.append([start, end])
+                for i in range(start, end+1):
+                    #iob_tags[i] = "I-"+ tag.get_labels('ner')[0].to_dict()['value']
+                    iob_tags[i] = tag.get_labels('np')[0].to_dict()['value']
+                #iob_tags[start] = "B-"+ tag.get_labels('ner')[0].to_dict()['value']
+                iob_tags[start] = tag.get_labels('np')[0].to_dict()['value']
+                #print(tag, self.tokenizer.decode(text[start:end+1]))
+            except Exception as e:
+                print(tag)
+                print(e)
+                pass
+        return {"tokens": text, "ner_tags": iob_tags, "mentions": mentions}
+    def get_token_for_char(self, i, offsets):
+        for off in range(len(offsets)):
+            if i < offsets[off]:
+                return off - 1
+        return len(offsets) - 1
+class FlairFrameModel:
+    def __init__(self, model_id_for_tokenizer, flair_model_name):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id_for_tokenizer, use_fast=True)
+        self.tagger = SequenceTagger.load(flair_model_name)
+    def annotate_text(self, text):
+        iob_tags = ['O'] * len(text)
+        mentions = []
+        text_str = self.tokenizer.decode(text)
+        sentence = Sentence(text_str)
+        # Predict NER tags
+        self.tagger.predict(sentence)
+        ner_tags = sentence.get_labels('frame')
+        offsets = []
+        offset = 0
+        for i, token_id in enumerate(text):
+            offsets.append(offset)
+            offset = len(self.tokenizer.decode(text[:i+1]))
+        offsets.append(offset)
+        for tag in ner_tags:
+            try:
+                start = self.get_token_for_char(tag.data_point.start_position, offsets)
+                end = self.get_token_for_char(tag.data_point.end_position-1, offsets)
+                mentions.append([start, end])
+                for i in range(start, end+1):
+                    #iob_tags[i] = "I-"+ tag.get_labels('ner')[0].to_dict()['value']
+                    iob_tags[i] = tag.to_dict()['value']
+                #iob_tags[start] = "B-"+ tag.get_labels('ner')[0].to_dict()['value']
+                iob_tags[start] = tag.to_dict()['value']
+                #print(tag, self.tokenizer.decode(text[start:end+1]))
+            except Exception as e:
+                print(tag)
+                print(e)
+                pass
+        return {"tokens": text, "ner_tags": iob_tags, "mentions": mentions}
+    def get_token_for_char(self, i, offsets):
+        for off in range(len(offsets)):
+            if i < offsets[off]:
+                return off - 1
+        return len(offsets) - 1
+class FlairPOSModel:
+    def __init__(self, model_id_for_tokenizer, flair_model_name):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id_for_tokenizer, use_fast=True)
+        self.tagger = SequenceTagger.load(flair_model_name)
+    def annotate_text(self, text):
+        iob_tags = ['O'] * len(text)
+        mentions = []
+        text_str = self.tokenizer.decode(text)
+        sentence = Sentence(text_str)
+        # Predict NER tags
+        self.tagger.predict(sentence)
+        ner_tags = sentence.get_labels('pos')
+        offsets = []
+        offset = 0
+        for i, token_id in enumerate(text):
+            offsets.append(offset)
+            offset = len(self.tokenizer.decode(text[:i+1]))
+        offsets.append(offset)
+        for tag in ner_tags:
+            try:
+                start = self.get_token_for_char(tag.data_point.start_position, offsets)
+                end = self.get_token_for_char(tag.data_point.end_position-1, offsets)
+                mentions.append([start, end])
+                for i in range(start, end+1):
+                    #iob_tags[i] = "I-"+ tag.get_labels('ner')[0].to_dict()['value']
+                    iob_tags[i] = tag.to_dict()['value']
+                #iob_tags[start] = "B-"+ tag.get_labels('ner')[0].to_dict()['value']
+                iob_tags[start] = tag.to_dict()['value']
+                #print(tag, self.tokenizer.decode(text[start:end+1]))
+            except Exception as e:
+                print(tag)
+                print(e)
+                pass
+        return {"tokens": text, "ner_tags": iob_tags, "mentions": mentions}
+    def get_token_for_char(self, i, offsets):
+        for off in range(len(offsets)):
+            if i < offsets[off]:
+                return off - 1
+        return len(offsets) - 1
+class DataGenerator(object):
+    def __init__(self, config, reference_model):
+        self.config = config
+        self.model_id = self.config.language_model
+        self.reference_model = reference_model
+        self.output_path = self.config.path_data
+        self.tokenizer = AutoTokenizer.from_pretrained(self.config.language_model, use_fast=True)
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        if self.config.cuda:
+            device = "cuda"
+        else:
+            device = "cpu"
+        self.model = AutoModelForCausalLM.from_pretrained(self.config.language_model).to(device)
+        json.dump({
+            "generation_kwargs": self.config.generation_kwargs,
+            "model_id": self.config.language_model,
+            "flair_model_name": reference_model.name,
+        }, open(self.config.path_config, "w"), indent=1)
+    def generate_text(self, prompts, generation_kwargs):
+        generated_texts = []
+        for prompt in tqdm(prompts, desc="Generating text"):
+            input_ids = self.tokenizer.encode(prompt, return_tensors="pt")
+            generated_text_ids = self.model.generate(input_ids=input_ids.to(self.model.device), pad_token_id=self.tokenizer.pad_token_id, **generation_kwargs)
+            generated_text = generated_text_ids[0].tolist()
+            prompt_token_ids = input_ids[0].tolist()
+            generated_texts.append({"prompt": prompt_token_ids, "full": generated_text})
+        return generated_texts
+    def annotate_text(self, texts):
+        annotated_texts = []
+        for text in tqdm(texts, desc="Annotating text"):
+            annotated_text = self.reference_model.annotate_text(text["full"])
+            annotated_texts.append(annotated_text)
+        return annotated_texts
+    def save_data(self, data):
+        with open(self.output_path, 'w') as f:
+            json.dump(data, f, indent=1)

stoke/src/data/util.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import re
+from nltk.tokenize.treebank import TreebankWordDetokenizer as Detok
+from torch.utils.data import Dataset
+import json
+import torch
+import torch.nn.functional as F
+import os
+import datasets
+import random
+class Detokenizer(object):
+    # https://stackoverflow.com/a/46311499
+    def __init__(self) -> None:
+        self.detokenizer = Detok()
+    def __call__(self, tokens, return_offsets=False):
+        text = self.detokenizer.detokenize(tokens)
+        text = re.sub('\s*,\s*', ', ', text)
+        text = re.sub('\s*\.\s*', '. ', text)
+        text = re.sub('\s*\?\s*', '? ', text)
+        text = text.replace(" --", "--")
+        if return_offsets:
+            offsets = [0]
+            for i in range(1, len(tokens)):
+                offsets.append(len(self(tokens[:i])))
+            """
+            # verify offsets
+            for i, offset in enumerate(offsets):
+                if i == 0:
+                    continue
+                check = text[:offset]
+                target = self(tokens[:i])
+                try:
+                    assert target == check
+                except AssertionError:
+                    print(tokens)
+                    print(f"'{check}' != '{target}'")
+                    raise AssertionError
+            """
+            return text.strip(), offsets
+        return text.strip()
+class JSONDataset(Dataset):
+    def __init__(self, path):
+        super().__init__()
+        self.samples = json.load(open(path, "r"))
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, idx):
+        return self.samples[idx]
+def create_mask_for_len(seq_len, pad_to=None, skip_start=0, window=None):
+    mask = (-1 * (torch.triu(torch.ones(seq_len, seq_len), diagonal=1) - 1)).bool()
+    if skip_start != 0:
+        mask[:skip_start, :] = False
+        mask[:, :skip_start] = False
+    if window is not None:
+        for i in range(window, seq_len):
+            mask[i, :max(i-window, 0)] = False
+    if pad_to is None:
+        return mask
+    return F.pad(mask, (0, pad_to-seq_len, 0, pad_to-seq_len))
+def collate_function_with_label_map(batch, label_map):
+    # prepare token ids
+    sequences = [torch.tensor(x['tokens']) for x in batch]
+    input_ids = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=0)
+    # prepare labels
+    labels_tokens = []
+    for batchitem in batch:
+        labels_tokens.append(torch.tensor([label_map.index(x) for x in batchitem['ner_tags']]))
+    labels_tokens = torch.nn.utils.rnn.pad_sequence(labels_tokens, batch_first=True, padding_value=0).to(torch.long)
+    labels_spans = torch.zeros((len(batch), input_ids.shape[-1], input_ids.shape[-1]))
+    for i, batchitem in enumerate(batch):
+        for mnt in batchitem['mentions']:
+            start, end = mnt
+            try:
+                labels_spans[i, end+1, start] = 1.0
+            except:
+                pass
+    # prepare masks
+    masks_tokens = [torch.tensor([False]+([True]*(len(x)-1))) for x in sequences]
+    masks_tokens = torch.nn.utils.rnn.pad_sequence(masks_tokens, batch_first=True, padding_value=False)
+    mask_spans = torch.stack([create_mask_for_len(len(x['tokens']), input_ids.shape[-1], skip_start=0, window=15) for x in batch])
+    # mask labels
+    labels_tokens = torch.masked_select(labels_tokens, masks_tokens).long()
+    labels_spans = torch.masked_select(labels_spans, mask_spans).long()
+    return {
+        'input_ids': input_ids,
+        'labels_tokens': labels_tokens,
+        'labels_spans': labels_spans,
+        'mask_tokens': masks_tokens.unsqueeze(-1),
+        'mask_spans': mask_spans.unsqueeze(-1)
+        }
+def print_metric(metric, class_labels, return_classwise=False, verbose=False):
+    f_ner = metric.compute()
+    p_ner = torch.nan_to_num(metric.num_tp / metric.num_prediction)
+    r_ner = torch.nan_to_num(metric.num_tp / metric.num_label)
+    if verbose:
+        print(f"{' '.ljust(10)}     P      R      F      S")
+    sum_support = 0
+    weighted_scores = [0, 0, 0]
+    classwise = {}
+    for ner_class, p, r, f, s in zip(class_labels, p_ner, r_ner, f_ner, metric.num_label):
+        if ner_class == "NONE" or ner_class == "O" or ner_class == "no_relation" or ner_class == "no_span":
+            continue
+        if verbose:
+            print(f"{ner_class.ljust(10)} - {p:.2f} - {r:.2f} - {f:.2f} - {int(s)}")
+        weighted_scores[0] += p*s
+        weighted_scores[1] += r*s
+        weighted_scores[2] += f*s
+        sum_support += s
+        classwise[ner_class] = {"p": p.item(), "r": r.item(), "f": f.item(), "s": s.item()}
+    p_micro = weighted_scores[0]/sum_support
+    r_micro = weighted_scores[1]/sum_support
+    f_micro = weighted_scores[2]/sum_support
+    classwise["macro"] = {"p": torch.mean(p_ner[1:]).item(), "r": torch.mean(r_ner[1:]).item(), "f": torch.mean(f_ner[1:]).item()}
+    if verbose:
+        print("")
+        print(f"MICRO      - {p_micro:.2f} - {r_micro:.2f} - {f_micro:.2f}")
+        print(f"MACRO      - {torch.mean(p_ner[1:]):.2f} - {torch.mean(r_ner[1:]):.2f} - {torch.mean(f_ner[1:]):.2f}")
+        print("")
+    if return_classwise:
+        return (p_micro.item(), r_micro.item(), f_micro.item()), classwise
+    return p_micro.item(), r_micro.item(), f_micro.item()
+class GenerationConfig:
+    def __init__(self, language_model, output_path, dataset_name, cuda=False, generation_kwargs={}):
+        self.language_model = language_model
+        self.output_path = output_path
+        self.dataset_name = dataset_name
+        self.cuda = cuda
+        self.generation_kwargs = generation_kwargs
+        self.path_data = os.path.join(output_path, f"{language_model}/{dataset_name}/data.json")
+        self.path_config = os.path.join(output_path, f"{language_model}/{dataset_name}/config.json")
+        if not os.path.exists(os.path.join(output_path, f"{language_model}/{dataset_name}")):
+            os.makedirs(os.path.join(output_path, f"{language_model}/{dataset_name}"))
+def conll_prompts():
+    ds = datasets.load_dataset("conll2003")["validation"]
+    dtk = Detokenizer()
+    prompts = [dtk(x["tokens"]) for x in ds]
+    ds = datasets.load_dataset("conll2003")["train"]
+    prompts += [dtk(x["tokens"]) for x in ds]
+    return prompts
+def partition_dataset(data, split_sizes):
+    random.shuffle(data)
+    total_size = len(data)
+    split_points = [int(total_size * size) for size in split_sizes[:-1]]
+    datasets = []
+    start_idx = 0
+    for split_point in split_points:
+        datasets.append(data[start_idx: start_idx + split_point])
+        start_idx += split_point
+    datasets.append(data[start_idx:])
+    return datasets
+def stats(ds, keys=None):
+    mentions_total = 0
+    mentions_per_type = {}
+    if keys is not None:
+        for key in keys:
+            mentions_per_type[key] = 0
+    for sample in ds:
+        mentions_total += len(sample['mentions'])
+        for mnt in sample['mentions']:
+            tag = sample['ner_tags'][mnt[0]]
+            if tag not in mentions_per_type.keys():
+                mentions_per_type[tag] = 0
+            mentions_per_type[tag] += 1
+    return len(mentions_per_type.keys()), mentions_total, mentions_per_type
+def split_data(path_to_data, split_names=["train", "validation", "test"], split_sizes=[0.8, 0.1, 0.1]):
+    with open(path_to_data, 'r') as file:
+        data = json.load(file)
+    annotation_types = sorted(list(stats(data)[-1].keys()))
+    datasets = partition_dataset(data, split_sizes)
+    for i, dataset in enumerate(datasets):
+        ds = []
+        for x in dataset:
+            out = {}
+            out["tokens"] = x["tokens"]
+            out["ner_tags"] = [y.replace("I-", "").replace("B-", "") for y in x["ner_tags"]]
+            out["mentions"] = x["mentions"]
+            ds.append(out)
+        print(f"Size of dataset {split_names[i]}: {len(ds)}")
+        json.dump(ds, open(f"{path_to_data.split('.json')[0]}_{split_names[i]}.json", "w"))
+        json.dump(stats(dataset, annotation_types)[-1], open(f"{path_to_data.split('.json')[0]}_{split_names[i]}_stats.json", "w"), indent=1)

stoke/src/playground/__init__.py ADDED Viewed

File without changes

stoke/src/playground/app.py ADDED Viewed

	@@ -0,0 +1,442 @@

+import streamlit as st
+from transformers import AutoModelForCausalLM, AutoTokenizer, STOKEStreamer
+from threading import Thread
+import json
+import torch
+import matplotlib.pyplot as plt
+from matplotlib.colors import to_hex
+import numpy as np
+import os
+import urllib.request
+import zipfile
+class MLP(torch.nn.Module):
+    def __init__(self, input_dim, output_dim, hidden_dim=1024, layer_id=0, cuda=False):
+        super(MLP, self).__init__()
+        self.fc1 = torch.nn.Linear(input_dim, hidden_dim)  # Input layer to hidden layer
+        self.fc3 = torch.nn.Linear(hidden_dim, output_dim)  # Hidden layer to output layer
+        self.layer_id = layer_id
+        if cuda:
+            self.device = "cuda"
+        else:
+            self.device = "cpu"
+        self.to(self.device)
+    def forward(self, x):
+        x = torch.flatten(x, start_dim=1)
+        x = torch.relu(self.fc1(x))
+        x = self.fc3(x)
+        return torch.argmax(x, dim=-1).cpu().detach(), torch.softmax(x, dim=-1).cpu().detach()
+def map_value_to_color(value, colormap_name='tab20c'):
+    """
+    Map a value between 0 and 1 to a CSS color using a Python colormap.
+    Args:
+        value (float): A value between 0 and 1.
+        colormap_name (str): The name of the colormap to use (e.g., 'viridis').
+    Returns:
+        str: A CSS color string in the form 'rgb(r, g, b)'.
+    """
+    # Ensure the value is within the range [0, 1]
+    value = np.clip(value, 0.0, 1.0)
+    # Get the colormap
+    colormap = plt.get_cmap(colormap_name)
+    # Map the value to a color
+    rgba_color = colormap(value)
+    # Convert the RGBA color to CSS format
+    css_color = to_hex(rgba_color)
+    return css_color + "88"
+@st.cache_resource
+def get_model_and_tokenizer(name):
+    # Load pre-trained model and tokenizer
+    tok = AutoTokenizer.from_pretrained(name)
+    model = AutoModelForCausalLM.from_pretrained(name)
+    return model, tok
+@st.cache_resource
+def get_classifiers_for_model(att_size, emb_size, device, config_paths):
+    classifier_token = None
+    #print(config)
+    config = {
+        "classifier_token": json.load(open(os.path.join(config_paths["classifier_token"], "config.json"), "r")),
+        "classifier_span": json.load(open(os.path.join(config_paths["classifier_span"], "config.json"), "r"))
+    }
+    layer_id = config["classifier_token"]["layer"]
+    classifier_span = MLP(att_size, 2, hidden_dim=config["classifier_span"]["classifier_dim"]).to(device)
+    classifier_span.load_state_dict(torch.load(os.path.join(config_paths["classifier_span"], "checkpoint.pt"), map_location=device))
+    classifier_token = MLP(emb_size, len(config["classifier_token"]["label_map"]), layer_id=layer_id, hidden_dim=config["classifier_token"]["classifier_dim"]).to(device)
+    classifier_token.load_state_dict(torch.load(os.path.join(config_paths["classifier_token"], "checkpoint.pt"), map_location=device))
+    print(sum(p.numel() for p in classifier_span.parameters()), sum(p.numel() for p in classifier_token.parameters()))
+    return classifier_span, classifier_token, config["classifier_token"]["label_map"]
+def get_available_models():
+    available_models = []
+    for model_name in ["gpt2", "gpt2-xl"]:
+        if os.path.isfile(f"checkpoints/{model_name}/config.json"):
+            available_models.append(model_name)
+    return available_models
+def get_available_datasets(model_name):
+    available_datasets = []
+    config_path = f"checkpoints/{model_name}/config.json"
+    if os.path.isfile(config_path):
+        with open(config_path, "r") as f:
+            config = json.load(f)
+            # Assuming datasets are keys in config.json
+            available_datasets = list(config.keys())
+    return available_datasets
+def download_and_extract_zip(url, extract_dir):
+    # Determine the parent directory
+    parent_dir = os.path.split(os.path.dirname(extract_dir))[-2]
+    print(parent_dir)
+    # Download the zip file to the parent directory
+    zip_file_path = os.path.join(parent_dir, "data.zip")
+    urllib.request.urlretrieve(url, zip_file_path)
+    # Extract the zip file
+    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
+        zip_ref.extractall(parent_dir)
+    # Remove the zip file
+    os.remove(zip_file_path)
+def find_datasets_and_model_ids(root_dir):
+    datasets = {}
+    # Check if the root directory exists
+    if not os.path.exists(root_dir):
+        # If root directory doesn't exist, download a zip file and unpack it
+        print("Root directory doesn't exist. Downloading zip file...")
+        url = "https://drive.usercontent.google.com/download?id=1dHjH_J0zuPS-SDVrh49tMpIx5ramu_hc&export=download&authuser=0&confirm=t&uuid=4efcec77-571c-44c7-82f1-f39ddae50eb5&at=APZUnTW8g-Ab4PUT0-B9mh4jQSc-%3A1711040271924"  # Replace with your actual download URL
+        download_and_extract_zip(url, root_dir)
+        print("Zip file downloaded and unpacked successfully.")
+    for root, dirs, files in os.walk(root_dir):
+        if 'config.json' in files and 'stoke_config.json' in files:
+            config_path = os.path.join(root, 'config.json')
+            stoke_config_path = os.path.join(root, 'stoke_config.json')
+            with open(config_path, 'r') as f:
+                config_data = json.load(f)
+                model_id = config_data.get('model_id')
+                if model_id:
+                    dataset_name = os.path.basename(os.path.dirname(config_path))
+            with open(stoke_config_path, 'r') as f:
+                stoke_config_data = json.load(f)
+                if model_id:
+                    dataset_name = os.path.basename(os.path.dirname(stoke_config_path))
+                    datasets.setdefault(model_id, {})[dataset_name] = stoke_config_data
+    return datasets
+# Main content
+st.title("Playground")
+# Sidebar for model and dataset selection
+with st.sidebar:
+    st.subheader("Model and Dataset Selection")
+    datasets = find_datasets_and_model_ids("data/")
+    available_models = datasets.keys()
+    print(datasets)
+    if available_models:
+        model_selection = st.selectbox("Select Model", available_models)
+    else:
+        st.error("No models available. Please check the file paths.")
+    # Select dataset based on selected model
+    available_datasets = datasets[model_selection]
+    if available_datasets:
+        dataset_selection = st.selectbox("Select Dataset", available_datasets)
+    else:
+        st.error("No datasets available for the selected model.")
+    # Select dataset based on selected model
+    available_configs = datasets[model_selection][dataset_selection]
+    if available_configs:
+        config_selection = st.selectbox("Select Config", available_configs.keys())
+    else:
+        st.error("No configs available for the selected dataset.")
+# Load model and streamer based on selections
+model, tok = get_model_and_tokenizer(model_selection)
+if torch.cuda.is_available():
+    model.cuda()
+classifier_span, classifier_token, label_map = get_classifiers_for_model(model.config.n_head*model.config.n_layer, model.config.n_embd, model.device, datasets[model_selection][dataset_selection][config_selection])
+streamer = STOKEStreamer(tok, classifier_token, classifier_span)
+new_tags = label_map
+def filter_spans(spans_and_values):
+    if spans_and_values == []:
+        return [], []
+    # Create a dictionary to store spans based on their second index values
+    span_dict = {}
+    spans, values = [x[0] for x in spans_and_values], [x[1] for x in spans_and_values]
+    # Iterate through the spans and update the dictionary with the highest value
+    for span, value in zip(spans, values):
+        start, end = span
+        if start > end or end - start > 15 or start == 0:
+            continue
+        current_value = span_dict.get(end, None)
+        if current_value is None or current_value[1] < value:
+            span_dict[end] = (span, value)
+    if span_dict == {}:
+        return [], []
+    # Extract the filtered spans and values
+    filtered_spans, filtered_values = zip(*span_dict.values())
+    return list(filtered_spans), list(filtered_values)
+def remove_overlapping_spans(spans):
+    # Sort the spans based on their end points
+    sorted_spans = sorted(spans, key=lambda x: x[0][1])
+    non_overlapping_spans = []
+    last_end = float('-inf')
+    # Iterate through the sorted spans
+    for span in sorted_spans:
+        start, end = span[0]
+        value = span[1]
+        # If the current span does not overlap with the previous one
+        if start >= last_end:
+            non_overlapping_spans.append(span)
+            last_end = end
+        else:
+            # If it overlaps, choose the one with the highest value
+            existing_span_index = -1
+            for i, existing_span in enumerate(non_overlapping_spans):
+                if existing_span[0][1] <= start:
+                    existing_span_index = i
+                    break
+            if existing_span_index != -1 and non_overlapping_spans[existing_span_index][1] < value:
+                non_overlapping_spans[existing_span_index] = span
+    return non_overlapping_spans
+def generate_html_no_overlap(tokenized_text, spans):
+    current_index = 0
+    html_content = ""
+    for (span_start, span_end), value in spans:
+        # Add text before the span
+        html_content += "".join(tokenized_text[current_index:span_start])
+        # Add the span with underlining
+        html_content += "<b><u>"
+        html_content += "".join(tokenized_text[span_start:span_end])
+        html_content += "</u></b> "
+        current_index = span_end
+    # Add any remaining text after the last span
+    html_content += "".join(tokenized_text[current_index:])
+    return html_content
+css = """
+    <style>
+    .highlight {
+        display: inline;
+    }
+    .highlight::after {
+        background-color: var(data-color);
+    }
+    .spanhighlight {
+        padding: 2px 5px;
+        border-radius: 5px;
+    }
+    .tooltip {
+    position: relative;
+    display: inline-block;
+}
+.tooltip::after {
+    content: attr(data-tooltip-text); /* Set content from data-tooltip-text attribute */
+    display: none;
+    position: absolute;
+    background-color: #333;
+    color: #fff;
+    padding: 5px;
+    border-radius: 5px;
+    bottom: 100%; /* Position it above the element */
+    left: 50%;
+    transform: translateX(-50%);
+    width: auto;
+    min-width: 120px;
+    margin: 0 auto;
+    text-align: center;
+}
+.tooltip:hover::after {
+    display: block; /* Show the tooltip on hover */
+}
+.small-text {
+    padding: 2px 5px;
+    background-color: white;
+    border-radius: 5px;
+    font-size: xx-small;
+    margin-left: 0.5em;
+    vertical-align: 0.2em;
+    font-weight: bold;
+    color: grey;
+}
+    </style>"""
+def generate_html_spanwise(token_strings, tokenwise_preds, spans, tokenizer):
+    # spanwise annotated text
+    annotated = []
+    span_ends = -1
+    in_span = False
+    out_of_span_tokens = []
+    for i in reversed(range(len(tokenwise_preds))):
+        if in_span:
+            if i >= span_ends:
+                continue
+            else:
+                in_span = False
+        predicted_class = ""
+        style = ""
+        span = None
+        for s in spans:
+            if s[1] == i+1:
+                span = s
+        if tokenwise_preds[i] != 0 and span is not None:
+            predicted_class = f"highlight spanhighlight"
+            style = f"background-color: {map_value_to_color((tokenwise_preds[i]-1)/(len(new_tags)-1))}"
+            if tokenizer.convert_tokens_to_string([token_strings[i]]).startswith(" "):
+                annotated.append("Ġ")
+            span_opener = f"Ġ<span class='{predicted_class}' data-tooltip-text='{new_tags[tokenwise_preds[i]]}' style='{style}'>".replace(" ", "Ġ")
+            span_end = f"<span class='small-text'>{new_tags[tokenwise_preds[i]]}</span></span>"
+            annotated.extend(out_of_span_tokens)
+            out_of_span_tokens = []
+            span_ends = span[0]
+            in_span = True
+            annotated.append(span_end)
+            annotated.extend([token_strings[x] for x in reversed(range(span[0], span[1]))])
+            annotated.append(span_opener)
+        else:
+            out_of_span_tokens.append(token_strings[i])
+    annotated.extend(out_of_span_tokens)
+    return [x for x in reversed(annotated)]
+# Define function to generate text based on input
+def generate_text(generation_kwargs, output_field):
+    # Function to generate text in a separate thread
+    def generate_async():
+        model.generate(**generation_kwargs)
+    # Start text generation in a separate thread
+    thread = Thread(target=generate_async)
+    thread.start()
+    # Display generated text as it becomes available
+    text_tokenwise = ""
+    text_spans = ""
+    removed_spans = ""
+    tags = []
+    spans = []
+    for new_text in streamer:
+        if new_text[1] is not None and new_text[2] != ['']:
+            text_tokenwise = ""
+            tags.extend(new_text[1])
+            spans.extend(new_text[-1])
+            # Tokenwise Classification
+            for tk, pred in zip(new_text[2],tags):
+                if pred != 0:
+                    style = f"background-color: {map_value_to_color((pred-1)/(len(new_tags)-1))}"
+                    if tk.startswith(" "):
+                        text_tokenwise += " "
+                    text_tokenwise += f"<span class='tooltip highlight' data-tooltip-text='{new_tags[pred]}' style='{style}'>{tk}</span>"
+                else:
+                    text_tokenwise += tk
+            # Span Classification
+            text_spans = ""
+            if len(spans) > 0:
+                filtered_spans = remove_overlapping_spans(spans)
+                text_spans = generate_html_no_overlap(new_text[2], filtered_spans)
+                if len(spans) - len(filtered_spans) > 0:
+                    removed_spans = f"{len(spans) - len(filtered_spans)} span(s) hidden due to overlap."
+            else:
+                for tk in new_text[2]:
+                    text_spans += f"{tk}"
+            # Spanwise Classification
+            annotated_tokens = generate_html_spanwise(new_text[2], tags, [x for x in filter_spans(spans)[0]], tok)
+            generated_text_spanwise = tok.convert_tokens_to_string(annotated_tokens).replace("<|endoftext|>", "")
+            output_field.empty()
+            output = f"{css}"
+            output += generated_text_spanwise.replace("\n", " ").replace("$", "$") + "\n<br>"
+            output += "<details><summary>Show tokenwise classification</summary>\n" + text_tokenwise.replace("\n", " ").replace("$", "\\$")
+            #output += "</details><details><summary>Show spans</summary>\n" + text_spans.replace("\n", " ").replace("$", "\\$")
+            if removed_spans != "":
+                output += f"<br><br><i>({removed_spans})</i>"
+            output += "</details>"
+            output_field.write(output, unsafe_allow_html=True)
+# Input field
+input_text = st.text_area("Enter prompt for completion", "")
+# Sidebar for customizing generation parameters
+with st.sidebar:
+    st.subheader("Generation Parameters")
+    max_new_tokens = st.slider("Max New Tokens", min_value=1, max_value=100, value=30)
+    repetition_penalty = st.slider("Repetition Penalty", min_value=1.0, max_value=2.0, value=1.2)
+    do_sample = st.checkbox("Do Sample", value=True)
+    temperature = st.slider("Temperature", min_value=0.1, max_value=2.0, value=1.0)
+    top_p = st.slider("Top-p", min_value=0.1, max_value=1.0, value=0.3)
+    top_k = st.slider("Top-k", min_value=10, max_value=100, value=50)
+    typical_p = st.slider("Typical P", min_value=0.1, max_value=1.0, value=1.0)
+# Button to generate text
+if st.button("Generate"):
+    if input_text:
+        output_field = st.empty()
+        inputs = tok(["  " + input_text], return_tensors="pt").to(model.device)
+        generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens,
+                                 repetition_penalty=repetition_penalty, temperature=temperature,
+                                 top_p=top_p, top_k=top_k, do_sample=do_sample, typical_p=typical_p)
+        generate_text(generation_kwargs, output_field)
+    else:
+        st.warning("Please enter some text first.")

stoke/src/selection/__init__.py ADDED Viewed

File without changes

stoke/src/selection/simple.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import json
+import os
+def find_best_checkpoint(path):
+    checkpoints_path = os.path.join(path, "checkpoints")
+    token_classifier_path = os.path.join(checkpoints_path, "token_classifier")
+    span_classifier_path = os.path.join(checkpoints_path, "span_classifier")
+    best_token_checkpoint = find_best_checkpoint_in_folder(token_classifier_path)
+    best_span_checkpoint = find_best_checkpoint_in_folder(span_classifier_path)
+    return best_token_checkpoint, best_span_checkpoint
+def find_best_checkpoint_in_folder(folder_path):
+    best_checkpoint = None
+    best_f1_validation = -1
+    for subfolder in os.listdir(folder_path):
+        subfolder_path = os.path.join(folder_path, subfolder)
+        config_path = os.path.join(subfolder_path, "config.json")
+        checkpoint_path = os.path.join(subfolder_path, "checkpoint.pt")
+        if os.path.exists(config_path) and os.path.exists(checkpoint_path):
+            with open(config_path, 'r') as config_file:
+                config_data = json.load(config_file)
+                if "best_f1_validation" in config_data:
+                    f1_validation = config_data["best_f1_validation"]
+                    if f1_validation > best_f1_validation:
+                        best_f1_validation = f1_validation
+                        best_checkpoint = subfolder_path
+    return best_checkpoint
+def create_config_for_path(path, name="default"):
+    best_token_checkpoint, best_span_checkpoint = find_best_checkpoint(path)
+    print("Best token classifier checkpoint:", best_token_checkpoint)
+    print("Best span classifier checkpoint:", best_span_checkpoint)
+    config = {
+        "classifier_token": best_token_checkpoint,
+        "classifier_span": best_span_checkpoint
+    }
+    configs_path = os.path.join(path, "stoke_config.json")
+    if os.path.exists(configs_path):
+        with open(configs_path, 'r') as configs_file:
+            existing_configs = json.load(configs_file)
+    else:
+        existing_configs = {}
+    existing_configs[name] = config
+    with open(configs_path, 'w') as configs_file:
+        json.dump(existing_configs, configs_file, indent=4)
+    print(f"Config '{name}' saved successfully.")

stoke/src/trainer/__init__.py ADDED Viewed

File without changes

stoke/src/trainer/trainer.py ADDED Viewed

	@@ -0,0 +1,284 @@

+from .util import TrainConfig
+from ..data.util import JSONDataset, collate_function_with_label_map, print_metric
+from ..classifier.probes import MLPProbe as MLP
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+from torch.utils.data import DataLoader
+from torcheval.metrics import MulticlassF1Score
+from transformers.optimization import get_linear_schedule_with_warmup
+from torch.optim import AdamW
+import json
+import os
+import string
+import random
+from tqdm import tqdm
+class Trainer:
+    def __init__(self, config:TrainConfig):
+        self.config = config
+        self._load_model()
+        self._load_data()
+        self._load_probes_and_optimizers()
+        print("Trainer is ready.")
+    def _load_model(self):
+        "Loads language model and tokenizer"
+        print(f"Loading model '{self.config.config_dataset['model_id']}'")
+        # check if custom huggingface cache was selected
+        kwds = {
+        }
+        if self.config.hfcache != "":
+            kwds["cache_dir"] = self.config.hfcache
+        # load model and tokenizer
+        self.model = AutoModelForCausalLM.from_pretrained(self.config.config_dataset['model_id'], output_attentions=True, output_hidden_states=True, return_dict=True, device_map="auto", **kwds).half()
+        self.tokenizer = AutoTokenizer.from_pretrained(self.config.config_dataset['model_id'], use_fast=True, **kwds)
+        if self.config.cuda:
+            self.model.cuda()
+        print("model and tokenizer loaded")
+    def _load_data(self):
+        "Loads datasets"
+        def collate_function(batch):
+            return collate_function_with_label_map(batch, self.config.label_map)
+        datasets = {}
+        self.dataloaders = {}
+        num_classes = None
+        self.config.label_map = []
+        for split in self.config.splits:
+            datasets[split] = JSONDataset(os.path.join(self.config.path, f"data_{split}.json"))
+            shuffle = False
+            if split == "train":
+                shuffle = True
+            self.dataloaders[split] = DataLoader(datasets[split], batch_size=self.config.batch_size, shuffle=shuffle, collate_fn=collate_function)
+            dataset_classes = json.load(open(os.path.join(self.config.path, f"data_{split}_stats.json"), "r"))
+            if num_classes is None:
+                num_classes = len(dataset_classes.keys())
+                self.config.label_map = ["O"] + list(dataset_classes.keys())
+            else:
+                assert len(dataset_classes.keys()) == num_classes
+            print(f"Loaded {split} dataset with {len(datasets[split])} samples and {num_classes} classes")
+    def _load_probes_and_optimizers(self):
+        "Loads probes and optimizers."
+        print("Preparing probes and optimizers")
+        if type(self.model.config):
+            n_layers = self.model.config.num_hidden_layers
+            n_heads = self.model.config.num_attention_heads
+            dim_hidden = self.model.config.hidden_size
+        print(f"Model has {n_layers} layers, hidden state size {dim_hidden}, and {n_heads} attention heads per layer")
+        if self.config.layers is None:
+            self.config.layers = [x for x in range(n_layers)]
+        print(f"Training tokenwise classifiers for {len(self.config.layers)} layer(s), {len(self.config.learning_rates)} learning rate(s), {len(self.config.classifier_dims)} hidden dims.")
+        self.classifier_device = "cpu"
+        if self.config.cuda:
+            self.classifier_device = "cuda"
+        if self.config.balance_loss:
+            class_frequency = [0 for _ in self.config.label_map]
+            for sample in self.dataloaders["train"]:
+                labels = [self.config.label_map.index(x) for x in sample['ner_tags']]
+                for x in labels:
+                    class_frequency[x] += 1
+            class_weights = [sum(class_frequency)/x for x in class_frequency]
+        else:
+            class_weights = [1.0 for _ in self.config.label_map]
+        self.token_classifiers = []
+        for layer in self.config.layers:
+            for lr in self.config.learning_rates:
+                for dim_c in self.config.classifier_dims:
+                    # set up classifier, optimizer, scheduler, and config
+                    _classifier = MLP(dim_hidden, len(self.config.label_map), hidden_dim=dim_c, cuda=self.config.cuda)
+                    _optimizer = AdamW(_classifier.parameters(), lr=lr, eps=1e-6)
+                    _scheduler = get_linear_schedule_with_warmup(_optimizer, self.config.n_steps_per_epoch, self.config.n_epochs*self.config.n_steps_per_epoch)
+                    _config = {
+                            "layer": layer,
+                            "model": self.config.config_dataset['model_id'],
+                            "type": "token_classifier",
+                            "label_map": self.config.label_map,
+                            "learning_rate": lr,
+                            "classifier_dim": dim_c,
+                            "loss_weights": class_weights,
+                            "identifier": ''.join(random.SystemRandom().choice(string.ascii_letters + string.digits) for _ in range(10)),
+                            "best_f1_validation": -1,
+                            "best_f1_validation_classwise": 0,
+                        }
+                    self.token_classifiers.append({
+                        "config_train": self.config,
+                        "config": _config,
+                        "classifier": _classifier,
+                        "optimizer": _optimizer,
+                        "lr_scheduler": _scheduler,
+                        "metric": MulticlassF1Score(num_classes=len(self.config.label_map), average=None, device=_classifier.device),
+                        "criterion": torch.nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(self.classifier_device))
+                    })
+        print(f"Total tokenwise classifiers: {len(self.token_classifiers)}")
+        print(f"Training span detectors for {len(self.config.learning_rates)} learning rate(s), {len(self.config.loss_weights_span)} loss weight(s), {len(self.config.classifier_dims)} hidden dim(s).")
+        self.span_classifiers = []
+        for lr in self.config.learning_rates:
+            for dim_c in self.config.classifier_dims:
+                for loss_weight in self.config.loss_weights_span:
+                    # set up classifier, optimizer, scheduler, and config
+                    _classifier = MLP(n_layers*n_heads, 2, hidden_dim=dim_c, cuda=self.config.cuda)
+                    _optimizer = AdamW(_classifier.parameters(), lr=lr, eps=1e-6)
+                    _scheduler = get_linear_schedule_with_warmup(_optimizer, self.config.n_steps_per_epoch, self.config.n_epochs*self.config.n_steps_per_epoch)
+                    _config = {
+                            "model": self.config.config_dataset['model_id'],
+                            "type": "span_classifier",
+                            "label_map": ["no_span", "span"],
+                            "learning_rate": lr,
+                            "classifier_dim": dim_c,
+                            "loss_weights": loss_weight,
+                            "identifier": ''.join(random.SystemRandom().choice(string.ascii_letters + string.digits) for _ in range(10)),
+                            "best_f1_validation": -1,
+                            "best_f1_validation_classwise": 0,
+                        }
+                    self.span_classifiers.append({
+                        "config_train": self.config,
+                        "config": _config,
+                        "classifier": _classifier,
+                        "optimizer": _optimizer,
+                        "lr_scheduler": _scheduler,
+                        "metric": MulticlassF1Score(num_classes=2, average=None, device=_classifier.device),
+                        "criterion": torch.nn.CrossEntropyLoss(weight=torch.tensor(loss_weight).to(self.classifier_device))
+                    })
+        print(f"Total span detectors: {len(self.span_classifiers)}")
+    def train(self):
+        data_iter_train = iter(self.dataloaders["train"])
+        self.best_f1 = {"token_classifier":-1, "span_classifier":-1}
+        self.best_config = {"token_classifier":None, "span_classifier":None}
+        for epoch in range(self.config.n_epochs):
+            # TRAIN
+            for item in self.token_classifiers + self.span_classifiers:
+                item['classifier'].train()
+                item['metric'].reset()
+            for step in tqdm(range(self.config.n_steps_per_epoch)):
+                # Get data
+                try:
+                    sample = next(data_iter_train)
+                except StopIteration:
+                    data_iter_train = iter(self.dataloaders["train"])
+                    sample = next(data_iter_train)
+                with torch.no_grad():
+                    input_ids = sample['input_ids']
+                    labels_tokens = sample['labels_tokens']
+                    labels_spans = sample['labels_spans']
+                    outputs = self.model(input_ids.to(self.model.device), output_hidden_states=True, output_attentions=True)
+                    hidden_states = {}
+                    for layer in self.config.layers:
+                        hidden_states[layer] = outputs.hidden_states[layer].to(self.classifier_device)
+                    # get attentions and labels
+                    attentions = torch.stack(outputs.attentions).swapaxes(0,1)
+                    attentions = attentions.reshape(attentions.size(0), -1, attentions.size(-2), attentions.size(-1)).permute(0, 2, 3, 1)
+                    attentions = torch.masked_select(attentions, sample['mask_spans'].to(self.classifier_device)).view(-1, attentions.size(-1))
+                # training step for each classifier
+                for item in self.span_classifiers + self.token_classifiers:
+                    if item['config']['type'] == "span_classifier":
+                        _preds = item['classifier'](attentions.to(item['classifier'].fc1.weight.dtype).to(self.classifier_device))
+                        _labels = labels_spans.to(self.classifier_device)
+                    elif item['config']['type'] == "token_classifier":
+                        _preds = item['classifier'](hidden_states[item['config']['layer']].to(item['classifier'].fc1.weight.dtype))
+                        _preds = torch.masked_select(_preds, sample['mask_tokens'].to(self.classifier_device))
+                        _labels = labels_tokens.to(self.classifier_device)
+                    loss = item['criterion'](_preds.view(-1, len(item['config']['label_map'])), _labels.view(-1))
+                    item['metric'].update(_preds.view(-1, len(item['config']['label_map'])), _labels.view(-1))
+                    item['optimizer'].zero_grad(set_to_none=True)
+                    loss.backward()
+                    item['optimizer'].step()
+                    item['lr_scheduler'].step()
+                hidden_states = {}
+                attentions = None
+            # EVAL
+            for item in self.span_classifiers + self.token_classifiers:
+                item['classifier'].eval()
+                item['metric'].reset()
+            with torch.no_grad():
+                for sample in tqdm(self.dataloaders["validation"]):
+                    input_ids = sample['input_ids']
+                    labels_tokens = sample['labels_tokens']
+                    labels_spans = sample['labels_spans']
+                    # language model forward pass
+                    outputs = self.model(input_ids.to(self.model.device), output_hidden_states=True, output_attentions=True)
+                    # get internal representations into correct shapes
+                    for layer in self.config.layers:
+                        hidden_states[layer] = outputs.hidden_states[layer].to(self.classifier_device)
+                    attentions = torch.stack(outputs.attentions).swapaxes(0,1)
+                    attentions = attentions.reshape(attentions.size(0), -1, attentions.size(-2), attentions.size(-1)).permute(0, 2, 3, 1)
+                    attentions = torch.masked_select(attentions, sample['mask_spans'].to(self.classifier_device)).view(-1, attentions.size(-1))
+                    # classifier inference
+                    for item in self.span_classifiers + self.token_classifiers:
+                        if item['config']['type'] == "span_classifier":
+                            _preds = item['classifier'](attentions.to(item['classifier'].fc1.weight.dtype).to(self.classifier_device))
+                            _labels = labels_spans.to(self.classifier_device)
+                        elif item['config']['type'] == "token_classifier":
+                            _preds = item['classifier'](hidden_states[item['config']['layer']].to(item['classifier'].fc1.weight.dtype))
+                            _preds = torch.masked_select(_preds, sample['mask_tokens'].to(self.classifier_device))
+                            _labels = labels_tokens.to(self.classifier_device)
+                        item['metric'].update(_preds.view(-1, len(item['config']['label_map'])), _labels.view(-1))
+                # logging and saving of checkpoints
+                for item in self.span_classifiers + self.token_classifiers:
+                    (p_micro, r_micro, f_micro), classwise = print_metric(item['metric'], item['config']['label_map'], return_classwise=True, verbose=False)
+                    if f_micro > item['config']['best_f1_validation']:
+                        item['config']['best_f1_validation'] = f_micro
+                        item['config']['best_f1_validation_classwise'] = classwise
+                        ckp_path = os.path.join(self.config.checkpoint_path, f"{item['config']['type']}/{item['config']['identifier']}/")
+                        os.makedirs(ckp_path, exist_ok=True)
+                        torch.save(item['classifier'].state_dict(), os.path.join(ckp_path, f"checkpoint.pt"))
+                        json.dump(item['config'], open(os.path.join(ckp_path, f"config.json"), "w"), indent=1)
+                        json.dump(item['config_train'].to_dict(), open(os.path.join(ckp_path, f"config_train.json"), "w"), indent=1)
+                    if f_micro > self.best_f1[item['config']['type']]:
+                        self.best_f1[item['config']['type']] = f_micro
+                        self.best_config[item['config']['type']] = item['config']
+            # print current best for each classifier type
+            for key in self.best_config.keys():
+                print(f"--- Best {key} config after epoch {epoch+1} ---")
+                if self.best_config[key] is not None:
+                    for key, value in self.best_config[key].items():
+                        print(key, value)
+        return self.best_config

stoke/src/trainer/util.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+import time
+import json
+class TrainConfig:
+    def __init__(self, path, splits=['train', 'validation'],
+                 layers=[9, 10, 11], hfcache='', classifier_dims=[4096], learning_rates=[1e-4],
+                 cuda=False, n_steps_per_epoch=1000, n_epochs=2, batch_size=8, balance_loss=False,
+                 loss_weights_span=[[1.0, 1.0], [1.0, 50.0], [1.0, 100.0]]):
+        self.path = path
+        self.checkpoint_path = os.path.join(self.path, "checkpoints/")
+        self.splits = splits
+        self.layers = layers
+        self.hfcache = hfcache
+        self.classifier_dims = classifier_dims
+        self.learning_rates = learning_rates
+        self.cuda = cuda
+        self.n_steps_per_epoch = n_steps_per_epoch
+        self.n_epochs = n_epochs
+        self.batch_size = batch_size
+        self.balance_loss = balance_loss
+        self.loss_weights_span = loss_weights_span
+        self.time = time.time()
+        self.config_dataset = json.load(open(os.path.join(path, f"config.json"), "r"))
+    def to_dict(self):
+        return {
+            "path": self.path,
+            "splits": self.splits,
+            "layers": self.layers,
+            "hfcache": self.hfcache,
+            "classifier_dims": self.classifier_dims,
+            "learning_rates": self.learning_rates,
+            "cuda": self.cuda,
+            "n_steps_per_epoch": self.n_steps_per_epoch,
+            "n_epochs": self.n_epochs,
+            "batch_size": self.batch_size,
+            "balance_loss": self.balance_loss,
+            "loss_weights_span": self.loss_weights_span,
+            "time": self.time,
+            "config_dataset": self.config_dataset
+        }

stoke/tests/__init__.py ADDED Viewed

File without changes