|
import glob |
|
import json |
|
import os |
|
import os |
|
|
|
import torch |
|
from tqdm import tqdm |
|
import random |
|
|
|
|
|
def open_content(path): |
|
paths = glob.glob(os.path.join(path, "*.json")) |
|
train, dev, test, labels = None, None, None, None |
|
for p in paths: |
|
if "train" in p: |
|
with open(p, "r") as f: |
|
train = json.load(f) |
|
elif "dev" in p: |
|
with open(p, "r") as f: |
|
dev = json.load(f) |
|
elif "test" in p: |
|
with open(p, "r") as f: |
|
test = json.load(f) |
|
elif "labels" in p: |
|
with open(p, "r") as f: |
|
labels = json.load(f) |
|
return train, dev, test, labels |
|
|
|
|
|
def process(data): |
|
words = data['sentence'].split() |
|
entities = [] |
|
|
|
for entity in data['entities']: |
|
start_char, end_char = entity['pos'] |
|
|
|
|
|
start_word = None |
|
end_word = None |
|
|
|
|
|
char_count = 0 |
|
for i, word in enumerate(words): |
|
word_length = len(word) |
|
if char_count == start_char: |
|
start_word = i |
|
if char_count + word_length == end_char: |
|
end_word = i |
|
break |
|
char_count += word_length + 1 |
|
|
|
|
|
entities.append((start_word, end_word, entity['type'])) |
|
|
|
|
|
sample = { |
|
"tokenized_text": words, |
|
"ner": entities |
|
} |
|
|
|
return sample |
|
|
|
|
|
|
|
def create_dataset(path): |
|
train, dev, test, labels = open_content(path) |
|
train_dataset = [] |
|
dev_dataset = [] |
|
test_dataset = [] |
|
for data in train: |
|
train_dataset.append(process(data)) |
|
for data in dev: |
|
dev_dataset.append(process(data)) |
|
for data in test: |
|
test_dataset.append(process(data)) |
|
return train_dataset, dev_dataset, test_dataset, labels |
|
|
|
|
|
@torch.no_grad() |
|
def get_for_one_path(path, model): |
|
|
|
_, _, test_dataset, entity_types = create_dataset(path) |
|
|
|
data_name = path.split("/")[-1] |
|
|
|
|
|
flat_ner = True |
|
if any([i in data_name for i in ["ACE", "GENIA", "Corpus"]]): |
|
flat_ner = False |
|
|
|
|
|
results, f1 = model.evaluate(test_dataset, flat_ner=flat_ner, threshold=0.5, batch_size=12, |
|
entity_types=entity_types) |
|
return data_name, results, f1 |
|
|
|
|
|
def get_for_all_path(model, steps, log_dir, data_paths): |
|
all_paths = glob.glob(f"{data_paths}/*") |
|
|
|
all_paths = sorted(all_paths) |
|
|
|
|
|
device = next(model.parameters()).device |
|
model.to(device) |
|
|
|
model.eval() |
|
|
|
|
|
save_path = os.path.join(log_dir, "results.txt") |
|
|
|
with open(save_path, "a") as f: |
|
f.write("##############################################\n") |
|
|
|
f.write("step: " + str(steps) + "\n") |
|
|
|
zero_shot_benc = ["mit-movie", "mit-restaurant", "CrossNER_AI", "CrossNER_literature", "CrossNER_music", |
|
"CrossNER_politics", "CrossNER_science"] |
|
|
|
zero_shot_benc_results = {} |
|
all_results = {} |
|
|
|
for p in tqdm(all_paths): |
|
if "sample_" not in p: |
|
data_name, results, f1 = get_for_one_path(p, model) |
|
|
|
with open(save_path, "a") as f: |
|
f.write(data_name + "\n") |
|
f.write(str(results) + "\n") |
|
|
|
if data_name in zero_shot_benc: |
|
zero_shot_benc_results[data_name] = f1 |
|
else: |
|
all_results[data_name] = f1 |
|
|
|
avg_all = sum(all_results.values()) / len(all_results) |
|
avg_zs = sum(zero_shot_benc_results.values()) / len(zero_shot_benc_results) |
|
|
|
save_path_table = os.path.join(log_dir, "tables.txt") |
|
|
|
|
|
table_bench_all = "" |
|
for k, v in all_results.items(): |
|
table_bench_all += f"{k:20}: {v:.1%}\n" |
|
|
|
table_bench_all += f"{'Average':20}: {avg_all:.1%}" |
|
|
|
|
|
table_bench_zeroshot = "" |
|
for k, v in zero_shot_benc_results.items(): |
|
table_bench_zeroshot += f"{k:20}: {v:.1%}\n" |
|
table_bench_zeroshot += f"{'Average':20}: {avg_zs:.1%}" |
|
|
|
|
|
with open(save_path_table, "a") as f: |
|
f.write("##############################################\n") |
|
f.write("step: " + str(steps) + "\n") |
|
f.write("Table for all datasets except crossNER\n") |
|
f.write(table_bench_all + "\n\n") |
|
f.write("Table for zero-shot benchmark\n") |
|
f.write(table_bench_zeroshot + "\n") |
|
f.write("##############################################\n\n") |
|
|
|
|
|
def sample_train_data(data_paths, sample_size=10000): |
|
all_paths = glob.glob(f"{data_paths}/*") |
|
|
|
all_paths = sorted(all_paths) |
|
|
|
|
|
zero_shot_benc = ["CrossNER_AI", "CrossNER_literature", "CrossNER_music", |
|
"CrossNER_politics", "CrossNER_science", "ACE 2004"] |
|
|
|
new_train = [] |
|
|
|
for p in tqdm(all_paths): |
|
if any([i in p for i in zero_shot_benc]): |
|
continue |
|
train, dev, test, labels = create_dataset(p) |
|
|
|
|
|
for i in range(len(train)): |
|
train[i]["label"] = labels |
|
|
|
random.shuffle(train) |
|
train = train[:sample_size] |
|
new_train.extend(train) |
|
|
|
return new_train |
|
|