|
import pandas as pd |
|
import csv |
|
|
|
from datasets import Dataset, DatasetDict |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
|
model_name = 't5-literary-coreference' |
|
device = 'cuda' |
|
|
|
print("Loading in data") |
|
|
|
df = pd.read_csv('example_input.csv') |
|
df = df.sample(frac=1) |
|
|
|
to_annotate = Dataset.from_pandas(df) |
|
|
|
speech_excerpts = DatasetDict({"annotate": to_annotate}) |
|
|
|
print("Loading models") |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("t5-3b", model_max_length=500) |
|
|
|
def preprocess_function(examples, input_text = "input", output_text = "output"): |
|
model_inputs = tokenizer(examples[input_text], max_length=500, truncation=True) |
|
|
|
targets = tokenizer(examples[output_text], max_length=500, truncation=True) |
|
|
|
model_inputs["labels"] = targets["input_ids"] |
|
|
|
return model_inputs |
|
|
|
tokenized_speech_excerpts = speech_excerpts.map(preprocess_function, batched=True) |
|
|
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device=device) |
|
|
|
print("Begin creating annotations") |
|
header = ["input", "model_output"] |
|
rows = [] |
|
|
|
for item in speech_excerpts["annotate"]: |
|
input_ids = tokenizer(item["input"], return_tensors="pt").input_ids |
|
result = model.generate(input_ids.to(device=device), max_length = 500) |
|
rows.append([item["input"], tokenizer.decode(result[0], skip_special_tokens = True)]) |
|
|
|
f = open("results.csv", "w") |
|
writer = csv.writer(f) |
|
writer.writerow(header) |
|
writer.writerows(rows) |
|
f.close() |
|
|
|
print("Finished") |
|
|