note-ner-demo / process.py
andrewgleave's picture
Add case to output
cfb3ccc
import argparse
import csv
import json
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
MODEL = "d4data/biomedical-ner-all"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForTokenClassification.from_pretrained(MODEL)
pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
def process(*args):
parser = argparse.ArgumentParser()
parser.add_argument('--notes', help='Notes CSV', required=True)
parser.add_argument('--out', help='Output', required=True)
args = parser.parse_args()
filepath = args.notes
outpath = args.out
if not filepath.endswith(".csv"):
raise ValueError("Filepath must be a .csv file.")
if not outpath.endswith(".json"):
raise ValueError("Output path must be a .json file.")
processed = []
with open(filepath, "r") as f:
reader = csv.DictReader(f)
for row in reader:
text = row["text"]
raw = pipe(text)
# do something with `raw` here e.g. save to file
ner_content = {
# "text": text,
"score": row["score"],
"student_id": row["student_id"],
"case": row["case"],
"entities": [
{
"entity": x["entity_group"],
"word": x["word"],
"score": round(float(x["score"]), 2),
"start": x["start"],
"end": x["end"],
}
for x in raw
],
}
processed.append(ner_content)
# write as json to file
with open(outpath, "w") as f:
json.dump(processed, f)
if __name__ == "__main__":
import sys
process(*sys.argv[1:])