import argparse import csv import json from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification MODEL = "d4data/biomedical-ner-all" tokenizer = AutoTokenizer.from_pretrained(MODEL) model = AutoModelForTokenClassification.from_pretrained(MODEL) pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") def process(*args): parser = argparse.ArgumentParser() parser.add_argument('--notes', help='Notes CSV', required=True) parser.add_argument('--out', help='Output', required=True) args = parser.parse_args() filepath = args.notes outpath = args.out if not filepath.endswith(".csv"): raise ValueError("Filepath must be a .csv file.") if not outpath.endswith(".json"): raise ValueError("Output path must be a .json file.") processed = [] with open(filepath, "r") as f: reader = csv.DictReader(f) for row in reader: text = row["text"] raw = pipe(text) # do something with `raw` here e.g. save to file ner_content = { # "text": text, "score": row["score"], "student_id": row["student_id"], "case": row["case"], "entities": [ { "entity": x["entity_group"], "word": x["word"], "score": round(float(x["score"]), 2), "start": x["start"], "end": x["end"], } for x in raw ], } processed.append(ner_content) # write as json to file with open(outpath, "w") as f: json.dump(processed, f) if __name__ == "__main__": import sys process(*sys.argv[1:])