import argparse
import csv
import json

from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

MODEL = "d4data/biomedical-ner-all"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForTokenClassification.from_pretrained(MODEL)

pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")


def process(*args):
    parser = argparse.ArgumentParser()
    parser.add_argument('--notes', help='Notes CSV', required=True)
    parser.add_argument('--out', help='Output', required=True)
    args = parser.parse_args()

    filepath = args.notes
    outpath = args.out

    if not filepath.endswith(".csv"):
        raise ValueError("Filepath must be a .csv file.")
    
    if not outpath.endswith(".json"):
        raise ValueError("Output path must be a .json file.")
    
    processed = []
    with open(filepath, "r") as f:
        reader = csv.DictReader(f)
        for row in reader:
            text = row["text"]
            raw = pipe(text)
            # do something with `raw` here e.g. save to file
            ner_content = {
                # "text": text,
                "score": row["score"],
                "student_id": row["student_id"],
                "case": row["case"],
                "entities": [
                    {
                        "entity": x["entity_group"],
                        "word": x["word"],
                        "score": round(float(x["score"]), 2),
                        "start": x["start"],
                        "end": x["end"],
                    }
                    for x in raw
                ],
            }
            processed.append(ner_content)
            
    # write as json to file
    with open(outpath, "w") as f:
        json.dump(processed, f)


if __name__ == "__main__":
    import sys

    process(*sys.argv[1:])