import os
import pandas as pd
import numpy as np
import gradio as gr
from transformers import DistilBertForTokenClassification, DistilBertTokenizerFast, pipeline
auth_token = os.environ["TOKEN"]

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

model_path = "inesani/ner-log"
model_loaded = DistilBertForTokenClassification.from_pretrained(model_path, use_auth_token=auth_token)

ner_pipeline = pipeline("ner", model=model_loaded, tokenizer=tokenizer,
                        aggregation_strategy='simple')

title = 'Hi, my name is NER! I am a ML model that detects IPs, KV and Timestamps in logs.'

description = """
I have been trained on a log corpus of only 10000 logs...I am a Work In Progress :)

You can paste below any log that you want to test or use one of the provided examples.
"""

examples = [
    'John Doe, 37 - Google - 42 Wallaby Way, Sydney - 500000',
    'campo1,campo2,campo3,"campo4,campo5"',
    '188.210.113.80 - - [26/Jan/2019:20:17:17 +0330] "GET /image/4158/productModel/200x200 HTTP/1.1" 200 4022 "https://www.zanbil.ir/m/browse/electric-heaters/%D8%A8%D8%AE%D8%A7%D8%B1%DB%8C-%D8%A8%D8%B1%D9%82%DB%8C" "Mozilla/5.0 (iPhone; CPU iPhone OS 12_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1" "-"',
    'Nov 19 16:43:19 campo1 campo2 ahora pongo un kv start=Apr 29 2022 16:43:19 deviceExternalId=FMG3HFT718900147 ad.vd=root ad.itime=1651244591 ad.idseq=186791712926990336 ',
    '1,2022/04/29 17:34:21,012501007018,CONFIG,0,0,2022/04/29 17:34:21,10.16.10.37,,validate-all,Panorama-FWTECSIS04,Panorama,Succeeded,,6966313332068319615,0x8000000000000000,0,0,0,0,,CD1FW-GICINT01A,0,',
    '1331901007	C36a282Jljz7BsbGH	192.168.202.76	137	udp	57398	WPAD	1	C_INTERNET	32	NB	-	-	F	F	T	F	1	-	-	F',
    'Jun  9 06:06:20 combo kernel: On node 0 totalpages: 32430',
    '2022-04-28T16:30:29Z S12KROVA1 LEEF:1.0|Cyber-Ark|Vault|11.5.0003|51|sev=6   Action=Retrieve File    EventMessage=Retrieve File  OSUser= usrName=PasswordManager src=127.0.0.0   SourceUser= TargetUser= File=Root\Policy.ini  Safe=PasswordManagerShared  Location=   Category=   RequestId=  Reason= ExtraDetails=   GatewayStation= CAPolicy=',
    'x.x.x.90 - - [13/Sep/2006:06:58:52 -0700] "PROPFIND /svn/[xxxx]/Extranet/branches/SOW-101 HTTP/1.1" 401 587 key1=value1 key2=value2 key3=value3 key4=value4',
]

def aggregate_entities(pipeline_output):
    reference_entity = pipeline_output[0]
    aggregated_output = [reference_entity]

    for idx in range(1, len(pipeline_output)):
        entity = pipeline_output[idx]
        if (entity['entity_group'] == reference_entity['entity_group']) \
                and (entity['start'] == reference_entity['end']):
            result_entity = {
                'entity_group': reference_entity['entity_group'],
                'score': np.round((reference_entity['score'] + entity['score'])
                                  / 2, 3),
                'word': reference_entity['word'] + entity['word'],
                'start': reference_entity['start'],
                'end': entity['end']
            }

            aggregated_output.pop()
            aggregated_output.append(result_entity)
            reference_entity = result_entity
        else:
            aggregated_output.append(entity)
            reference_entity = entity

    return aggregated_output

    
def ner(text):
    output = ner_pipeline(text)
    if len(output) != 0:
        output = aggregate_entities(output)
    for i in output:
        i['entity'] = i.pop('entity_group')
    df = pd.DataFrame({}, columns=['Word', 'Entity', 'Probability'])
    for i in output:
        new_entity = pd.Series({"Word": text[i['start']:i['end']],
                                "Entity": i['entity'],
                                "Probability": np.round(i['score'], 3)})
        df = pd.concat([df, new_entity.to_frame().T], ignore_index=True)
    return [{"text": text, "entities": output}, df]


demo = gr.Interface(ner,
                    gr.Textbox(label='Log', placeholder="Enter your log here!"),
                    [gr.HighlightedText(label='NER output'),
                     gr.Dataframe(label='',
                                  headers=["Word", "Entity", "Probability"],
                                  datatype=["str", "str", "number"],
                                  wrap=True
                                  )],
                    title=title,
                    description=description,
                    examples=examples,
                    allow_flagging='never')
demo.launch()