import os import pandas as pd import numpy as np import gradio as gr from transformers import DistilBertForTokenClassification, DistilBertTokenizerFast, pipeline auth_token = os.environ["TOKEN"] tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') model_path = "inesani/ner-log" model_loaded = DistilBertForTokenClassification.from_pretrained(model_path, use_auth_token=auth_token) ner_pipeline = pipeline("ner", model=model_loaded, tokenizer=tokenizer, aggregation_strategy='simple') title = 'Hi, my name is NER! I am a ML model that detects IPs, KV and Timestamps in logs.' description = """ I have been trained on a log corpus of only 10000 logs...I am a Work In Progress :) You can paste below any log that you want to test or use one of the provided examples. """ examples = [ 'John Doe, 37 - Google - 42 Wallaby Way, Sydney - 500000', 'campo1,campo2,campo3,"campo4,campo5"', '188.210.113.80 - - [26/Jan/2019:20:17:17 +0330] "GET /image/4158/productModel/200x200 HTTP/1.1" 200 4022 "https://www.zanbil.ir/m/browse/electric-heaters/%D8%A8%D8%AE%D8%A7%D8%B1%DB%8C-%D8%A8%D8%B1%D9%82%DB%8C" "Mozilla/5.0 (iPhone; CPU iPhone OS 12_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1" "-"', 'Nov 19 16:43:19 campo1 campo2 ahora pongo un kv start=Apr 29 2022 16:43:19 deviceExternalId=FMG3HFT718900147 ad.vd=root ad.itime=1651244591 ad.idseq=186791712926990336 ', '1,2022/04/29 17:34:21,012501007018,CONFIG,0,0,2022/04/29 17:34:21,10.16.10.37,,validate-all,Panorama-FWTECSIS04,Panorama,Succeeded,,6966313332068319615,0x8000000000000000,0,0,0,0,,CD1FW-GICINT01A,0,', '1331901007 C36a282Jljz7BsbGH 192.168.202.76 137 udp 57398 WPAD 1 C_INTERNET 32 NB - - F F T F 1 - - F', 'Jun 9 06:06:20 combo kernel: On node 0 totalpages: 32430', '2022-04-28T16:30:29Z S12KROVA1 LEEF:1.0|Cyber-Ark|Vault|11.5.0003|51|sev=6 Action=Retrieve File EventMessage=Retrieve File OSUser= usrName=PasswordManager src=127.0.0.0 SourceUser= TargetUser= File=Root\Policy.ini Safe=PasswordManagerShared Location= Category= RequestId= Reason= ExtraDetails= GatewayStation= CAPolicy=', 'x.x.x.90 - - [13/Sep/2006:06:58:52 -0700] "PROPFIND /svn/[xxxx]/Extranet/branches/SOW-101 HTTP/1.1" 401 587 key1=value1 key2=value2 key3=value3 key4=value4', ] def aggregate_entities(pipeline_output): reference_entity = pipeline_output[0] aggregated_output = [reference_entity] for idx in range(1, len(pipeline_output)): entity = pipeline_output[idx] if (entity['entity_group'] == reference_entity['entity_group']) \ and (entity['start'] == reference_entity['end']): result_entity = { 'entity_group': reference_entity['entity_group'], 'score': np.round((reference_entity['score'] + entity['score']) / 2, 3), 'word': reference_entity['word'] + entity['word'], 'start': reference_entity['start'], 'end': entity['end'] } aggregated_output.pop() aggregated_output.append(result_entity) reference_entity = result_entity else: aggregated_output.append(entity) reference_entity = entity return aggregated_output def ner(text): output = ner_pipeline(text) if len(output) != 0: output = aggregate_entities(output) for i in output: i['entity'] = i.pop('entity_group') df = pd.DataFrame({}, columns=['Word', 'Entity', 'Probability']) for i in output: new_entity = pd.Series({"Word": text[i['start']:i['end']], "Entity": i['entity'], "Probability": np.round(i['score'], 3)}) df = pd.concat([df, new_entity.to_frame().T], ignore_index=True) return [{"text": text, "entities": output}, df] demo = gr.Interface(ner, gr.Textbox(label='Log', placeholder="Enter your log here!"), [gr.HighlightedText(label='NER output'), gr.Dataframe(label='', headers=["Word", "Entity", "Probability"], datatype=["str", "str", "number"], wrap=True )], title=title, description=description, examples=examples, allow_flagging='never') demo.launch()