from io import StringIO, BytesIO import gradio as gr from pdfminer.high_level import extract_text from transformers import pipeline import pandas as pd import numpy as np nlp = pipeline("ner", model="carblacac/ner-investing", tokenizer="carblacac/ner-investing") # group the label class Group(): def __init__(self): self.id = 0 self.text = '' def getgroup(self,text): if self.text == text: return self.id else: self.id +=1 self.text = text return self.id grp_gen = Group() def entities_to_df(entities): df = pd.DataFrame(entities) df['entity'] = df['entity'].apply(lambda x: x[2:]) return df def highlight_text(fileObj): path = BytesIO(fileObj) text = extract_text(path) entities = nlp(text) df = entities_to_df(entities) df['group'] = df['entity'].apply(grp_gen.getgroup) group_tag = df.groupby(by='group') img_tagging = group_tag.agg({ 'start':min, 'end':max, 'entity':np.unique, 'word':lambda x: " ".join(x) }) entities = img_tagging.to_dict('records') for d in entities: d['entity'] = d['entity'][0] return {"text": text, "entities": entities} gr.Interface(highlight_text, gr.inputs.File(file_count="single", type="bytes"), gr.HighlightedText(), ).launch()