Spaces:
Sleeping
Sleeping
| from io import StringIO, BytesIO | |
| import gradio as gr | |
| from pdfminer.high_level import extract_text | |
| from transformers import pipeline | |
| import pandas as pd | |
| import numpy as np | |
| nlp = pipeline("ner", model="carblacac/ner-investing", tokenizer="carblacac/ner-investing") | |
| class Group(): | |
| def __init__(self): | |
| self.id = 0 | |
| self.text = '' | |
| def getgroup(self,text): | |
| if self.text == text: | |
| return self.id | |
| else: | |
| self.id +=1 | |
| self.text = text | |
| return self.id | |
| grp_gen = Group() | |
| def entities_to_df(entities): | |
| df = pd.DataFrame(entities) | |
| df['entity'] = df['entity'].apply(lambda x: x[2:]) | |
| df['group'] = df['entity'].apply(grp_gen.getgroup) | |
| group_tag = df.groupby(by='group') | |
| img_tagging = group_tag.agg({ | |
| 'start':min, | |
| 'end':max, | |
| 'entity':np.unique, | |
| 'word':lambda x: " ".join(x) | |
| }) | |
| return img_tagging | |
| def transform_entity_type(entities): | |
| for d in entities: | |
| d['entity'] = d['entity'][0] | |
| return entities | |
| def highlight_text(fileObj): | |
| path = BytesIO(fileObj) | |
| text = extract_text(path) | |
| entities = nlp(text) | |
| df = entities_to_df(entities) | |
| entities = df.to_dict('records') | |
| entities = transform_entity_type(entities) | |
| return {"text": text, "entities": entities} | |
| examples = ['Beiersdorf sees slower sales this year after bumper 2022 By Reuters.pdf'] | |
| gr.Interface(fn=highlight_text, | |
| inputs=gr.inputs.File(file_count="single", type="bytes"), | |
| outputs=gr.HighlightedText(), | |
| examples=examples | |
| ).launch() | |