carblacac commited on
Commit
6085bca
1 Parent(s): 2b67bc6

format entities

Browse files
Files changed (2) hide show
  1. app.py +40 -0
  2. requirements.txt +2 -0
app.py CHANGED
@@ -3,14 +3,54 @@ from io import StringIO, BytesIO
3
  import gradio as gr
4
  from pdfminer.high_level import extract_text
5
  from transformers import pipeline
 
 
6
 
7
  nlp = pipeline("ner", model="carblacac/ner-investing", tokenizer="carblacac/ner-investing")
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  def highlight_text(fileObj):
11
  path = BytesIO(fileObj)
12
  text = extract_text(path)
13
  entities = nlp(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  return {"text": text, "entities": entities}
15
 
16
  gr.Interface(highlight_text,
 
3
  import gradio as gr
4
  from pdfminer.high_level import extract_text
5
  from transformers import pipeline
6
+ import pandas as pd
7
+ import numpy as np
8
 
9
  nlp = pipeline("ner", model="carblacac/ner-investing", tokenizer="carblacac/ner-investing")
10
 
11
+ # group the label
12
+ class Group():
13
+ def __init__(self):
14
+ self.id = 0
15
+ self.text = ''
16
+
17
+ def getgroup(self,text):
18
+ if self.text == text:
19
+ return self.id
20
+ else:
21
+ self.id +=1
22
+ self.text = text
23
+ return self.id
24
+
25
+ grp_gen = Group()
26
+
27
+ def entities_to_df(entities):
28
+ df = pd.DataFrame(entities)
29
+ df['entity'] = df['entity'].apply(lambda x: x[2:])
30
+ return df
31
 
32
  def highlight_text(fileObj):
33
  path = BytesIO(fileObj)
34
  text = extract_text(path)
35
  entities = nlp(text)
36
+ df = entities_to_df(entities)
37
+
38
+ df['group'] = df['entity'].apply(grp_gen.getgroup)
39
+ group_tag = df.groupby(by='group')
40
+
41
+ img_tagging = group_tag.agg({
42
+
43
+ 'start':min,
44
+ 'end':max,
45
+ 'entity':np.unique,
46
+ 'word':lambda x: " ".join(x)
47
+
48
+ })
49
+
50
+ entities = img_tagging.to_dict('records')
51
+ for d in entities:
52
+ d['entity'] = d['entity'][0]
53
+
54
  return {"text": text, "entities": entities}
55
 
56
  gr.Interface(highlight_text,
requirements.txt CHANGED
@@ -1,3 +1,5 @@
1
  transformers
2
  pdfminer.six
3
  torch
 
 
 
1
  transformers
2
  pdfminer.six
3
  torch
4
+ pandas
5
+ numpy