Spaces:
Running
Running
Create annotator.py
Browse files- annotator.py +96 -0
annotator.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import transformers
|
3 |
+
from sklearn import metrics
|
4 |
+
import pandas as pd
|
5 |
+
import streamlit as st
|
6 |
+
def ignitor_load():
|
7 |
+
dataj=pd.read_json('tinyignitorfile.json')
|
8 |
+
return dataj
|
9 |
+
|
10 |
+
def appendor(thex):
|
11 |
+
gaa=ignitor_load()
|
12 |
+
shortt=gaa.loc[:21,['text','index']]
|
13 |
+
shortt.loc[21,'text']=thex
|
14 |
+
return shortt
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
tokenizerr = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
|
19 |
+
modell = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased')
|
20 |
+
|
21 |
+
|
22 |
+
encod=[]
|
23 |
+
def allll(df):
|
24 |
+
for i in range(len(df)):
|
25 |
+
v=df.loc[i,'text']
|
26 |
+
embed=np.array(np.array(modell(np.array(tokenizerr.encode(v))[np.newaxis,:])[0][0][1:-1]).mean(0))
|
27 |
+
encod.append(embed)
|
28 |
+
#allll(shortt)
|
29 |
+
|
30 |
+
|
31 |
+
|
32 |
+
labs = {}
|
33 |
+
|
34 |
+
labs["SALARY"] = ['underpay','underpaid','overpay','overpaying','payments','wage','payroll','pay','paycheck']
|
35 |
+
labs["COLLEAGUES"] = ['colleague','employee','staff' ,'coworker','co-worker','colleagues']
|
36 |
+
labs["SUPERVISION"] = ['boss','supervisors','manager','supervisor']
|
37 |
+
labs["TIMEDAY"] = ['monday','weekday','day','weekend']
|
38 |
+
labs["TIMEDAYNOMONDAY"] = ['weekday','day','weekend']
|
39 |
+
emblabs={}
|
40 |
+
emblabss=[]
|
41 |
+
keyy=[]
|
42 |
+
for key,v in labs.items():
|
43 |
+
keyy.append(key)
|
44 |
+
embed=np.array(np.array(modell(np.array(tokenizerr.encode(v))[np.newaxis,:])[0][0][1:-1]).mean(0))
|
45 |
+
emblabss.append(embed)
|
46 |
+
|
47 |
+
for i in range(len(keyy)):
|
48 |
+
emblabs[keyy[i]] = emblabss[i]
|
49 |
+
|
50 |
+
hamme=[]
|
51 |
+
for a,z in emblabs.items():
|
52 |
+
jj=z.reshape(-1, 1)
|
53 |
+
hamme.append(jj)
|
54 |
+
|
55 |
+
|
56 |
+
sim=[]
|
57 |
+
for i in range(len(hamme)):
|
58 |
+
zz=metrics.pairwise.cosine_similarity(encod, hamme[i].T)
|
59 |
+
sim.append(zz)
|
60 |
+
|
61 |
+
sim=np.array(sim)
|
62 |
+
cyr1=st.secrets["cyr1"]
|
63 |
+
cyr1=float(cyr1)
|
64 |
+
cyr2=st.secrets["cyr2"]
|
65 |
+
cyr2=float(cyr2)
|
66 |
+
cyr3=st.secrets["cyr3"]
|
67 |
+
cyr3=float(cyr3)
|
68 |
+
cyr4=st.secrets["cyr4"]
|
69 |
+
cyr4=float(cyr4)
|
70 |
+
cyr5=st.secrets["cyr5"]
|
71 |
+
cyr5=float(cyr5)
|
72 |
+
|
73 |
+
referirv=[cyr1,cyr2,cyr3,cyr4,cyr5]
|
74 |
+
meanss=[]
|
75 |
+
labels = list(emblabs.keys())
|
76 |
+
for i in range(len(sim)):
|
77 |
+
sim[i] = sim[i] / sum(sim[i])
|
78 |
+
meanss.append(sim[i].mean())
|
79 |
+
zarayeb = [ii / jj for ii, jj in zip(referirv, meanss)]
|
80 |
+
|
81 |
+
for i in range(len(sim)):
|
82 |
+
|
83 |
+
sim[i] = (sim[i])*zarayeb[i]
|
84 |
+
|
85 |
+
threshhold=st.secrets["threshhold"]
|
86 |
+
threshhold=float(threshhold)
|
87 |
+
tags=[]
|
88 |
+
for j in range(len(sim[0])):
|
89 |
+
if np.amax([sim[:,j,0]]) <= threshhold:
|
90 |
+
label='None'
|
91 |
+
tags.append(label)
|
92 |
+
else:
|
93 |
+
label=np.argmax([sim[:,j,0]])
|
94 |
+
tags.append(label)
|
95 |
+
|
96 |
+
return tags[-1]
|