import numpy as np import transformers from sklearn import metrics import pandas as pd import streamlit as st def ignitor_load(): dataj=pd.read_json('tinyignitorfile.json') return dataj def appendor(thex): gaa=ignitor_load() shortt=gaa.loc[:21,['text','index']] shortt.loc[21,'text']=thex return shortt tokenizerr = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased') modell = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased') encod=[] def allll(df): for i in range(len(df)): v=df.loc[i,'text'] embed=np.array(np.array(modell(np.array(tokenizerr.encode(v))[np.newaxis,:])[0][0][1:-1]).mean(0)) encod.append(embed) #allll(shortt) labs = {} labs["SALARY"] = ['underpay','underpaid','overpay','overpaying','payments','wage','payroll','pay','paycheck'] labs["COLLEAGUES"] = ['colleague','employee','staff' ,'coworker','co-worker','colleagues'] labs["SUPERVISION"] = ['boss','supervisors','manager','supervisor'] labs["TIMEDAY"] = ['monday','weekday','day','weekend'] labs["TIMEDAYNOMONDAY"] = ['weekday','day','weekend'] emblabs={} emblabss=[] keyy=[] for key,v in labs.items(): keyy.append(key) embed=np.array(np.array(modell(np.array(tokenizerr.encode(v))[np.newaxis,:])[0][0][1:-1]).mean(0)) emblabss.append(embed) for i in range(len(keyy)): emblabs[keyy[i]] = emblabss[i] hamme=[] for a,z in emblabs.items(): jj=z.reshape(-1, 1) hamme.append(jj) sim=[] for i in range(len(hamme)): zz=metrics.pairwise.cosine_similarity(encod, hamme[i].T) sim.append(zz) sim=np.array(sim) cyr1=st.secrets["cyr1"] cyr1=float(cyr1) cyr2=st.secrets["cyr2"] cyr2=float(cyr2) cyr3=st.secrets["cyr3"] cyr3=float(cyr3) cyr4=st.secrets["cyr4"] cyr4=float(cyr4) cyr5=st.secrets["cyr5"] cyr5=float(cyr5) referirv=[cyr1,cyr2,cyr3,cyr4,cyr5] meanss=[] labels = list(emblabs.keys()) for i in range(len(sim)): sim[i] = sim[i] / sum(sim[i]) meanss.append(sim[i].mean()) zarayeb = [ii / jj for ii, jj in zip(referirv, meanss)] for i in range(len(sim)): sim[i] = (sim[i])*zarayeb[i] threshhold=st.secrets["threshhold"] threshhold=float(threshhold) tags=[] for j in range(len(sim[0])): if np.amax([sim[:,j,0]]) <= threshhold: label='None' tags.append(label) else: label=np.argmax([sim[:,j,0]]) tags.append(label) return tags[-1]