# -*- coding: utf-8 -*- """ Created on Sat Dec 17 11:27:28 2022 @author: anas """ import pickle import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer import regex as re # import spacy # from spacy import displacy from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix, classification_report, accuracy_score,f1_score, precision_score, recall_score #from googletrans import Translator import requests class ModelPums: def __init__(self): self.obData=None self.obVec=None self.obSVML=None self.obSVMK=None self.oblogict=None self.X=None self.Y=None self.Classes=None class TEC: __Outs={'svmL':None,'svmK':None,'logstick':None,'target':None} def createOuts(): ob=TEC.__Outs.copy() for key in ob: ob[key]=[] return ob def __init__(self,typemodel="svmL",model_setting=None,spt=0.3,nlp=None): self.model_setting=model_setting self.isLoad=False self.nlp=nlp def toclean(self,txt): try: url="https://ansaltwyl256.pythonanywhere.com/api/nlp/"+txt response = requests.get(url) return response.json()['description'] except: return "$" def loadmodel(self): mm=ModelPums() matrck=pickle.load(open(self.model_setting.path_model,'rb')) self.DES=pickle.load(open(self.model_setting.path_Qwords,'rb')) mm.obData=matrck['obData'] mm.obVec=TfidfVectorizer(norm='l2') mm.obVec.fit(mm.obData) # mm.obVec=matrck['obVec'] mm.obSVML=matrck['obSVML'] mm.obSVMK=matrck['obSVMK'] mm.oblogict=matrck['oblogict'] mm.X=matrck['X'] mm.Y=matrck['Y'] mm.Classes=matrck['Classes'] self.obMP=mm if self.model_setting.path_Qwords!="": self.Qwords={}#pickle.load(open(model_setting.path_Qwords,'rb')) else: self.Qwords={} #self.detector = Translator() # self.Splits(spt) self.Model=self.obMP.obSVML self.name=self.model_setting.name self.pipeline= None self.isLoad=True def getLables(self): return self.obMP.Classes def is_found(self,words,ob): sms=[] ob=ob.lower().strip() for w in words: if w==ob: return 1,w sms.append(self.similarity(w,ob)) index=np.argmax(sms) if sms[index]>0.7: return 2, list(words)[index] return 0,'' def is_found_K(self,words,ob): ob=ob.lower().strip() for w in words: if w==ob: return 1,w return -1,'' def Training(self): self.obMP.obSVMK.fit(self.obMP.X,self.obMP.Y) def clean_dataT(self,data,typw='',is_input=False): d,_=self.clean_dataAPI(data) return d datac=data dock=[] is_found= self.is_found if is_input==True else self.is_found_K reg=r"([0-9])|(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?" for datac in data: strr=str(datac) strr=re.sub(reg, "", strr) doc=self.nlp(strr) disc="" for token in doc: if not token.is_punct and not token.is_stop and not token.is_space and token.is_alpha: if token.pos_=='ADJ' or token.pos_=='NOUN' or token.pos_=='VERB':#token.pos_== typw: qk,key=is_found(self.Qwords,token.lemma_) if qk==1: disc=disc+self.Qwords[token.lemma_]+" " elif qk==2: disc=disc+self.Qwords[key]+" " elif qk==-1: disc=disc+token.lemma_+" " disc=disc.lower().strip() if len(disc)>0: dock.append(disc) return dock def similarity(self,ob1,ob2): ob1=ob1 ob2=ob2 nob1=self.nlp(ob1) return nob1.similarity(self.nlp(ob2)) #-----------------# def clean_data(self,data,typw='',is_input=False): return self.clean_dataAPI(data) datac=data dock=[] labels=[] is_found= self.is_found if is_input==True else self.is_found_K reg=r"([0-9])|(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?" for (datac,label) in data: strr=str(datac) strr=re.sub(reg, "", strr) doc=self.nlp(strr) disc="" for token in doc: if not token.is_punct and not token.is_stop and not token.is_space and token.is_alpha: if True: # token.pos_=='ADJ' or token.pos_=='NOUN' or token.pos_=='VERB' : # disc=disc+token.lemma_+" " qk,key=is_found(self.Qwords,token.lemma_) # print(qk,key) if qk==1: disc=disc+self.Qwords[key]+" " elif qk==2: disc=disc+self.Qwords[key]+" " elif qk==-1: disc=disc+token.lemma_+" " if len(disc)>2: dock.append(disc.strip()) labels.append(label) return dock,labels #-----------------# def clean_dataAPI(self,data,typw='',is_input=False): txt=self.toclean(data) if txt !="$": return [txt],6 reg=r"([0-9])|(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?" strr=str(data) strr=re.sub(reg, "", strr) return [strr],6 def setPipeline(self,model=None): self.pipeline=model def to_tran(self,text,dest='en'): ff=True c=0 while ff: try: t=self.detector.translate(str(text),dest=dest) ff=False text=t.text except: c+=1 if c==20: ff=False print(' no connenet Tr') return text def Predict_ALL(self,description,istrans=False): if istrans: description=self.to_tran(description) if self.pipeline is not None: text_output,_,outs=self.pipeline.Predict_ALL(description) for key in outs: text_output=text_output+'--'+ outs[key] else: text_output='' mx,outs=self.get_ptedict_proba(description+' '+text_output) return text_output,mx,outs def predictAPI(self,description,is_input=False,istrans=False): if istrans: description=self.to_tran(description) clean_description,_=self.clean_dataAPI(description,'',is_input=is_input) try: features=self.obMP.obVec.transform(clean_description) yp=self.Model.predict(features) txttec=self.obMP.Classes[yp[0]] dis=self.DES[txttec] except: txttec='No' dis=" ....." return txttec,dis def predict(self,description,is_input=False,istrans=False): if istrans: description=self.to_tran(description) clean_description,_=self.clean_data([(description,' ')],'',is_input=is_input) try: features=self.obMP.obVec.transform(clean_description) yp=self.Model.predict(features) txttec=self.obMP.Classes[yp[0]] except: txttec='No' return txttec def predict_ids(self,description,is_input=False,istrans=False): if istrans: description=self.to_tran(description) # clean_description,_=self.clean_data([(description,' ')],'',is_input=is_input) try: features=self.obMP.obVec.transform(description) yp=self.Model.predict(features) # txttec=self.obMP.Classes[yp[0]] except: yp='No' return yp def ptedict_proba(self,description,mx=[],is_input=False,istrans=False): if istrans: description=self.to_tran(description) clean_description,_=self.clean_data([(description,' ')],'',is_input=is_input) try: features=self.obMP.obVec.transform(clean_description) mx=self.Model.predict_proba(features) print(np.int16(mx*100)) yp=np.argmax(mx) print(yp) txttec='Technique : '+ self.obMP.Classes[yp] except: txttec='No Found technique ...! (^_^)' return txttec def get_ptedict_proba(self,description,mx=[]): clean_description,_=self.clean_data([(description,' ')],'') try: features=self.obMP.obVec.transform(clean_description) mx=self.obMP.obSVML.predict_proba(features) yk=self.obMP.obSVMK.predict(features) # yp=np.argmax(mx) outputs={'svmK':self.obMP.Classes[yk[0]]} except: txttec='No Found technique ...! (^_^)' outputs={} return mx,outputs def get_ptedict_threemodel(self,description): clean_description,_=self.clean_data([(description,' ')],'') try: features=self.obMP.obVec.transform(clean_description) yl=self.obMP.obSVML.predict(features) yk=self.obMP.obSVMK.predict(features) ym=self.obMP.oblogict.predict(features) outputs={'svmL':self.obMP.Classes[yl[0]], 'svmK':self.obMP.Classes[yk[0]], 'logstick':self.obMP.Classes[ym[0]] } except: print('No Found technique ...! (^_^)' ) outputs={} return outputs def verification(self,inputs=[],outputs=[]): out_prodect=TEC.createOuts() unprocess=0 meta={"tf":TEC.createOuts(),"num":TEC.createOuts()} names=list(self.obMP.Classes) for i in range(len(outputs)): try: outs=self.get_ptedict_threemodel(inputs[i]) target=outputs[i].strip() names.index(target) outs['target']=target for key in outs: out_prodect[key].append(outs[key]) meta['tf'][key].append(int(outs[key]==target)) meta['num'][key].append(names.index(outs[key])) except : unprocess+=1 scores={} for key in meta['num']: if key!='target': scores[key]=self.valmodel(meta['num']['target'],meta['num'][key],' model '+key) return out_prodect,meta,scores def valmodel(self,y,yp,titel=" "): print('---------------'+titel+'------------------------' ) cr=classification_report(y, yp) print(cr) scores={} scores['accuracy']=accuracy_score(y, yp) scores['f1_score']=f1_score(y, yp, average="macro") scores['precision']=precision_score(y, yp, average="macro") scores['recall']=recall_score(y, yp, average="macro") return {'smmray':cr,'scores':scores} #---------------------- def ChangeModel(self,ob=None): if ob==None:return if type(ob) is not str: self.Model=ob else : if ob=='svmL': self.Model=self.obMP.obSVML elif ob=='svmK': self.Model=self.obMP.obSVMK else: self.Model=self.obMP.oblogict #------------------------------ def Search(self): txt=input('Enter any text :') print('Technique:'+ self.predict(txt)) def Info_Models(self): print('Number Data is ',self.obMP.X.shape) print('Number of classes :',len(self.obMP.Classes)) print ('---------simples -------------------') n=len(self.obMP.Classes) f=n%2 n=n-f for i in range(0,n,2): print( (self.obMP.Classes[i], np.sum(np.int16(self.obMP.Y==i))),'------------------',(self.obMP.Classes[i+1], np.sum(np.int16(self.obMP.Y==i+1)))) if f==1: print( (self.obMP.Classes[i+2], np.sum(np.int16(self.obMP.Y==i+2)))) def DlistModel(self): print ('SVC(kernel=\'linear\')-> svmL') print('LinearSVC(C=1.0) -> svmK ') print('LogisticRegression() -> logic')