import nltk from nltk.tokenize import word_tokenize import pandas as pd import csv tok_dict={} lst=['NNP', 'IN', 'JJ', 'NN', ',', 'RB', ':', 'EX', 'VBD', 'WRB', 'CD', 'DT', 'TO', 'VB', '.', '(', ')', 'CC', 'POS', 'VBP', 'NNS', 'PRP', 'VBZ', 'VBG', 'VBN', 'MD', 'PRP$', 'JJR', 'JJS', 'UH', 'RP', 'WP', 'WDT', '#', "''"] pd_dict={'msg':[],'label':[],'label_no':[],'NNP':[], 'IN':[], 'JJ':[], 'NN':[], ',':[], 'RB':[], ':':[], 'EX':[], 'VBD':[], 'WRB':[], 'CD':[], 'DT':[], 'TO':[], 'VB':[], '.':[], '(':[], ')':[], 'CC':[], 'POS':[], 'VBP':[], 'NNS':[], 'PRP':[], 'VBZ':[], 'VBG':[], 'VBN':[], 'MD':[], 'PRP$':[], 'JJR':[], 'JJS':[], 'UH':[], 'RP':[], 'WP':[], 'WDT':[], '#':[], "''":[]} with open("spam.csv", 'r') as file: csvreader = csv.reader(file) j=0 k=0 for row in csvreader: if j==0: j=1 continue pd_dict['msg'].append(row[1]) pd_dict['label'].append(row[0]) if row[0]=='spam': pd_dict['label_no'].append(1) else: pd_dict['label_no'].append(0) for label in lst: pd_dict[label].append(0) text=row[1] tokens=word_tokenize(text) tokens_tagged=nltk.pos_tag(tokens) # print(tokens_tagged,end='\n\n') for i in tokens_tagged: if i[1] in tok_dict: tok_dict[i[1]].append(i[0]) else: tok_dict[i[1]]=[i[0]] if i[1] in pd_dict: pd_dict[i[1]][k]+=1 k+=1 #text="" #tokens=word_tokenize(text) #tokens_tagged=nltk.pos_tag(tokens) #print(tokens_tagged,end='\n\n') #for i in tokens_tagged: # if i[1] in tok_dict: # tok_dict[i[1]].append(i[0]) # else: # tok_dict[i[1]]=[i[0]] #print(tok_dict, end="\n\n") tok_dict1={} for i in tok_dict: tok_dict1[i]=len(tok_dict[i]) del_lst=[] for i in tok_dict1: print(i," ",tok_dict1[i]) if tok_dict1[i]<100: del_lst.append(i) print(del_lst) for i in del_lst: tok_dict1.pop(i) print(tok_dict1) lst=[] for i in tok_dict1: lst.append(i) print(lst,len(lst)) df=pd.DataFrame(pd_dict) print(df.head())