Spaces:
Sleeping
Sleeping
import nltk | |
from nltk.tokenize import word_tokenize | |
import pandas as pd | |
import csv | |
tok_dict={} | |
lst=['NNP', 'IN', 'JJ', 'NN', ',', 'RB', ':', 'EX', 'VBD', 'WRB', 'CD', 'DT', 'TO', 'VB', '.', | |
'(', ')', 'CC', 'POS', 'VBP', 'NNS', 'PRP', 'VBZ', 'VBG', 'VBN', 'MD', 'PRP$', 'JJR', 'JJS', 'UH', 'RP', 'WP', 'WDT', '#', "''"] | |
pd_dict={'msg':[],'label':[],'label_no':[],'NNP':[], 'IN':[], 'JJ':[], 'NN':[], ',':[], 'RB':[], ':':[], 'EX':[], 'VBD':[], | |
'WRB':[], 'CD':[], 'DT':[], 'TO':[], 'VB':[], '.':[], '(':[], ')':[], 'CC':[], | |
'POS':[], 'VBP':[], 'NNS':[], 'PRP':[], 'VBZ':[], 'VBG':[], 'VBN':[], 'MD':[], | |
'PRP$':[], 'JJR':[], 'JJS':[], 'UH':[], 'RP':[], 'WP':[], 'WDT':[], '#':[], "''":[]} | |
with open("spam_db.csv", 'r') as file: | |
csvreader = csv.reader(file) | |
j=0 | |
k=0 | |
for row in csvreader: | |
if j==0: | |
j=1 | |
continue | |
pd_dict['msg'].append(row[1]) | |
pd_dict['label'].append(row[0]) | |
if row[0]=='spam': | |
pd_dict['label_no'].append(1) | |
else: | |
pd_dict['label_no'].append(0) | |
for label in lst: | |
pd_dict[label].append(0) | |
text=row[1] | |
tokens=word_tokenize(text) | |
tokens_tagged=nltk.pos_tag(tokens) | |
# print(tokens_tagged,end='\n\n') | |
for i in tokens_tagged: | |
if i[1] in tok_dict: | |
tok_dict[i[1]].append(i[0]) | |
else: | |
tok_dict[i[1]]=[i[0]] | |
if i[1] in pd_dict: | |
pd_dict[i[1]][k]+=1 | |
k+=1 | |
#text="" | |
#tokens=word_tokenize(text) | |
#tokens_tagged=nltk.pos_tag(tokens) | |
#print(tokens_tagged,end='\n\n') | |
#for i in tokens_tagged: | |
# if i[1] in tok_dict: | |
# tok_dict[i[1]].append(i[0]) | |
# else: | |
# tok_dict[i[1]]=[i[0]] | |
#print(tok_dict, end="\n\n") | |
tok_dict1={} | |
for i in tok_dict: | |
tok_dict1[i]=len(tok_dict[i]) | |
del_lst=[] | |
for i in tok_dict1: | |
print(i," ",tok_dict1[i]) | |
if tok_dict1[i]<100: | |
del_lst.append(i) | |
print(del_lst) | |
for i in del_lst: | |
tok_dict1.pop(i) | |
print(tok_dict1) | |
lst=[] | |
for i in tok_dict1: | |
lst.append(i) | |
print(lst,len(lst)) | |
df=pd.DataFrame(pd_dict) | |
print(df.head()) | |