File size: 2,062 Bytes
b986fa0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd
import csv


tok_dict={}

lst=['NNP', 'IN', 'JJ', 'NN', ',', 'RB', ':', 'EX', 'VBD', 'WRB', 'CD', 'DT', 'TO', 'VB', '.',
     '(', ')', 'CC', 'POS', 'VBP', 'NNS', 'PRP', 'VBZ', 'VBG', 'VBN', 'MD', 'PRP$', 'JJR', 'JJS', 'UH', 'RP', 'WP', 'WDT', '#', "''"]

pd_dict={'msg':[],'label':[],'label_no':[],'NNP':[], 'IN':[], 'JJ':[], 'NN':[], ',':[], 'RB':[], ':':[], 'EX':[], 'VBD':[], 
'WRB':[], 'CD':[], 'DT':[], 'TO':[], 'VB':[], '.':[], '(':[], ')':[], 'CC':[],
'POS':[], 'VBP':[], 'NNS':[], 'PRP':[], 'VBZ':[], 'VBG':[], 'VBN':[], 'MD':[], 
'PRP$':[], 'JJR':[], 'JJS':[], 'UH':[], 'RP':[], 'WP':[], 'WDT':[], '#':[], "''":[]}

with open("spam_db.csv", 'r') as file:
  csvreader = csv.reader(file)
  j=0
  k=0
  for row in csvreader:
    if j==0:
        j=1
        continue
    pd_dict['msg'].append(row[1])
    pd_dict['label'].append(row[0])
    if row[0]=='spam':
        pd_dict['label_no'].append(1)
    else:
        pd_dict['label_no'].append(0)
    for label in lst:
        pd_dict[label].append(0)
    text=row[1]
    tokens=word_tokenize(text)
    tokens_tagged=nltk.pos_tag(tokens)
#        print(tokens_tagged,end='\n\n')
    for i in tokens_tagged:
        if i[1] in tok_dict:
            tok_dict[i[1]].append(i[0])
        else:
            tok_dict[i[1]]=[i[0]]
        if i[1] in pd_dict:
          pd_dict[i[1]][k]+=1
    k+=1
        



#text=""
#tokens=word_tokenize(text)
#tokens_tagged=nltk.pos_tag(tokens)
#print(tokens_tagged,end='\n\n')
#for i in tokens_tagged:
#    if i[1] in tok_dict:
#        tok_dict[i[1]].append(i[0])
#    else:
#        tok_dict[i[1]]=[i[0]]

#print(tok_dict, end="\n\n")

tok_dict1={}
for i in tok_dict:
    tok_dict1[i]=len(tok_dict[i])

del_lst=[]
for i in tok_dict1:
    print(i," ",tok_dict1[i])
    if tok_dict1[i]<100:
        del_lst.append(i)

print(del_lst)
for i in del_lst:
    tok_dict1.pop(i)


print(tok_dict1)

lst=[]
for i in tok_dict1:
    lst.append(i)


print(lst,len(lst))

df=pd.DataFrame(pd_dict)
print(df.head())