SmitaGautam commited on
Commit
762a449
1 Parent(s): 981d617

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +21 -97
train.py CHANGED
@@ -37,105 +37,29 @@ pos_tags = [ 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD',
37
  'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'
38
  ]
39
 
40
-
41
- def feature_vector(w, scaled_position, pos_tag):
42
- vec = np.zeros(12).astype(np.float32)
43
-
44
- #if w[0].isupper():
45
- #title = 1
46
- #else:
47
- #title = 0
48
-
49
- if w.isupper():
50
- allcaps = 1
51
- else:
52
- allcaps = 0
53
-
54
- if w in PUNCT:
55
- punct = 1
56
- else:
57
- punct = 0
58
-
59
- if w.lower() in stopwords:
60
- sw=1
61
- else:
62
- sw=0
63
-
64
- if w.isdigit():
65
- is_digit=1
66
- else:
67
- is_digit=0
68
-
69
- if pos_tag in ('VB','VBD','VBG','VBN','VBP','VBZ'):
70
- is_verb=1
71
- else:
72
- is_verb=0
73
-
74
- #if pos_tag in ('NN','NNP','NNPS','NNS'):
75
- if pos_tag in ('NNP','NNPS'):
76
- is_noun=1
77
- else:
78
- is_noun=0
79
-
80
- if w in places:
81
- is_place=1
82
- else:
83
- is_place=0
84
-
85
- if w in people:
86
- is_people=1
87
- else:
88
- is_people=0
89
-
90
- if w in countries:
91
- is_country=1
92
  else:
93
- is_country=0
94
-
95
- if w in nationalities:
96
- is_nation=1
97
  else:
98
- is_nation=0
99
-
100
- # Build vector
101
- #vec[0] = title
102
- vec[0] = allcaps
103
- vec[1] = len(w)
104
- vec[2] = punct
105
- vec[3] = scaled_position
106
- vec[4] = sw
107
- vec[5] = is_digit
108
- vec[6] = is_verb
109
- vec[7] = is_noun
110
- vec[8] = is_place
111
- vec[9] = is_people
112
- vec[10] = is_country
113
- vec[11] = is_nation
114
-
115
- return vec
116
-
117
-
118
- def feature_vector_d(word, prev_word_pos_tag, next_word_pos_tag, current_word_pos_tag):
119
- vec = np.zeros(116).astype('float32')
120
- if(word.istitle()):
121
- vec[0] = 1
122
- if word.lower() in stopwords:
123
- vec[1] = 1
124
- if(word.isupper()):
125
- vec[2] = 1
126
- vec[3] = len(word)
127
- vec[4] = word.isdigit()
128
-
129
- if prev_word_pos_tag!=-1:
130
- vec[5+prev_word_pos_tag] = 1
131
-
132
- if next_word_pos_tag!=-1:
133
- vec[42+next_word_pos_tag] = 1
134
-
135
- if current_word_pos_tag!=-1:
136
- vec[79+current_word_pos_tag] = 1
137
-
138
- return vec
139
 
140
 
141
  def feature_vector2(word, prev_word_pos_tag, next_word_pos_tag, current_word_pos_tag):
 
37
  'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'
38
  ]
39
 
40
+ def feature_vector(word, scaled_position, current_word_pos_tag):
41
+ features = []
42
+ features.append(int(word.lower() in stopwords))
43
+ features.append(int(word.isupper()))
44
+ features.append(int(word in PUNCT))
45
+ features.append(int(word.istitle()))
46
+ features.append(int(word.isdigit()))
47
+ # features.append(len(word))
48
+ features.append(int(word in places))
49
+ features.append(int(word in people))
50
+ features.append(int(word in countries))
51
+ features.append(int(word in nationalities))
52
+
53
+ if (current_word_pos_tag==12) or (current_word_pos_tag==13): ##NNP, NNPS
54
+ features.append(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  else:
56
+ features.append(0)
57
+ features.append(scaled_position)
58
+ if 27 <= current_word_pos_tag <= 32: ##isVERB
59
+ features.append(1)
60
  else:
61
+ features.append(0)
62
+ return np.asarray(features, dtype = np.float32)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
 
65
  def feature_vector2(word, prev_word_pos_tag, next_word_pos_tag, current_word_pos_tag):