ahmadardhy commited on
Commit
d776785
1 Parent(s): 4b1c00b

Upload 12 files

Browse files
c45/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .c45 import C45
c45/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (196 Bytes). View file
 
c45/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (172 Bytes). View file
 
c45/__pycache__/c45.cpython-311.pyc ADDED
Binary file (6.23 kB). View file
 
c45/__pycache__/c45.cpython-38.pyc ADDED
Binary file (3.95 kB). View file
 
c45/__pycache__/c45_utils.cpython-311.pyc ADDED
Binary file (14.8 kB). View file
 
c45/__pycache__/c45_utils.cpython-38.pyc ADDED
Binary file (5.9 kB). View file
 
c45/c45.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from xml.dom import minidom
3
+ from xml.etree import ElementTree as ET
4
+
5
+ from sklearn.base import BaseEstimator, ClassifierMixin
6
+ from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
7
+
8
+ from .c45_utils import decision, grow_tree
9
+
10
+ class C45(BaseEstimator, ClassifierMixin):
11
+ """A C4.5 tree classifier.
12
+
13
+ Parameters
14
+ ----------
15
+ attrNames : list, optional (default=None)
16
+ The list of feature names used in printing tree during. If left default,
17
+ attributes will be named attr0, attr1... etc
18
+ See also
19
+ --------
20
+ DecisionTreeClassifier
21
+ References
22
+ ----------
23
+ .. [1] https://en.wikipedia.org/wiki/Decision_tree_learning
24
+ .. [2] https://en.wikipedia.org/wiki/C4.5_algorithm
25
+ .. [3] L. Breiman, J. Friedman, R. Olshen, and C. Stone, "Classification
26
+ and Regression Trees", Wadsworth, Belmont, CA, 1984.
27
+ .. [4] J. R. Quinlain, "C4.5: Programs for Machine Learning",
28
+ Morgan Kaufmann Publishers, 1993
29
+ Examples
30
+ --------
31
+ >>> from sklearn.datasets import load_iris
32
+ >>> from sklearn.model_selection import cross_val_score
33
+ >>> from c45 import C45
34
+ >>> iris = load_iris()
35
+ >>> clf = C45(attrNames=iris.feature_names)
36
+ >>> cross_val_score(clf, iris.data, iris.target, cv=10)
37
+ ... # doctest: +SKIP
38
+ ...
39
+ array([ 1. , 0.93..., 0.86..., 0.93..., 0.93...,
40
+ 0.93..., 0.93..., 1. , 0.93..., 1. ])
41
+ """
42
+ def __init__(self, attrNames=None):
43
+ if attrNames is not None:
44
+ attrNames = [''.join(i for i in x if i.isalnum()).replace(' ', '_') for x in attrNames]
45
+ self.attrNames = attrNames
46
+
47
+ def fit(self, X, y):
48
+ X, y = check_X_y(X, y)
49
+ self.X_ = X
50
+ self.y_ = y
51
+ self.resultType = type(y[0])
52
+ if self.attrNames is None:
53
+ self.attrNames = [f'attr{x}' for x in range(len(self.X_[0]))]
54
+
55
+ assert(len(self.attrNames) == len(self.X_[0]))
56
+
57
+ data = [[] for i in range(len(self.attrNames))]
58
+ categories = []
59
+
60
+ for i in range(len(self.X_)):
61
+ categories.append(str(self.y_[i]))
62
+ for j in range(len(self.attrNames)):
63
+ data[j].append(self.X_[i][j])
64
+ root = ET.Element('DecisionTree')
65
+ grow_tree(data,categories,root,self.attrNames)
66
+ self.tree_ = ET.tostring(root, encoding="unicode")
67
+ return self
68
+
69
+ def predict(self, X):
70
+ check_is_fitted(self, ['tree_', 'resultType', 'attrNames'])
71
+ X = check_array(X)
72
+ dom = minidom.parseString(self.tree_)
73
+ root = dom.childNodes[0]
74
+ prediction = []
75
+ for i in range(len(X)):
76
+ answerlist = decision(root,X[i],self.attrNames,1)
77
+ answerlist = sorted(answerlist.items(), key=lambda x:x[1], reverse = True )
78
+ answer = answerlist[0][0]
79
+ prediction.append((self.resultType)(answer))
80
+ return prediction
81
+
82
+ def printTree(self):
83
+ check_is_fitted(self, ['tree_'])
84
+ dom = minidom.parseString(self.tree_)
85
+ print(dom.toprettyxml(newl="\r\n"))
c45/c45_utils.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from xml.etree import ElementTree as ET
3
+
4
+
5
+ def prettify(elem, level=0):
6
+ i = "\n" + level*" "
7
+ if len(elem):
8
+ if not elem.text or not elem.text.strip():
9
+ elem.text = i + " "
10
+ for e in elem:
11
+ prettify(e, level+1)
12
+ if not e.tail or not e.tail.strip():
13
+ e.tail = i
14
+ if level and (not elem.tail or not elem.tail.strip()):
15
+ elem.tail = i
16
+ return elem
17
+
18
+ def isnum(attr):
19
+ for x in set(attr):
20
+ if not x=="?":
21
+ try:
22
+ x=float(x)
23
+ return isinstance(x,float)
24
+ except ValueError:
25
+ return False
26
+ return True
27
+
28
+ def entropy(x):
29
+ ent=0
30
+ for k in set(x):
31
+ p_i=float(x.count(k))/len(x)
32
+ ent=ent-p_i* math.log(p_i,2)
33
+ return ent
34
+
35
+ def gain_ratio(category,attr):
36
+ s=0
37
+ cat=[]
38
+ att=[]
39
+ for i in range(len(attr)):
40
+ if not attr[i]=="?":
41
+ cat.append(category[i])
42
+ att.append(attr[i])
43
+ for i in set(att):
44
+ p_i=float(att.count(i))/len(att)
45
+ cat_i=[]
46
+ for j in range(len(cat)):
47
+ if att[j]==i:
48
+ cat_i.append(cat[j])
49
+ s=s+p_i*entropy(cat_i)
50
+ gain=entropy(cat)-s
51
+ ent_att=entropy(att)
52
+ if ent_att==0:
53
+ return 0
54
+ else:
55
+ return gain/ent_att
56
+
57
+ def gain(category,attr):
58
+ cats=[]
59
+ for i in range(len(attr)):
60
+ if not attr[i]=="?":
61
+ cats.append([float(attr[i]),category[i]])
62
+ cats=sorted(cats, key=lambda x:x[0])
63
+
64
+ cat=[cats[i][1] for i in range(len(cats))]
65
+ att=[cats[i][0] for i in range(len(cats))]
66
+ if len(set(att))==1:
67
+ return 0
68
+ else:
69
+ gains=[]
70
+ div_point=[]
71
+ for i in range(1,len(cat)):
72
+ if not att[i]==att[i-1]:
73
+ gains.append(entropy(cat[:i])*float(i)/len(cat)+entropy(cat[i:])*(1-float(i)/len(cat)))
74
+ div_point.append(i)
75
+ gain=entropy(cat)-min(gains)
76
+
77
+ p_1=float(div_point[gains.index(min(gains))])/len(cat)
78
+ ent_attr= -p_1*math.log(p_1,2)-(1-p_1)*math.log((1-p_1),2)
79
+ return gain/ent_attr
80
+
81
+ def division_point(category,attr):
82
+ cats=[]
83
+ for i in range(len(attr)):
84
+ if not attr[i]=="?":
85
+ cats.append([float(attr[i]),category[i]])
86
+ cats=sorted(cats, key=lambda x:x[0])
87
+
88
+ cat=[cats[i][1] for i in range(len(cats))]
89
+ att=[cats[i][0] for i in range(len(cats))]
90
+ gains=[]
91
+ div_point=[]
92
+ for i in range(1,len(cat)):
93
+ if not att[i]==att[i-1]:
94
+ gains.append(entropy(cat[:i])*float(i)/len(cat)+entropy(cat[i:])*(1-float(i)/len(cat)))
95
+ div_point.append(i)
96
+ return att[div_point[gains.index(min(gains))]]
97
+
98
+ def grow_tree(data,category,parent,attrs_names):
99
+ if len(set(category))>1:
100
+
101
+ division=[]
102
+ for i in range(len(data)):
103
+ if set(data[i])==set("?"):
104
+ division.append(0)
105
+ else:
106
+ if (isnum(data[i])):
107
+ division.append(gain(category,data[i]))
108
+ else:
109
+ division.append(gain_ratio(category,data[i]))
110
+ if max(division)==0:
111
+ num_max=0
112
+ for cat in set(category):
113
+ num_cat=category.count(cat)
114
+ if num_cat>num_max:
115
+ num_max=num_cat
116
+ most_cat=cat
117
+ parent.text=most_cat
118
+ else:
119
+ index_selected=division.index(max(division))
120
+ name_selected=str(attrs_names[index_selected])
121
+ if isnum(data[index_selected]):
122
+ div_point=division_point(category,data[index_selected])
123
+ r_son_data=[[] for i in range(len(data))]
124
+ r_son_category=[]
125
+ l_son_data=[[] for i in range(len(data))]
126
+ l_son_category=[]
127
+ for i in range(len(category)):
128
+ if not data[index_selected][i]=="?":
129
+ if float(data[index_selected][i])<float(div_point):
130
+ l_son_category.append(category[i])
131
+ for j in range(len(data)):
132
+ l_son_data[j].append(data[j][i])
133
+ else:
134
+ r_son_category.append(category[i])
135
+ for j in range(len(data)):
136
+ r_son_data[j].append(data[j][i])
137
+ if len(l_son_category)>0 and len(r_son_category)>0:
138
+ p_l=float(len(l_son_category))/(len(data[index_selected])-data[index_selected].count("?"))
139
+ son=ET.SubElement(parent,name_selected,{'value':str(div_point),"flag":"l","p":str(round(p_l,3))})
140
+ grow_tree(l_son_data,l_son_category,son,attrs_names)
141
+ son=ET.SubElement(parent,name_selected,{'value':str(div_point),"flag":"r","p":str(round(1-p_l,3))})
142
+ grow_tree(r_son_data,r_son_category,son,attrs_names)
143
+ else:
144
+ num_max=0
145
+ for cat in set(category):
146
+ num_cat=category.count(cat)
147
+ if num_cat>num_max:
148
+ num_max=num_cat
149
+ most_cat=cat
150
+ parent.text=most_cat
151
+ else:
152
+ for k in set(data[index_selected]):
153
+ if not k=="?":
154
+ son_data=[[] for i in range(len(data))]
155
+ son_category=[]
156
+ for i in range(len(category)):
157
+ if data[index_selected][i]==k:
158
+ son_category.append(category[i])
159
+ for j in range(len(data)):
160
+ son_data[j].append(data[j][i])
161
+ son=ET.SubElement(parent,name_selected,{'value':k,"flag":"m",'p':str(round(float(len(son_category))/(len(data[index_selected])-data[index_selected].count("?")),3))})
162
+ grow_tree(son_data,son_category,son,attrs_names)
163
+ else:
164
+ parent.text=category[0]
165
+
166
+ def add(d1,d2):
167
+ d=d1
168
+ for i in d2:
169
+ if d.has_key(i):
170
+ d[i]=d[i]+d2[i]
171
+ else:
172
+ d[i]=d2[i]
173
+ return d
174
+
175
+ def decision(root,obs,attrs_names,p):
176
+ if root.hasChildNodes():
177
+ att_name=root.firstChild.nodeName
178
+ if att_name=="#text":
179
+
180
+ return decision(root.firstChild,obs,attrs_names,p)
181
+ else:
182
+ att=obs[attrs_names.index(att_name)]
183
+ if att=="?":
184
+ d={}
185
+ for child in root.childNodes:
186
+ d=add(d,decision(child,obs,attrs_names,p*float(child.getAttribute("p"))))
187
+ return d
188
+ else:
189
+ for child in root.childNodes:
190
+ if child.getAttribute("flag")=="m" and child.getAttribute("value")==att or \
191
+ child.getAttribute("flag")=="l" and float(att)<float(child.getAttribute("value")) or \
192
+ child.getAttribute("flag")=="r" and float(att)>=float(child.getAttribute("value")):
193
+ return decision(child,obs,attrs_names,p)
194
+ else:
195
+ return {root.nodeValue:p}
model_scratch_2.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:994d3bdd3909faa5a41f6c1a6da49c9e88753333d892feb5178e365032faa974
3
+ size 30366
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ joblib==1.3.2
2
+ pandas==1.4.2
3
+ numpy==1.26.2
4
+ scikit-learn==1.1.3
5
+ streamlit
6
+ mysql-connector-python==8.2.0