Spaces:
Running
Running
Commit
•
d776785
1
Parent(s):
4b1c00b
Upload 12 files
Browse files- c45/__init__.py +1 -0
- c45/__pycache__/__init__.cpython-311.pyc +0 -0
- c45/__pycache__/__init__.cpython-38.pyc +0 -0
- c45/__pycache__/c45.cpython-311.pyc +0 -0
- c45/__pycache__/c45.cpython-38.pyc +0 -0
- c45/__pycache__/c45_utils.cpython-311.pyc +0 -0
- c45/__pycache__/c45_utils.cpython-38.pyc +0 -0
- c45/c45.py +85 -0
- c45/c45_utils.py +195 -0
- model_scratch_2.model +3 -0
- requirements.txt +6 -0
c45/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .c45 import C45
|
c45/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (196 Bytes). View file
|
|
c45/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (172 Bytes). View file
|
|
c45/__pycache__/c45.cpython-311.pyc
ADDED
Binary file (6.23 kB). View file
|
|
c45/__pycache__/c45.cpython-38.pyc
ADDED
Binary file (3.95 kB). View file
|
|
c45/__pycache__/c45_utils.cpython-311.pyc
ADDED
Binary file (14.8 kB). View file
|
|
c45/__pycache__/c45_utils.cpython-38.pyc
ADDED
Binary file (5.9 kB). View file
|
|
c45/c45.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
from xml.dom import minidom
|
3 |
+
from xml.etree import ElementTree as ET
|
4 |
+
|
5 |
+
from sklearn.base import BaseEstimator, ClassifierMixin
|
6 |
+
from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
|
7 |
+
|
8 |
+
from .c45_utils import decision, grow_tree
|
9 |
+
|
10 |
+
class C45(BaseEstimator, ClassifierMixin):
|
11 |
+
"""A C4.5 tree classifier.
|
12 |
+
|
13 |
+
Parameters
|
14 |
+
----------
|
15 |
+
attrNames : list, optional (default=None)
|
16 |
+
The list of feature names used in printing tree during. If left default,
|
17 |
+
attributes will be named attr0, attr1... etc
|
18 |
+
See also
|
19 |
+
--------
|
20 |
+
DecisionTreeClassifier
|
21 |
+
References
|
22 |
+
----------
|
23 |
+
.. [1] https://en.wikipedia.org/wiki/Decision_tree_learning
|
24 |
+
.. [2] https://en.wikipedia.org/wiki/C4.5_algorithm
|
25 |
+
.. [3] L. Breiman, J. Friedman, R. Olshen, and C. Stone, "Classification
|
26 |
+
and Regression Trees", Wadsworth, Belmont, CA, 1984.
|
27 |
+
.. [4] J. R. Quinlain, "C4.5: Programs for Machine Learning",
|
28 |
+
Morgan Kaufmann Publishers, 1993
|
29 |
+
Examples
|
30 |
+
--------
|
31 |
+
>>> from sklearn.datasets import load_iris
|
32 |
+
>>> from sklearn.model_selection import cross_val_score
|
33 |
+
>>> from c45 import C45
|
34 |
+
>>> iris = load_iris()
|
35 |
+
>>> clf = C45(attrNames=iris.feature_names)
|
36 |
+
>>> cross_val_score(clf, iris.data, iris.target, cv=10)
|
37 |
+
... # doctest: +SKIP
|
38 |
+
...
|
39 |
+
array([ 1. , 0.93..., 0.86..., 0.93..., 0.93...,
|
40 |
+
0.93..., 0.93..., 1. , 0.93..., 1. ])
|
41 |
+
"""
|
42 |
+
def __init__(self, attrNames=None):
|
43 |
+
if attrNames is not None:
|
44 |
+
attrNames = [''.join(i for i in x if i.isalnum()).replace(' ', '_') for x in attrNames]
|
45 |
+
self.attrNames = attrNames
|
46 |
+
|
47 |
+
def fit(self, X, y):
|
48 |
+
X, y = check_X_y(X, y)
|
49 |
+
self.X_ = X
|
50 |
+
self.y_ = y
|
51 |
+
self.resultType = type(y[0])
|
52 |
+
if self.attrNames is None:
|
53 |
+
self.attrNames = [f'attr{x}' for x in range(len(self.X_[0]))]
|
54 |
+
|
55 |
+
assert(len(self.attrNames) == len(self.X_[0]))
|
56 |
+
|
57 |
+
data = [[] for i in range(len(self.attrNames))]
|
58 |
+
categories = []
|
59 |
+
|
60 |
+
for i in range(len(self.X_)):
|
61 |
+
categories.append(str(self.y_[i]))
|
62 |
+
for j in range(len(self.attrNames)):
|
63 |
+
data[j].append(self.X_[i][j])
|
64 |
+
root = ET.Element('DecisionTree')
|
65 |
+
grow_tree(data,categories,root,self.attrNames)
|
66 |
+
self.tree_ = ET.tostring(root, encoding="unicode")
|
67 |
+
return self
|
68 |
+
|
69 |
+
def predict(self, X):
|
70 |
+
check_is_fitted(self, ['tree_', 'resultType', 'attrNames'])
|
71 |
+
X = check_array(X)
|
72 |
+
dom = minidom.parseString(self.tree_)
|
73 |
+
root = dom.childNodes[0]
|
74 |
+
prediction = []
|
75 |
+
for i in range(len(X)):
|
76 |
+
answerlist = decision(root,X[i],self.attrNames,1)
|
77 |
+
answerlist = sorted(answerlist.items(), key=lambda x:x[1], reverse = True )
|
78 |
+
answer = answerlist[0][0]
|
79 |
+
prediction.append((self.resultType)(answer))
|
80 |
+
return prediction
|
81 |
+
|
82 |
+
def printTree(self):
|
83 |
+
check_is_fitted(self, ['tree_'])
|
84 |
+
dom = minidom.parseString(self.tree_)
|
85 |
+
print(dom.toprettyxml(newl="\r\n"))
|
c45/c45_utils.py
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
from xml.etree import ElementTree as ET
|
3 |
+
|
4 |
+
|
5 |
+
def prettify(elem, level=0):
|
6 |
+
i = "\n" + level*" "
|
7 |
+
if len(elem):
|
8 |
+
if not elem.text or not elem.text.strip():
|
9 |
+
elem.text = i + " "
|
10 |
+
for e in elem:
|
11 |
+
prettify(e, level+1)
|
12 |
+
if not e.tail or not e.tail.strip():
|
13 |
+
e.tail = i
|
14 |
+
if level and (not elem.tail or not elem.tail.strip()):
|
15 |
+
elem.tail = i
|
16 |
+
return elem
|
17 |
+
|
18 |
+
def isnum(attr):
|
19 |
+
for x in set(attr):
|
20 |
+
if not x=="?":
|
21 |
+
try:
|
22 |
+
x=float(x)
|
23 |
+
return isinstance(x,float)
|
24 |
+
except ValueError:
|
25 |
+
return False
|
26 |
+
return True
|
27 |
+
|
28 |
+
def entropy(x):
|
29 |
+
ent=0
|
30 |
+
for k in set(x):
|
31 |
+
p_i=float(x.count(k))/len(x)
|
32 |
+
ent=ent-p_i* math.log(p_i,2)
|
33 |
+
return ent
|
34 |
+
|
35 |
+
def gain_ratio(category,attr):
|
36 |
+
s=0
|
37 |
+
cat=[]
|
38 |
+
att=[]
|
39 |
+
for i in range(len(attr)):
|
40 |
+
if not attr[i]=="?":
|
41 |
+
cat.append(category[i])
|
42 |
+
att.append(attr[i])
|
43 |
+
for i in set(att):
|
44 |
+
p_i=float(att.count(i))/len(att)
|
45 |
+
cat_i=[]
|
46 |
+
for j in range(len(cat)):
|
47 |
+
if att[j]==i:
|
48 |
+
cat_i.append(cat[j])
|
49 |
+
s=s+p_i*entropy(cat_i)
|
50 |
+
gain=entropy(cat)-s
|
51 |
+
ent_att=entropy(att)
|
52 |
+
if ent_att==0:
|
53 |
+
return 0
|
54 |
+
else:
|
55 |
+
return gain/ent_att
|
56 |
+
|
57 |
+
def gain(category,attr):
|
58 |
+
cats=[]
|
59 |
+
for i in range(len(attr)):
|
60 |
+
if not attr[i]=="?":
|
61 |
+
cats.append([float(attr[i]),category[i]])
|
62 |
+
cats=sorted(cats, key=lambda x:x[0])
|
63 |
+
|
64 |
+
cat=[cats[i][1] for i in range(len(cats))]
|
65 |
+
att=[cats[i][0] for i in range(len(cats))]
|
66 |
+
if len(set(att))==1:
|
67 |
+
return 0
|
68 |
+
else:
|
69 |
+
gains=[]
|
70 |
+
div_point=[]
|
71 |
+
for i in range(1,len(cat)):
|
72 |
+
if not att[i]==att[i-1]:
|
73 |
+
gains.append(entropy(cat[:i])*float(i)/len(cat)+entropy(cat[i:])*(1-float(i)/len(cat)))
|
74 |
+
div_point.append(i)
|
75 |
+
gain=entropy(cat)-min(gains)
|
76 |
+
|
77 |
+
p_1=float(div_point[gains.index(min(gains))])/len(cat)
|
78 |
+
ent_attr= -p_1*math.log(p_1,2)-(1-p_1)*math.log((1-p_1),2)
|
79 |
+
return gain/ent_attr
|
80 |
+
|
81 |
+
def division_point(category,attr):
|
82 |
+
cats=[]
|
83 |
+
for i in range(len(attr)):
|
84 |
+
if not attr[i]=="?":
|
85 |
+
cats.append([float(attr[i]),category[i]])
|
86 |
+
cats=sorted(cats, key=lambda x:x[0])
|
87 |
+
|
88 |
+
cat=[cats[i][1] for i in range(len(cats))]
|
89 |
+
att=[cats[i][0] for i in range(len(cats))]
|
90 |
+
gains=[]
|
91 |
+
div_point=[]
|
92 |
+
for i in range(1,len(cat)):
|
93 |
+
if not att[i]==att[i-1]:
|
94 |
+
gains.append(entropy(cat[:i])*float(i)/len(cat)+entropy(cat[i:])*(1-float(i)/len(cat)))
|
95 |
+
div_point.append(i)
|
96 |
+
return att[div_point[gains.index(min(gains))]]
|
97 |
+
|
98 |
+
def grow_tree(data,category,parent,attrs_names):
|
99 |
+
if len(set(category))>1:
|
100 |
+
|
101 |
+
division=[]
|
102 |
+
for i in range(len(data)):
|
103 |
+
if set(data[i])==set("?"):
|
104 |
+
division.append(0)
|
105 |
+
else:
|
106 |
+
if (isnum(data[i])):
|
107 |
+
division.append(gain(category,data[i]))
|
108 |
+
else:
|
109 |
+
division.append(gain_ratio(category,data[i]))
|
110 |
+
if max(division)==0:
|
111 |
+
num_max=0
|
112 |
+
for cat in set(category):
|
113 |
+
num_cat=category.count(cat)
|
114 |
+
if num_cat>num_max:
|
115 |
+
num_max=num_cat
|
116 |
+
most_cat=cat
|
117 |
+
parent.text=most_cat
|
118 |
+
else:
|
119 |
+
index_selected=division.index(max(division))
|
120 |
+
name_selected=str(attrs_names[index_selected])
|
121 |
+
if isnum(data[index_selected]):
|
122 |
+
div_point=division_point(category,data[index_selected])
|
123 |
+
r_son_data=[[] for i in range(len(data))]
|
124 |
+
r_son_category=[]
|
125 |
+
l_son_data=[[] for i in range(len(data))]
|
126 |
+
l_son_category=[]
|
127 |
+
for i in range(len(category)):
|
128 |
+
if not data[index_selected][i]=="?":
|
129 |
+
if float(data[index_selected][i])<float(div_point):
|
130 |
+
l_son_category.append(category[i])
|
131 |
+
for j in range(len(data)):
|
132 |
+
l_son_data[j].append(data[j][i])
|
133 |
+
else:
|
134 |
+
r_son_category.append(category[i])
|
135 |
+
for j in range(len(data)):
|
136 |
+
r_son_data[j].append(data[j][i])
|
137 |
+
if len(l_son_category)>0 and len(r_son_category)>0:
|
138 |
+
p_l=float(len(l_son_category))/(len(data[index_selected])-data[index_selected].count("?"))
|
139 |
+
son=ET.SubElement(parent,name_selected,{'value':str(div_point),"flag":"l","p":str(round(p_l,3))})
|
140 |
+
grow_tree(l_son_data,l_son_category,son,attrs_names)
|
141 |
+
son=ET.SubElement(parent,name_selected,{'value':str(div_point),"flag":"r","p":str(round(1-p_l,3))})
|
142 |
+
grow_tree(r_son_data,r_son_category,son,attrs_names)
|
143 |
+
else:
|
144 |
+
num_max=0
|
145 |
+
for cat in set(category):
|
146 |
+
num_cat=category.count(cat)
|
147 |
+
if num_cat>num_max:
|
148 |
+
num_max=num_cat
|
149 |
+
most_cat=cat
|
150 |
+
parent.text=most_cat
|
151 |
+
else:
|
152 |
+
for k in set(data[index_selected]):
|
153 |
+
if not k=="?":
|
154 |
+
son_data=[[] for i in range(len(data))]
|
155 |
+
son_category=[]
|
156 |
+
for i in range(len(category)):
|
157 |
+
if data[index_selected][i]==k:
|
158 |
+
son_category.append(category[i])
|
159 |
+
for j in range(len(data)):
|
160 |
+
son_data[j].append(data[j][i])
|
161 |
+
son=ET.SubElement(parent,name_selected,{'value':k,"flag":"m",'p':str(round(float(len(son_category))/(len(data[index_selected])-data[index_selected].count("?")),3))})
|
162 |
+
grow_tree(son_data,son_category,son,attrs_names)
|
163 |
+
else:
|
164 |
+
parent.text=category[0]
|
165 |
+
|
166 |
+
def add(d1,d2):
|
167 |
+
d=d1
|
168 |
+
for i in d2:
|
169 |
+
if d.has_key(i):
|
170 |
+
d[i]=d[i]+d2[i]
|
171 |
+
else:
|
172 |
+
d[i]=d2[i]
|
173 |
+
return d
|
174 |
+
|
175 |
+
def decision(root,obs,attrs_names,p):
|
176 |
+
if root.hasChildNodes():
|
177 |
+
att_name=root.firstChild.nodeName
|
178 |
+
if att_name=="#text":
|
179 |
+
|
180 |
+
return decision(root.firstChild,obs,attrs_names,p)
|
181 |
+
else:
|
182 |
+
att=obs[attrs_names.index(att_name)]
|
183 |
+
if att=="?":
|
184 |
+
d={}
|
185 |
+
for child in root.childNodes:
|
186 |
+
d=add(d,decision(child,obs,attrs_names,p*float(child.getAttribute("p"))))
|
187 |
+
return d
|
188 |
+
else:
|
189 |
+
for child in root.childNodes:
|
190 |
+
if child.getAttribute("flag")=="m" and child.getAttribute("value")==att or \
|
191 |
+
child.getAttribute("flag")=="l" and float(att)<float(child.getAttribute("value")) or \
|
192 |
+
child.getAttribute("flag")=="r" and float(att)>=float(child.getAttribute("value")):
|
193 |
+
return decision(child,obs,attrs_names,p)
|
194 |
+
else:
|
195 |
+
return {root.nodeValue:p}
|
model_scratch_2.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:994d3bdd3909faa5a41f6c1a6da49c9e88753333d892feb5178e365032faa974
|
3 |
+
size 30366
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
joblib==1.3.2
|
2 |
+
pandas==1.4.2
|
3 |
+
numpy==1.26.2
|
4 |
+
scikit-learn==1.1.3
|
5 |
+
streamlit
|
6 |
+
mysql-connector-python==8.2.0
|