用Python实现一个简单的决策树算法
发布时间:2023-12-04 09:14:28
决策树是一种常用的机器学习算法,用于解决分类和回归问题。它通过对训练数据集进行递归分割,构建一个树形结构,以实现预测新的数据实例的目标值。
下面是用Python实现一个简单的决策树算法的例子。
首先,我们需要定义一个决策树节点的类,用于表示决策树的节点。
class DecisionNode:
def __init__(self, feature_i=None, threshold=None, value=None, true_branch=None, false_branch=None):
self.feature_i = feature_i # 节点的特征索引
self.threshold = threshold # 分割特征的阈值
self.value = value # 叶结点的值
self.true_branch = true_branch # 真分支子树
self.false_branch = false_branch # 假分支子树
接下来,我们定义一个决策树算法的类。
class DecisionTree:
def __init__(self, min_samples_split=2, max_depth=None):
self.min_samples_split = min_samples_split # 最小分割样本数
self.max_depth = max_depth # 最大深度
def fit(self, X, y):
self.features = X.shape[1] # 特征的数量
self.tree = self._grow_tree(X, y)
def predict(self, X):
return [self._predict(x) for x in X]
def _grow_tree(self, X, y, depth=0):
num_samples, num_features = X.shape
num_labels = len(np.unique(y))
# 如果满足停止条件,返回一个叶结点
if (depth == self.max_depth or num_labels == 1 or num_samples < self.min_samples_split):
label_counts = np.bincount(y)
return DecisionNode(value=np.argmax(label_counts))
# 寻找 分割特征和阈值
best_feature, best_threshold = self._best_criteria(X, y)
# 分割数据集
true_indexes, false_indexes = self._split(X[:, best_feature], best_threshold)
# 递归生长子树
true_branch = self._grow_tree(X[true_indexes, :], y[true_indexes], depth+1)
false_branch = self._grow_tree(X[false_indexes, :], y[false_indexes], depth+1)
return DecisionNode(feature_i=best_feature, threshold=best_threshold,
true_branch=true_branch, false_branch=false_branch)
def _best_criteria(self, X, y):
best_gain = 0.0
best_feature = None
best_threshold = None
# 计算基尼指数的增益
for feature_i in range(self.features):
thresholds = np.unique(X[:, feature_i])
for threshold in thresholds:
gain = self._gain(y, X[:, feature_i], threshold)
if gain > best_gain:
best_gain = gain
best_feature = feature_i
best_threshold = threshold
return best_feature, best_threshold
def _gain(self, y, feature, threshold):
# Gini index的计算
p = len(feature[feature <= threshold]) / len(feature)
gini = self._gini(y)
true_gini = self._gini(y[feature <= threshold])
false_gini = self._gini(y[feature > threshold])
# 计算基尼指数的增益
gain = gini - p * true_gini - (1 - p) * false_gini
return gain
def _gini(self, y):
# 计算基尼系数
_, counts = np.unique(y, return_counts=True)
probabilities = counts / len(y)
gini = 1 - sum(probabilities**2)
return gini
def _split(self, feature, threshold):
# 分割数据集,返回True和False两类的索引
true_indexes = np.argwhere(feature <= threshold).flatten()
false_indexes = np.argwhere(feature > threshold).flatten()
return true_indexes, false_indexes
def _predict(self, x):
# 预测单个数据实例的目标值
node = self.tree
while node.value is None:
if x[node.feature_i] <= node.threshold:
node = node.true_branch
else:
node = node.false_branch
return node.value
现在,我们可以使用决策树算法来解决分类问题。假设我们有一个数据集,其中包含两个特征"X1"和"X2",以及对应的目标值"y"。
import numpy as np # 创建数据集 X = np.array([[1, 2], [2, 1], [3, 4], [4, 3]]) y = np.array([0, 0, 1, 1]) # 创建决策树对象 dt = DecisionTree() # 训练决策树模型 dt.fit(X, y) # 预测新的数据实例 new_X = np.array([[5, 6], [6, 5]]) predictions = dt.predict(new_X) print(predictions)
以上就是使用Python实现一个简单的决策树算法的示例。决策树是一个强大的机器学习算法,能够广泛应用于分类和回归问题的解决。
