决策树(Decision Tree)是机器学习中监督学习算法的重要代表,因其逻辑类似于人类的决策过程而得名。想象你判断 “是否出门野餐”:首先看 “是否有雨”,再考虑 “温度是否适宜”,最后查看 “有无同伴”—— 这种逐步判断的逻辑链,就是决策树的核心思想。
在机器学习中,决策树的结构包含三个关键部分:
分支:每个判断的可能结果(是 / 否、数值区间等)
决策树的优势在于可解释性极强(能清晰看到决策过程)、无需特征标准化、对异常值不敏感,缺点是容易过拟合(后续会讲优化方法)。
决策树的关键在于如何选择最优特征作为节点,本质是通过 “纯度提升” 实现分类 / 回归。常用的特征选择指标有 3 种:
是类别 k 在数据集 D 中的占比。
CART 树是二叉树(每个节点仅分两个分支),可同时用于分类和回归(回归时用平方误差最小化)。
构建过程:
终止条件:
原理:在树构建过程中提前停止生长
方法:
优点:
缺点:
原理:先构建完整树,再自底向上剪枝
方法:
优点:
缺点:
决策树学习的核心目标是获得泛化能力强的模型,即在未见数据上表现良好的决策树。为实现这一目标,需要:
这是一个关于是否可以成功申请信贷的表格,接下来同学们,请根据表格内容编写代码
def _build_tree(self, X, y, depth=0):
n_samples, n_features = X.shape
n_classes = len(np.unique(y))
# 终止条件
if (self.max_depth is not None and depth >= self.max_depth) or \
n_classes == 1 or \
n_samples < self.min_samples_split:
leaf_value = self._compute_leaf_value(y)
return TreeNode(value=leaf_value)
best_feature, best_threshold = self._find_best_split(X, y, n_features)
if best_feature is None:
return TreeNode(value=self._compute_leaf_value(y))
left_idxs = X[:, best_feature] <= best_threshold
right_idxs = ~left_idxs
left = self._build_tree(X[left_idxs], y[left_idxs], depth + 1)
right = self._build_tree(X[right_idxs], y[right_idxs], depth + 1)
return TreeNode(feature_idx=best_feature, threshold=best_threshold, left=left, right=right)
def _entropy(self, y):
counts = np.bincount(y)
ps = counts / len(y)
return -np.sum([p * np.log2(p) for p in ps if p > 0])
def _find_best_split(self, X, y, n_features):
best_gain = -1
best_feature = None
best_threshold = None
for feature_idx in range(n_features):
thresholds = np.unique(X[:, feature_idx])
for threshold in thresholds:
gain = self._information_gain(X, y, feature_idx, threshold)
if gain > best_gain:
best_gain = gain
best_feature = feature_idx
best_threshold = threshold
return best_feature, best_threshold
公式:Gain(D,A)=H(D) - Σ(∣Dv∣/∣D∣ * H(Dv))
def _information_gain(self, X, y, feature_idx, threshold):
parent_entropy = self._entropy(y)
left_idxs = X[:, feature_idx] <= threshold
right_idxs = ~left_idxs
if np.sum(left_idxs) == 0 or np.sum(right_idxs) == 0:
return 0
n = len(y)
n_left, n_right = np.sum(left_idxs), np.sum(right_idxs)
e_left = self._entropy(y[left_idxs])
e_right = self._entropy(y[right_idxs])
child_entropy = (n_left / n) * e_left + (n_right / n) * e_right
information_gain = parent_entropy - child_entropy
return information_gain
公式:增益率 = 信息增益 / 划分信息
def _split_info(self, X, feature_idx, threshold):
left_idxs = X[:, feature_idx] <= threshold
right_idxs = ~left_idxs
n = len(X)
n_left, n_right = np.sum(left_idxs), np.sum(right_idxs)
if n_left == 0 or n_right == 0:
return 0
p_left = n_left / n
p_right = n_right / n
return -p_left * np.log2(p_left) - p_right * np.log2(p_right)
def _information_gain_ratio(self, X, y, feature_idx, threshold):
information_gain = self._information_gain(X, y, feature_idx, threshold)
split_information = self._split_info(X, feature_idx, threshold)
if split_information == 0:
return 0
return information_gain / split_information
import numpy as np
from collections import Counter
def load_data(file_path):
data = []
with open(file_path, 'r') as f:
for line in f:
items = line.strip().split(',')
items = [int(item) for item in items]
data.append(items)
return np.array(data)
class TreeNode:
def __init__(self, feature_idx=None, threshold=None, value=None, left=None, right=None):
self.feature_idx = feature_idx
self.threshold = threshold
self.value = value
self.left = left
self.right = right
def is_leaf(self):
return self.value is not None
class DecisionTree:
def __init__(self, max_depth=None, min_samples_split=2):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.root = None
def fit(self, X, y):
self.root = self._build_tree(X, y)
def predict(self, X):
return np.array([self._traverse_tree(x, self.root) for x in X])
def _traverse_tree(self, x, node):
if node.is_leaf():
return node.value
if x[node.feature_idx] <= node.threshold:
return self._traverse_tree(x, node.left)
else:
return self._traverse_tree(x, node.right)
def _build_tree(self, X, y, depth=0):
n_samples, n_features = X.shape
n_classes = len(np.unique(y))
# 终止条件
if (self.max_depth is not None and depth >= self.max_depth) or \
n_classes == 1 or \
n_samples < self.min_samples_split:
leaf_value = self._compute_leaf_value(y)
return TreeNode(value=leaf_value)
best_feature, best_threshold = self._find_best_split(X, y, n_features)
if best_feature is None:
return TreeNode(value=self._compute_leaf_value(y))
left_idxs = X[:, best_feature] <= best_threshold
right_idxs = ~left_idxs
left = self._build_tree(X[left_idxs], y[left_idxs], depth + 1)
right = self._build_tree(X[right_idxs], y[right_idxs], depth + 1)
return TreeNode(feature_idx=best_feature, threshold=best_threshold, left=left, right=right)
def _find_best_split(self, X, y, n_features):
pass
def _compute_leaf_value(self, y):
pass
class ID3DecisionTree(DecisionTree):
def __init__(self, max_depth=None, min_samples_split=2):
super().__init__(max_depth, min_samples_split)
def _entropy(self, y):
counts = np.bincount(y)
ps = counts / len(y)
return -np.sum([p * np.log2(p) for p in ps if p > 0])
def _information_gain(self, X, y, feature_idx, threshold):
parent_entropy = self._entropy(y)
left_idxs = X[:, feature_idx] <= threshold
right_idxs = ~left_idxs
if np.sum(left_idxs) == 0 or np.sum(right_idxs) == 0:
return 0
n = len(y)
n_left, n_right = np.sum(left_idxs), np.sum(right_idxs)
e_left = self._entropy(y[left_idxs])
e_right = self._entropy(y[right_idxs])
child_entropy = (n_left / n) * e_left + (n_right / n) * e_right
information_gain = parent_entropy - child_entropy
return information_gain
def _find_best_split(self, X, y, n_features):
best_gain = -1
best_feature = None
best_threshold = None
for feature_idx in range(n_features):
thresholds = np.unique(X[:, feature_idx])
for threshold in thresholds:
gain = self._information_gain(X, y, feature_idx, threshold)
if gain > best_gain:
best_gain = gain
best_feature = feature_idx
best_threshold = threshold
return best_feature, best_threshold
def _compute_leaf_value(self, y):
return Counter(y).most_common(1)[0][0]
class C45DecisionTree(DecisionTree):
def __init__(self, max_depth=None, min_samples_split=2):
super().__init__(max_depth, min_samples_split)
def _entropy(self, y):
counts = np.bincount(y)
ps = counts / len(y)
return -np.sum([p * np.log2(p) for p in ps if p > 0])
def _split_info(self, X, feature_idx, threshold):
left_idxs = X[:, feature_idx] <= threshold
right_idxs = ~left_idxs
n = len(X)
n_left, n_right = np.sum(left_idxs), np.sum(right_idxs)
if n_left == 0 or n_right == 0:
return 0
p_left = n_left / n
p_right = n_right / n
return -p_left * np.log2(p_left) - p_right * np.log2(p_right)
def _information_gain_ratio(self, X, y, feature_idx, threshold):
parent_entropy = self._entropy(y)
left_idxs = X[:, feature_idx] <= threshold
right_idxs = ~left_idxs
if np.sum(left_idxs) == 0 or np.sum(right_idxs) == 0:
return 0
n = len(y)
n_left, n_right = np.sum(left_idxs), np.sum(right_idxs)
e_left = self._entropy(y[left_idxs])
e_right = self._entropy(y[right_idxs])
child_entropy = (n_left / n) * e_left + (n_right / n) * e_right
information_gain = parent_entropy - child_entropy
split_information = self._split_info(X, feature_idx, threshold)
if split_information == 0:
return 0
gain_ratio = information_gain / split_information
return gain_ratio
def _find_best_split(self, X, y, n_features):
best_gain_ratio = -1
best_feature = None
best_threshold = None
for feature_idx in range(n_features):
thresholds = np.unique(X[:, feature_idx])
for threshold in thresholds:
gain_ratio = self._information_gain_ratio(X, y, feature_idx, threshold)
if gain_ratio > best_gain_ratio:
best_gain_ratio = gain_ratio
best_feature = feature_idx
best_threshold = threshold
return best_feature, best_threshold
def _compute_leaf_value(self, y):
return Counter(y).most_common(1)[0][0]
def evaluate_model(model, X_train, y_train, X_test, y_test):
model.fit(X_train, y_train)
test_pred = model.predict(X_test)
test_acc = np.sum(test_pred == y_test) / len(y_test)
print(f"测试集准确率: {test_acc:.4f}")
return test_acc
def main():
train_path = "C:/Users/Administrator/Downloads/dataset.txt"
test_path = "C:/Users/Administrator/Downloads/testset.txt"
train_data = load_data(train_path)
test_data = load_data(test_path)
# 分割特征和标签
X_train = train_data[:, :-1]
y_train = train_data[:, -1]
X_test = test_data[:, :-1]
y_test = test_data[:, -1]
print("\n使用ID3决策树(信息增益):")
id3_tree = ID3DecisionTree(max_depth=5)
id3_test_acc = evaluate_model(id3_tree, X_train, y_train, X_test, y_test)
print("\n使用C4.5决策树(信息增益率):")
c45_tree = C45DecisionTree(max_depth=5)
c45_test_acc = evaluate_model(c45_tree, X_train, y_train, X_test, y_test)
if __name__ == "__main__":
main()
扫码加好友,拉您进群



收藏
