关于sklearn决策树手动指定节点进行剪枝调整的实现

一、决策树剪枝

决策树的剪枝方式有两种，预剪枝和后剪枝，后剪枝在python的sklearn方法中提供了CCP代价复杂度剪枝法（Cost Complexity Pruning）具体实现代码如下：

 # -*- coding: utf-8 -*-
from sklearn.datasets import load_iris
from sklearn import tree
import numpy as np
 
#--------数据准备-----------------------------------
iris = load_iris()                          # 加载数据
X = iris.data
y = iris.target
#-------模型训练---------------------------------
clf = tree.DecisionTreeClassifier(min_samples_split=10,random_state=0,ccp_alpha=0)        
clf = clf.fit(X, y)     
#-------计算ccp路径------------------------------
pruning_path = clf.cost_complexity_pruning_path(X, y)
 
#-------打印结果---------------------------------   
print("\n====CCP路径=================")
print("ccp_alphas:",pruning_path['ccp_alphas'])
print("impurities:",pruning_path['impurities'])    
 
#------设置alpha对树后剪枝-----------------------
clf = tree.DecisionTreeClassifier(min_samples_split=10,random_state=0,ccp_alpha=0.1)        
clf = clf.fit(X, y) 
#------自行计算树纯度以验证-----------------------
is_leaf =clf.tree_.children_left ==-1
tree_impurities = (clf.tree_.impurity[is_leaf]* clf.tree_.n_node_samples[is_leaf]/len(y)).sum()
#-------打印结果--------------------------- 
print("\n==设置alpha=0.1剪枝后的树纯度：=========\n",tree_impurities)

该方法在树构建完成后，对树进行剪枝简化，使以下损失函数最小化。但具体裁剪哪些分枝的是不知道，因此需要一个方法，可以对具体分枝进行裁剪。目前sklearn并没有提供该方法，从源码上进行修改。

二、决策树源码分析

在github上下载scikit-learn-1.1.X的源码，决策树的实现在scikit-learn-1.1.X/sklearn/tree/_tree.pyx，采用cpython是实现代码分析如下：

#CCP 剪枝方式
def _build_pruned_tree_ccp(
    Tree tree, # OUT
    Tree orig_tree,
    DOUBLE_t ccp_alpha):
    """Build a pruned tree from the original tree using cost complexity
    pruning.

    The values and nodes from the original tree are copied into the pruned
    tree.

    Parameters
    ----------
    tree : Tree
        Location to place the pruned tree
    orig_tree : Tree
        Original tree
    ccp_alpha : positive double
        Complexity parameter. The subtree with the largest cost complexity
        that is smaller than ``ccp_alpha`` will be chosen. By default,
        no pruning is performed.
    """

    cdef:
        SIZE_t n_nodes = orig_tree.node_count
        unsigned char[:] leaves_in_subtree = np.zeros(
            shape=n_nodes, dtype=np.uint8)

    pruning_controller = _AlphaPruner(ccp_alpha=ccp_alpha)

    # CCP方法用于计算哪些节点需要剪枝
    _cost_complexity_prune(leaves_in_subtree, orig_tree, pruning_controller)

    # 实现剪枝的方法
    _build_pruned_tree(tree, orig_tree, leaves_in_subtree,
                       pruning_controller.capacity)

cdef class _CCPPruneController:
    """Base class used by build_pruned_tree_ccp and ccp_pruning_path
    to control pruning.
    """
    cdef bint stop_pruning(self, DOUBLE_t effective_alpha) nogil:
        """Return 1 to stop pruning and 0 to continue pruning"""
        return 0

    cdef void save_metrics(self, DOUBLE_t effective_alpha,
                           DOUBLE_t subtree_impurities) nogil:
        """Save metrics when pruning"""
        pass

    cdef void after_pruning(self, unsigned char[:] in_subtree) nogil:
        """Called after pruning"""
        pass


cdef class _AlphaPruner(_CCPPruneController):
    """Use alpha to control when to stop pruning."""
    cdef DOUBLE_t ccp_alpha
    cdef SIZE_t capacity

    def __cinit__(self, DOUBLE_t ccp_alpha):
        self.ccp_alpha = ccp_alpha
        self.capacity = 0

    cdef bint stop_pruning(self, DOUBLE_t effective_alpha) nogil:
        # The subtree on the previous iteration has the greatest ccp_alpha
        # less than or equal to self.ccp_alpha
        return self.ccp_alpha < effective_alpha

    cdef void after_pruning(self, unsigned char[:] in_subtree) nogil:
        """Updates the number of leaves in subtree"""
        for i in range(in_subtree.shape[0]):
            if in_subtree[i]:
                self.capacity += 1

cdef struct CostComplexityPruningRecord:
    SIZE_t node_idx
    SIZE_t parent

cdef _cost_complexity_prune(unsigned char[:] leaves_in_subtree, # OUT
                            Tree orig_tree,
                            _CCPPruneController controller):
    """Perform cost complexity pruning.

    This function takes an already grown tree, `orig_tree` and outputs a
    boolean mask `leaves_in_subtree` which are the leaves in the pruned tree.
    During the pruning process, the controller is passed the effective alpha and
    the subtree impurities. Furthermore, the controller signals when to stop
    pruning.

    Parameters
    ----------
    leaves_in_subtree : unsigned char[:]
        Output for leaves of subtree
    orig_tree : Tree
        Original tree
    ccp_controller : _CCPPruneController
        Cost complexity controller
    """

    cdef:
        SIZE_t i
        SIZE_t n_nodes = orig_tree.node_count
        # prior probability using weighted samples
        DOUBLE_t[:] weighted_n_node_samples = orig_tree.weighted_n_node_samples
        DOUBLE_t total_sum_weights = weighted_n_node_samples[0]
        DOUBLE_t[:] impurity = orig_tree.impurity
        # weighted impurity of each node
        DOUBLE_t[:] r_node = np.empty(shape=n_nodes, dtype=np.float64)

        SIZE_t[:] child_l = orig_tree.children_left
        SIZE_t[:] child_r = orig_tree.children_right
        SIZE_t[:] parent = np.zeros(shape=n_nodes, dtype=np.intp)

        stack[CostComplexityPruningRecord] ccp_stack
        CostComplexityPruningRecord stack_record
        int rc = 0
        SIZE_t node_idx
        stack[SIZE_t] node_indices_stack

        SIZE_t[:] n_leaves = np.zeros(shape=n_nodes, dtype=np.intp)
        DOUBLE_t[:] r_branch = np.zeros(shape=n_nodes, dtype=np.float64)
        DOUBLE_t current_r
        SIZE_t leaf_idx
        SIZE_t parent_idx

        # candidate nodes that can be pruned
        unsigned char[:] candidate_nodes = np.zeros(shape=n_nodes,
                                                    dtype=np.uint8)
        # nodes in subtree
        unsigned char[:] in_subtree = np.ones(shape=n_nodes, dtype=np.uint8)
        DOUBLE_t[:] g_node = np.zeros(shape=n_nodes, dtype=np.float64)
        SIZE_t pruned_branch_node_idx
        DOUBLE_t subtree_alpha
        DOUBLE_t effective_alpha
        SIZE_t child_l_idx
        SIZE_t child_r_idx
        SIZE_t n_pruned_leaves
        DOUBLE_t r_diff
        DOUBLE_t max_float64 = np.finfo(np.float64).max

    # find parent node ids and leaves
    with nogil:

        for i in range(r_node.shape[0]):
            r_node[i] = (
                weighted_n_node_samples[i] * impurity[i] / total_sum_weights)

        # Push the root node
        ccp_stack.push({"node_idx": 0, "parent": _TREE_UNDEFINED})

        while not ccp_stack.empty():
            stack_record = ccp_stack.top()
            ccp_stack.pop()

            node_idx = stack_record.node_idx
            parent[node_idx] = stack_record.parent

            if child_l[node_idx] == _TREE_LEAF:
                # ... and child_r[node_idx] == _TREE_LEAF:
                leaves_in_subtree[node_idx] = 1
            else:
                ccp_stack.push({"node_idx": child_l[node_idx], "parent": node_idx})
                ccp_stack.push({"node_idx": child_r[node_idx], "parent": node_idx})

        # computes number of leaves in all branches and the overall impurity of
        # the branch. The overall impurity is the sum of r_node in its leaves.
        for leaf_idx in range(leaves_in_subtree.shape[0]):
            if not leaves_in_subtree[leaf_idx]:
                continue
            r_branch[leaf_idx] = r_node[leaf_idx]

            # bubble up values to ancestor nodes
            current_r = r_node[leaf_idx]
            while leaf_idx != 0:
                parent_idx = parent[leaf_idx]
                r_branch[parent_idx] += current_r
                n_leaves[parent_idx] += 1
                leaf_idx = parent_idx

        for i in range(leaves_in_subtree.shape[0]):
            candidate_nodes[i] = not leaves_in_subtree[i]

        # save metrics before pruning
        controller.save_metrics(0.0, r_branch[0])

        # while root node is not a leaf
        while candidate_nodes[0]:

            # computes ccp_alpha for subtrees and finds the minimal alpha
            effective_alpha = max_float64
            for i in range(n_nodes):
                if not candidate_nodes[i]:
                    continue
                subtree_alpha = (r_node[i] - r_branch[i]) / (n_leaves[i] - 1)
                if subtree_alpha < effective_alpha:
                    effective_alpha = subtree_alpha
                    pruned_branch_node_idx = i

            if controller.stop_pruning(effective_alpha):
                break

            node_indices_stack.push(pruned_branch_node_idx)

            # descendants of branch are not in subtree
            while not node_indices_stack.empty():
                node_idx = node_indices_stack.top()
                node_indices_stack.pop()

                if not in_subtree[node_idx]:
                    continue # branch has already been marked for pruning
                candidate_nodes[node_idx] = 0
                leaves_in_subtree[node_idx] = 0
                in_subtree[node_idx] = 0

                if child_l[node_idx] != _TREE_LEAF:
                    # ... and child_r[node_idx] != _TREE_LEAF:
                    node_indices_stack.push(child_l[node_idx])
                    node_indices_stack.push(child_r[node_idx])
            leaves_in_subtree[pruned_branch_node_idx] = 1
            in_subtree[pruned_branch_node_idx] = 1

            # updates number of leaves
            n_pruned_leaves = n_leaves[pruned_branch_node_idx] - 1
            n_leaves[pruned_branch_node_idx] = 0

            # computes the increase in r_branch to bubble up
            r_diff = r_node[pruned_branch_node_idx] - r_branch[pruned_branch_node_idx]
            r_branch[pruned_branch_node_idx] = r_node[pruned_branch_node_idx]

            # bubble up values to ancestors
            node_idx = parent[pruned_branch_node_idx]
            while node_idx != _TREE_UNDEFINED:
                n_leaves[node_idx] -= n_pruned_leaves
                r_branch[node_idx] += r_diff
                node_idx = parent[node_idx]

            controller.save_metrics(effective_alpha, r_branch[0])

        controller.after_pruning(in_subtree)

# 构造对象在下面剪枝使用使用
cdef struct BuildPrunedRecord:
    SIZE_t start
    SIZE_t depth
    SIZE_t parent
    bint is_left

# 剪枝方法，传入需要兼职的节点信息leaves_in_subtree
cdef _build_pruned_tree(
    Tree tree, # 输出树
    Tree orig_tree, # 源树
    const unsigned char[:] leaves_in_subtree,
    SIZE_t capacity):
    """Build a pruned tree.

    Build a pruned tree from the original tree by transforming the nodes in
    ``leaves_in_subtree`` into leaves.

    Parameters
    ----------
    tree : Tree
        Location to place the pruned tree
    orig_tree : Tree
        Original tree
    leaves_in_subtree : unsigned char memoryview, shape=(node_count, )
        Boolean mask for leaves to include in subtree
    capacity : SIZE_t
        Number of nodes to initially allocate in pruned tree
    """
    tree._resize(capacity)

    cdef:
        SIZE_t orig_node_id
        SIZE_t new_node_id
        SIZE_t depth
        SIZE_t parent
        bint is_left
        bint is_leaf

        # value_stride for original tree and new tree are the same
        SIZE_t value_stride = orig_tree.value_stride
        SIZE_t max_depth_seen = -1
        int rc = 0
        Node* node
        double* orig_value_ptr
        double* new_value_ptr

        stack[BuildPrunedRecord] prune_stack
        BuildPrunedRecord stack_record

    with nogil:
        # push root node onto stack
        prune_stack.push({"start": 0, "depth": 0, "parent": _TREE_UNDEFINED, "is_left": 0})

        while not prune_stack.empty():
            stack_record = prune_stack.top()
            prune_stack.pop()

            orig_node_id = stack_record.start
            depth = stack_record.depth
            parent = stack_record.parent
            is_left = stack_record.is_left

            is_leaf = leaves_in_subtree[orig_node_id]
            node = &orig_tree.nodes[orig_node_id]

            new_node_id = tree._add_node(
                parent, is_left, is_leaf, node.feature, node.threshold,
                node.impurity, node.n_node_samples,
                node.weighted_n_node_samples)

            if new_node_id == SIZE_MAX:
                rc = -1
                break

            # copy value from original tree to new tree
            orig_value_ptr = orig_tree.value + value_stride * orig_node_id
            new_value_ptr = tree.value + value_stride * new_node_id
            memcpy(new_value_ptr, orig_value_ptr, sizeof(double) * value_stride)

            if not is_leaf:
                # Push right child on stack
                prune_stack.push({"start": node.right_child, "depth": depth + 1,
                                  "parent": new_node_id, "is_left": 0})
                # push left child on stack
                prune_stack.push({"start": node.left_child, "depth": depth + 1,
                                  "parent": new_node_id, "is_left": 1})

            if depth > max_depth_seen:
                max_depth_seen = depth

        if rc >= 0:
            tree.max_depth = max_depth_seen
    if rc == -1:
        raise MemoryError("pruning tree")

基于CCP的代码，去除CCP的计算，直接指定哪些节点需要剪枝，复用_build_pruned_tree方法，具体剪枝代码如下，实现prune_tree方法：

cdef struct PruningRecord:
    SIZE_t node_idx
    SIZE_t parent
    SIZE_t leaves

def prune_tree(
    Tree tree, # OUT
    Tree orig_tree,
    list leaves_redoces):

    cdef:
        SIZE_t n_nodes = orig_tree.node_count
        unsigned char[:] leaves_in_subtree = np.zeros(shape=n_nodes, dtype=np.uint8)
        stack[PruningRecord] ccp_stack
        PruningRecord stack_record
        SIZE_t[:] child_l = orig_tree.children_left
        SIZE_t[:] child_r = orig_tree.children_right
        SIZE_t capacity = 0
        SIZE_t leaves = 0

    for leaf_idx in leaves_redoces:
        leaves_in_subtree[leaf_idx] = 1
        
    ccp_stack.push({"node_idx": 0, "parent": _TREE_UNDEFINED, "leaves": 0})
    while not ccp_stack.empty():
        stack_record = ccp_stack.top()
        leaves = stack_record.leaves
        ccp_stack.pop()
        node_idx = stack_record.node_idx
        if leaves_in_subtree[node_idx]:
            leaves = leaves_in_subtree[node_idx]
        if child_l[node_idx] != _TREE_LEAF:
            leaves_in_subtree[node_idx] = leaves
            ccp_stack.push({"node_idx": child_l[node_idx], "parent": node_idx, "leaves": leaves})
            ccp_stack.push({"node_idx": child_r[node_idx], "parent": node_idx, "leaves": leaves})
        else:
            leaves_in_subtree[node_idx] = 1
    
    for i in range(leaves_in_subtree.shape[0]):
        if leaves_in_subtree[i]:
            capacity += 1
    _build_pruned_tree(tree, orig_tree, leaves_in_subtree, capacity)

由于是.pyx文件需要通过源码编译生成指定的文件，命令：

python setup.py build_ext --inplace

将目录下sklearn\tree\_tree.cp38-win_amd64.pyd文件复制替换当前python环境下的同名文件，重启测试:

#-*- coding: UTF-8 -*-
'''
Created on 2021年3月4日

@author: xuconghui
'''
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.tree._tree import prune_tree,Tree
import numpy as np
from sklearn.metrics import precision_score,recall_score,accuracy_score,f1_score
import pandas as pd
# 加载Iris数据集
iris = load_iris()

X = iris.data
y = iris.target
feature_names = iris.feature_names
print("特征:"+str(iris.feature_names))
# 创建并训练决策树分类器
tree_classifier = DecisionTreeClassifier(random_state=0)
tree_classifier.fit(X, y)

y_predict = tree_classifier.predict(X)
acc = accuracy_score(y, y_predict)
p_score = precision_score(y, y_predict, average='micro')
r_score = recall_score(y, y_predict, average='micro')
f_score = f1_score(y, y_predict, average='micro')
print("准确率="+str(acc)+" 精确度="+str(p_score)+" 召回度="+str(r_score)+"f_score="+str(f_score))
tree.plot_tree(tree_classifier,filled=True)
plt.show()
# 手动剪枝决策树
def xprune_tree(tree_model, recodes=[7]):
    feature_idx = feature_names.index(feature_name)
    feature_idxs = [feature_idx,feature_idx-len(feature_names)]
    pruned_tree = Tree(tree_model.n_features_in_, tree_model.classes_, tree_model.n_outputs_)
    prune_tree(pruned_tree,tree_model.tree_,recodes)
    tree_model.tree_ = pruned_tree


xprune_tree(tree_classifier)

# 输出
with open(iris.dot", 'w') as f:
    tree.export_graphviz(tree_classifier, out_file=f)
tree.plot_tree(tree_classifier,filled=True)
plt.show()