"""
Tree evaluation metrics
=========================
Utilities for measuring a tree-based model
"""
import math
from ..convert import convert as convert_model
from ..trees import Wrapper as TreesWrapper
import numpy
[docs]
def get_tree_estimators(estimator):
"""
Get the DecisionTree instances from ensembles or single-tree models
"""
if hasattr(estimator, 'estimators_'):
trees = [ e for e in estimator.estimators_]
else:
trees = [ estimator ]
return trees
[docs]
def model_size_nodes(model, a=None, b=None):
"""
Size of model, in number of decision nodes
"""
if isinstance(model, TreesWrapper):
em = model
else:
em = convert_model(model)
nodes, roots, leaves = em.forest_
return len(nodes)
[docs]
def model_size_leaves(model, a=None, b=None):
"""
"""
if isinstance(model, TreesWrapper):
em = model
else:
em = convert_model(model)
nodes, roots, leaves = em.forest_
return len(leaves)
[docs]
def model_size_bytes(model, a=None, b=None, node_size=None, leaf_size=None):
"""
Size of model, in bytes. For both decision nodes and leaves
"""
# EmlTreesNode is 56 bits
# This is 8 bytes on most platforms due to padding/alignment
# feature index, a threshold value, left, and right child indices
if node_size is None:
node_size = 8
if isinstance(model, TreesWrapper):
em = model
else:
em = convert_model(model)
if leaf_size is None:
leaf_size = math.ceil(em.leaf_bits/8)
nodes = model_size_nodes(em)
leaves = model_size_leaves(em)
bytes = (nodes * node_size) + (leaves * leaf_size)
return bytes
[docs]
def tree_depth_average(model, a=None, b=None):
"""
Average depth of model
"""
trees = get_tree_estimators(model)
depths = [ e.tree_.max_depth for e in trees ]
return numpy.mean(depths)
[docs]
def tree_depth_difference(model, a=None, b=None):
"""Measures how much variation there is in tree depths"""
trees = get_tree_estimators(model)
depths = [ e.tree_.max_depth for e in trees ]
return numpy.max(depths) - numpy.min(depths)
[docs]
def count_trees(model, a=None, b=None):
"""
Number of trees in model
"""
trees = get_tree_estimators(model)
return len(trees)
[docs]
def compute_cost_estimate(model, X, b=None):
"""
Make an estimate of the compute cost, using the following assumptions:
- The dataset X is representative of the typical dataset
- Cost is proportional to the number of decision node evaluation in a tree
- The cost is added across all trees in the ensemble
Under this model, the actual compute time can be computed as the estimate times a constant C,
representing the time a single evaluation of a decision node takes.
"""
trees = get_tree_estimators(model)
X = numpy.array(X)
total = 0.0
for e in trees:
path = e.decision_path(X)
t = numpy.sum(path, axis=1)
total += numpy.mean(t)
return total