Source code for emlearn.evaluate.pareto


"""
Pareto-optimal evaluation
=========================
"""

import numpy

[docs]def is_pareto_efficient_simple(costs):
    """
    Find the pareto-efficient points (smaller is better)
    
    :param costs: An (n_points, n_costs) array
    :return: A (n_points, ) boolean array, indicating whether each point is Pareto efficient
    
    From https://stackoverflow.com/a/40239615/1967571
    Fairly fast for many datapoints, less fast for many costs, somewhat readable
    """
    is_efficient = numpy.ones(costs.shape[0], dtype = bool)
    for i, c in enumerate(costs):
        if is_efficient[i]:
            is_efficient[is_efficient] = numpy.any(costs[is_efficient] < c, axis=1)  # Keep any point with a lower cost
            is_efficient[i] = True  # And keep self
    return is_efficient


[docs]def find_pareto_front(df,
    cost_metric : str = 'mean_test_compute',
    performance_metric : str = 'mean_test_accuracy',
    higher_is_better : bool = True,
    min_performance=None):    
    """
    Find the Pareto front

    :param cost_metric: Column with model compute cost. Lower cost always better
    :param performance_metric: Column with model predictive performance.
    :param higher_is_better: Whether higher or lower is better for @performance_metric
    :param min_performance: Cut datapoints with worse performance than this

    :returns: The rows that make up the Pareto front
    """

    # prevnt mutating input    
    df = df.copy()

    # flip to negative-is-better
    if higher_is_better:
        df['mean_test_score'] = -df[performance_metric]
    else:
        df['mean_test_score'] = df[performance_metric]

    pp = is_pareto_efficient_simple(df[['mean_test_score', cost_metric]].values)
    
    if min_performance is not None:
        if higher_is_better:
            pp = (pp & (df[performance_metric] >= min_performance))
        else:
            pp = (pp & (df[performance_metric] <= min_performance))
    
    return df[pp]

[docs]def plot_pareto_front(results,
                      pareto_cut=None,
                      plot_other=True,
                      plot_pareto=True,
                      hue=None,
                      pareto_alpha=0.8,
                      other_alpha=0.3,
                      pareto_global=False,
                      s=100,
                      pareto_s=5,
                      height=8,
                      aspect=1,
                      cost_metric='mean_test_compute',
                      performance_metric='mean_test_accuracy',
                      size_metric='mean_test_size'):
    """
    Utility for plotting performance vs compute cost and size of a model.
    
    Can also compute and plot the pareto front. 
    """

    import seaborn

    pf = find_pareto_front(results, min_performance=pareto_cut)
    pf = pf.sort_values(cost_metric)

    g = seaborn.FacetGrid(results, hue=hue, height=height, aspect=aspect)

    # plot all data
    if plot_other:
        g.map_dataframe(seaborn.scatterplot,
                        x=cost_metric,
                        y=performance_metric,
                        size=size_metric,
                        legend=False,
                        s=pareto_s*s,
                        alpha=other_alpha,
                        #hue=hue,
       )

    def _plot_front(color, label):
        if pareto_global:
            sub = pf.copy()
        else:        
            sub = pf[pf[hue] == label]        

        seaborn.lineplot(
            data=sub,
            x=cost_metric,
            y=performance_metric,
            #color=color,
            label=label,
            alpha=pareto_alpha,
            legend=True,
        )

    # plot the data along Pareto front 
    if plot_pareto:
        g.map(_plot_front)

    return g