.. DO NOT EDIT.
.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
.. "auto_examples/trees_feature_quantization.py"
.. LINE NUMBERS ARE GIVEN BELOW.

.. only:: html

    .. note::
        :class: sphx-glr-download-link-note

        :ref:`Go to the end <sphx_glr_download_auto_examples_trees_feature_quantization.py>`
        to download the full example code.

.. rst-class:: sphx-glr-example-title

.. _sphx_glr_auto_examples_trees_feature_quantization.py:


Feature data-types for trees
===========================

Tree-based models in emlearn supports both float and integer datatypes for the feature datatype.
This example illustrates how this can impact model size.

.. GENERATED FROM PYTHON SOURCE LINES 11-28

.. code-block:: Python


    import os.path

    import emlearn
    import numpy
    import pandas
    import seaborn
    import matplotlib.pyplot as plt

    try:
        # When executed as regular .py script
        here = os.path.dirname(__file__)
    except NameError:
        # When executed as Jupyter notebook / Sphinx Gallery
        here = os.getcwd()


.. GENERATED FROM PYTHON SOURCE LINES 29-33

Train a RandomForest model
------------------------

Key thing is to transform the data into integers that fit the

.. GENERATED FROM PYTHON SOURCE LINES 33-65

.. code-block:: Python

    from sklearn.ensemble import RandomForestClassifier
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.model_selection import cross_val_score, StratifiedKFold

    def train_model(data):

        label_column = 'label'
        feature_columns = list(set(data.columns) - set([label_column]))
        X = data[feature_columns]
        Y = data[label_column]

        # Rescale and convert to integers (quantize)
        # Here everything is made to fit in int8, the smallest representation
        # it may be needed to adapt to larger ones, such as uint16
        X = (MinMaxScaler().fit_transform(X) * 127).astype(int)

        model = RandomForestClassifier(n_estimators=10, max_depth=10, random_state=1)

        # sanity check performance
        cv = StratifiedKFold(5, random_state=None, shuffle=False)
        scores = cross_val_score(model, X, Y, cv=cv)
        assert numpy.mean(scores) >= 0.60, numpy.mean(scores)

        model.fit(X, Y)

        return model

    from emlearn.examples.datasets.sonar import load_sonar_dataset

    data = load_sonar_dataset()
    model = train_model(data)


.. GENERATED FROM PYTHON SOURCE LINES 66-71

Measure how feature datatype impacts program size
-------------------------------

We are testing here on the AVR8 platform, which has no floating point unit (FPU)
Other platforms may show different results.

.. GENERATED FROM PYTHON SOURCE LINES 71-163

.. code-block:: Python


    from emlearn.evaluate.size import get_program_size, check_build_tools

    def check_program_size(dtype, model, platform, mcu):

        features_length = model.estimators_[0].n_features_in_
        model_enabled = 0 if dtype == 'no-model' else 1
        if dtype == 'loadable':
            dtype = 'int16_t'
            method = 'loadable'
        else:
            method = 'inline'

        model_name = f'sizecheck_{method}_{dtype}'

        print(model_name)
        if model_enabled:
            # Quantize with the specified dtype
            c_model = emlearn.convert(model, dtype=dtype, method=method)
            model_code = c_model.save(name=model_name, include_proba=False)
        else:
            model_code = ""

        test_program = \
        f"""
        // Disable unused features
        #define EML_TREES_REGRESSION_ENABLE 0

        #include <stdint.h>

        #if {model_enabled}
        {model_code}

        static {dtype} features[{features_length}] = {{0, }};
        #endif

        int main()
        {{
            uint8_t pred = 0;
            #if {model_enabled}
            pred = {model_name}_predict(features, {features_length});
            #endif
            int out = pred;
            return out;
        }}
        """
        data = get_program_size(test_program, platform=platform, mcu=mcu)

        return pandas.Series(data)
    

    def run_experiment(model, platform, mcu):

        results_file = os.path.join(here, f'trees-feature-quantization-{platform}+{mcu}.csv')
        # check if AVR build tools are present. If not, just load results from a file
        missing_tools = check_build_tools(platform)

        if missing_tools:
            print(f"WARNING: Compiler toolchain for platform '{platform}' not found. Loading cached results")
            results = pandas.read_csv(results_file)
        else:
            experiments = pandas.DataFrame({
                'dtype': ('no-model', 'loadable', 'float', 'int32_t', 'int16_t', 'int8_t', 'uint8_t'),
            })
            results = experiments['dtype'].apply(check_program_size, model=model, platform=platform, mcu=mcu)
            results = pandas.merge(experiments, results, left_index=True, right_index=True)
            results = results.set_index('dtype')
            # subtract overall program size to get only model size
            results = (results - results.loc['no-model'])
            results = results.drop(index='no-model')

            # add identifying information
            results['platform'] = platform
            results['cpu'] = mcu
            results = results.reset_index().set_index(['platform', 'cpu', 'dtype'])

            results.to_csv(results_file)
            print("Ran experiments. Results written to", results_file)

        return results


    platforms = pandas.DataFrame.from_records([
        ('avr', 'atmega2560'),
        ('arm', 'Cortex-M0'),
        ('arm', 'Cortex-M4F'),
    ], columns=['platform', 'cpu'])

    results = pandas.concat([run_experiment(model, platform=row.platform, mcu=row.cpu) for idx, row in platforms.iterrows()])
    print(results)


.. rst-class:: sphx-glr-script-out

 .. code-block:: none

    WARNING: Compiler toolchain for platform 'avr' not found. Loading cached results
    WARNING: Compiler toolchain for platform 'arm' not found. Loading cached results
    WARNING: Compiler toolchain for platform 'arm' not found. Loading cached results
      platform         cpu     dtype  flash   ram
    0      avr  atmega2560  loadable   3980  1890
    1      avr  atmega2560     float   7160   240
    2      avr  atmega2560   int32_t   6206   240
    3      avr  atmega2560   int16_t   4498   120
    4      avr  atmega2560    int8_t   2486    60
    5      avr  atmega2560   uint8_t   2476    60
    0      arm   Cortex-M0  loadable   4896   144
    1      arm   Cortex-M0     float   4676   240
    2      arm   Cortex-M0   int32_t   2456   240
    3      arm   Cortex-M0   int16_t   2864   120
    4      arm   Cortex-M0    int8_t   2712    60
    5      arm   Cortex-M0   uint8_t   2256    60
    0      arm  Cortex-M4F  loadable   2456   144
    1      arm  Cortex-M4F     float   4608   240
    2      arm  Cortex-M4F   int32_t   1972   240
    3      arm  Cortex-M4F   int16_t   2204   120
    4      arm  Cortex-M4F    int8_t   2204    60
    5      arm  Cortex-M4F   uint8_t   1988    60


.. GENERATED FROM PYTHON SOURCE LINES 164-169

Plot results
-------------------------------

There can be considerable reductions in program memory consumption
by picking a suitable datatype for the platform.

.. GENERATED FROM PYTHON SOURCE LINES 169-195

.. code-block:: Python


    def plot_results(results):
        results = results.reset_index()
        results['name'] = results.platform + '/' + results.cpu

        g = seaborn.catplot(data=results,
            kind='bar',
            y='flash',
            x='dtype',
            row='name',
            height=4,
            aspect=2,
        )
        fig = g.figure
        fig.suptitle("Model size vs feature datatype")

        for ax in g.axes.flat:
            ax.grid(True, which='major', axis='y')
            ax.set_axisbelow(True)

        return fig

    fig = plot_results(results)
    fig.savefig('example-trees-feature-quantization.png')


.. image-sg:: /auto_examples/images/sphx_glr_trees_feature_quantization_001.png
   :alt: Model size vs feature datatype, name = avr/atmega2560, name = arm/Cortex-M0, name = arm/Cortex-M4F
   :srcset: /auto_examples/images/sphx_glr_trees_feature_quantization_001.png
   :class: sphx-glr-single-img


.. rst-class:: sphx-glr-timing

   **Total running time of the script:** (0 minutes 0.638 seconds)


.. _sphx_glr_download_auto_examples_trees_feature_quantization.py:

.. only:: html

  .. container:: sphx-glr-footer sphx-glr-footer-example

    .. container:: sphx-glr-download sphx-glr-download-jupyter

      :download:`Download Jupyter notebook: trees_feature_quantization.ipynb <trees_feature_quantization.ipynb>`

    .. container:: sphx-glr-download sphx-glr-download-python

      :download:`Download Python source code: trees_feature_quantization.py <trees_feature_quantization.py>`

    .. container:: sphx-glr-download sphx-glr-download-zip

      :download:`Download zipped: trees_feature_quantization.zip <trees_feature_quantization.zip>`


.. only:: html

 .. rst-class:: sphx-glr-signature

    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_