.. DO NOT EDIT. .. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. .. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: .. "auto_examples/trees_feature_quantization.py" .. LINE NUMBERS ARE GIVEN BELOW. .. only:: html .. note:: :class: sphx-glr-download-link-note :ref:`Go to the end ` to download the full example code. .. rst-class:: sphx-glr-example-title .. _sphx_glr_auto_examples_trees_feature_quantization.py: Feature data-types for trees =========================== Tree-based models in emlearn supports both float and integer datatypes for the feature datatype. This example illustrates how this can impact model size. .. GENERATED FROM PYTHON SOURCE LINES 11-28 .. code-block:: Python import os.path import emlearn import numpy import pandas import seaborn import matplotlib.pyplot as plt try: # When executed as regular .py script here = os.path.dirname(__file__) except NameError: # When executed as Jupyter notebook / Sphinx Gallery here = os.getcwd() .. GENERATED FROM PYTHON SOURCE LINES 29-33 Train a RandomForest model ------------------------ Key thing is to transform the data into integers that fit the .. GENERATED FROM PYTHON SOURCE LINES 33-65 .. code-block:: Python from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import cross_val_score, StratifiedKFold def train_model(data): label_column = 'label' feature_columns = list(set(data.columns) - set([label_column])) X = data[feature_columns] Y = data[label_column] # Rescale and convert to integers (quantize) # Here everything is made to fit in int8, the smallest representation # it may be needed to adapt to larger ones, such as uint16 X = (MinMaxScaler().fit_transform(X) * 127).astype(int) model = RandomForestClassifier(n_estimators=10, max_depth=10, random_state=1) # sanity check performance cv = StratifiedKFold(5, random_state=None, shuffle=False) scores = cross_val_score(model, X, Y, cv=cv) assert numpy.mean(scores) >= 0.60, numpy.mean(scores) model.fit(X, Y) return model from emlearn.examples.datasets.sonar import load_sonar_dataset data = load_sonar_dataset() model = train_model(data) .. GENERATED FROM PYTHON SOURCE LINES 66-71 Measure how feature datatype impacts program size ------------------------------- We are testing here on the AVR8 platform, which has no floating point unit (FPU) Other platforms may show different results. .. GENERATED FROM PYTHON SOURCE LINES 71-163 .. code-block:: Python from emlearn.evaluate.size import get_program_size, check_build_tools def check_program_size(dtype, model, platform, mcu): features_length = model.estimators_[0].n_features_in_ model_enabled = 0 if dtype == 'no-model' else 1 if dtype == 'loadable': dtype = 'int16_t' method = 'loadable' else: method = 'inline' model_name = f'sizecheck_{method}_{dtype}' print(model_name) if model_enabled: # Quantize with the specified dtype c_model = emlearn.convert(model, dtype=dtype, method=method) model_code = c_model.save(name=model_name, include_proba=False) else: model_code = "" test_program = \ f""" // Disable unused features #define EML_TREES_REGRESSION_ENABLE 0 #include #if {model_enabled} {model_code} static {dtype} features[{features_length}] = {{0, }}; #endif int main() {{ uint8_t pred = 0; #if {model_enabled} pred = {model_name}_predict(features, {features_length}); #endif int out = pred; return out; }} """ data = get_program_size(test_program, platform=platform, mcu=mcu) return pandas.Series(data) def run_experiment(model, platform, mcu): results_file = os.path.join(here, f'trees-feature-quantization-{platform}+{mcu}.csv') # check if AVR build tools are present. If not, just load results from a file missing_tools = check_build_tools(platform) if missing_tools: print(f"WARNING: Compiler toolchain for platform '{platform}' not found. Loading cached results") results = pandas.read_csv(results_file) else: experiments = pandas.DataFrame({ 'dtype': ('no-model', 'loadable', 'float', 'int32_t', 'int16_t', 'int8_t', 'uint8_t'), }) results = experiments['dtype'].apply(check_program_size, model=model, platform=platform, mcu=mcu) results = pandas.merge(experiments, results, left_index=True, right_index=True) results = results.set_index('dtype') # subtract overall program size to get only model size results = (results - results.loc['no-model']) results = results.drop(index='no-model') # add identifying information results['platform'] = platform results['cpu'] = mcu results = results.reset_index().set_index(['platform', 'cpu', 'dtype']) results.to_csv(results_file) print("Ran experiments. Results written to", results_file) return results platforms = pandas.DataFrame.from_records([ ('avr', 'atmega2560'), ('arm', 'Cortex-M0'), ('arm', 'Cortex-M4F'), ], columns=['platform', 'cpu']) results = pandas.concat([run_experiment(model, platform=row.platform, mcu=row.cpu) for idx, row in platforms.iterrows()]) print(results) .. rst-class:: sphx-glr-script-out .. code-block:: none WARNING: Compiler toolchain for platform 'avr' not found. Loading cached results WARNING: Compiler toolchain for platform 'arm' not found. Loading cached results WARNING: Compiler toolchain for platform 'arm' not found. Loading cached results platform cpu dtype flash ram 0 avr atmega2560 loadable 3980 1890 1 avr atmega2560 float 7160 240 2 avr atmega2560 int32_t 6206 240 3 avr atmega2560 int16_t 4498 120 4 avr atmega2560 int8_t 2486 60 5 avr atmega2560 uint8_t 2476 60 0 arm Cortex-M0 loadable 4896 144 1 arm Cortex-M0 float 4676 240 2 arm Cortex-M0 int32_t 2456 240 3 arm Cortex-M0 int16_t 2864 120 4 arm Cortex-M0 int8_t 2712 60 5 arm Cortex-M0 uint8_t 2256 60 0 arm Cortex-M4F loadable 2456 144 1 arm Cortex-M4F float 4608 240 2 arm Cortex-M4F int32_t 1972 240 3 arm Cortex-M4F int16_t 2204 120 4 arm Cortex-M4F int8_t 2204 60 5 arm Cortex-M4F uint8_t 1988 60 .. GENERATED FROM PYTHON SOURCE LINES 164-169 Plot results ------------------------------- There can be considerable reductions in program memory consumption by picking a suitable datatype for the platform. .. GENERATED FROM PYTHON SOURCE LINES 169-195 .. code-block:: Python def plot_results(results): results = results.reset_index() results['name'] = results.platform + '/' + results.cpu g = seaborn.catplot(data=results, kind='bar', y='flash', x='dtype', row='name', height=4, aspect=2, ) fig = g.figure fig.suptitle("Model size vs feature datatype") for ax in g.axes.flat: ax.grid(True, which='major', axis='y') ax.set_axisbelow(True) return fig fig = plot_results(results) fig.savefig('example-trees-feature-quantization.png') .. image-sg:: /auto_examples/images/sphx_glr_trees_feature_quantization_001.png :alt: Model size vs feature datatype, name = avr/atmega2560, name = arm/Cortex-M0, name = arm/Cortex-M4F :srcset: /auto_examples/images/sphx_glr_trees_feature_quantization_001.png :class: sphx-glr-single-img .. rst-class:: sphx-glr-timing **Total running time of the script:** (0 minutes 0.638 seconds) .. _sphx_glr_download_auto_examples_trees_feature_quantization.py: .. only:: html .. container:: sphx-glr-footer sphx-glr-footer-example .. container:: sphx-glr-download sphx-glr-download-jupyter :download:`Download Jupyter notebook: trees_feature_quantization.ipynb ` .. container:: sphx-glr-download sphx-glr-download-python :download:`Download Python source code: trees_feature_quantization.py ` .. container:: sphx-glr-download sphx-glr-download-zip :download:`Download zipped: trees_feature_quantization.zip ` .. only:: html .. rst-class:: sphx-glr-signature `Gallery generated by Sphinx-Gallery `_