.. DO NOT EDIT. .. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. .. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: .. "auto_examples/trees_voting.py" .. LINE NUMBERS ARE GIVEN BELOW. .. only:: html .. note:: :class: sphx-glr-download-link-note :ref:`Go to the end ` to download the full example code. .. rst-class:: sphx-glr-example-title .. _sphx_glr_auto_examples_trees_voting.py: Voting options for trees =========================== This example illustrates hard majority voting vs soft voting options for trees. Hard voting can sometimes give a significant drop in predictive performance, and using soft voting can be neccesary to match the original model. This makes the model slightly bigger. .. GENERATED FROM PYTHON SOURCE LINES 14-30 .. code-block:: Python import os.path import emlearn import numpy import pandas import seaborn import matplotlib.pyplot as plt try: # When executed as regular .py script here = os.path.dirname(__file__) except NameError: # When executed as Jupyter notebook / Sphinx Gallery here = os.getcwd() .. GENERATED FROM PYTHON SOURCE LINES 31-33 Load datasets ---------------- .. GENERATED FROM PYTHON SOURCE LINES 33-49 .. code-block:: Python from sklearn.datasets import fetch_openml, load_digits from emlearn.examples.datasets.sonar import load_sonar_dataset sonar_data = load_sonar_dataset() heart_data, y = fetch_openml('heart-statlog', version=1, as_frame=True, return_X_y=True) heart_data['label'] = y digits_data, y = load_digits(as_frame=True, return_X_y=True) digits_data['label'] = y print('sonar', len(sonar_data)) print('heart', len(heart_data)) print('digits', len(digits_data)) .. rst-class:: sphx-glr-script-out .. code-block:: none sonar 208 heart 270 digits 1797 .. GENERATED FROM PYTHON SOURCE LINES 50-54 Train a RandomForest model ------------------------ Key thing is to transform the data into integers that fit the .. GENERATED FROM PYTHON SOURCE LINES 54-96 .. code-block:: Python from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import MinMaxScaler, LabelEncoder from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split #from sklearn.metrics import get_scorer from sklearn.metrics import accuracy_score def prepare_data(data, label_column = 'label'): feature_columns = list(set(data.columns) - set([label_column])) X = data[feature_columns] Y = data[label_column] # Rescale and convert to integers (quantize) # Here everything is made to fit in int16 X = (MinMaxScaler().fit_transform(X) * 2**15-1).astype(int) Y = LabelEncoder().fit_transform(Y) return X, Y def train_model(data, max_depth=5): X, Y = prepare_data(data) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30) model = RandomForestClassifier(n_estimators=10, max_depth=max_depth, random_state=1) # sanity check performance cv = StratifiedKFold(5, random_state=None, shuffle=False) scores = cross_val_score(model, X_train, Y_train, cv=cv, scoring='accuracy') assert numpy.mean(scores) >= 0.70, (numpy.mean(scores), scores) model.fit(X_train, Y_train) #Y_pred = model.predict_proba(X_test)[:, 1] Y_pred = model.predict(X_test) #test_score = average_precision_score(Y_test, Y_pred, pos_label=pos_label) test_score = accuracy_score(Y_test, Y_pred) assert test_score >= 0.70, test_score return model, X_test, Y_test .. GENERATED FROM PYTHON SOURCE LINES 97-101 Experiments with different leaf_bits settings ------------------------------- .. GENERATED FROM PYTHON SOURCE LINES 101-137 .. code-block:: Python from emlearn.evaluate.trees import model_size_bytes, model_size_leaves def run_experiment(leaf_bits, X_test, Y_test, model): # Do conversion with specified leaf_bits c_model = emlearn.convert(model, method='loadable', leaf_bits=leaf_bits) #model_code = c_model.save(name=model_name, include_proba=True) # As a reference, compute the score before conversion Y_pred_ref = model.predict(X_test) ref_score = accuracy_score(Y_test, Y_pred_ref) # Estimate predictive performance after conversion Y_pred = c_model.predict(X_test) score = accuracy_score(Y_test, Y_pred) # Estimate model size model_size = model_size_bytes(c_model) model_leaves = model_size_leaves(c_model) out = pandas.Series({ 'model_size_bytes': model_size, 'model_leaves': model_leaves, 'leaf_bits': leaf_bits, 'score': round(100*score, 2), 'ref_score': round(100*ref_score, 2), }) out['score_diff'] = out['score'] - out['ref_score'] return out experiments = pandas.DataFrame({ 'leaf_bits': [0,2,3,4,5,6,7,8], }) .. GENERATED FROM PYTHON SOURCE LINES 138-141 Sonar dataset ------------------------------- .. GENERATED FROM PYTHON SOURCE LINES 141-148 .. code-block:: Python model, X_test, Y_test = train_model(sonar_data, max_depth=6) sonar_results = experiments.leaf_bits.apply(run_experiment, X_test=X_test, Y_test=Y_test, model=model) sonar_results['dataset'] = 'sonar' sonar_results['size_ratio'] = sonar_results['model_size_bytes'] / sonar_results['model_size_bytes'].min() print(sonar_results) .. rst-class:: sphx-glr-script-out .. code-block:: none model_size_bytes model_leaves leaf_bits ... score_diff dataset size_ratio 0 1128.0 2.0 0.0 ... -3.17 sonar 1.000000 1 1131.0 3.0 2.0 ... -3.17 sonar 1.002660 2 1134.0 6.0 3.0 ... -3.17 sonar 1.005319 3 1136.0 8.0 4.0 ... -3.17 sonar 1.007092 4 1140.0 12.0 5.0 ... -1.58 sonar 1.010638 5 1141.0 13.0 6.0 ... 0.00 sonar 1.011525 6 1143.0 15.0 7.0 ... 0.00 sonar 1.013298 7 1144.0 16.0 8.0 ... 0.00 sonar 1.014184 [8 rows x 8 columns] .. GENERATED FROM PYTHON SOURCE LINES 149-152 Heart disease dataset ------------------------------- .. GENERATED FROM PYTHON SOURCE LINES 152-159 .. code-block:: Python model, X_test, Y_test = train_model(heart_data, max_depth=6) heart_results = experiments.leaf_bits.apply(run_experiment, X_test=X_test, Y_test=Y_test, model=model) heart_results['dataset'] = 'heart' heart_results['size_ratio'] = heart_results['model_size_bytes'] / heart_results['model_size_bytes'].min() print(heart_results) .. rst-class:: sphx-glr-script-out .. code-block:: none model_size_bytes model_leaves leaf_bits ... score_diff dataset size_ratio 0 1992.0 2.0 0.0 ... 0.0 heart 1.000000 1 1995.0 3.0 2.0 ... 0.0 heart 1.001506 2 1999.0 7.0 3.0 ... 0.0 heart 1.003514 3 2005.0 13.0 4.0 ... 0.0 heart 1.006526 4 2013.0 21.0 5.0 ... 0.0 heart 1.010542 5 2017.0 25.0 6.0 ... 0.0 heart 1.012550 6 2019.0 27.0 7.0 ... 0.0 heart 1.013554 7 2019.0 27.0 8.0 ... 0.0 heart 1.013554 [8 rows x 8 columns] .. GENERATED FROM PYTHON SOURCE LINES 160-163 Digits dataset ------------------------------- .. GENERATED FROM PYTHON SOURCE LINES 163-170 .. code-block:: Python model, X_test, Y_test = train_model(digits_data, max_depth=5) digits_results = experiments.leaf_bits.apply(run_experiment, X_test=X_test, Y_test=Y_test, model=model) digits_results['dataset'] = 'digits' digits_results['size_ratio'] = digits_results['model_size_bytes'] / digits_results['model_size_bytes'].min() print(digits_results) .. rst-class:: sphx-glr-script-out .. code-block:: none model_size_bytes model_leaves leaf_bits ... score_diff dataset size_ratio 0 2272.0 10.0 0.0 ... -2.22 digits 1.000000 1 2322.0 50.0 2.0 ... -1.11 digits 1.022007 2 2421.0 149.0 3.0 ... 0.00 digits 1.065581 3 2457.0 185.0 4.0 ... 0.74 digits 1.081426 4 2472.0 200.0 5.0 ... -0.19 digits 1.088028 5 2478.0 206.0 6.0 ... 0.00 digits 1.090669 6 2479.0 207.0 7.0 ... 0.00 digits 1.091109 7 2479.0 207.0 8.0 ... 0.00 digits 1.091109 [8 rows x 8 columns] .. GENERATED FROM PYTHON SOURCE LINES 171-175 Visualize results ------------------------------- Soft voting gives slightly bigger models, but often good improvements in predictive performance. .. GENERATED FROM PYTHON SOURCE LINES 175-201 .. code-block:: Python def plot_results(results): results = results.reset_index() g = seaborn.relplot(data=results, #kind='bar', y='score_diff', x='size_ratio', hue='dataset', height=4, aspect=1.5, ) fig = g.figure fig.suptitle("Model scores vs size (higher is better)") for ax in g.axes.flat: ax.grid(True, which='major', axis='y') ax.axhline(0.0, ls='-', lw=1.5, color='black', alpha=0.5) ax.set_axisbelow(True) return fig combined = pandas.concat([sonar_results, heart_results, digits_results], axis=0) fig = plot_results(combined) fig.savefig('example-trees-voting.png') .. image-sg:: /auto_examples/images/sphx_glr_trees_voting_001.png :alt: Model scores vs size (higher is better) :srcset: /auto_examples/images/sphx_glr_trees_voting_001.png :class: sphx-glr-single-img .. rst-class:: sphx-glr-timing **Total running time of the script:** (0 minutes 21.253 seconds) .. _sphx_glr_download_auto_examples_trees_voting.py: .. only:: html .. container:: sphx-glr-footer sphx-glr-footer-example .. container:: sphx-glr-download sphx-glr-download-jupyter :download:`Download Jupyter notebook: trees_voting.ipynb ` .. container:: sphx-glr-download sphx-glr-download-python :download:`Download Python source code: trees_voting.py ` .. container:: sphx-glr-download sphx-glr-download-zip :download:`Download zipped: trees_voting.zip ` .. only:: html .. rst-class:: sphx-glr-signature `Gallery generated by Sphinx-Gallery `_