Source code for emlearn.common


"""
Utilities
=========================
"""

import os
import sys
import subprocess
import platform
from distutils.ccompiler import new_compiler

import numpy

default_warn_flags = [
    '-Wall',
    '-Wextra',
    '-Wpointer-arith',
    '-Wdouble-promotion',
    '-Wfloat-conversion',
    '-Wno-unused-parameter',
    '-Wno-unused-variable',
    '-Wno-unused-function',
]

def check_array(arr):
    # import dynamically to not need this at package build time
    from sklearn.utils import check_array as check
    return check(arr)


[docs]
def get_include_dir() -> str:
    """
    Get the include directory with C headers for emlearn
    """
    return os.path.join(os.path.dirname(__file__))



def build_classifier(cmodel, name, temp_dir,
        include_dir, func=None, test_function=None, n_classes=None,
        warn_flags=None,
        warn_error=True,
        ):
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)

    if test_function is None:
        test_function = 'eml_test_read_csv'

    if warn_flags is None:
        warn_flags = default_warn_flags

    # create a new compiler object
    # force re-compilation even if object files exist (required)
    cc = new_compiler(force=1)

    tree_name = name
    def_file_name = name+'.h'
    def_file = os.path.join(temp_dir, def_file_name)
    code_file = os.path.join(temp_dir, name+'.c')
    output_filename = cc.executable_filename(name)
    bin_path = os.path.join(temp_dir, output_filename)
    include_dirs = [temp_dir, include_dir]
    if sys.platform.startswith('win'): # Windows
        libraries = []
        cc_args = []
    else : # MacOS and Linux should be the same
        libraries = ["m"] # math library / libm
        cc_args = ["-std=c99"]

        # be strict about compile warning
        cc_args += warn_flags
        if warn_error:
            cc_args += ['-Werror']

    if n_classes:
        code = """
        #include "{def_file_name}"
        #include <eml_test.h>

        #define N_CLASSES {n_classes}
        float outputs[N_CLASSES] = {{0.0}};

        static void classify_proba(const float *values, int length, int row) {{
            {func}; // TODO: handle error
            for (int class_no=0; class_no<N_CLASSES; class_no++) {{
                const float prob = outputs[class_no];
                printf("%d,%d,%f\\n", row, class_no, (double)prob);
            }}
        }}
        int main() {{
            {test_function}(stdin, classify_proba);
        }}
        """.format(**locals())
    else:
        # Trivial program that reads values on stdin, and returns classifications on stdout
        code = """
        #include "{def_file_name}"
        #include <eml_test.h>

        static void classify(const float *values, int length, int row) {{
            printf("%d,%f\\n", row, (double){func});
        }}
        int main() {{
            {test_function}(stdin, classify);
        }}
        """.format(**locals())

    with open(def_file, 'w') as f:
        f.write(cmodel)

    with open(code_file, 'w') as f:
        f.write(code)
    objects = cc.compile(sources=[code_file],
        extra_preargs=cc_args, include_dirs=include_dirs)

    cc.link("executable", objects, output_filename=output_filename, 
        output_dir=temp_dir, libraries=libraries)  

    return bin_path

def run_classifier(bin_path, data, out_dtype='int', float_precision=8):

    # Serialize input data as CSV
    def serialize_value(v):
        return '{:.{prec}f}'.format(v, prec=float_precision)

    lines = []
    for row in data:
        lines.append(",".join(serialize_value(v) for v in row))
    stdin = '\n'.join(lines)

    assert len(lines) == len(data), (len(lines), data.shape)

    # Run as subprocess
    args = [ bin_path ]
    out = subprocess.check_output(args, input=stdin, encoding='utf8', universal_newlines=True)

    # Parse output
    outputs = []
    lines = out.split('\n')
    for line in lines:
        if line:
            tokens = line.split(',')
            row = tokens[0]
            if len(tokens) == 2:
                out_ = tokens[1]
            else:
                out_ = tokens

            if out_dtype == 'int':
                out_ = int(float(out_))
            elif out_dtype == 'float':
                out_ = float(out_)
            else:
                out_ = out_dtype(out_)
            outputs.append(out_)

    return outputs

class CompiledClassifier():
    def __init__(self, cmodel, name, call=None, include_dir=None, temp_dir='tmp',
            test_function=None,
            out_dtype='int',
            proba_call=None,
            n_classes=None):

        if include_dir == None:
            include_dir = get_include_dir()
        self.bin_path = build_classifier(cmodel, name,
                include_dir=include_dir, temp_dir=temp_dir, func=call, test_function=test_function)

        self.proba_bin_path = None
        if proba_call is not None:
            self.proba_bin_path = build_classifier(cmodel, name+'_proba',
                    include_dir=include_dir, temp_dir=temp_dir,
                    func=proba_call, n_classes=n_classes,
                    test_function=test_function)

        self._out_dtype = out_dtype
        self.n_classes = n_classes

    def predict(self, X):
        X = check_array(X)

        out = run_classifier(self.bin_path, X, out_dtype=self._out_dtype)
        assert len(out) == len(X), out
        return out

    def predict_proba(self, X):
        X = check_array(X)

        if self.proba_bin_path is None:
            raise ValueError('predict_proba() not supported')

        def convert_out(raw):
            row, cls, prob = raw 
            return int(row), int(cls), float(prob)

        result = run_classifier(self.proba_bin_path, X, out_dtype=convert_out)
        out = numpy.empty(shape=(len(X), self.n_classes))
        for i, (row, cls, prob) in enumerate(result):
            out[row][cls] = prob

        assert len(out) == len(X), out
        return out

    def regress(self, X):
        X = check_array(X)

        return self.predict(X)




[docs]
def compile_executable(code_file : str,
                    out_dir : str,
                    name : str ='main',
                    include_dirs=[]):
    """
    Compile C code on the host.

    Useful to integrate small C executables in a Python-based script or notebook.
    Uses distutil.ccompiler, same as what is used to build Python modules.
    Should work portably on all platforms.

    :param code_file: Path to file with C code to compile
    :param out_dir: Path to directory where output executable will be located
    :param name: Base name of the executable
    :param include_dirs: Include directories for C headers   

    :return: Path to executable
    """

    cc = new_compiler(force=1)

    output_filename = cc.executable_filename(name)
    bin_path = os.path.join(out_dir, output_filename)
    #include_dirs = [out_dir, include_dir]

    if sys.platform.startswith('win'): # Windows
        libraries = None
        cc_args = None
    else: # MacOS and Linux should be the same
        libraries = ["m"] # math library / libm
        cc_args = ["-std=c99"]


    objects = cc.compile(
        sources=[code_file],
        extra_preargs=cc_args,
        include_dirs=include_dirs
    )

    cc.link("executable", objects,
        output_filename=output_filename, 
        output_dir=out_dir,
        libraries=libraries,
    )  

    return bin_path