moorepants · August 12, 2022 16:54
diff --git a/example_results.txt b/example_results.txt
 Testing results.

 Timing the functions.

 Timing: cython
 cython time: 0.00288254904747 s

 Timing: numpy_broadcast
 numpy_broadcast time: 0.00597401690483 s

 Timing: lambdify_individual
 lambdify_individual time: 0.00303873364131 s

 Timing: ufuncify_f2py
 ufuncify_f2py time: 0.00201614236832 s

 Timing: python
 python time: 0.119000189304 s

 Timing: ufuncify_cython
 ufuncify_cython time: 0.00293522365888 s

 Timing: numpy
 numpy time: 0.00303197193146 s

 Timing: c
 c time: 0.0029081483682 s

 Timing: lambdify
 lambdify time: 0.00599523711205 s

 Timing: ufuncify_matrix_cython
 ufuncify_matrix_cython time: 0.00292766968409 s

 Benchmark time: 0.119000189304 s

 Ratios of the timings to the benchmark time:
 --------------------------------------------

 ufuncify_f2py ratio: 59.0237034717
 cython ratio: 41.2829711983
 c ratio: 40.9195729508
 ufuncify_matrix_cython ratio: 40.6467266273
 ufuncify_cython ratio: 40.5421198294
 numpy ratio: 39.2484468836
 lambdify_individual ratio: 39.1611122761
 numpy_broadcast ratio: 19.9196271454
 lambdify ratio: 19.8491214076
diff --git a/main.c b/main.c
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>

 #include "eval_matrix_h.h"

 /*  This just tests out the C functions in eval_matrix_c.c. */

 int main()
 {
    // generate the a, b and c arrays of doubles

    int n = 10000;

    double a[n];
    double b[n];
    double c[n];

    srand(time(NULL));

    int i;

    for (i = 0; i < n; i++){
        a[i] = (rand() % 100);
        b[i] = (rand() % 100);
        c[i] = (rand() % 100);
    }

    // for (i = 0; i < n; i++){
        // printf("a%d: %f\n", i, a[i]);
        // printf("b%d: %f\n", i, b[i]);
        // printf("c%d: %f\n", i, c[i]);
    // }

    double matrix[n * 4];

    eval_matrix_loop(n, a, b, c, matrix);

    double mat[4];

    for (i = 0; i < n; i++){

        mat[0] = matrix[i * 4];
        mat[1] = matrix[i * 4 + 1];
        mat[2] = matrix[i * 4 + 2];
        mat[3] = matrix[i * 4 + 3];

        printf("matrix[%d] = [%f %f %f %f]\n", i, mat[0], mat[1], mat[2], mat[3]);
    }

    return 0;
 }
diff --git a/multiindex.py b/multiindex.py
 """This is a small rewrite of the ufuncify function in sympy so it supports
 array arguments for all values."""

 import importlib
 import subprocess

 import sympy as sy
 from sympy.utilities.lambdify import implemented_function
 from sympy.utilities.autowrap import autowrap

 _c_template = """\
 #include <math.h>
 #include "{file_prefix}_h.h"

 void {routine_name}(double matrix[{matrix_output_size}], {input_args})
 {{
 {eval_code}
 }}
 """

 _h_template = """\
 void {routine_name}(double matrix[{matrix_output_size}], {input_args});
 """

 _cython_template = """\
 import numpy as np
 cimport numpy as np
 cimport cython

 cdef extern from "{file_prefix}_h.h":
    void {routine_name}(double matrix[{matrix_output_size}], {input_args})

 @cython.boundscheck(False)
 @cython.wraparound(False)
 def {routine_name}_loop(np.ndarray[np.double_t, ndim=2] matrix, {numpy_typed_input_args}):

    cdef int n = matrix.shape[0]

    cdef int i

    for i in range(n):
        {routine_name}(&matrix[i, 0], {indexed_input_args})

    return matrix.reshape(n, {num_rows}, {num_cols})
 """

 _setup_template = """\
 import numpy
 from distutils.core import setup
 from distutils.extension import Extension
 from Cython.Distutils import build_ext

 extension = Extension(name="{file_prefix}",
                      sources=["{file_prefix}.pyx",
                               "{file_prefix}_c.c"],
                      extra_compile_args=[],
                      include_dirs=[numpy.get_include()])

 setup(name="{routine_name}",
      cmdclass={{'build_ext': build_ext}},
      ext_modules=[extension])
 """


 def ufuncify_matrix(args, expr, cse=True):

    matrix_size = expr.shape[0] * expr.shape[1]

    file_prefix_base = 'ufuncify_matrix'
    file_prefix = '{}_{}'.format(file_prefix_base, 0)

    taken = False
    i = 1

    while not taken:
        try:
            open(file_prefix + '.pyx', 'r')
        except IOError:
            taken = True
        else:
            file_prefix = '{}_{}'.format(file_prefix_base, i)
            i += 1

    d = {'routine_name': 'eval_matrix',
         'file_prefix': file_prefix,
         'matrix_output_size': matrix_size,
         'num_rows': expr.shape[0],
         'num_cols': expr.shape[1]}

    matrix_sym = sy.MatrixSymbol('matrix', expr.shape[0], expr.shape[1])

    sub_exprs, simple_mat = sy.cse(expr, sy.numbered_symbols('z_'))

    sub_expr_code = '\n'.join(['double ' + sy.ccode(sub_expr[1], sub_expr[0])
                               for sub_expr in sub_exprs])

    matrix_code = sy.ccode(simple_mat[0], matrix_sym)

    d['eval_code'] = '    ' + '\n    '.join((sub_expr_code + '\n' + matrix_code).split('\n'))

    c_indent = len('void {routine_name}('.format(**d))
    c_arg_spacer = ',\n' + ' ' * c_indent

    d['input_args'] = c_arg_spacer.join(['double {}'.format(a) for a in args])

    cython_indent = len('def {routine_name}_loop('.format(**d))
    cython_arg_spacer = ',\n' + ' ' * cython_indent

    d['numpy_typed_input_args'] = cython_arg_spacer.join(['np.ndarray[np.double_t, ndim=1] {}'.format(a) for a in args])

    d['indexed_input_args'] = ',\n'.join(['{}[i]'.format(a) for a in args])

    files = {}
    files[d['file_prefix'] + '_c.c'] = _c_template.format(**d)
    files[d['file_prefix'] + '_h.h'] = _h_template.format(**d)
    files[d['file_prefix'] + '.pyx'] = _cython_template.format(**d)
    files[d['file_prefix'] + '_setup.py'] = _setup_template.format(**d)

    for filename, code in files.items():
        with open(filename, 'w') as f:
            f.write(code)

    cmd = ['python', d['file_prefix'] + '_setup.py', 'build_ext', '--inplace']
    subprocess.call(cmd, stderr=subprocess.STDOUT, stdout=subprocess.PIPE)

    cython_module = importlib.import_module(d['file_prefix'])
    return getattr(cython_module, d['routine_name'] + '_loop')


 def ufuncify(args, expr, **kwargs):
    """
    Generates a binary ufunc-like lambda function for numpy arrays

    ``args``
        Either a Symbol or a tuple of symbols. Specifies the argument sequence
        for the ufunc-like function.

    ``expr``
        A SymPy expression that defines the element wise operation

    ``kwargs``
        Optional keyword arguments are forwarded to autowrap().

    The returned function can only act on one array at a time, as only the
    first argument accept arrays as input.

    .. Note:: a *proper* numpy ufunc is required to support broadcasting, type
       casting and more.  The function returned here, may not qualify for
       numpy's definition of a ufunc.  That why we use the term ufunc-like.

    References
    ==========
    [1] http://docs.scipy.org/doc/numpy/reference/ufuncs.html

    Examples
    ========

    >>> from sympy.utilities.autowrap import ufuncify
    >>> from sympy.abc import x, y
    >>> import numpy as np
    >>> f = ufuncify([x, y], y + x**2)
    >>> f([1, 2, 3], [2, 2, 2])
    [ 3.  6.  11.]
    >>> a = f(np.arange(5), 3 * np.ones(5))
    >>> isinstance(a, np.ndarray)
    True
    >>> print a
    [ 3. 4. 7. 12. 19.]

    """
    y = sy.IndexedBase(sy.Dummy('y')) # result array

    i = sy.Dummy('i', integer=True) # index of the array
    m = sy.Dummy('m', integer=True) # length of array (dimension)
    i = sy.Idx(i, m) # index of the array

    l = sy.Lambda(args, expr) # A lambda function that represents the inputs and outputs of the expression
    f = implemented_function('f', l) # maps an UndefinedFunction('f') to the lambda function

    if isinstance(args, sy.Symbol):
        args = [args]
    else:
        args = list(args)

    # For each of the args create an indexed version.
    indexed_args = [sy.IndexedBase(sy.Dummy(str(a))) for a in args]

    # ensure correct order of arguments
    kwargs['args'] = [y] + indexed_args + [m]
    args_with_indices = [a[i] for a in indexed_args]
    return autowrap(sy.Eq(y[i], f(*args_with_indices)), **kwargs)


 if __name__ == '__main__':

    result_array = sy.IndexedBase('y')

    arg_1 = sy.IndexedBase('a')
    arg_2 = sy.IndexedBase('b')

    array_length = sy.symbols('n', integer=True)

    index = sy.symbols('i', integer=True)
    index = sy.Idx(index, array_length)

    a, b = sy.symbols('a, b')

    expr = a**2 + b / 2.0

    lam = sy.Lambda((a, b), expr)

    sym_func = implemented_function('f', lam)

    code_args = [result_array, arg_1, arg_2, array_length]

    equation = sy.Eq(result_array[index], sym_func(arg_1[index], arg_2[index]))

    f_for = autowrap(equation, tempdir='multi-fortran', args=code_args)

    f_cyt = autowrap(equation, language='C', backend='cython', tempdir='multi-cython', args=code_args)
diff --git a/setup.py b/setup.py
 """This file builds the Cython extensions."""

 import numpy
 from distutils.core import setup
 from distutils.extension import Extension
 from Cython.Distutils import build_ext

 extension = Extension(name="eval_matrix",
                      sources=["eval_matrix.pyx",
                               "eval_matrix_c.c"],
                      include_dirs=[numpy.get_include()])

 setup(name="eval_matrix",
      cmdclass={'build_ext': build_ext},
      ext_modules=[extension])
diff --git a/test_matrix_eval.m b/test_matrix_eval.m
 function test_matrix_eval()

    n = 10000;

    a_vals = rand(n, 1);
    b_vals = rand(n, 1);
    c_vals = rand(n, 1);

    f_loop = @() eval_matrices_loop(n, a_vals, b_vals, c_vals);
    f_vectorized = @() eval_matrices_vectorized(n, a_vals, b_vals, c_vals);

    % These are taken manually from the Python results.

    python_loop_time= 0.136805589199;
    python_vectorized_time = 0.00317755591869;

    t_loop = timeit(f_loop);
    t_vectorized = timeit(f_vectorized);

    display('------------------------------------')

    display(sprintf('Mean time to evaluate the loop %1.4f s', t_loop))
    display(sprintf('Ratio to the Python loop benchmark time is %1.2f', ...
                    python_loop_time / t_loop))
    display(sprintf('Ratio to the Python vectorized time is %1.2f', ...
                    python_vectorized_time / t_loop))

    display('------------------------------------')

    display(sprintf('Mean time to evaluate the vectorized loop %1.4f s', ...
                    t_vectorized))
    display(sprintf('Ratio to the Python loop benchmark time is %1.2f', ...
                    python_loop_time / t_vectorized))
    display(sprintf('Ratio to the Python vectorized time is %1.2f', ...
                    python_vectorized_time / t_vectorized))

    display('------------------------------------')

 end

 function result = eval_matrices_loop(n, a_vals, b_vals, c_vals)

    result = zeros(n, 2, 2);

    for i = 1:n
        result(i, 1, 1) = a_vals(i).^2 .* cos(b_vals(i)).^c_vals(i);
        result(i, 1, 2) = tan(b_vals(i)) ./ sin(a_vals(i) + b_vals(i)) + c_vals(i).^4;
        result(i, 2, 1) = a_vals(i).^2 + b_vals(i).^2 - sqrt(c_vals(i));
        result(i, 2, 2) = (((a_vals(i) + b_vals(i) + c_vals(i)) .* (a_vals(i) + b_vals(i))) ./ a_vals(i) .* sin(b_vals(i)));
    end

 end

 function result = eval_matrices_vectorized(n, a_vals, b_vals, c_vals)

    result = zeros(n, 2, 2);

    result(:, 1, 1) = a_vals.^2 .* cos(b_vals).^c_vals;
    result(:, 1, 2) = tan(b_vals) ./ sin(a_vals + b_vals) + c_vals.^4;
    result(:, 2, 1) = a_vals.^2 + b_vals.^2 - sqrt(c_vals);
    result(:, 2, 2) = (((a_vals + b_vals + c_vals) .* (a_vals + b_vals)) ./ a_vals .* sin(b_vals));

 end

diff --git a/test_matrix_eval.py b/test_matrix_eval.py
 #!/usr/bin/env python

 """A common need in my dynamics work is the to evaluate a small list of long
 mathematical expressions (typically less than 100 expressions) a large
 number of times (up to millions) with different scalar arguments each time.
 The number of evaluations is always much greater than the number of
 expressions. The expressions are generally all a function of the same
 scalars but not necessarily and sometimes the expressions have scalars that
 are constant with respect to each evaluations.

 This set of Python, Cython, and C programs explore how to do this as fast as
 possible.

 This requires the current master of SymPy.

 You will need to build the Cython extension modules before running this
 module.

 $ python setup.py build_exit --inplace

 """

 import math
 import timeit

 import numpy as np
 import sympy as sp

 from eval_matrix import eval_matrix_loop_cython as cython_loop
 from eval_matrix import eval_matrix_loop_c as c_loop
 from multiindex import ufuncify, ufuncify_matrix

 """First I create the symbolic form of some sample expressions (these are
 much shorter than my real problems)."""

 a, b, c = sp.symbols('a, b, c')

 expr_00 = a**2 * sp.cos(b)**c
 expr_01 = sp.tan(b) / sp.sin(a + b) + c**4
 expr_10 = a**2 + b**2 - sp.sqrt(c)
 expr_11 = ((a + b + c) * (a + b)) / a * sp.sin(b)

 sym_mat = sp.Matrix([[expr_00, expr_01],
                     [expr_10, expr_11]])

 # These simply set up some large one dimensional arrays that will be used in
 # the expression evaluation.

 n = 10000

 a_vals = np.random.random(n)
 b_vals = np.random.random(n)
 c_vals = np.random.random(n)

 """The mathematical expressions are generally generated symbolically with
 SymPy. The easiet method available in SymPy is to use the lambdify function
 to generate a NumPy backed numerical function.

 I'll try two methods:

    1. lamdify the matrix and rely on numpy's underlying broadcasting
    2. lambdify each expression individually

 """

 f = sp.lambdify((a, b, c), sym_mat, modules='numpy', default_array=True)

 f00 = sp.lambdify((a, b, c), expr_00, modules='numpy', default_array=True)
 f01 = sp.lambdify((a, b, c), expr_01, modules='numpy', default_array=True)
 f10 = sp.lambdify((a, b, c), expr_10, modules='numpy', default_array=True)
 f11 = sp.lambdify((a, b, c), expr_11, modules='numpy', default_array=True)


 def eval_matrix_loop_lambdify():
    """This evaluates the lambdified matrix of expressions all in one
    shot."""
    return f(a_vals, b_vals, c_vals)


 def eval_matrix_loop_lambdify_individual():
    """This allocates a 3D array inserts the evaluated lambdified
    expressions in the correct entries."""

    result = np.empty((n, 2, 2))

    result[:, 0, 0] = f00(a_vals, b_vals, c_vals)
    result[:, 0, 1] = f01(a_vals, b_vals, c_vals)
    result[:, 1, 0] = f10(a_vals, b_vals, c_vals)
    result[:, 1, 1] = f11(a_vals, b_vals, c_vals)

    return result

 """These two methods are explicit Python functions that use NumPy to do
 exactly what lambdify does under the hood."""


 def eval_matrix_loop_numpy_broadcast():
    """This is the same thing as lambdifying the SymPy matrix."""

    result = np.array(
        [[a_vals**2 * np.cos(b_vals)**c_vals,
          np.tan(b_vals) / np.sin(a_vals + b_vals) + c_vals**4],
         [a_vals**2 + b_vals**2 - np.sqrt(c_vals),
          ((a_vals + b_vals + c_vals) * (a_vals + b_vals)) / a_vals *
          np.sin(b_vals)]])

    return result


 def eval_matrix_loop_numpy():
    """Since the number of matrix elements are typically much smaller than
    the number of evaluations, NumPy can be used to compute each of the
    Matrix expressions. This is equivalent to the individual lambdified
    expressions above."""

    result = np.empty((n, 2, 2))

    result[:, 0, 0] = a_vals**2 * np.cos(b_vals)**c_vals
    result[:, 0, 1] = np.tan(b_vals) / np.sin(a_vals + b_vals) + c_vals**4
    result[:, 1, 0] = a_vals**2 + b_vals**2 - np.sqrt(c_vals)
    result[:, 1, 1] = (((a_vals + b_vals + c_vals) * (a_vals + b_vals)) /
                       a_vals * np.sin(b_vals))

    return result


 """The most basic method of building the result array is a simple loop in
 Python. But this will definitely be the slowest due to Python's overhead.
 But this is what we ultimately want to improve with all these methods that
 rely on fast low level code for the loop (vectorizing). This is the speed
 benchmark. All other method will be compared against it."""


 def eval_matrix_loop_python():
    """This is the standard Python method, i.e. loop through each array and
    compute the four matrix entries."""

    result = np.empty((n, 2, 2))

    for i in range(n):
        result[i, 0, 0] = a_vals[i]**2 * math.cos(b_vals[i])**c_vals[i]
        result[i, 0, 1] = (math.tan(b_vals[i]) / math.sin(a_vals[i] +
                           b_vals[i]) + c_vals[i]**4)
        result[i, 1, 0] = a_vals[i]**2 + b_vals[i]**2 - math.sqrt(c_vals[i])
        result[i, 1, 1] = (((a_vals[i] + b_vals[i] + c_vals[i]) * (a_vals[i]
                           + b_vals[i])) / a_vals[i] * math.sin(b_vals[i]))

    return result

 """The next methods utilized hand written C functions and some Cython
 wrappers. I have two flavors. In the cython one the loop is in Cython and
 the expression eval is in C. In the second one, _c, both the loop and the
 expression evals are in C, with just a light Cython wrapper."""


 def eval_matrix_loop_cython():
    """This is equivalent to the naive Python loop but is implemented in a
    lower level as a combination of Cython and C. The loop is in Cython and
    the expression eval is in C."""

    result = np.empty((n, 4))

    return cython_loop(a_vals, b_vals, c_vals, result)


 def eval_matrix_loop_c():
    """This is equivalent to the naive Python loop but is implemented in a
    lower level as a combination of Cython and C. The loop and expression
    evals are all in C."""

    result = np.empty((n * 4))

    return c_loop(a_vals, b_vals, c_vals, result)

 """sympy.utilities.ufuncify automatically generates the broadcasting loop in
 the low level. The default settings use Fortran and f2py. Currently,
 ufuncify only supports scalar expressions and an array for the first
 argument. But I've included a modified version in multiindex.py that
 requires all of the arguments to the function to be arrays of equal length.
 ufuncify currently doesn't support a list of expressions (or sympy matrices)
 so I ufuncify each expression. If all of the expressions were in the low
 level loop then things will likely be faster especially if cse is used and
 other optimizations."""

 g00 = ufuncify((a, b, c), expr_00, language='F95', backend='f2py',
               tempdir='ufunc-fortran-code')
 g01 = ufuncify((a, b, c), expr_01, language='F95', backend='f2py')
 g10 = ufuncify((a, b, c), expr_10, language='F95', backend='f2py')
 g11 = ufuncify((a, b, c), expr_11, language='F95', backend='f2py')


 def eval_matrix_loop_ufuncify_f2py():
    """This creates the result using the Fortran backend."""

    result = np.empty((n, 2, 2))

    result[:, 0, 0] = g00(a_vals, b_vals, c_vals)
    result[:, 0, 1] = g01(a_vals, b_vals, c_vals)
    result[:, 1, 0] = g10(a_vals, b_vals, c_vals)
    result[:, 1, 1] = g11(a_vals, b_vals, c_vals)

    return result

 h00 = ufuncify((a, b, c), expr_00, language='C', backend='Cython',
               tempdir='ufunc-cython-code')
 h01 = ufuncify((a, b, c), expr_01, language='C', backend='Cython')
 h10 = ufuncify((a, b, c), expr_10, language='C', backend='Cython')
 h11 = ufuncify((a, b, c), expr_11, language='C', backend='Cython')


 def eval_matrix_loop_ufuncify_cython():
    """This creates the result using the C/Cython backend."""

    result = np.empty((n, 2, 2))

    result[:, 0, 0] = h00(a_vals, b_vals, c_vals)
    result[:, 0, 1] = h01(a_vals, b_vals, c_vals)
    result[:, 1, 0] = h10(a_vals, b_vals, c_vals)
    result[:, 1, 1] = h11(a_vals, b_vals, c_vals)

    return result

 r = ufuncify_matrix((a, b, c), sym_mat)


 def eval_matrix_loop_ufuncify_matrix_cython():

    result = np.empty((n, 4))

    return r(result, a_vals, b_vals, c_vals)


 def test_results():
    """This makes sure all of the methods return the same result. I opted to
    put the reshaping here which may or may not be a good idea."""

    print('Testing results.\n')

    gold_standard = eval_matrix_loop_python()

    np.testing.assert_allclose(np.transpose(eval_matrix_loop_lambdify(),
                                            (2, 0, 1)),
                               gold_standard)

    np.testing.assert_allclose(eval_matrix_loop_lambdify_individual(),
                               gold_standard)

    np.testing.assert_allclose(np.transpose(eval_matrix_loop_numpy_broadcast(),
                                            (2, 0, 1)),
                               gold_standard)

    np.testing.assert_allclose(eval_matrix_loop_numpy(),
                               gold_standard)

    np.testing.assert_allclose(eval_matrix_loop_cython().reshape(n, 2, 2),
                               gold_standard)

    np.testing.assert_allclose(eval_matrix_loop_c().reshape(n, 2, 2),
                               gold_standard)

    np.testing.assert_allclose(eval_matrix_loop_ufuncify_f2py(),
                               gold_standard)

    np.testing.assert_allclose(eval_matrix_loop_ufuncify_cython(),
                               gold_standard)

    np.testing.assert_allclose(eval_matrix_loop_ufuncify_matrix_cython(),
                               gold_standard)


 def time_functions():

    print('Timing the functions.\n')

    func_suffixes = {'python': 100,
                     'lambdify': 1000,
                     'lambdify_individual': 3000,
                     'numpy_broadcast': 1000,
                     'numpy': 2000,
                     'cython': 3000,
                     'c': 3000,
                     'ufuncify_f2py': 3000,
                     'ufuncify_cython': 3000,
                     'ufuncify_matrix_cython': 3000}

    times = {}
    eval_stmt = 'eval_matrix_loop_{}()'
    import_stmt = 'from __main__ import eval_matrix_loop_{}'

    for suffix, n in func_suffixes.items():
        print('Timing: {}'.format(suffix))
        times[suffix] = (timeit.timeit(eval_stmt.format(suffix),
                                       setup=import_stmt.format(suffix),
                                       number=n) / float(n))
        print('{} time: {} s\n'.format(suffix, times[suffix]))

    benchmark_time = times.pop('python')

    print('Benchmark time: {} s\n'.format(benchmark_time))

    print('Ratios of the timings to the benchmark time:')
    print('--------------------------------------------\n')

    ratios = {}
    for k, v in times.items():
        ratios[k] = benchmark_time / v

    for k in sorted(ratios, key=ratios.__getitem__, reverse=True):
        print('{} ratio: {}'.format(k, ratios[k]))

 if __name__ == "__main__":

    test_results()
    time_functions()
diff --git a/test_pypy.py b/test_pypy.py
 import math
 import random

 n = 10000

 a_vals = random.sample(xrange(1, n + 1), n)
 b_vals = random.sample(xrange(1, n + 1), n)
 c_vals = random.sample(xrange(1, n + 1), n)


 def eval_matrix_loop_pypy():
    """This is the standard Python method, i.e. loop through each array and
    compute the four matrix entries."""

    result = []

    for i in range(n):
        mat = []
        mat.append(a_vals[i]**2 * math.cos(b_vals[i])**c_vals[i])
        mat.append(math.tan(b_vals[i]) / math.sin(a_vals[i] + b_vals[i]) +
                   c_vals[i]**4)
        mat.append(a_vals[i]**2 + b_vals[i]**2 - math.sqrt(c_vals[i]))
        mat.append(((a_vals[i] + b_vals[i] + c_vals[i]) * (a_vals[i] +
                                                           b_vals[i])) /
                   a_vals[i] * math.sin(b_vals[i]))
        result.append(mat)

    return result
	Testing results.

	Timing the functions.

	Timing: cython
	cython time: 0.00288254904747 s

	Timing: numpy_broadcast
	numpy_broadcast time: 0.00597401690483 s

	Timing: lambdify_individual
	lambdify_individual time: 0.00303873364131 s

	Timing: ufuncify_f2py
	ufuncify_f2py time: 0.00201614236832 s

	Timing: python
	python time: 0.119000189304 s

	Timing: ufuncify_cython
	ufuncify_cython time: 0.00293522365888 s

	Timing: numpy
	numpy time: 0.00303197193146 s

	Timing: c
	c time: 0.0029081483682 s

	Timing: lambdify
	lambdify time: 0.00599523711205 s

	Timing: ufuncify_matrix_cython
	ufuncify_matrix_cython time: 0.00292766968409 s

	Benchmark time: 0.119000189304 s

	Ratios of the timings to the benchmark time:
	--------------------------------------------

	ufuncify_f2py ratio: 59.0237034717
	cython ratio: 41.2829711983
	c ratio: 40.9195729508
	ufuncify_matrix_cython ratio: 40.6467266273
	ufuncify_cython ratio: 40.5421198294
	numpy ratio: 39.2484468836
	lambdify_individual ratio: 39.1611122761
	numpy_broadcast ratio: 19.9196271454
	lambdify ratio: 19.8491214076
	#include <stdio.h>
	#include <stdlib.h>
	#include <time.h>

	#include "eval_matrix_h.h"

	/* This just tests out the C functions in eval_matrix_c.c. */

	int main()
	{
	// generate the a, b and c arrays of doubles

	int n = 10000;

	double a[n];
	double b[n];
	double c[n];

	srand(time(NULL));

	int i;

	for (i = 0; i < n; i++){
	a[i] = (rand() % 100);
	b[i] = (rand() % 100);
	c[i] = (rand() % 100);
	}

	// for (i = 0; i < n; i++){
	// printf("a%d: %f\n", i, a[i]);
	// printf("b%d: %f\n", i, b[i]);
	// printf("c%d: %f\n", i, c[i]);
	// }

	double matrix[n * 4];

	eval_matrix_loop(n, a, b, c, matrix);

	double mat[4];

	for (i = 0; i < n; i++){

	mat[0] = matrix[i * 4];
	mat[1] = matrix[i * 4 + 1];
	mat[2] = matrix[i * 4 + 2];
	mat[3] = matrix[i * 4 + 3];

	printf("matrix[%d] = [%f %f %f %f]\n", i, mat[0], mat[1], mat[2], mat[3]);
	}

	return 0;
	}
	"""This is a small rewrite of the ufuncify function in sympy so it supports
	array arguments for all values."""

	import importlib
	import subprocess

	import sympy as sy
	from sympy.utilities.lambdify import implemented_function
	from sympy.utilities.autowrap import autowrap

	_c_template = """\
	#include <math.h>
	#include "{file_prefix}_h.h"

	void {routine_name}(double matrix[{matrix_output_size}], {input_args})
	{{
	{eval_code}
	}}
	"""

	_h_template = """\
	void {routine_name}(double matrix[{matrix_output_size}], {input_args});
	"""

	_cython_template = """\
	import numpy as np
	cimport numpy as np
	cimport cython

	cdef extern from "{file_prefix}_h.h":
	void {routine_name}(double matrix[{matrix_output_size}], {input_args})

	@cython.boundscheck(False)
	@cython.wraparound(False)
	def {routine_name}_loop(np.ndarray[np.double_t, ndim=2] matrix, {numpy_typed_input_args}):

	cdef int n = matrix.shape[0]

	cdef int i

	for i in range(n):
	{routine_name}(&matrix[i, 0], {indexed_input_args})

	return matrix.reshape(n, {num_rows}, {num_cols})
	"""

	_setup_template = """\
	import numpy
	from distutils.core import setup
	from distutils.extension import Extension
	from Cython.Distutils import build_ext

	extension = Extension(name="{file_prefix}",
	sources=["{file_prefix}.pyx",
	"{file_prefix}_c.c"],
	extra_compile_args=[],
	include_dirs=[numpy.get_include()])

	setup(name="{routine_name}",
	cmdclass={{'build_ext': build_ext}},
	ext_modules=[extension])
	"""


	def ufuncify_matrix(args, expr, cse=True):

	matrix_size = expr.shape[0] * expr.shape[1]

	file_prefix_base = 'ufuncify_matrix'
	file_prefix = '{}_{}'.format(file_prefix_base, 0)

	taken = False
	i = 1

	while not taken:
	try:
	open(file_prefix + '.pyx', 'r')
	except IOError:
	taken = True
	else:
	file_prefix = '{}_{}'.format(file_prefix_base, i)
	i += 1

	d = {'routine_name': 'eval_matrix',
	'file_prefix': file_prefix,
	'matrix_output_size': matrix_size,
	'num_rows': expr.shape[0],
	'num_cols': expr.shape[1]}

	matrix_sym = sy.MatrixSymbol('matrix', expr.shape[0], expr.shape[1])

	sub_exprs, simple_mat = sy.cse(expr, sy.numbered_symbols('z_'))

	sub_expr_code = '\n'.join(['double ' + sy.ccode(sub_expr[1], sub_expr[0])
	for sub_expr in sub_exprs])

	matrix_code = sy.ccode(simple_mat[0], matrix_sym)

	d['eval_code'] = ' ' + '\n '.join((sub_expr_code + '\n' + matrix_code).split('\n'))

	c_indent = len('void {routine_name}('.format(**d))
	c_arg_spacer = ',\n' + ' ' * c_indent

	d['input_args'] = c_arg_spacer.join(['double {}'.format(a) for a in args])

	cython_indent = len('def {routine_name}_loop('.format(**d))
	cython_arg_spacer = ',\n' + ' ' * cython_indent

	d['numpy_typed_input_args'] = cython_arg_spacer.join(['np.ndarray[np.double_t, ndim=1] {}'.format(a) for a in args])

	d['indexed_input_args'] = ',\n'.join(['{}[i]'.format(a) for a in args])

	files = {}
	files[d['file_prefix'] + '_c.c'] = _c_template.format(**d)
	files[d['file_prefix'] + '_h.h'] = _h_template.format(**d)
	files[d['file_prefix'] + '.pyx'] = _cython_template.format(**d)
	files[d['file_prefix'] + '_setup.py'] = _setup_template.format(**d)

	for filename, code in files.items():
	with open(filename, 'w') as f:
	f.write(code)

	cmd = ['python', d['file_prefix'] + '_setup.py', 'build_ext', '--inplace']
	subprocess.call(cmd, stderr=subprocess.STDOUT, stdout=subprocess.PIPE)

	cython_module = importlib.import_module(d['file_prefix'])
	return getattr(cython_module, d['routine_name'] + '_loop')


	def ufuncify(args, expr, **kwargs):
	"""
	Generates a binary ufunc-like lambda function for numpy arrays

	``args``
	Either a Symbol or a tuple of symbols. Specifies the argument sequence
	for the ufunc-like function.

	``expr``
	A SymPy expression that defines the element wise operation

	``kwargs``
	Optional keyword arguments are forwarded to autowrap().

	The returned function can only act on one array at a time, as only the
	first argument accept arrays as input.

	.. Note:: a proper numpy ufunc is required to support broadcasting, type
	casting and more. The function returned here, may not qualify for
	numpy's definition of a ufunc. That why we use the term ufunc-like.

	References
	==========
	[1] http://docs.scipy.org/doc/numpy/reference/ufuncs.html

	Examples
	========

	>>> from sympy.utilities.autowrap import ufuncify
	>>> from sympy.abc import x, y
	>>> import numpy as np
	>>> f = ufuncify([x, y], y + x**2)
	>>> f([1, 2, 3], [2, 2, 2])
	[ 3. 6. 11.]
	>>> a = f(np.arange(5), 3 * np.ones(5))
	>>> isinstance(a, np.ndarray)
	True
	>>> print a
	[ 3. 4. 7. 12. 19.]

	"""
	y = sy.IndexedBase(sy.Dummy('y')) # result array

	i = sy.Dummy('i', integer=True) # index of the array
	m = sy.Dummy('m', integer=True) # length of array (dimension)
	i = sy.Idx(i, m) # index of the array

	l = sy.Lambda(args, expr) # A lambda function that represents the inputs and outputs of the expression
	f = implemented_function('f', l) # maps an UndefinedFunction('f') to the lambda function

	if isinstance(args, sy.Symbol):
	args = [args]
	else:
	args = list(args)

	# For each of the args create an indexed version.
	indexed_args = [sy.IndexedBase(sy.Dummy(str(a))) for a in args]

	# ensure correct order of arguments
	kwargs['args'] = [y] + indexed_args + [m]
	args_with_indices = [a[i] for a in indexed_args]
	return autowrap(sy.Eq(y[i], f(args_with_indices)), *kwargs)


	if __name__ == '__main__':

	result_array = sy.IndexedBase('y')

	arg_1 = sy.IndexedBase('a')
	arg_2 = sy.IndexedBase('b')

	array_length = sy.symbols('n', integer=True)

	index = sy.symbols('i', integer=True)
	index = sy.Idx(index, array_length)

	a, b = sy.symbols('a, b')

	expr = a**2 + b / 2.0

	lam = sy.Lambda((a, b), expr)

	sym_func = implemented_function('f', lam)

	code_args = [result_array, arg_1, arg_2, array_length]

	equation = sy.Eq(result_array[index], sym_func(arg_1[index], arg_2[index]))

	f_for = autowrap(equation, tempdir='multi-fortran', args=code_args)

	f_cyt = autowrap(equation, language='C', backend='cython', tempdir='multi-cython', args=code_args)
	"""This file builds the Cython extensions."""

	import numpy
	from distutils.core import setup
	from distutils.extension import Extension
	from Cython.Distutils import build_ext

	extension = Extension(name="eval_matrix",
	sources=["eval_matrix.pyx",
	"eval_matrix_c.c"],
	include_dirs=[numpy.get_include()])

	setup(name="eval_matrix",
	cmdclass={'build_ext': build_ext},
	ext_modules=[extension])
	function test_matrix_eval()

	n = 10000;

	a_vals = rand(n, 1);
	b_vals = rand(n, 1);
	c_vals = rand(n, 1);

	f_loop = @() eval_matrices_loop(n, a_vals, b_vals, c_vals);
	f_vectorized = @() eval_matrices_vectorized(n, a_vals, b_vals, c_vals);

	% These are taken manually from the Python results.

	python_loop_time= 0.136805589199;
	python_vectorized_time = 0.00317755591869;

	t_loop = timeit(f_loop);
	t_vectorized = timeit(f_vectorized);

	display('------------------------------------')

	display(sprintf('Mean time to evaluate the loop %1.4f s', t_loop))
	display(sprintf('Ratio to the Python loop benchmark time is %1.2f', ...
	python_loop_time / t_loop))
	display(sprintf('Ratio to the Python vectorized time is %1.2f', ...
	python_vectorized_time / t_loop))

	display('------------------------------------')

	display(sprintf('Mean time to evaluate the vectorized loop %1.4f s', ...
	t_vectorized))
	display(sprintf('Ratio to the Python loop benchmark time is %1.2f', ...
	python_loop_time / t_vectorized))
	display(sprintf('Ratio to the Python vectorized time is %1.2f', ...
	python_vectorized_time / t_vectorized))

	display('------------------------------------')

	end

	function result = eval_matrices_loop(n, a_vals, b_vals, c_vals)

	result = zeros(n, 2, 2);

	for i = 1:n
	result(i, 1, 1) = a_vals(i).^2 .* cos(b_vals(i)).^c_vals(i);
	result(i, 1, 2) = tan(b_vals(i)) ./ sin(a_vals(i) + b_vals(i)) + c_vals(i).^4;
	result(i, 2, 1) = a_vals(i).^2 + b_vals(i).^2 - sqrt(c_vals(i));
	result(i, 2, 2) = (((a_vals(i) + b_vals(i) + c_vals(i)) .* (a_vals(i) + b_vals(i))) ./ a_vals(i) .* sin(b_vals(i)));
	end

	end

	function result = eval_matrices_vectorized(n, a_vals, b_vals, c_vals)

	result = zeros(n, 2, 2);

	result(:, 1, 1) = a_vals.^2 .* cos(b_vals).^c_vals;
	result(:, 1, 2) = tan(b_vals) ./ sin(a_vals + b_vals) + c_vals.^4;
	result(:, 2, 1) = a_vals.^2 + b_vals.^2 - sqrt(c_vals);
	result(:, 2, 2) = (((a_vals + b_vals + c_vals) .* (a_vals + b_vals)) ./ a_vals .* sin(b_vals));

	end
	#!/usr/bin/env python

	"""A common need in my dynamics work is the to evaluate a small list of long
	mathematical expressions (typically less than 100 expressions) a large
	number of times (up to millions) with different scalar arguments each time.
	The number of evaluations is always much greater than the number of
	expressions. The expressions are generally all a function of the same
	scalars but not necessarily and sometimes the expressions have scalars that
	are constant with respect to each evaluations.

	This set of Python, Cython, and C programs explore how to do this as fast as
	possible.

	This requires the current master of SymPy.

	You will need to build the Cython extension modules before running this
	module.

	$ python setup.py build_exit --inplace

	"""

	import math
	import timeit

	import numpy as np
	import sympy as sp

	from eval_matrix import eval_matrix_loop_cython as cython_loop
	from eval_matrix import eval_matrix_loop_c as c_loop
	from multiindex import ufuncify, ufuncify_matrix

	"""First I create the symbolic form of some sample expressions (these are
	much shorter than my real problems)."""

	a, b, c = sp.symbols('a, b, c')

	expr_00 = a*2 sp.cos(b)**c
	expr_01 = sp.tan(b) / sp.sin(a + b) + c**4
	expr_10 = a2 + b2 - sp.sqrt(c)
	expr_11 = ((a + b + c) * (a + b)) / a * sp.sin(b)

	sym_mat = sp.Matrix([[expr_00, expr_01],
	[expr_10, expr_11]])

	# These simply set up some large one dimensional arrays that will be used in
	# the expression evaluation.

	n = 10000

	a_vals = np.random.random(n)
	b_vals = np.random.random(n)
	c_vals = np.random.random(n)

	"""The mathematical expressions are generally generated symbolically with
	SymPy. The easiet method available in SymPy is to use the lambdify function
	to generate a NumPy backed numerical function.

	I'll try two methods:

	1. lamdify the matrix and rely on numpy's underlying broadcasting
	2. lambdify each expression individually

	"""

	f = sp.lambdify((a, b, c), sym_mat, modules='numpy', default_array=True)

	f00 = sp.lambdify((a, b, c), expr_00, modules='numpy', default_array=True)
	f01 = sp.lambdify((a, b, c), expr_01, modules='numpy', default_array=True)
	f10 = sp.lambdify((a, b, c), expr_10, modules='numpy', default_array=True)
	f11 = sp.lambdify((a, b, c), expr_11, modules='numpy', default_array=True)


	def eval_matrix_loop_lambdify():
	"""This evaluates the lambdified matrix of expressions all in one
	shot."""
	return f(a_vals, b_vals, c_vals)


	def eval_matrix_loop_lambdify_individual():
	"""This allocates a 3D array inserts the evaluated lambdified
	expressions in the correct entries."""

	result = np.empty((n, 2, 2))

	result[:, 0, 0] = f00(a_vals, b_vals, c_vals)
	result[:, 0, 1] = f01(a_vals, b_vals, c_vals)
	result[:, 1, 0] = f10(a_vals, b_vals, c_vals)
	result[:, 1, 1] = f11(a_vals, b_vals, c_vals)

	return result

	"""These two methods are explicit Python functions that use NumPy to do
	exactly what lambdify does under the hood."""


	def eval_matrix_loop_numpy_broadcast():
	"""This is the same thing as lambdifying the SymPy matrix."""

	result = np.array(
	[[a_vals*2 np.cos(b_vals)**c_vals,
	np.tan(b_vals) / np.sin(a_vals + b_vals) + c_vals**4],
	[a_vals2 + b_vals2 - np.sqrt(c_vals),
	((a_vals + b_vals + c_vals) * (a_vals + b_vals)) / a_vals *
	np.sin(b_vals)]])

	return result


	def eval_matrix_loop_numpy():
	"""Since the number of matrix elements are typically much smaller than
	the number of evaluations, NumPy can be used to compute each of the
	Matrix expressions. This is equivalent to the individual lambdified
	expressions above."""

	result = np.empty((n, 2, 2))

	result[:, 0, 0] = a_vals*2 np.cos(b_vals)**c_vals
	result[:, 0, 1] = np.tan(b_vals) / np.sin(a_vals + b_vals) + c_vals**4
	result[:, 1, 0] = a_vals2 + b_vals2 - np.sqrt(c_vals)
	result[:, 1, 1] = (((a_vals + b_vals + c_vals) * (a_vals + b_vals)) /
	a_vals * np.sin(b_vals))

	return result


	"""The most basic method of building the result array is a simple loop in
	Python. But this will definitely be the slowest due to Python's overhead.
	But this is what we ultimately want to improve with all these methods that
	rely on fast low level code for the loop (vectorizing). This is the speed
	benchmark. All other method will be compared against it."""


	def eval_matrix_loop_python():
	"""This is the standard Python method, i.e. loop through each array and
	compute the four matrix entries."""

	result = np.empty((n, 2, 2))

	for i in range(n):
	result[i, 0, 0] = a_vals[i]*2 math.cos(b_vals[i])**c_vals[i]
	result[i, 0, 1] = (math.tan(b_vals[i]) / math.sin(a_vals[i] +
	b_vals[i]) + c_vals[i]**4)
	result[i, 1, 0] = a_vals[i]2 + b_vals[i]2 - math.sqrt(c_vals[i])
	result[i, 1, 1] = (((a_vals[i] + b_vals[i] + c_vals[i]) * (a_vals[i]
	+ b_vals[i])) / a_vals[i] * math.sin(b_vals[i]))

	return result

	"""The next methods utilized hand written C functions and some Cython
	wrappers. I have two flavors. In the cython one the loop is in Cython and
	the expression eval is in C. In the second one, _c, both the loop and the
	expression evals are in C, with just a light Cython wrapper."""


	def eval_matrix_loop_cython():
	"""This is equivalent to the naive Python loop but is implemented in a
	lower level as a combination of Cython and C. The loop is in Cython and
	the expression eval is in C."""

	result = np.empty((n, 4))

	return cython_loop(a_vals, b_vals, c_vals, result)


	def eval_matrix_loop_c():
	"""This is equivalent to the naive Python loop but is implemented in a
	lower level as a combination of Cython and C. The loop and expression
	evals are all in C."""

	result = np.empty((n * 4))

	return c_loop(a_vals, b_vals, c_vals, result)

	"""sympy.utilities.ufuncify automatically generates the broadcasting loop in
	the low level. The default settings use Fortran and f2py. Currently,
	ufuncify only supports scalar expressions and an array for the first
	argument. But I've included a modified version in multiindex.py that
	requires all of the arguments to the function to be arrays of equal length.
	ufuncify currently doesn't support a list of expressions (or sympy matrices)
	so I ufuncify each expression. If all of the expressions were in the low
	level loop then things will likely be faster especially if cse is used and
	other optimizations."""

	g00 = ufuncify((a, b, c), expr_00, language='F95', backend='f2py',
	tempdir='ufunc-fortran-code')
	g01 = ufuncify((a, b, c), expr_01, language='F95', backend='f2py')
	g10 = ufuncify((a, b, c), expr_10, language='F95', backend='f2py')
	g11 = ufuncify((a, b, c), expr_11, language='F95', backend='f2py')


	def eval_matrix_loop_ufuncify_f2py():
	"""This creates the result using the Fortran backend."""

	result = np.empty((n, 2, 2))

	result[:, 0, 0] = g00(a_vals, b_vals, c_vals)
	result[:, 0, 1] = g01(a_vals, b_vals, c_vals)
	result[:, 1, 0] = g10(a_vals, b_vals, c_vals)
	result[:, 1, 1] = g11(a_vals, b_vals, c_vals)

	return result

	h00 = ufuncify((a, b, c), expr_00, language='C', backend='Cython',
	tempdir='ufunc-cython-code')
	h01 = ufuncify((a, b, c), expr_01, language='C', backend='Cython')
	h10 = ufuncify((a, b, c), expr_10, language='C', backend='Cython')
	h11 = ufuncify((a, b, c), expr_11, language='C', backend='Cython')


	def eval_matrix_loop_ufuncify_cython():
	"""This creates the result using the C/Cython backend."""

	result = np.empty((n, 2, 2))

	result[:, 0, 0] = h00(a_vals, b_vals, c_vals)
	result[:, 0, 1] = h01(a_vals, b_vals, c_vals)
	result[:, 1, 0] = h10(a_vals, b_vals, c_vals)
	result[:, 1, 1] = h11(a_vals, b_vals, c_vals)

	return result

	r = ufuncify_matrix((a, b, c), sym_mat)


	def eval_matrix_loop_ufuncify_matrix_cython():

	result = np.empty((n, 4))

	return r(result, a_vals, b_vals, c_vals)


	def test_results():
	"""This makes sure all of the methods return the same result. I opted to
	put the reshaping here which may or may not be a good idea."""

	print('Testing results.\n')

	gold_standard = eval_matrix_loop_python()

	np.testing.assert_allclose(np.transpose(eval_matrix_loop_lambdify(),
	(2, 0, 1)),
	gold_standard)

	np.testing.assert_allclose(eval_matrix_loop_lambdify_individual(),
	gold_standard)

	np.testing.assert_allclose(np.transpose(eval_matrix_loop_numpy_broadcast(),
	(2, 0, 1)),
	gold_standard)

	np.testing.assert_allclose(eval_matrix_loop_numpy(),
	gold_standard)

	np.testing.assert_allclose(eval_matrix_loop_cython().reshape(n, 2, 2),
	gold_standard)

	np.testing.assert_allclose(eval_matrix_loop_c().reshape(n, 2, 2),
	gold_standard)

	np.testing.assert_allclose(eval_matrix_loop_ufuncify_f2py(),
	gold_standard)

	np.testing.assert_allclose(eval_matrix_loop_ufuncify_cython(),
	gold_standard)

	np.testing.assert_allclose(eval_matrix_loop_ufuncify_matrix_cython(),
	gold_standard)


	def time_functions():

	print('Timing the functions.\n')

	func_suffixes = {'python': 100,
	'lambdify': 1000,
	'lambdify_individual': 3000,
	'numpy_broadcast': 1000,
	'numpy': 2000,
	'cython': 3000,
	'c': 3000,
	'ufuncify_f2py': 3000,
	'ufuncify_cython': 3000,
	'ufuncify_matrix_cython': 3000}

	times = {}
	eval_stmt = 'eval_matrix_loop_{}()'
	import_stmt = 'from __main__ import eval_matrix_loop_{}'

	for suffix, n in func_suffixes.items():
	print('Timing: {}'.format(suffix))
	times[suffix] = (timeit.timeit(eval_stmt.format(suffix),
	setup=import_stmt.format(suffix),
	number=n) / float(n))
	print('{} time: {} s\n'.format(suffix, times[suffix]))

	benchmark_time = times.pop('python')

	print('Benchmark time: {} s\n'.format(benchmark_time))

	print('Ratios of the timings to the benchmark time:')
	print('--------------------------------------------\n')

	ratios = {}
	for k, v in times.items():
	ratios[k] = benchmark_time / v

	for k in sorted(ratios, key=ratios.__getitem__, reverse=True):
	print('{} ratio: {}'.format(k, ratios[k]))

	if __name__ == "__main__":

	test_results()
	time_functions()
	import math
	import random

	n = 10000

	a_vals = random.sample(xrange(1, n + 1), n)
	b_vals = random.sample(xrange(1, n + 1), n)
	c_vals = random.sample(xrange(1, n + 1), n)


	def eval_matrix_loop_pypy():
	"""This is the standard Python method, i.e. loop through each array and
	compute the four matrix entries."""

	result = []

	for i in range(n):
	mat = []
	mat.append(a_vals[i]*2 math.cos(b_vals[i])**c_vals[i])
	mat.append(math.tan(b_vals[i]) / math.sin(a_vals[i] + b_vals[i]) +
	c_vals[i]**4)
	mat.append(a_vals[i]2 + b_vals[i]2 - math.sqrt(c_vals[i]))
	mat.append(((a_vals[i] + b_vals[i] + c_vals[i]) * (a_vals[i] +
	b_vals[i])) /
	a_vals[i] * math.sin(b_vals[i]))
	result.append(mat)

	return result