#line 1 "numpy/core/src/multiarray/einsum.c.src"

/*
 *****************************************************************************
 **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
 **       Changes should be made to the original source (.src) file         **
 *****************************************************************************
 */

#line 1
/*
 * This file contains the implementation of the 'einsum' function,
 * which provides an einstein-summation operation.
 *
 * Copyright (c) 2011 by Mark Wiebe (mwwiebe@gmail.com)
 * The Univerity of British Columbia
 *
 * See LICENSE.txt for the license.
 */

#define PY_SSIZE_T_CLEAN
#include "Python.h"
#include "structmember.h"

#define NPY_NO_DEPRECATED_API NPY_API_VERSION
#define _MULTIARRAYMODULE
#include <numpy/npy_common.h>
#include <numpy/arrayobject.h>
#include <numpy/halffloat.h>
#include <npy_pycompat.h>

#include <ctype.h>

#include "convert.h"

#ifdef NPY_HAVE_SSE_INTRINSICS
#define EINSUM_USE_SSE1 1
#else
#define EINSUM_USE_SSE1 0
#endif

/*
 * TODO: Only some SSE2 for float64 is implemented.
 */
#ifdef NPY_HAVE_SSE2_INTRINSICS
#define EINSUM_USE_SSE2 1
#else
#define EINSUM_USE_SSE2 0
#endif

#if EINSUM_USE_SSE1
#include <xmmintrin.h>
#endif

#if EINSUM_USE_SSE2
#include <emmintrin.h>
#endif

#define EINSUM_IS_SSE_ALIGNED(x) ((((npy_intp)x)&0xf) == 0)

/********** PRINTF DEBUG TRACING **************/
#define NPY_EINSUM_DBG_TRACING 0

#if NPY_EINSUM_DBG_TRACING
#define NPY_EINSUM_DBG_PRINT(s) printf("%s", s);
#define NPY_EINSUM_DBG_PRINT1(s, p1) printf(s, p1);
#define NPY_EINSUM_DBG_PRINT2(s, p1, p2) printf(s, p1, p2);
#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3) printf(s);
#else
#define NPY_EINSUM_DBG_PRINT(s)
#define NPY_EINSUM_DBG_PRINT1(s, p1)
#define NPY_EINSUM_DBG_PRINT2(s, p1, p2)
#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3)
#endif
/**********************************************/

typedef enum {
    BROADCAST_NONE,
    BROADCAST_LEFT,
    BROADCAST_RIGHT,
    BROADCAST_MIDDLE
} EINSUM_BROADCAST;

#line 108

#line 113
static void
byte_sum_of_products_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if (1 == 1) || (1 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1 == 2 || 1 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (1 == 1) || (1 <= 3 && !0)
    char *data_out = dataptr[1];
    npy_intp stride_out = strides[1];
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_one (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 1 == 1
        *(npy_byte *)data_out = ((*(npy_byte *)data0) +
                                         (*(npy_byte *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 1 == 2
        *(npy_byte *)data_out = ((*(npy_byte *)data0) *
                                         (*(npy_byte *)data1) +
                                         (*(npy_byte *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 1 == 3
        *(npy_byte *)data_out = ((*(npy_byte *)data0) *
                                         (*(npy_byte *)data1) *
                                         (*(npy_byte *)data2) +
                                         (*(npy_byte *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_byte temp = (*(npy_byte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_byte *)dataptr[i]);
        }
        *(npy_byte *)dataptr[nop] = (temp +
                                           (*(npy_byte *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1 == 1
        ((npy_byte *)data_out)[0] = ((npy_byte *)data0)[0] +
                                         ((npy_byte *)data_out)[0];
        ((npy_byte *)data_out)[1] = ((npy_byte *)data0)[1] +
                                         ((npy_byte *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 1 <= 3
#define _SUMPROD_NOP 1
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_byte re, im, tmp;
        int i;
        re = ((npy_byte *)dataptr[0])[0];
        im = ((npy_byte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_byte *)dataptr[i])[0] -
                  im * ((npy_byte *)dataptr[i])[1];
            im = re * ((npy_byte *)dataptr[i])[1] +
                 im * ((npy_byte *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_byte *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[0];
        ((npy_byte *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 1 == 1

static void
byte_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data_out = (npy_byte *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 228
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_byte *)data_out + 2*6)[0] =
                                    ((npy_byte *)data0 + 2*6)[0] +
                                    ((npy_byte *)data_out + 2*6)[0];
            ((npy_byte *)data_out + 2*6)[1] =
                                    ((npy_byte *)data0 + 2*6)[1] +
                                    ((npy_byte *)data_out + 2*6)[1];
#endif

#line 228
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_byte *)data_out + 2*5)[0] =
                                    ((npy_byte *)data0 + 2*5)[0] +
                                    ((npy_byte *)data_out + 2*5)[0];
            ((npy_byte *)data_out + 2*5)[1] =
                                    ((npy_byte *)data0 + 2*5)[1] +
                                    ((npy_byte *)data_out + 2*5)[1];
#endif

#line 228
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_byte *)data_out + 2*4)[0] =
                                    ((npy_byte *)data0 + 2*4)[0] +
                                    ((npy_byte *)data_out + 2*4)[0];
            ((npy_byte *)data_out + 2*4)[1] =
                                    ((npy_byte *)data0 + 2*4)[1] +
                                    ((npy_byte *)data_out + 2*4)[1];
#endif

#line 228
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_byte *)data_out + 2*3)[0] =
                                    ((npy_byte *)data0 + 2*3)[0] +
                                    ((npy_byte *)data_out + 2*3)[0];
            ((npy_byte *)data_out + 2*3)[1] =
                                    ((npy_byte *)data0 + 2*3)[1] +
                                    ((npy_byte *)data_out + 2*3)[1];
#endif

#line 228
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_byte *)data_out + 2*2)[0] =
                                    ((npy_byte *)data0 + 2*2)[0] +
                                    ((npy_byte *)data_out + 2*2)[0];
            ((npy_byte *)data_out + 2*2)[1] =
                                    ((npy_byte *)data0 + 2*2)[1] +
                                    ((npy_byte *)data_out + 2*2)[1];
#endif

#line 228
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_byte *)data_out + 2*1)[0] =
                                    ((npy_byte *)data0 + 2*1)[0] +
                                    ((npy_byte *)data_out + 2*1)[0];
            ((npy_byte *)data_out + 2*1)[1] =
                                    ((npy_byte *)data0 + 2*1)[1] +
                                    ((npy_byte *)data_out + 2*1)[1];
#endif

#line 228
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_byte *)data_out + 2*0)[0] =
                                    ((npy_byte *)data0 + 2*0)[0] +
                                    ((npy_byte *)data_out + 2*0)[0];
            ((npy_byte *)data_out + 2*0)[1] =
                                    ((npy_byte *)data0 + 2*0)[1] +
                                    ((npy_byte *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 252
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_byte *)data_out + 2*0)[0] =
                                ((npy_byte *)data0 + 2*0)[0] +
                                ((npy_byte *)data_out + 2*0)[0];
        ((npy_byte *)data_out + 2*0)[1] =
                                ((npy_byte *)data0 + 2*0)[1] +
                                ((npy_byte *)data_out + 2*0)[1];
#endif

#line 252
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_byte *)data_out + 2*1)[0] =
                                ((npy_byte *)data0 + 2*1)[0] +
                                ((npy_byte *)data_out + 2*1)[0];
        ((npy_byte *)data_out + 2*1)[1] =
                                ((npy_byte *)data0 + 2*1)[1] +
                                ((npy_byte *)data_out + 2*1)[1];
#endif

#line 252
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_byte *)data_out + 2*2)[0] =
                                ((npy_byte *)data0 + 2*2)[0] +
                                ((npy_byte *)data_out + 2*2)[0];
        ((npy_byte *)data_out + 2*2)[1] =
                                ((npy_byte *)data0 + 2*2)[1] +
                                ((npy_byte *)data_out + 2*2)[1];
#endif

#line 252
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_byte *)data_out + 2*3)[0] =
                                ((npy_byte *)data0 + 2*3)[0] +
                                ((npy_byte *)data_out + 2*3)[0];
        ((npy_byte *)data_out + 2*3)[1] =
                                ((npy_byte *)data0 + 2*3)[1] +
                                ((npy_byte *)data_out + 2*3)[1];
#endif

#line 252
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_byte *)data_out + 2*4)[0] =
                                ((npy_byte *)data0 + 2*4)[0] +
                                ((npy_byte *)data_out + 2*4)[0];
        ((npy_byte *)data_out + 2*4)[1] =
                                ((npy_byte *)data0 + 2*4)[1] +
                                ((npy_byte *)data_out + 2*4)[1];
#endif

#line 252
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_byte *)data_out + 2*5)[0] =
                                ((npy_byte *)data0 + 2*5)[0] +
                                ((npy_byte *)data_out + 2*5)[0];
        ((npy_byte *)data_out + 2*5)[1] =
                                ((npy_byte *)data0 + 2*5)[1] +
                                ((npy_byte *)data_out + 2*5)[1];
#endif

#line 252
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_byte *)data_out + 2*6)[0] =
                                ((npy_byte *)data0 + 2*6)[0] +
                                ((npy_byte *)data_out + 2*6)[0];
        ((npy_byte *)data_out + 2*6)[1] =
                                ((npy_byte *)data0 + 2*6)[1] +
                                ((npy_byte *)data_out + 2*6)[1];
#endif

#line 252
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_byte *)data_out + 2*7)[0] =
                                ((npy_byte *)data0 + 2*7)[0] +
                                ((npy_byte *)data_out + 2*7)[0];
        ((npy_byte *)data_out + 2*7)[1] =
                                ((npy_byte *)data0 + 2*7)[1] +
                                ((npy_byte *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1 == 2 && !0

static void
byte_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte *data_out = (npy_byte *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b;
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_two (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 295
        case 6+1:
            data_out[6] = ((data0[6]) *
                                 (data1[6]) +
                                 (data_out[6]));

#line 295
        case 5+1:
            data_out[5] = ((data0[5]) *
                                 (data1[5]) +
                                 (data_out[5]));

#line 295
        case 4+1:
            data_out[4] = ((data0[4]) *
                                 (data1[4]) +
                                 (data_out[4]));

#line 295
        case 3+1:
            data_out[3] = ((data0[3]) *
                                 (data1[3]) +
                                 (data_out[3]));

#line 295
        case 2+1:
            data_out[2] = ((data0[2]) *
                                 (data1[2]) +
                                 (data_out[2]));

#line 295
        case 1+1:
            data_out[1] = ((data0[1]) *
                                 (data1[1]) +
                                 (data_out[1]));

#line 295
        case 0+1:
            data_out[0] = ((data0[0]) *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
        EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 345
        data_out[0] = ((data0[0]) *
                             (data1[0]) +
                             (data_out[0]));

#line 345
        data_out[1] = ((data0[1]) *
                             (data1[1]) +
                             (data_out[1]));

#line 345
        data_out[2] = ((data0[2]) *
                             (data1[2]) +
                             (data_out[2]));

#line 345
        data_out[3] = ((data0[3]) *
                             (data1[3]) +
                             (data_out[3]));

#line 345
        data_out[4] = ((data0[4]) *
                             (data1[4]) +
                             (data_out[4]));

#line 345
        data_out[5] = ((data0[5]) *
                             (data1[5]) +
                             (data_out[5]));

#line 345
        data_out[6] = ((data0[6]) *
                             (data1[6]) +
                             (data_out[6]));

#line 345
        data_out[7] = ((data0[7]) *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data0 += 8;
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

/* Some extra specializations for the two operand case */
static void
byte_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte value0 = (*(npy_byte *)dataptr[0]);
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte *data_out = (npy_byte *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value0_sse;
#elif EINSUM_USE_SSE2 && 0
    __m128d a, b, value0_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 383
        case 6+1:
            data_out[6] = (value0 *
                                 (data1[6]) +
                                 (data_out[6]));

#line 383
        case 5+1:
            data_out[5] = (value0 *
                                 (data1[5]) +
                                 (data_out[5]));

#line 383
        case 4+1:
            data_out[4] = (value0 *
                                 (data1[4]) +
                                 (data_out[4]));

#line 383
        case 3+1:
            data_out[3] = (value0 *
                                 (data1[3]) +
                                 (data_out[3]));

#line 383
        case 2+1:
            data_out[2] = (value0 *
                                 (data1[2]) +
                                 (data_out[2]));

#line 383
        case 1+1:
            data_out[1] = (value0 *
                                 (data1[1]) +
                                 (data_out[1]));

#line 383
        case 0+1:
            data_out[0] = (value0 *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value0_sse = _mm_set_ps1(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#elif EINSUM_USE_SSE2 && 0
    value0_sse = _mm_set1_pd(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+0));
            b = _mm_add_pd(a, _mm_load_pd(data_out+0));
            _mm_store_pd(data_out+0, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+2));
            b = _mm_add_pd(a, _mm_load_pd(data_out+2));
            _mm_store_pd(data_out+2, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+4));
            b = _mm_add_pd(a, _mm_load_pd(data_out+4));
            _mm_store_pd(data_out+4, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+6));
            b = _mm_add_pd(a, _mm_load_pd(data_out+6));
            _mm_store_pd(data_out+6, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#elif EINSUM_USE_SSE2 && 0
#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+0));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+0));
        _mm_storeu_pd(data_out+0, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+2));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+2));
        _mm_storeu_pd(data_out+2, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+4));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+4));
        _mm_storeu_pd(data_out+4, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+6));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+6));
        _mm_storeu_pd(data_out+6, b);

#else
#line 474
        data_out[0] = (value0 *
                             (data1[0]) +
                             (data_out[0]));

#line 474
        data_out[1] = (value0 *
                             (data1[1]) +
                             (data_out[1]));

#line 474
        data_out[2] = (value0 *
                             (data1[2]) +
                             (data_out[2]));

#line 474
        data_out[3] = (value0 *
                             (data1[3]) +
                             (data_out[3]));

#line 474
        data_out[4] = (value0 *
                             (data1[4]) +
                             (data_out[4]));

#line 474
        data_out[5] = (value0 *
                             (data1[5]) +
                             (data_out[5]));

#line 474
        data_out[6] = (value0 *
                             (data1[6]) +
                             (data_out[6]));

#line 474
        data_out[7] = (value0 *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    if (count > 0) {
        goto finish_after_unrolled_loop;
    }
}

static void
byte_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte value1 = (*(npy_byte *)dataptr[1]);
    npy_byte *data_out = (npy_byte *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value1_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 510
        case 6+1:
            data_out[6] = ((data0[6])*
                                 value1  +
                                 (data_out[6]));

#line 510
        case 5+1:
            data_out[5] = ((data0[5])*
                                 value1  +
                                 (data_out[5]));

#line 510
        case 4+1:
            data_out[4] = ((data0[4])*
                                 value1  +
                                 (data_out[4]));

#line 510
        case 3+1:
            data_out[3] = ((data0[3])*
                                 value1  +
                                 (data_out[3]));

#line 510
        case 2+1:
            data_out[2] = ((data0[2])*
                                 value1  +
                                 (data_out[2]));

#line 510
        case 1+1:
            data_out[1] = ((data0[1])*
                                 value1  +
                                 (data_out[1]));

#line 510
        case 0+1:
            data_out[0] = ((data0[0])*
                                 value1  +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value1_sse = _mm_set_ps1(value1);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+0), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+4), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 560
        data_out[0] = ((data0[0])*
                             value1  +
                             (data_out[0]));

#line 560
        data_out[1] = ((data0[1])*
                             value1  +
                             (data_out[1]));

#line 560
        data_out[2] = ((data0[2])*
                             value1  +
                             (data_out[2]));

#line 560
        data_out[3] = ((data0[3])*
                             value1  +
                             (data_out[3]));

#line 560
        data_out[4] = ((data0[4])*
                             value1  +
                             (data_out[4]));

#line 560
        data_out[5] = ((data0[5])*
                             value1  +
                             (data_out[5]));

#line 560
        data_out[6] = ((data0[6])*
                             value1  +
                             (data_out[6]));

#line 560
        data_out[7] = ((data0[7])*
                             value1  +
                             (data_out[7]));

#endif
        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
byte_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 596
        case 6+1:
            accum += (data0[6]) * (data1[6]);

#line 596
        case 5+1:
            accum += (data0[5]) * (data1[5]);

#line 596
        case 4+1:
            accum += (data0[4]) * (data1[4]);

#line 596
        case 3+1:
            accum += (data0[3]) * (data1[3]);

#line 596
        case 2+1:
            accum += (data0[2]) * (data1[2]);

#line 596
        case 1+1:
            accum += (data0[1]) * (data1[1]);

#line 596
        case 0+1:
            accum += (data0[0]) * (data1[0]);

        case 0:
            *(npy_byte *)dataptr[2] += (accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            accum_sse = _mm_add_ps(accum_sse, a);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            accum_sse = _mm_add_ps(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+0), _mm_load_pd(data1+0));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+2), _mm_load_pd(data1+2));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+4), _mm_load_pd(data1+4));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+6), _mm_load_pd(data1+6));
            accum_sse = _mm_add_pd(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        accum_sse = _mm_add_ps(accum_sse, a);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        accum_sse = _mm_add_ps(accum_sse, a);

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+0), _mm_loadu_pd(data1+0));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+2), _mm_loadu_pd(data1+2));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+4), _mm_loadu_pd(data1+4));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+6), _mm_loadu_pd(data1+6));
        accum_sse = _mm_add_pd(accum_sse, a);

#else
#line 708
        accum += (data0[0]) * (data1[0]);

#line 708
        accum += (data0[1]) * (data1[1]);

#line 708
        accum += (data0[2]) * (data1[2]);

#line 708
        accum += (data0[3]) * (data1[3]);

#line 708
        accum += (data0[4]) * (data1[4]);

#line 708
        accum += (data0[5]) * (data1[5]);

#line 708
        accum += (data0[6]) * (data1[6]);

#line 708
        accum += (data0[7]) * (data1[7]);

#endif
        data0 += 8;
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
byte_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte value0 = (*(npy_byte *)dataptr[0]);
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_stride0_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 754
        case 6+1:
            accum += (data1[6]);

#line 754
        case 5+1:
            accum += (data1[5]);

#line 754
        case 4+1:
            accum += (data1[4]);

#line 754
        case 3+1:
            accum += (data1[3]);

#line 754
        case 2+1:
            accum += (data1[2]);

#line 754
        case 1+1:
            accum += (data1[1]);

#line 754
        case 0+1:
            accum += (data1[0]);

        case 0:
            *(npy_byte *)dataptr[2] += (value0 * accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+0));

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+4));

            data1 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+0));

#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+4));

#else
#line 813
        accum += (data1[0]);

#line 813
        accum += (data1[1]);

#line 813
        accum += (data1[2]);

#line 813
        accum += (data1[3]);

#line 813
        accum += (data1[4]);

#line 813
        accum += (data1[5]);

#line 813
        accum += (data1[6]);

#line 813
        accum += (data1[7]);

#endif
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
byte_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte value1 = (*(npy_byte *)dataptr[1]);
    npy_byte accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_stride0_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 853
        case 6+1:
            accum += (data0[6]);

#line 853
        case 5+1:
            accum += (data0[5]);

#line 853
        case 4+1:
            accum += (data0[4]);

#line 853
        case 3+1:
            accum += (data0[3]);

#line 853
        case 2+1:
            accum += (data0[2]);

#line 853
        case 1+1:
            accum += (data0[1]);

#line 853
        case 0+1:
            accum += (data0[0]);

        case 0:
            *(npy_byte *)dataptr[2] += (accum * value1);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#else
#line 912
        accum += (data0[0]);

#line 912
        accum += (data0[1]);

#line 912
        accum += (data0[2]);

#line 912
        accum += (data0[3]);

#line 912
        accum += (data0[4]);

#line 912
        accum += (data0[5]);

#line 912
        accum += (data0[6]);

#line 912
        accum += (data0[7]);

#endif
        data0 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1 == 3 && !0

static void
byte_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte *data2 = (npy_byte *)dataptr[2];
    npy_byte *data_out = (npy_byte *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 949
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 949
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 949
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 949
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 949
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 949
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 949
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 949
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 1 > 3 || @complex */

static void
byte_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_one (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_byte temp = (*(npy_byte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_byte *)dataptr[i]);
        }
        *(npy_byte *)dataptr[nop] = (temp +
                                           (*(npy_byte *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_byte);
        }
#else /* complex */
#  if 1 <= 3
#    define _SUMPROD_NOP 1
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_byte re, im, tmp;
        int i;
        re = ((npy_byte *)dataptr[0])[0];
        im = ((npy_byte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_byte *)dataptr[i])[0] -
                  im * ((npy_byte *)dataptr[i])[1];
            im = re * ((npy_byte *)dataptr[i])[1] +
                 im * ((npy_byte *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_byte *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[0];
        ((npy_byte *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_byte);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 1 */

#if 1 == 1

static void
byte_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_byte accum_re = 0, accum_im = 0;
    npy_byte *data0 = (npy_byte *)dataptr[0];
#else
    npy_byte accum = 0;
    npy_byte *data0 = (npy_byte *)dataptr[0];
#endif

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif


    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_outstride0_one (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 1058
        case 6+1:
#if !0
            accum += (data0[6]);
#else /* complex */
            accum_re += data0[2*6+0];
            accum_im += data0[2*6+1];
#endif

#line 1058
        case 5+1:
#if !0
            accum += (data0[5]);
#else /* complex */
            accum_re += data0[2*5+0];
            accum_im += data0[2*5+1];
#endif

#line 1058
        case 4+1:
#if !0
            accum += (data0[4]);
#else /* complex */
            accum_re += data0[2*4+0];
            accum_im += data0[2*4+1];
#endif

#line 1058
        case 3+1:
#if !0
            accum += (data0[3]);
#else /* complex */
            accum_re += data0[2*3+0];
            accum_im += data0[2*3+1];
#endif

#line 1058
        case 2+1:
#if !0
            accum += (data0[2]);
#else /* complex */
            accum_re += data0[2*2+0];
            accum_im += data0[2*2+1];
#endif

#line 1058
        case 1+1:
#if !0
            accum += (data0[1]);
#else /* complex */
            accum_re += data0[2*1+0];
            accum_im += data0[2*1+1];
#endif

#line 1058
        case 0+1:
#if !0
            accum += (data0[0]);
#else /* complex */
            accum_re += data0[2*0+0];
            accum_im += data0[2*0+1];
#endif

        case 0:
#if 0
            ((npy_byte *)dataptr[1])[0] += accum_re;
            ((npy_byte *)dataptr[1])[1] += accum_im;
#else
            *((npy_byte *)dataptr[1]) = (accum +
                                    (*((npy_byte *)dataptr[1])));
#endif
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+0));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+2));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+4));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+6));

            data0 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+0));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+2));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+4));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+6));

#else
#line 1171
#  if !0
        accum += (data0[0]);
#  else /* complex */
        accum_re += data0[2*0+0];
        accum_im += data0[2*0+1];
#  endif

#line 1171
#  if !0
        accum += (data0[1]);
#  else /* complex */
        accum_re += data0[2*1+0];
        accum_im += data0[2*1+1];
#  endif

#line 1171
#  if !0
        accum += (data0[2]);
#  else /* complex */
        accum_re += data0[2*2+0];
        accum_im += data0[2*2+1];
#  endif

#line 1171
#  if !0
        accum += (data0[3]);
#  else /* complex */
        accum_re += data0[2*3+0];
        accum_im += data0[2*3+1];
#  endif

#line 1171
#  if !0
        accum += (data0[4]);
#  else /* complex */
        accum_re += data0[2*4+0];
        accum_im += data0[2*4+1];
#  endif

#line 1171
#  if !0
        accum += (data0[5]);
#  else /* complex */
        accum_re += data0[2*5+0];
        accum_im += data0[2*5+1];
#  endif

#line 1171
#  if !0
        accum += (data0[6]);
#  else /* complex */
        accum_re += data0[2*6+0];
        accum_im += data0[2*6+1];
#  endif

#line 1171
#  if !0
        accum += (data0[7]);
#  else /* complex */
        accum_re += data0[2*7+0];
        accum_im += data0[2*7+1];
#  endif

#endif

#if !0
        data0 += 8;
#else
        data0 += 8*2;
#endif
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#endif /* 1 == 1 */

static void
byte_sum_of_products_outstride0_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_byte accum_re = 0, accum_im = 0;
#else
    npy_byte accum = 0;
#endif

#if (1 == 1) || (1 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1 == 2 || 1 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_outstride0_one (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 1 == 1
        accum += (*(npy_byte *)data0);
        data0 += stride0;
#  elif 1 == 2
        accum += (*(npy_byte *)data0) *
                 (*(npy_byte *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 1 == 3
        accum += (*(npy_byte *)data0) *
                 (*(npy_byte *)data1) *
                 (*(npy_byte *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_byte temp = (*(npy_byte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_byte *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1 == 1
        accum_re += ((npy_byte *)data0)[0];
        accum_im += ((npy_byte *)data0)[1];
        data0 += stride0;
#  else
#    if 1 <= 3
#define _SUMPROD_NOP 1
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_byte re, im, tmp;
        int i;
        re = ((npy_byte *)dataptr[0])[0];
        im = ((npy_byte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_byte *)dataptr[i])[0] -
                  im * ((npy_byte *)dataptr[i])[1];
            im = re * ((npy_byte *)dataptr[i])[1] +
                 im * ((npy_byte *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 1 <= 3
    ((npy_byte *)dataptr[1])[0] += accum_re;
    ((npy_byte *)dataptr[1])[1] += accum_im;
#  else
    ((npy_byte *)dataptr[nop])[0] += accum_re;
    ((npy_byte *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 1 <= 3
    *((npy_byte *)dataptr[1]) = (accum +
                                    (*((npy_byte *)dataptr[1])));
#  else
    *((npy_byte *)dataptr[nop]) = (accum +
                                    (*((npy_byte *)dataptr[nop])));
#  endif
#endif

}


#line 113
static void
byte_sum_of_products_two(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if (2 == 1) || (2 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (2 == 2 || 2 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (2 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (2 == 1) || (2 <= 3 && !0)
    char *data_out = dataptr[2];
    npy_intp stride_out = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_two (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 2 == 1
        *(npy_byte *)data_out = ((*(npy_byte *)data0) +
                                         (*(npy_byte *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 2 == 2
        *(npy_byte *)data_out = ((*(npy_byte *)data0) *
                                         (*(npy_byte *)data1) +
                                         (*(npy_byte *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 2 == 3
        *(npy_byte *)data_out = ((*(npy_byte *)data0) *
                                         (*(npy_byte *)data1) *
                                         (*(npy_byte *)data2) +
                                         (*(npy_byte *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_byte temp = (*(npy_byte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_byte *)dataptr[i]);
        }
        *(npy_byte *)dataptr[nop] = (temp +
                                           (*(npy_byte *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 2 == 1
        ((npy_byte *)data_out)[0] = ((npy_byte *)data0)[0] +
                                         ((npy_byte *)data_out)[0];
        ((npy_byte *)data_out)[1] = ((npy_byte *)data0)[1] +
                                         ((npy_byte *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 2 <= 3
#define _SUMPROD_NOP 2
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_byte re, im, tmp;
        int i;
        re = ((npy_byte *)dataptr[0])[0];
        im = ((npy_byte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_byte *)dataptr[i])[0] -
                  im * ((npy_byte *)dataptr[i])[1];
            im = re * ((npy_byte *)dataptr[i])[1] +
                 im * ((npy_byte *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_byte *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[0];
        ((npy_byte *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 2 == 1

static void
byte_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data_out = (npy_byte *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 228
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_byte *)data_out + 2*6)[0] =
                                    ((npy_byte *)data0 + 2*6)[0] +
                                    ((npy_byte *)data_out + 2*6)[0];
            ((npy_byte *)data_out + 2*6)[1] =
                                    ((npy_byte *)data0 + 2*6)[1] +
                                    ((npy_byte *)data_out + 2*6)[1];
#endif

#line 228
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_byte *)data_out + 2*5)[0] =
                                    ((npy_byte *)data0 + 2*5)[0] +
                                    ((npy_byte *)data_out + 2*5)[0];
            ((npy_byte *)data_out + 2*5)[1] =
                                    ((npy_byte *)data0 + 2*5)[1] +
                                    ((npy_byte *)data_out + 2*5)[1];
#endif

#line 228
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_byte *)data_out + 2*4)[0] =
                                    ((npy_byte *)data0 + 2*4)[0] +
                                    ((npy_byte *)data_out + 2*4)[0];
            ((npy_byte *)data_out + 2*4)[1] =
                                    ((npy_byte *)data0 + 2*4)[1] +
                                    ((npy_byte *)data_out + 2*4)[1];
#endif

#line 228
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_byte *)data_out + 2*3)[0] =
                                    ((npy_byte *)data0 + 2*3)[0] +
                                    ((npy_byte *)data_out + 2*3)[0];
            ((npy_byte *)data_out + 2*3)[1] =
                                    ((npy_byte *)data0 + 2*3)[1] +
                                    ((npy_byte *)data_out + 2*3)[1];
#endif

#line 228
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_byte *)data_out + 2*2)[0] =
                                    ((npy_byte *)data0 + 2*2)[0] +
                                    ((npy_byte *)data_out + 2*2)[0];
            ((npy_byte *)data_out + 2*2)[1] =
                                    ((npy_byte *)data0 + 2*2)[1] +
                                    ((npy_byte *)data_out + 2*2)[1];
#endif

#line 228
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_byte *)data_out + 2*1)[0] =
                                    ((npy_byte *)data0 + 2*1)[0] +
                                    ((npy_byte *)data_out + 2*1)[0];
            ((npy_byte *)data_out + 2*1)[1] =
                                    ((npy_byte *)data0 + 2*1)[1] +
                                    ((npy_byte *)data_out + 2*1)[1];
#endif

#line 228
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_byte *)data_out + 2*0)[0] =
                                    ((npy_byte *)data0 + 2*0)[0] +
                                    ((npy_byte *)data_out + 2*0)[0];
            ((npy_byte *)data_out + 2*0)[1] =
                                    ((npy_byte *)data0 + 2*0)[1] +
                                    ((npy_byte *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 252
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_byte *)data_out + 2*0)[0] =
                                ((npy_byte *)data0 + 2*0)[0] +
                                ((npy_byte *)data_out + 2*0)[0];
        ((npy_byte *)data_out + 2*0)[1] =
                                ((npy_byte *)data0 + 2*0)[1] +
                                ((npy_byte *)data_out + 2*0)[1];
#endif

#line 252
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_byte *)data_out + 2*1)[0] =
                                ((npy_byte *)data0 + 2*1)[0] +
                                ((npy_byte *)data_out + 2*1)[0];
        ((npy_byte *)data_out + 2*1)[1] =
                                ((npy_byte *)data0 + 2*1)[1] +
                                ((npy_byte *)data_out + 2*1)[1];
#endif

#line 252
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_byte *)data_out + 2*2)[0] =
                                ((npy_byte *)data0 + 2*2)[0] +
                                ((npy_byte *)data_out + 2*2)[0];
        ((npy_byte *)data_out + 2*2)[1] =
                                ((npy_byte *)data0 + 2*2)[1] +
                                ((npy_byte *)data_out + 2*2)[1];
#endif

#line 252
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_byte *)data_out + 2*3)[0] =
                                ((npy_byte *)data0 + 2*3)[0] +
                                ((npy_byte *)data_out + 2*3)[0];
        ((npy_byte *)data_out + 2*3)[1] =
                                ((npy_byte *)data0 + 2*3)[1] +
                                ((npy_byte *)data_out + 2*3)[1];
#endif

#line 252
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_byte *)data_out + 2*4)[0] =
                                ((npy_byte *)data0 + 2*4)[0] +
                                ((npy_byte *)data_out + 2*4)[0];
        ((npy_byte *)data_out + 2*4)[1] =
                                ((npy_byte *)data0 + 2*4)[1] +
                                ((npy_byte *)data_out + 2*4)[1];
#endif

#line 252
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_byte *)data_out + 2*5)[0] =
                                ((npy_byte *)data0 + 2*5)[0] +
                                ((npy_byte *)data_out + 2*5)[0];
        ((npy_byte *)data_out + 2*5)[1] =
                                ((npy_byte *)data0 + 2*5)[1] +
                                ((npy_byte *)data_out + 2*5)[1];
#endif

#line 252
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_byte *)data_out + 2*6)[0] =
                                ((npy_byte *)data0 + 2*6)[0] +
                                ((npy_byte *)data_out + 2*6)[0];
        ((npy_byte *)data_out + 2*6)[1] =
                                ((npy_byte *)data0 + 2*6)[1] +
                                ((npy_byte *)data_out + 2*6)[1];
#endif

#line 252
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_byte *)data_out + 2*7)[0] =
                                ((npy_byte *)data0 + 2*7)[0] +
                                ((npy_byte *)data_out + 2*7)[0];
        ((npy_byte *)data_out + 2*7)[1] =
                                ((npy_byte *)data0 + 2*7)[1] +
                                ((npy_byte *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 2 == 2 && !0

static void
byte_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte *data_out = (npy_byte *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b;
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_two (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 295
        case 6+1:
            data_out[6] = ((data0[6]) *
                                 (data1[6]) +
                                 (data_out[6]));

#line 295
        case 5+1:
            data_out[5] = ((data0[5]) *
                                 (data1[5]) +
                                 (data_out[5]));

#line 295
        case 4+1:
            data_out[4] = ((data0[4]) *
                                 (data1[4]) +
                                 (data_out[4]));

#line 295
        case 3+1:
            data_out[3] = ((data0[3]) *
                                 (data1[3]) +
                                 (data_out[3]));

#line 295
        case 2+1:
            data_out[2] = ((data0[2]) *
                                 (data1[2]) +
                                 (data_out[2]));

#line 295
        case 1+1:
            data_out[1] = ((data0[1]) *
                                 (data1[1]) +
                                 (data_out[1]));

#line 295
        case 0+1:
            data_out[0] = ((data0[0]) *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
        EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 345
        data_out[0] = ((data0[0]) *
                             (data1[0]) +
                             (data_out[0]));

#line 345
        data_out[1] = ((data0[1]) *
                             (data1[1]) +
                             (data_out[1]));

#line 345
        data_out[2] = ((data0[2]) *
                             (data1[2]) +
                             (data_out[2]));

#line 345
        data_out[3] = ((data0[3]) *
                             (data1[3]) +
                             (data_out[3]));

#line 345
        data_out[4] = ((data0[4]) *
                             (data1[4]) +
                             (data_out[4]));

#line 345
        data_out[5] = ((data0[5]) *
                             (data1[5]) +
                             (data_out[5]));

#line 345
        data_out[6] = ((data0[6]) *
                             (data1[6]) +
                             (data_out[6]));

#line 345
        data_out[7] = ((data0[7]) *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data0 += 8;
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

/* Some extra specializations for the two operand case */
static void
byte_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte value0 = (*(npy_byte *)dataptr[0]);
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte *data_out = (npy_byte *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value0_sse;
#elif EINSUM_USE_SSE2 && 0
    __m128d a, b, value0_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 383
        case 6+1:
            data_out[6] = (value0 *
                                 (data1[6]) +
                                 (data_out[6]));

#line 383
        case 5+1:
            data_out[5] = (value0 *
                                 (data1[5]) +
                                 (data_out[5]));

#line 383
        case 4+1:
            data_out[4] = (value0 *
                                 (data1[4]) +
                                 (data_out[4]));

#line 383
        case 3+1:
            data_out[3] = (value0 *
                                 (data1[3]) +
                                 (data_out[3]));

#line 383
        case 2+1:
            data_out[2] = (value0 *
                                 (data1[2]) +
                                 (data_out[2]));

#line 383
        case 1+1:
            data_out[1] = (value0 *
                                 (data1[1]) +
                                 (data_out[1]));

#line 383
        case 0+1:
            data_out[0] = (value0 *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value0_sse = _mm_set_ps1(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#elif EINSUM_USE_SSE2 && 0
    value0_sse = _mm_set1_pd(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+0));
            b = _mm_add_pd(a, _mm_load_pd(data_out+0));
            _mm_store_pd(data_out+0, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+2));
            b = _mm_add_pd(a, _mm_load_pd(data_out+2));
            _mm_store_pd(data_out+2, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+4));
            b = _mm_add_pd(a, _mm_load_pd(data_out+4));
            _mm_store_pd(data_out+4, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+6));
            b = _mm_add_pd(a, _mm_load_pd(data_out+6));
            _mm_store_pd(data_out+6, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#elif EINSUM_USE_SSE2 && 0
#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+0));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+0));
        _mm_storeu_pd(data_out+0, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+2));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+2));
        _mm_storeu_pd(data_out+2, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+4));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+4));
        _mm_storeu_pd(data_out+4, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+6));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+6));
        _mm_storeu_pd(data_out+6, b);

#else
#line 474
        data_out[0] = (value0 *
                             (data1[0]) +
                             (data_out[0]));

#line 474
        data_out[1] = (value0 *
                             (data1[1]) +
                             (data_out[1]));

#line 474
        data_out[2] = (value0 *
                             (data1[2]) +
                             (data_out[2]));

#line 474
        data_out[3] = (value0 *
                             (data1[3]) +
                             (data_out[3]));

#line 474
        data_out[4] = (value0 *
                             (data1[4]) +
                             (data_out[4]));

#line 474
        data_out[5] = (value0 *
                             (data1[5]) +
                             (data_out[5]));

#line 474
        data_out[6] = (value0 *
                             (data1[6]) +
                             (data_out[6]));

#line 474
        data_out[7] = (value0 *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    if (count > 0) {
        goto finish_after_unrolled_loop;
    }
}

static void
byte_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte value1 = (*(npy_byte *)dataptr[1]);
    npy_byte *data_out = (npy_byte *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value1_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 510
        case 6+1:
            data_out[6] = ((data0[6])*
                                 value1  +
                                 (data_out[6]));

#line 510
        case 5+1:
            data_out[5] = ((data0[5])*
                                 value1  +
                                 (data_out[5]));

#line 510
        case 4+1:
            data_out[4] = ((data0[4])*
                                 value1  +
                                 (data_out[4]));

#line 510
        case 3+1:
            data_out[3] = ((data0[3])*
                                 value1  +
                                 (data_out[3]));

#line 510
        case 2+1:
            data_out[2] = ((data0[2])*
                                 value1  +
                                 (data_out[2]));

#line 510
        case 1+1:
            data_out[1] = ((data0[1])*
                                 value1  +
                                 (data_out[1]));

#line 510
        case 0+1:
            data_out[0] = ((data0[0])*
                                 value1  +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value1_sse = _mm_set_ps1(value1);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+0), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+4), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 560
        data_out[0] = ((data0[0])*
                             value1  +
                             (data_out[0]));

#line 560
        data_out[1] = ((data0[1])*
                             value1  +
                             (data_out[1]));

#line 560
        data_out[2] = ((data0[2])*
                             value1  +
                             (data_out[2]));

#line 560
        data_out[3] = ((data0[3])*
                             value1  +
                             (data_out[3]));

#line 560
        data_out[4] = ((data0[4])*
                             value1  +
                             (data_out[4]));

#line 560
        data_out[5] = ((data0[5])*
                             value1  +
                             (data_out[5]));

#line 560
        data_out[6] = ((data0[6])*
                             value1  +
                             (data_out[6]));

#line 560
        data_out[7] = ((data0[7])*
                             value1  +
                             (data_out[7]));

#endif
        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
byte_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 596
        case 6+1:
            accum += (data0[6]) * (data1[6]);

#line 596
        case 5+1:
            accum += (data0[5]) * (data1[5]);

#line 596
        case 4+1:
            accum += (data0[4]) * (data1[4]);

#line 596
        case 3+1:
            accum += (data0[3]) * (data1[3]);

#line 596
        case 2+1:
            accum += (data0[2]) * (data1[2]);

#line 596
        case 1+1:
            accum += (data0[1]) * (data1[1]);

#line 596
        case 0+1:
            accum += (data0[0]) * (data1[0]);

        case 0:
            *(npy_byte *)dataptr[2] += (accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            accum_sse = _mm_add_ps(accum_sse, a);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            accum_sse = _mm_add_ps(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+0), _mm_load_pd(data1+0));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+2), _mm_load_pd(data1+2));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+4), _mm_load_pd(data1+4));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+6), _mm_load_pd(data1+6));
            accum_sse = _mm_add_pd(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        accum_sse = _mm_add_ps(accum_sse, a);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        accum_sse = _mm_add_ps(accum_sse, a);

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+0), _mm_loadu_pd(data1+0));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+2), _mm_loadu_pd(data1+2));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+4), _mm_loadu_pd(data1+4));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+6), _mm_loadu_pd(data1+6));
        accum_sse = _mm_add_pd(accum_sse, a);

#else
#line 708
        accum += (data0[0]) * (data1[0]);

#line 708
        accum += (data0[1]) * (data1[1]);

#line 708
        accum += (data0[2]) * (data1[2]);

#line 708
        accum += (data0[3]) * (data1[3]);

#line 708
        accum += (data0[4]) * (data1[4]);

#line 708
        accum += (data0[5]) * (data1[5]);

#line 708
        accum += (data0[6]) * (data1[6]);

#line 708
        accum += (data0[7]) * (data1[7]);

#endif
        data0 += 8;
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
byte_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte value0 = (*(npy_byte *)dataptr[0]);
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_stride0_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 754
        case 6+1:
            accum += (data1[6]);

#line 754
        case 5+1:
            accum += (data1[5]);

#line 754
        case 4+1:
            accum += (data1[4]);

#line 754
        case 3+1:
            accum += (data1[3]);

#line 754
        case 2+1:
            accum += (data1[2]);

#line 754
        case 1+1:
            accum += (data1[1]);

#line 754
        case 0+1:
            accum += (data1[0]);

        case 0:
            *(npy_byte *)dataptr[2] += (value0 * accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+0));

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+4));

            data1 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+0));

#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+4));

#else
#line 813
        accum += (data1[0]);

#line 813
        accum += (data1[1]);

#line 813
        accum += (data1[2]);

#line 813
        accum += (data1[3]);

#line 813
        accum += (data1[4]);

#line 813
        accum += (data1[5]);

#line 813
        accum += (data1[6]);

#line 813
        accum += (data1[7]);

#endif
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
byte_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte value1 = (*(npy_byte *)dataptr[1]);
    npy_byte accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_stride0_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 853
        case 6+1:
            accum += (data0[6]);

#line 853
        case 5+1:
            accum += (data0[5]);

#line 853
        case 4+1:
            accum += (data0[4]);

#line 853
        case 3+1:
            accum += (data0[3]);

#line 853
        case 2+1:
            accum += (data0[2]);

#line 853
        case 1+1:
            accum += (data0[1]);

#line 853
        case 0+1:
            accum += (data0[0]);

        case 0:
            *(npy_byte *)dataptr[2] += (accum * value1);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#else
#line 912
        accum += (data0[0]);

#line 912
        accum += (data0[1]);

#line 912
        accum += (data0[2]);

#line 912
        accum += (data0[3]);

#line 912
        accum += (data0[4]);

#line 912
        accum += (data0[5]);

#line 912
        accum += (data0[6]);

#line 912
        accum += (data0[7]);

#endif
        data0 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 2 == 3 && !0

static void
byte_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte *data2 = (npy_byte *)dataptr[2];
    npy_byte *data_out = (npy_byte *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 949
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 949
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 949
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 949
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 949
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 949
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 949
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 949
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 2 > 3 || @complex */

static void
byte_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_two (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_byte temp = (*(npy_byte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_byte *)dataptr[i]);
        }
        *(npy_byte *)dataptr[nop] = (temp +
                                           (*(npy_byte *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_byte);
        }
#else /* complex */
#  if 2 <= 3
#    define _SUMPROD_NOP 2
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_byte re, im, tmp;
        int i;
        re = ((npy_byte *)dataptr[0])[0];
        im = ((npy_byte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_byte *)dataptr[i])[0] -
                  im * ((npy_byte *)dataptr[i])[1];
            im = re * ((npy_byte *)dataptr[i])[1] +
                 im * ((npy_byte *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_byte *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[0];
        ((npy_byte *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_byte);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 2 */

#if 2 == 1

static void
byte_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_byte accum_re = 0, accum_im = 0;
    npy_byte *data0 = (npy_byte *)dataptr[0];
#else
    npy_byte accum = 0;
    npy_byte *data0 = (npy_byte *)dataptr[0];
#endif

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif


    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_outstride0_one (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 1058
        case 6+1:
#if !0
            accum += (data0[6]);
#else /* complex */
            accum_re += data0[2*6+0];
            accum_im += data0[2*6+1];
#endif

#line 1058
        case 5+1:
#if !0
            accum += (data0[5]);
#else /* complex */
            accum_re += data0[2*5+0];
            accum_im += data0[2*5+1];
#endif

#line 1058
        case 4+1:
#if !0
            accum += (data0[4]);
#else /* complex */
            accum_re += data0[2*4+0];
            accum_im += data0[2*4+1];
#endif

#line 1058
        case 3+1:
#if !0
            accum += (data0[3]);
#else /* complex */
            accum_re += data0[2*3+0];
            accum_im += data0[2*3+1];
#endif

#line 1058
        case 2+1:
#if !0
            accum += (data0[2]);
#else /* complex */
            accum_re += data0[2*2+0];
            accum_im += data0[2*2+1];
#endif

#line 1058
        case 1+1:
#if !0
            accum += (data0[1]);
#else /* complex */
            accum_re += data0[2*1+0];
            accum_im += data0[2*1+1];
#endif

#line 1058
        case 0+1:
#if !0
            accum += (data0[0]);
#else /* complex */
            accum_re += data0[2*0+0];
            accum_im += data0[2*0+1];
#endif

        case 0:
#if 0
            ((npy_byte *)dataptr[1])[0] += accum_re;
            ((npy_byte *)dataptr[1])[1] += accum_im;
#else
            *((npy_byte *)dataptr[1]) = (accum +
                                    (*((npy_byte *)dataptr[1])));
#endif
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+0));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+2));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+4));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+6));

            data0 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+0));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+2));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+4));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+6));

#else
#line 1171
#  if !0
        accum += (data0[0]);
#  else /* complex */
        accum_re += data0[2*0+0];
        accum_im += data0[2*0+1];
#  endif

#line 1171
#  if !0
        accum += (data0[1]);
#  else /* complex */
        accum_re += data0[2*1+0];
        accum_im += data0[2*1+1];
#  endif

#line 1171
#  if !0
        accum += (data0[2]);
#  else /* complex */
        accum_re += data0[2*2+0];
        accum_im += data0[2*2+1];
#  endif

#line 1171
#  if !0
        accum += (data0[3]);
#  else /* complex */
        accum_re += data0[2*3+0];
        accum_im += data0[2*3+1];
#  endif

#line 1171
#  if !0
        accum += (data0[4]);
#  else /* complex */
        accum_re += data0[2*4+0];
        accum_im += data0[2*4+1];
#  endif

#line 1171
#  if !0
        accum += (data0[5]);
#  else /* complex */
        accum_re += data0[2*5+0];
        accum_im += data0[2*5+1];
#  endif

#line 1171
#  if !0
        accum += (data0[6]);
#  else /* complex */
        accum_re += data0[2*6+0];
        accum_im += data0[2*6+1];
#  endif

#line 1171
#  if !0
        accum += (data0[7]);
#  else /* complex */
        accum_re += data0[2*7+0];
        accum_im += data0[2*7+1];
#  endif

#endif

#if !0
        data0 += 8;
#else
        data0 += 8*2;
#endif
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#endif /* 2 == 1 */

static void
byte_sum_of_products_outstride0_two(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_byte accum_re = 0, accum_im = 0;
#else
    npy_byte accum = 0;
#endif

#if (2 == 1) || (2 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (2 == 2 || 2 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (2 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_outstride0_two (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 2 == 1
        accum += (*(npy_byte *)data0);
        data0 += stride0;
#  elif 2 == 2
        accum += (*(npy_byte *)data0) *
                 (*(npy_byte *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 2 == 3
        accum += (*(npy_byte *)data0) *
                 (*(npy_byte *)data1) *
                 (*(npy_byte *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_byte temp = (*(npy_byte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_byte *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 2 == 1
        accum_re += ((npy_byte *)data0)[0];
        accum_im += ((npy_byte *)data0)[1];
        data0 += stride0;
#  else
#    if 2 <= 3
#define _SUMPROD_NOP 2
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_byte re, im, tmp;
        int i;
        re = ((npy_byte *)dataptr[0])[0];
        im = ((npy_byte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_byte *)dataptr[i])[0] -
                  im * ((npy_byte *)dataptr[i])[1];
            im = re * ((npy_byte *)dataptr[i])[1] +
                 im * ((npy_byte *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 2 <= 3
    ((npy_byte *)dataptr[2])[0] += accum_re;
    ((npy_byte *)dataptr[2])[1] += accum_im;
#  else
    ((npy_byte *)dataptr[nop])[0] += accum_re;
    ((npy_byte *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 2 <= 3
    *((npy_byte *)dataptr[2]) = (accum +
                                    (*((npy_byte *)dataptr[2])));
#  else
    *((npy_byte *)dataptr[nop]) = (accum +
                                    (*((npy_byte *)dataptr[nop])));
#  endif
#endif

}


#line 113
static void
byte_sum_of_products_three(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if (3 == 1) || (3 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (3 == 2 || 3 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (3 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (3 == 1) || (3 <= 3 && !0)
    char *data_out = dataptr[3];
    npy_intp stride_out = strides[3];
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_three (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 3 == 1
        *(npy_byte *)data_out = ((*(npy_byte *)data0) +
                                         (*(npy_byte *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 3 == 2
        *(npy_byte *)data_out = ((*(npy_byte *)data0) *
                                         (*(npy_byte *)data1) +
                                         (*(npy_byte *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 3 == 3
        *(npy_byte *)data_out = ((*(npy_byte *)data0) *
                                         (*(npy_byte *)data1) *
                                         (*(npy_byte *)data2) +
                                         (*(npy_byte *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_byte temp = (*(npy_byte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_byte *)dataptr[i]);
        }
        *(npy_byte *)dataptr[nop] = (temp +
                                           (*(npy_byte *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 3 == 1
        ((npy_byte *)data_out)[0] = ((npy_byte *)data0)[0] +
                                         ((npy_byte *)data_out)[0];
        ((npy_byte *)data_out)[1] = ((npy_byte *)data0)[1] +
                                         ((npy_byte *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 3 <= 3
#define _SUMPROD_NOP 3
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_byte re, im, tmp;
        int i;
        re = ((npy_byte *)dataptr[0])[0];
        im = ((npy_byte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_byte *)dataptr[i])[0] -
                  im * ((npy_byte *)dataptr[i])[1];
            im = re * ((npy_byte *)dataptr[i])[1] +
                 im * ((npy_byte *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_byte *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[0];
        ((npy_byte *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 3 == 1

static void
byte_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data_out = (npy_byte *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 228
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_byte *)data_out + 2*6)[0] =
                                    ((npy_byte *)data0 + 2*6)[0] +
                                    ((npy_byte *)data_out + 2*6)[0];
            ((npy_byte *)data_out + 2*6)[1] =
                                    ((npy_byte *)data0 + 2*6)[1] +
                                    ((npy_byte *)data_out + 2*6)[1];
#endif

#line 228
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_byte *)data_out + 2*5)[0] =
                                    ((npy_byte *)data0 + 2*5)[0] +
                                    ((npy_byte *)data_out + 2*5)[0];
            ((npy_byte *)data_out + 2*5)[1] =
                                    ((npy_byte *)data0 + 2*5)[1] +
                                    ((npy_byte *)data_out + 2*5)[1];
#endif

#line 228
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_byte *)data_out + 2*4)[0] =
                                    ((npy_byte *)data0 + 2*4)[0] +
                                    ((npy_byte *)data_out + 2*4)[0];
            ((npy_byte *)data_out + 2*4)[1] =
                                    ((npy_byte *)data0 + 2*4)[1] +
                                    ((npy_byte *)data_out + 2*4)[1];
#endif

#line 228
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_byte *)data_out + 2*3)[0] =
                                    ((npy_byte *)data0 + 2*3)[0] +
                                    ((npy_byte *)data_out + 2*3)[0];
            ((npy_byte *)data_out + 2*3)[1] =
                                    ((npy_byte *)data0 + 2*3)[1] +
                                    ((npy_byte *)data_out + 2*3)[1];
#endif

#line 228
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_byte *)data_out + 2*2)[0] =
                                    ((npy_byte *)data0 + 2*2)[0] +
                                    ((npy_byte *)data_out + 2*2)[0];
            ((npy_byte *)data_out + 2*2)[1] =
                                    ((npy_byte *)data0 + 2*2)[1] +
                                    ((npy_byte *)data_out + 2*2)[1];
#endif

#line 228
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_byte *)data_out + 2*1)[0] =
                                    ((npy_byte *)data0 + 2*1)[0] +
                                    ((npy_byte *)data_out + 2*1)[0];
            ((npy_byte *)data_out + 2*1)[1] =
                                    ((npy_byte *)data0 + 2*1)[1] +
                                    ((npy_byte *)data_out + 2*1)[1];
#endif

#line 228
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_byte *)data_out + 2*0)[0] =
                                    ((npy_byte *)data0 + 2*0)[0] +
                                    ((npy_byte *)data_out + 2*0)[0];
            ((npy_byte *)data_out + 2*0)[1] =
                                    ((npy_byte *)data0 + 2*0)[1] +
                                    ((npy_byte *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 252
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_byte *)data_out + 2*0)[0] =
                                ((npy_byte *)data0 + 2*0)[0] +
                                ((npy_byte *)data_out + 2*0)[0];
        ((npy_byte *)data_out + 2*0)[1] =
                                ((npy_byte *)data0 + 2*0)[1] +
                                ((npy_byte *)data_out + 2*0)[1];
#endif

#line 252
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_byte *)data_out + 2*1)[0] =
                                ((npy_byte *)data0 + 2*1)[0] +
                                ((npy_byte *)data_out + 2*1)[0];
        ((npy_byte *)data_out + 2*1)[1] =
                                ((npy_byte *)data0 + 2*1)[1] +
                                ((npy_byte *)data_out + 2*1)[1];
#endif

#line 252
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_byte *)data_out + 2*2)[0] =
                                ((npy_byte *)data0 + 2*2)[0] +
                                ((npy_byte *)data_out + 2*2)[0];
        ((npy_byte *)data_out + 2*2)[1] =
                                ((npy_byte *)data0 + 2*2)[1] +
                                ((npy_byte *)data_out + 2*2)[1];
#endif

#line 252
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_byte *)data_out + 2*3)[0] =
                                ((npy_byte *)data0 + 2*3)[0] +
                                ((npy_byte *)data_out + 2*3)[0];
        ((npy_byte *)data_out + 2*3)[1] =
                                ((npy_byte *)data0 + 2*3)[1] +
                                ((npy_byte *)data_out + 2*3)[1];
#endif

#line 252
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_byte *)data_out + 2*4)[0] =
                                ((npy_byte *)data0 + 2*4)[0] +
                                ((npy_byte *)data_out + 2*4)[0];
        ((npy_byte *)data_out + 2*4)[1] =
                                ((npy_byte *)data0 + 2*4)[1] +
                                ((npy_byte *)data_out + 2*4)[1];
#endif

#line 252
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_byte *)data_out + 2*5)[0] =
                                ((npy_byte *)data0 + 2*5)[0] +
                                ((npy_byte *)data_out + 2*5)[0];
        ((npy_byte *)data_out + 2*5)[1] =
                                ((npy_byte *)data0 + 2*5)[1] +
                                ((npy_byte *)data_out + 2*5)[1];
#endif

#line 252
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_byte *)data_out + 2*6)[0] =
                                ((npy_byte *)data0 + 2*6)[0] +
                                ((npy_byte *)data_out + 2*6)[0];
        ((npy_byte *)data_out + 2*6)[1] =
                                ((npy_byte *)data0 + 2*6)[1] +
                                ((npy_byte *)data_out + 2*6)[1];
#endif

#line 252
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_byte *)data_out + 2*7)[0] =
                                ((npy_byte *)data0 + 2*7)[0] +
                                ((npy_byte *)data_out + 2*7)[0];
        ((npy_byte *)data_out + 2*7)[1] =
                                ((npy_byte *)data0 + 2*7)[1] +
                                ((npy_byte *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 3 == 2 && !0

static void
byte_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte *data_out = (npy_byte *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b;
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_two (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 295
        case 6+1:
            data_out[6] = ((data0[6]) *
                                 (data1[6]) +
                                 (data_out[6]));

#line 295
        case 5+1:
            data_out[5] = ((data0[5]) *
                                 (data1[5]) +
                                 (data_out[5]));

#line 295
        case 4+1:
            data_out[4] = ((data0[4]) *
                                 (data1[4]) +
                                 (data_out[4]));

#line 295
        case 3+1:
            data_out[3] = ((data0[3]) *
                                 (data1[3]) +
                                 (data_out[3]));

#line 295
        case 2+1:
            data_out[2] = ((data0[2]) *
                                 (data1[2]) +
                                 (data_out[2]));

#line 295
        case 1+1:
            data_out[1] = ((data0[1]) *
                                 (data1[1]) +
                                 (data_out[1]));

#line 295
        case 0+1:
            data_out[0] = ((data0[0]) *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
        EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 345
        data_out[0] = ((data0[0]) *
                             (data1[0]) +
                             (data_out[0]));

#line 345
        data_out[1] = ((data0[1]) *
                             (data1[1]) +
                             (data_out[1]));

#line 345
        data_out[2] = ((data0[2]) *
                             (data1[2]) +
                             (data_out[2]));

#line 345
        data_out[3] = ((data0[3]) *
                             (data1[3]) +
                             (data_out[3]));

#line 345
        data_out[4] = ((data0[4]) *
                             (data1[4]) +
                             (data_out[4]));

#line 345
        data_out[5] = ((data0[5]) *
                             (data1[5]) +
                             (data_out[5]));

#line 345
        data_out[6] = ((data0[6]) *
                             (data1[6]) +
                             (data_out[6]));

#line 345
        data_out[7] = ((data0[7]) *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data0 += 8;
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

/* Some extra specializations for the two operand case */
static void
byte_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte value0 = (*(npy_byte *)dataptr[0]);
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte *data_out = (npy_byte *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value0_sse;
#elif EINSUM_USE_SSE2 && 0
    __m128d a, b, value0_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 383
        case 6+1:
            data_out[6] = (value0 *
                                 (data1[6]) +
                                 (data_out[6]));

#line 383
        case 5+1:
            data_out[5] = (value0 *
                                 (data1[5]) +
                                 (data_out[5]));

#line 383
        case 4+1:
            data_out[4] = (value0 *
                                 (data1[4]) +
                                 (data_out[4]));

#line 383
        case 3+1:
            data_out[3] = (value0 *
                                 (data1[3]) +
                                 (data_out[3]));

#line 383
        case 2+1:
            data_out[2] = (value0 *
                                 (data1[2]) +
                                 (data_out[2]));

#line 383
        case 1+1:
            data_out[1] = (value0 *
                                 (data1[1]) +
                                 (data_out[1]));

#line 383
        case 0+1:
            data_out[0] = (value0 *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value0_sse = _mm_set_ps1(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#elif EINSUM_USE_SSE2 && 0
    value0_sse = _mm_set1_pd(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+0));
            b = _mm_add_pd(a, _mm_load_pd(data_out+0));
            _mm_store_pd(data_out+0, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+2));
            b = _mm_add_pd(a, _mm_load_pd(data_out+2));
            _mm_store_pd(data_out+2, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+4));
            b = _mm_add_pd(a, _mm_load_pd(data_out+4));
            _mm_store_pd(data_out+4, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+6));
            b = _mm_add_pd(a, _mm_load_pd(data_out+6));
            _mm_store_pd(data_out+6, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#elif EINSUM_USE_SSE2 && 0
#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+0));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+0));
        _mm_storeu_pd(data_out+0, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+2));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+2));
        _mm_storeu_pd(data_out+2, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+4));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+4));
        _mm_storeu_pd(data_out+4, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+6));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+6));
        _mm_storeu_pd(data_out+6, b);

#else
#line 474
        data_out[0] = (value0 *
                             (data1[0]) +
                             (data_out[0]));

#line 474
        data_out[1] = (value0 *
                             (data1[1]) +
                             (data_out[1]));

#line 474
        data_out[2] = (value0 *
                             (data1[2]) +
                             (data_out[2]));

#line 474
        data_out[3] = (value0 *
                             (data1[3]) +
                             (data_out[3]));

#line 474
        data_out[4] = (value0 *
                             (data1[4]) +
                             (data_out[4]));

#line 474
        data_out[5] = (value0 *
                             (data1[5]) +
                             (data_out[5]));

#line 474
        data_out[6] = (value0 *
                             (data1[6]) +
                             (data_out[6]));

#line 474
        data_out[7] = (value0 *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    if (count > 0) {
        goto finish_after_unrolled_loop;
    }
}

static void
byte_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte value1 = (*(npy_byte *)dataptr[1]);
    npy_byte *data_out = (npy_byte *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value1_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 510
        case 6+1:
            data_out[6] = ((data0[6])*
                                 value1  +
                                 (data_out[6]));

#line 510
        case 5+1:
            data_out[5] = ((data0[5])*
                                 value1  +
                                 (data_out[5]));

#line 510
        case 4+1:
            data_out[4] = ((data0[4])*
                                 value1  +
                                 (data_out[4]));

#line 510
        case 3+1:
            data_out[3] = ((data0[3])*
                                 value1  +
                                 (data_out[3]));

#line 510
        case 2+1:
            data_out[2] = ((data0[2])*
                                 value1  +
                                 (data_out[2]));

#line 510
        case 1+1:
            data_out[1] = ((data0[1])*
                                 value1  +
                                 (data_out[1]));

#line 510
        case 0+1:
            data_out[0] = ((data0[0])*
                                 value1  +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value1_sse = _mm_set_ps1(value1);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+0), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+4), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 560
        data_out[0] = ((data0[0])*
                             value1  +
                             (data_out[0]));

#line 560
        data_out[1] = ((data0[1])*
                             value1  +
                             (data_out[1]));

#line 560
        data_out[2] = ((data0[2])*
                             value1  +
                             (data_out[2]));

#line 560
        data_out[3] = ((data0[3])*
                             value1  +
                             (data_out[3]));

#line 560
        data_out[4] = ((data0[4])*
                             value1  +
                             (data_out[4]));

#line 560
        data_out[5] = ((data0[5])*
                             value1  +
                             (data_out[5]));

#line 560
        data_out[6] = ((data0[6])*
                             value1  +
                             (data_out[6]));

#line 560
        data_out[7] = ((data0[7])*
                             value1  +
                             (data_out[7]));

#endif
        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
byte_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 596
        case 6+1:
            accum += (data0[6]) * (data1[6]);

#line 596
        case 5+1:
            accum += (data0[5]) * (data1[5]);

#line 596
        case 4+1:
            accum += (data0[4]) * (data1[4]);

#line 596
        case 3+1:
            accum += (data0[3]) * (data1[3]);

#line 596
        case 2+1:
            accum += (data0[2]) * (data1[2]);

#line 596
        case 1+1:
            accum += (data0[1]) * (data1[1]);

#line 596
        case 0+1:
            accum += (data0[0]) * (data1[0]);

        case 0:
            *(npy_byte *)dataptr[2] += (accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            accum_sse = _mm_add_ps(accum_sse, a);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            accum_sse = _mm_add_ps(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+0), _mm_load_pd(data1+0));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+2), _mm_load_pd(data1+2));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+4), _mm_load_pd(data1+4));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+6), _mm_load_pd(data1+6));
            accum_sse = _mm_add_pd(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        accum_sse = _mm_add_ps(accum_sse, a);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        accum_sse = _mm_add_ps(accum_sse, a);

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+0), _mm_loadu_pd(data1+0));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+2), _mm_loadu_pd(data1+2));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+4), _mm_loadu_pd(data1+4));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+6), _mm_loadu_pd(data1+6));
        accum_sse = _mm_add_pd(accum_sse, a);

#else
#line 708
        accum += (data0[0]) * (data1[0]);

#line 708
        accum += (data0[1]) * (data1[1]);

#line 708
        accum += (data0[2]) * (data1[2]);

#line 708
        accum += (data0[3]) * (data1[3]);

#line 708
        accum += (data0[4]) * (data1[4]);

#line 708
        accum += (data0[5]) * (data1[5]);

#line 708
        accum += (data0[6]) * (data1[6]);

#line 708
        accum += (data0[7]) * (data1[7]);

#endif
        data0 += 8;
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
byte_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte value0 = (*(npy_byte *)dataptr[0]);
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_stride0_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 754
        case 6+1:
            accum += (data1[6]);

#line 754
        case 5+1:
            accum += (data1[5]);

#line 754
        case 4+1:
            accum += (data1[4]);

#line 754
        case 3+1:
            accum += (data1[3]);

#line 754
        case 2+1:
            accum += (data1[2]);

#line 754
        case 1+1:
            accum += (data1[1]);

#line 754
        case 0+1:
            accum += (data1[0]);

        case 0:
            *(npy_byte *)dataptr[2] += (value0 * accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+0));

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+4));

            data1 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+0));

#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+4));

#else
#line 813
        accum += (data1[0]);

#line 813
        accum += (data1[1]);

#line 813
        accum += (data1[2]);

#line 813
        accum += (data1[3]);

#line 813
        accum += (data1[4]);

#line 813
        accum += (data1[5]);

#line 813
        accum += (data1[6]);

#line 813
        accum += (data1[7]);

#endif
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
byte_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte value1 = (*(npy_byte *)dataptr[1]);
    npy_byte accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_stride0_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 853
        case 6+1:
            accum += (data0[6]);

#line 853
        case 5+1:
            accum += (data0[5]);

#line 853
        case 4+1:
            accum += (data0[4]);

#line 853
        case 3+1:
            accum += (data0[3]);

#line 853
        case 2+1:
            accum += (data0[2]);

#line 853
        case 1+1:
            accum += (data0[1]);

#line 853
        case 0+1:
            accum += (data0[0]);

        case 0:
            *(npy_byte *)dataptr[2] += (accum * value1);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#else
#line 912
        accum += (data0[0]);

#line 912
        accum += (data0[1]);

#line 912
        accum += (data0[2]);

#line 912
        accum += (data0[3]);

#line 912
        accum += (data0[4]);

#line 912
        accum += (data0[5]);

#line 912
        accum += (data0[6]);

#line 912
        accum += (data0[7]);

#endif
        data0 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 3 == 3 && !0

static void
byte_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte *data2 = (npy_byte *)dataptr[2];
    npy_byte *data_out = (npy_byte *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 949
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 949
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 949
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 949
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 949
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 949
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 949
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 949
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 3 > 3 || @complex */

static void
byte_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_three (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_byte temp = (*(npy_byte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_byte *)dataptr[i]);
        }
        *(npy_byte *)dataptr[nop] = (temp +
                                           (*(npy_byte *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_byte);
        }
#else /* complex */
#  if 3 <= 3
#    define _SUMPROD_NOP 3
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_byte re, im, tmp;
        int i;
        re = ((npy_byte *)dataptr[0])[0];
        im = ((npy_byte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_byte *)dataptr[i])[0] -
                  im * ((npy_byte *)dataptr[i])[1];
            im = re * ((npy_byte *)dataptr[i])[1] +
                 im * ((npy_byte *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_byte *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[0];
        ((npy_byte *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_byte);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 3 */

#if 3 == 1

static void
byte_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_byte accum_re = 0, accum_im = 0;
    npy_byte *data0 = (npy_byte *)dataptr[0];
#else
    npy_byte accum = 0;
    npy_byte *data0 = (npy_byte *)dataptr[0];
#endif

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif


    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_outstride0_one (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 1058
        case 6+1:
#if !0
            accum += (data0[6]);
#else /* complex */
            accum_re += data0[2*6+0];
            accum_im += data0[2*6+1];
#endif

#line 1058
        case 5+1:
#if !0
            accum += (data0[5]);
#else /* complex */
            accum_re += data0[2*5+0];
            accum_im += data0[2*5+1];
#endif

#line 1058
        case 4+1:
#if !0
            accum += (data0[4]);
#else /* complex */
            accum_re += data0[2*4+0];
            accum_im += data0[2*4+1];
#endif

#line 1058
        case 3+1:
#if !0
            accum += (data0[3]);
#else /* complex */
            accum_re += data0[2*3+0];
            accum_im += data0[2*3+1];
#endif

#line 1058
        case 2+1:
#if !0
            accum += (data0[2]);
#else /* complex */
            accum_re += data0[2*2+0];
            accum_im += data0[2*2+1];
#endif

#line 1058
        case 1+1:
#if !0
            accum += (data0[1]);
#else /* complex */
            accum_re += data0[2*1+0];
            accum_im += data0[2*1+1];
#endif

#line 1058
        case 0+1:
#if !0
            accum += (data0[0]);
#else /* complex */
            accum_re += data0[2*0+0];
            accum_im += data0[2*0+1];
#endif

        case 0:
#if 0
            ((npy_byte *)dataptr[1])[0] += accum_re;
            ((npy_byte *)dataptr[1])[1] += accum_im;
#else
            *((npy_byte *)dataptr[1]) = (accum +
                                    (*((npy_byte *)dataptr[1])));
#endif
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+0));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+2));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+4));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+6));

            data0 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+0));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+2));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+4));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+6));

#else
#line 1171
#  if !0
        accum += (data0[0]);
#  else /* complex */
        accum_re += data0[2*0+0];
        accum_im += data0[2*0+1];
#  endif

#line 1171
#  if !0
        accum += (data0[1]);
#  else /* complex */
        accum_re += data0[2*1+0];
        accum_im += data0[2*1+1];
#  endif

#line 1171
#  if !0
        accum += (data0[2]);
#  else /* complex */
        accum_re += data0[2*2+0];
        accum_im += data0[2*2+1];
#  endif

#line 1171
#  if !0
        accum += (data0[3]);
#  else /* complex */
        accum_re += data0[2*3+0];
        accum_im += data0[2*3+1];
#  endif

#line 1171
#  if !0
        accum += (data0[4]);
#  else /* complex */
        accum_re += data0[2*4+0];
        accum_im += data0[2*4+1];
#  endif

#line 1171
#  if !0
        accum += (data0[5]);
#  else /* complex */
        accum_re += data0[2*5+0];
        accum_im += data0[2*5+1];
#  endif

#line 1171
#  if !0
        accum += (data0[6]);
#  else /* complex */
        accum_re += data0[2*6+0];
        accum_im += data0[2*6+1];
#  endif

#line 1171
#  if !0
        accum += (data0[7]);
#  else /* complex */
        accum_re += data0[2*7+0];
        accum_im += data0[2*7+1];
#  endif

#endif

#if !0
        data0 += 8;
#else
        data0 += 8*2;
#endif
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#endif /* 3 == 1 */

static void
byte_sum_of_products_outstride0_three(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_byte accum_re = 0, accum_im = 0;
#else
    npy_byte accum = 0;
#endif

#if (3 == 1) || (3 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (3 == 2 || 3 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (3 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_outstride0_three (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 3 == 1
        accum += (*(npy_byte *)data0);
        data0 += stride0;
#  elif 3 == 2
        accum += (*(npy_byte *)data0) *
                 (*(npy_byte *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 3 == 3
        accum += (*(npy_byte *)data0) *
                 (*(npy_byte *)data1) *
                 (*(npy_byte *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_byte temp = (*(npy_byte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_byte *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 3 == 1
        accum_re += ((npy_byte *)data0)[0];
        accum_im += ((npy_byte *)data0)[1];
        data0 += stride0;
#  else
#    if 3 <= 3
#define _SUMPROD_NOP 3
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_byte re, im, tmp;
        int i;
        re = ((npy_byte *)dataptr[0])[0];
        im = ((npy_byte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_byte *)dataptr[i])[0] -
                  im * ((npy_byte *)dataptr[i])[1];
            im = re * ((npy_byte *)dataptr[i])[1] +
                 im * ((npy_byte *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 3 <= 3
    ((npy_byte *)dataptr[3])[0] += accum_re;
    ((npy_byte *)dataptr[3])[1] += accum_im;
#  else
    ((npy_byte *)dataptr[nop])[0] += accum_re;
    ((npy_byte *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 3 <= 3
    *((npy_byte *)dataptr[3]) = (accum +
                                    (*((npy_byte *)dataptr[3])));
#  else
    *((npy_byte *)dataptr[nop]) = (accum +
                                    (*((npy_byte *)dataptr[nop])));
#  endif
#endif

}


#line 113
static void
byte_sum_of_products_any(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if (1000 == 1) || (1000 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1000 == 2 || 1000 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1000 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (1000 == 1) || (1000 <= 3 && !0)
    char *data_out = dataptr[1000];
    npy_intp stride_out = strides[1000];
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_any (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 1000 == 1
        *(npy_byte *)data_out = ((*(npy_byte *)data0) +
                                         (*(npy_byte *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 1000 == 2
        *(npy_byte *)data_out = ((*(npy_byte *)data0) *
                                         (*(npy_byte *)data1) +
                                         (*(npy_byte *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 1000 == 3
        *(npy_byte *)data_out = ((*(npy_byte *)data0) *
                                         (*(npy_byte *)data1) *
                                         (*(npy_byte *)data2) +
                                         (*(npy_byte *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_byte temp = (*(npy_byte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_byte *)dataptr[i]);
        }
        *(npy_byte *)dataptr[nop] = (temp +
                                           (*(npy_byte *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1000 == 1
        ((npy_byte *)data_out)[0] = ((npy_byte *)data0)[0] +
                                         ((npy_byte *)data_out)[0];
        ((npy_byte *)data_out)[1] = ((npy_byte *)data0)[1] +
                                         ((npy_byte *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 1000 <= 3
#define _SUMPROD_NOP 1000
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_byte re, im, tmp;
        int i;
        re = ((npy_byte *)dataptr[0])[0];
        im = ((npy_byte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_byte *)dataptr[i])[0] -
                  im * ((npy_byte *)dataptr[i])[1];
            im = re * ((npy_byte *)dataptr[i])[1] +
                 im * ((npy_byte *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_byte *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[0];
        ((npy_byte *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 1000 == 1

static void
byte_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data_out = (npy_byte *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 228
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_byte *)data_out + 2*6)[0] =
                                    ((npy_byte *)data0 + 2*6)[0] +
                                    ((npy_byte *)data_out + 2*6)[0];
            ((npy_byte *)data_out + 2*6)[1] =
                                    ((npy_byte *)data0 + 2*6)[1] +
                                    ((npy_byte *)data_out + 2*6)[1];
#endif

#line 228
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_byte *)data_out + 2*5)[0] =
                                    ((npy_byte *)data0 + 2*5)[0] +
                                    ((npy_byte *)data_out + 2*5)[0];
            ((npy_byte *)data_out + 2*5)[1] =
                                    ((npy_byte *)data0 + 2*5)[1] +
                                    ((npy_byte *)data_out + 2*5)[1];
#endif

#line 228
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_byte *)data_out + 2*4)[0] =
                                    ((npy_byte *)data0 + 2*4)[0] +
                                    ((npy_byte *)data_out + 2*4)[0];
            ((npy_byte *)data_out + 2*4)[1] =
                                    ((npy_byte *)data0 + 2*4)[1] +
                                    ((npy_byte *)data_out + 2*4)[1];
#endif

#line 228
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_byte *)data_out + 2*3)[0] =
                                    ((npy_byte *)data0 + 2*3)[0] +
                                    ((npy_byte *)data_out + 2*3)[0];
            ((npy_byte *)data_out + 2*3)[1] =
                                    ((npy_byte *)data0 + 2*3)[1] +
                                    ((npy_byte *)data_out + 2*3)[1];
#endif

#line 228
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_byte *)data_out + 2*2)[0] =
                                    ((npy_byte *)data0 + 2*2)[0] +
                                    ((npy_byte *)data_out + 2*2)[0];
            ((npy_byte *)data_out + 2*2)[1] =
                                    ((npy_byte *)data0 + 2*2)[1] +
                                    ((npy_byte *)data_out + 2*2)[1];
#endif

#line 228
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_byte *)data_out + 2*1)[0] =
                                    ((npy_byte *)data0 + 2*1)[0] +
                                    ((npy_byte *)data_out + 2*1)[0];
            ((npy_byte *)data_out + 2*1)[1] =
                                    ((npy_byte *)data0 + 2*1)[1] +
                                    ((npy_byte *)data_out + 2*1)[1];
#endif

#line 228
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_byte *)data_out + 2*0)[0] =
                                    ((npy_byte *)data0 + 2*0)[0] +
                                    ((npy_byte *)data_out + 2*0)[0];
            ((npy_byte *)data_out + 2*0)[1] =
                                    ((npy_byte *)data0 + 2*0)[1] +
                                    ((npy_byte *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 252
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_byte *)data_out + 2*0)[0] =
                                ((npy_byte *)data0 + 2*0)[0] +
                                ((npy_byte *)data_out + 2*0)[0];
        ((npy_byte *)data_out + 2*0)[1] =
                                ((npy_byte *)data0 + 2*0)[1] +
                                ((npy_byte *)data_out + 2*0)[1];
#endif

#line 252
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_byte *)data_out + 2*1)[0] =
                                ((npy_byte *)data0 + 2*1)[0] +
                                ((npy_byte *)data_out + 2*1)[0];
        ((npy_byte *)data_out + 2*1)[1] =
                                ((npy_byte *)data0 + 2*1)[1] +
                                ((npy_byte *)data_out + 2*1)[1];
#endif

#line 252
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_byte *)data_out + 2*2)[0] =
                                ((npy_byte *)data0 + 2*2)[0] +
                                ((npy_byte *)data_out + 2*2)[0];
        ((npy_byte *)data_out + 2*2)[1] =
                                ((npy_byte *)data0 + 2*2)[1] +
                                ((npy_byte *)data_out + 2*2)[1];
#endif

#line 252
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_byte *)data_out + 2*3)[0] =
                                ((npy_byte *)data0 + 2*3)[0] +
                                ((npy_byte *)data_out + 2*3)[0];
        ((npy_byte *)data_out + 2*3)[1] =
                                ((npy_byte *)data0 + 2*3)[1] +
                                ((npy_byte *)data_out + 2*3)[1];
#endif

#line 252
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_byte *)data_out + 2*4)[0] =
                                ((npy_byte *)data0 + 2*4)[0] +
                                ((npy_byte *)data_out + 2*4)[0];
        ((npy_byte *)data_out + 2*4)[1] =
                                ((npy_byte *)data0 + 2*4)[1] +
                                ((npy_byte *)data_out + 2*4)[1];
#endif

#line 252
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_byte *)data_out + 2*5)[0] =
                                ((npy_byte *)data0 + 2*5)[0] +
                                ((npy_byte *)data_out + 2*5)[0];
        ((npy_byte *)data_out + 2*5)[1] =
                                ((npy_byte *)data0 + 2*5)[1] +
                                ((npy_byte *)data_out + 2*5)[1];
#endif

#line 252
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_byte *)data_out + 2*6)[0] =
                                ((npy_byte *)data0 + 2*6)[0] +
                                ((npy_byte *)data_out + 2*6)[0];
        ((npy_byte *)data_out + 2*6)[1] =
                                ((npy_byte *)data0 + 2*6)[1] +
                                ((npy_byte *)data_out + 2*6)[1];
#endif

#line 252
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_byte *)data_out + 2*7)[0] =
                                ((npy_byte *)data0 + 2*7)[0] +
                                ((npy_byte *)data_out + 2*7)[0];
        ((npy_byte *)data_out + 2*7)[1] =
                                ((npy_byte *)data0 + 2*7)[1] +
                                ((npy_byte *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1000 == 2 && !0

static void
byte_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte *data_out = (npy_byte *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b;
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_two (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 295
        case 6+1:
            data_out[6] = ((data0[6]) *
                                 (data1[6]) +
                                 (data_out[6]));

#line 295
        case 5+1:
            data_out[5] = ((data0[5]) *
                                 (data1[5]) +
                                 (data_out[5]));

#line 295
        case 4+1:
            data_out[4] = ((data0[4]) *
                                 (data1[4]) +
                                 (data_out[4]));

#line 295
        case 3+1:
            data_out[3] = ((data0[3]) *
                                 (data1[3]) +
                                 (data_out[3]));

#line 295
        case 2+1:
            data_out[2] = ((data0[2]) *
                                 (data1[2]) +
                                 (data_out[2]));

#line 295
        case 1+1:
            data_out[1] = ((data0[1]) *
                                 (data1[1]) +
                                 (data_out[1]));

#line 295
        case 0+1:
            data_out[0] = ((data0[0]) *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
        EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 345
        data_out[0] = ((data0[0]) *
                             (data1[0]) +
                             (data_out[0]));

#line 345
        data_out[1] = ((data0[1]) *
                             (data1[1]) +
                             (data_out[1]));

#line 345
        data_out[2] = ((data0[2]) *
                             (data1[2]) +
                             (data_out[2]));

#line 345
        data_out[3] = ((data0[3]) *
                             (data1[3]) +
                             (data_out[3]));

#line 345
        data_out[4] = ((data0[4]) *
                             (data1[4]) +
                             (data_out[4]));

#line 345
        data_out[5] = ((data0[5]) *
                             (data1[5]) +
                             (data_out[5]));

#line 345
        data_out[6] = ((data0[6]) *
                             (data1[6]) +
                             (data_out[6]));

#line 345
        data_out[7] = ((data0[7]) *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data0 += 8;
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

/* Some extra specializations for the two operand case */
static void
byte_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte value0 = (*(npy_byte *)dataptr[0]);
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte *data_out = (npy_byte *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value0_sse;
#elif EINSUM_USE_SSE2 && 0
    __m128d a, b, value0_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 383
        case 6+1:
            data_out[6] = (value0 *
                                 (data1[6]) +
                                 (data_out[6]));

#line 383
        case 5+1:
            data_out[5] = (value0 *
                                 (data1[5]) +
                                 (data_out[5]));

#line 383
        case 4+1:
            data_out[4] = (value0 *
                                 (data1[4]) +
                                 (data_out[4]));

#line 383
        case 3+1:
            data_out[3] = (value0 *
                                 (data1[3]) +
                                 (data_out[3]));

#line 383
        case 2+1:
            data_out[2] = (value0 *
                                 (data1[2]) +
                                 (data_out[2]));

#line 383
        case 1+1:
            data_out[1] = (value0 *
                                 (data1[1]) +
                                 (data_out[1]));

#line 383
        case 0+1:
            data_out[0] = (value0 *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value0_sse = _mm_set_ps1(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#elif EINSUM_USE_SSE2 && 0
    value0_sse = _mm_set1_pd(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+0));
            b = _mm_add_pd(a, _mm_load_pd(data_out+0));
            _mm_store_pd(data_out+0, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+2));
            b = _mm_add_pd(a, _mm_load_pd(data_out+2));
            _mm_store_pd(data_out+2, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+4));
            b = _mm_add_pd(a, _mm_load_pd(data_out+4));
            _mm_store_pd(data_out+4, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+6));
            b = _mm_add_pd(a, _mm_load_pd(data_out+6));
            _mm_store_pd(data_out+6, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#elif EINSUM_USE_SSE2 && 0
#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+0));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+0));
        _mm_storeu_pd(data_out+0, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+2));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+2));
        _mm_storeu_pd(data_out+2, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+4));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+4));
        _mm_storeu_pd(data_out+4, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+6));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+6));
        _mm_storeu_pd(data_out+6, b);

#else
#line 474
        data_out[0] = (value0 *
                             (data1[0]) +
                             (data_out[0]));

#line 474
        data_out[1] = (value0 *
                             (data1[1]) +
                             (data_out[1]));

#line 474
        data_out[2] = (value0 *
                             (data1[2]) +
                             (data_out[2]));

#line 474
        data_out[3] = (value0 *
                             (data1[3]) +
                             (data_out[3]));

#line 474
        data_out[4] = (value0 *
                             (data1[4]) +
                             (data_out[4]));

#line 474
        data_out[5] = (value0 *
                             (data1[5]) +
                             (data_out[5]));

#line 474
        data_out[6] = (value0 *
                             (data1[6]) +
                             (data_out[6]));

#line 474
        data_out[7] = (value0 *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    if (count > 0) {
        goto finish_after_unrolled_loop;
    }
}

static void
byte_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte value1 = (*(npy_byte *)dataptr[1]);
    npy_byte *data_out = (npy_byte *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value1_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 510
        case 6+1:
            data_out[6] = ((data0[6])*
                                 value1  +
                                 (data_out[6]));

#line 510
        case 5+1:
            data_out[5] = ((data0[5])*
                                 value1  +
                                 (data_out[5]));

#line 510
        case 4+1:
            data_out[4] = ((data0[4])*
                                 value1  +
                                 (data_out[4]));

#line 510
        case 3+1:
            data_out[3] = ((data0[3])*
                                 value1  +
                                 (data_out[3]));

#line 510
        case 2+1:
            data_out[2] = ((data0[2])*
                                 value1  +
                                 (data_out[2]));

#line 510
        case 1+1:
            data_out[1] = ((data0[1])*
                                 value1  +
                                 (data_out[1]));

#line 510
        case 0+1:
            data_out[0] = ((data0[0])*
                                 value1  +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value1_sse = _mm_set_ps1(value1);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+0), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+4), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 560
        data_out[0] = ((data0[0])*
                             value1  +
                             (data_out[0]));

#line 560
        data_out[1] = ((data0[1])*
                             value1  +
                             (data_out[1]));

#line 560
        data_out[2] = ((data0[2])*
                             value1  +
                             (data_out[2]));

#line 560
        data_out[3] = ((data0[3])*
                             value1  +
                             (data_out[3]));

#line 560
        data_out[4] = ((data0[4])*
                             value1  +
                             (data_out[4]));

#line 560
        data_out[5] = ((data0[5])*
                             value1  +
                             (data_out[5]));

#line 560
        data_out[6] = ((data0[6])*
                             value1  +
                             (data_out[6]));

#line 560
        data_out[7] = ((data0[7])*
                             value1  +
                             (data_out[7]));

#endif
        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
byte_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 596
        case 6+1:
            accum += (data0[6]) * (data1[6]);

#line 596
        case 5+1:
            accum += (data0[5]) * (data1[5]);

#line 596
        case 4+1:
            accum += (data0[4]) * (data1[4]);

#line 596
        case 3+1:
            accum += (data0[3]) * (data1[3]);

#line 596
        case 2+1:
            accum += (data0[2]) * (data1[2]);

#line 596
        case 1+1:
            accum += (data0[1]) * (data1[1]);

#line 596
        case 0+1:
            accum += (data0[0]) * (data1[0]);

        case 0:
            *(npy_byte *)dataptr[2] += (accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            accum_sse = _mm_add_ps(accum_sse, a);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            accum_sse = _mm_add_ps(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+0), _mm_load_pd(data1+0));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+2), _mm_load_pd(data1+2));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+4), _mm_load_pd(data1+4));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+6), _mm_load_pd(data1+6));
            accum_sse = _mm_add_pd(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        accum_sse = _mm_add_ps(accum_sse, a);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        accum_sse = _mm_add_ps(accum_sse, a);

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+0), _mm_loadu_pd(data1+0));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+2), _mm_loadu_pd(data1+2));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+4), _mm_loadu_pd(data1+4));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+6), _mm_loadu_pd(data1+6));
        accum_sse = _mm_add_pd(accum_sse, a);

#else
#line 708
        accum += (data0[0]) * (data1[0]);

#line 708
        accum += (data0[1]) * (data1[1]);

#line 708
        accum += (data0[2]) * (data1[2]);

#line 708
        accum += (data0[3]) * (data1[3]);

#line 708
        accum += (data0[4]) * (data1[4]);

#line 708
        accum += (data0[5]) * (data1[5]);

#line 708
        accum += (data0[6]) * (data1[6]);

#line 708
        accum += (data0[7]) * (data1[7]);

#endif
        data0 += 8;
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
byte_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte value0 = (*(npy_byte *)dataptr[0]);
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_stride0_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 754
        case 6+1:
            accum += (data1[6]);

#line 754
        case 5+1:
            accum += (data1[5]);

#line 754
        case 4+1:
            accum += (data1[4]);

#line 754
        case 3+1:
            accum += (data1[3]);

#line 754
        case 2+1:
            accum += (data1[2]);

#line 754
        case 1+1:
            accum += (data1[1]);

#line 754
        case 0+1:
            accum += (data1[0]);

        case 0:
            *(npy_byte *)dataptr[2] += (value0 * accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+0));

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+4));

            data1 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+0));

#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+4));

#else
#line 813
        accum += (data1[0]);

#line 813
        accum += (data1[1]);

#line 813
        accum += (data1[2]);

#line 813
        accum += (data1[3]);

#line 813
        accum += (data1[4]);

#line 813
        accum += (data1[5]);

#line 813
        accum += (data1[6]);

#line 813
        accum += (data1[7]);

#endif
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
byte_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte value1 = (*(npy_byte *)dataptr[1]);
    npy_byte accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_stride0_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 853
        case 6+1:
            accum += (data0[6]);

#line 853
        case 5+1:
            accum += (data0[5]);

#line 853
        case 4+1:
            accum += (data0[4]);

#line 853
        case 3+1:
            accum += (data0[3]);

#line 853
        case 2+1:
            accum += (data0[2]);

#line 853
        case 1+1:
            accum += (data0[1]);

#line 853
        case 0+1:
            accum += (data0[0]);

        case 0:
            *(npy_byte *)dataptr[2] += (accum * value1);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#else
#line 912
        accum += (data0[0]);

#line 912
        accum += (data0[1]);

#line 912
        accum += (data0[2]);

#line 912
        accum += (data0[3]);

#line 912
        accum += (data0[4]);

#line 912
        accum += (data0[5]);

#line 912
        accum += (data0[6]);

#line 912
        accum += (data0[7]);

#endif
        data0 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1000 == 3 && !0

static void
byte_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte *data2 = (npy_byte *)dataptr[2];
    npy_byte *data_out = (npy_byte *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 949
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 949
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 949
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 949
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 949
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 949
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 949
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 949
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 1000 > 3 || @complex */

static void
byte_sum_of_products_contig_any(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_any (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_byte temp = (*(npy_byte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_byte *)dataptr[i]);
        }
        *(npy_byte *)dataptr[nop] = (temp +
                                           (*(npy_byte *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_byte);
        }
#else /* complex */
#  if 1000 <= 3
#    define _SUMPROD_NOP 1000
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_byte re, im, tmp;
        int i;
        re = ((npy_byte *)dataptr[0])[0];
        im = ((npy_byte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_byte *)dataptr[i])[0] -
                  im * ((npy_byte *)dataptr[i])[1];
            im = re * ((npy_byte *)dataptr[i])[1] +
                 im * ((npy_byte *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_byte *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[0];
        ((npy_byte *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_byte);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 1000 */

#if 1000 == 1

static void
byte_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_byte accum_re = 0, accum_im = 0;
    npy_byte *data0 = (npy_byte *)dataptr[0];
#else
    npy_byte accum = 0;
    npy_byte *data0 = (npy_byte *)dataptr[0];
#endif

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif


    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_outstride0_one (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 1058
        case 6+1:
#if !0
            accum += (data0[6]);
#else /* complex */
            accum_re += data0[2*6+0];
            accum_im += data0[2*6+1];
#endif

#line 1058
        case 5+1:
#if !0
            accum += (data0[5]);
#else /* complex */
            accum_re += data0[2*5+0];
            accum_im += data0[2*5+1];
#endif

#line 1058
        case 4+1:
#if !0
            accum += (data0[4]);
#else /* complex */
            accum_re += data0[2*4+0];
            accum_im += data0[2*4+1];
#endif

#line 1058
        case 3+1:
#if !0
            accum += (data0[3]);
#else /* complex */
            accum_re += data0[2*3+0];
            accum_im += data0[2*3+1];
#endif

#line 1058
        case 2+1:
#if !0
            accum += (data0[2]);
#else /* complex */
            accum_re += data0[2*2+0];
            accum_im += data0[2*2+1];
#endif

#line 1058
        case 1+1:
#if !0
            accum += (data0[1]);
#else /* complex */
            accum_re += data0[2*1+0];
            accum_im += data0[2*1+1];
#endif

#line 1058
        case 0+1:
#if !0
            accum += (data0[0]);
#else /* complex */
            accum_re += data0[2*0+0];
            accum_im += data0[2*0+1];
#endif

        case 0:
#if 0
            ((npy_byte *)dataptr[1])[0] += accum_re;
            ((npy_byte *)dataptr[1])[1] += accum_im;
#else
            *((npy_byte *)dataptr[1]) = (accum +
                                    (*((npy_byte *)dataptr[1])));
#endif
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+0));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+2));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+4));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+6));

            data0 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+0));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+2));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+4));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+6));

#else
#line 1171
#  if !0
        accum += (data0[0]);
#  else /* complex */
        accum_re += data0[2*0+0];
        accum_im += data0[2*0+1];
#  endif

#line 1171
#  if !0
        accum += (data0[1]);
#  else /* complex */
        accum_re += data0[2*1+0];
        accum_im += data0[2*1+1];
#  endif

#line 1171
#  if !0
        accum += (data0[2]);
#  else /* complex */
        accum_re += data0[2*2+0];
        accum_im += data0[2*2+1];
#  endif

#line 1171
#  if !0
        accum += (data0[3]);
#  else /* complex */
        accum_re += data0[2*3+0];
        accum_im += data0[2*3+1];
#  endif

#line 1171
#  if !0
        accum += (data0[4]);
#  else /* complex */
        accum_re += data0[2*4+0];
        accum_im += data0[2*4+1];
#  endif

#line 1171
#  if !0
        accum += (data0[5]);
#  else /* complex */
        accum_re += data0[2*5+0];
        accum_im += data0[2*5+1];
#  endif

#line 1171
#  if !0
        accum += (data0[6]);
#  else /* complex */
        accum_re += data0[2*6+0];
        accum_im += data0[2*6+1];
#  endif

#line 1171
#  if !0
        accum += (data0[7]);
#  else /* complex */
        accum_re += data0[2*7+0];
        accum_im += data0[2*7+1];
#  endif

#endif

#if !0
        data0 += 8;
#else
        data0 += 8*2;
#endif
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#endif /* 1000 == 1 */

static void
byte_sum_of_products_outstride0_any(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_byte accum_re = 0, accum_im = 0;
#else
    npy_byte accum = 0;
#endif

#if (1000 == 1) || (1000 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1000 == 2 || 1000 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1000 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_outstride0_any (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 1000 == 1
        accum += (*(npy_byte *)data0);
        data0 += stride0;
#  elif 1000 == 2
        accum += (*(npy_byte *)data0) *
                 (*(npy_byte *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 1000 == 3
        accum += (*(npy_byte *)data0) *
                 (*(npy_byte *)data1) *
                 (*(npy_byte *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_byte temp = (*(npy_byte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_byte *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1000 == 1
        accum_re += ((npy_byte *)data0)[0];
        accum_im += ((npy_byte *)data0)[1];
        data0 += stride0;
#  else
#    if 1000 <= 3
#define _SUMPROD_NOP 1000
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_byte re, im, tmp;
        int i;
        re = ((npy_byte *)dataptr[0])[0];
        im = ((npy_byte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_byte *)dataptr[i])[0] -
                  im * ((npy_byte *)dataptr[i])[1];
            im = re * ((npy_byte *)dataptr[i])[1] +
                 im * ((npy_byte *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 1000 <= 3
    ((npy_byte *)dataptr[1000])[0] += accum_re;
    ((npy_byte *)dataptr[1000])[1] += accum_im;
#  else
    ((npy_byte *)dataptr[nop])[0] += accum_re;
    ((npy_byte *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 1000 <= 3
    *((npy_byte *)dataptr[1000]) = (accum +
                                    (*((npy_byte *)dataptr[1000])));
#  else
    *((npy_byte *)dataptr[nop]) = (accum +
                                    (*((npy_byte *)dataptr[nop])));
#  endif
#endif

}




#line 108

#line 113
static void
short_sum_of_products_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if (1 == 1) || (1 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1 == 2 || 1 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (1 == 1) || (1 <= 3 && !0)
    char *data_out = dataptr[1];
    npy_intp stride_out = strides[1];
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_one (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 1 == 1
        *(npy_short *)data_out = ((*(npy_short *)data0) +
                                         (*(npy_short *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 1 == 2
        *(npy_short *)data_out = ((*(npy_short *)data0) *
                                         (*(npy_short *)data1) +
                                         (*(npy_short *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 1 == 3
        *(npy_short *)data_out = ((*(npy_short *)data0) *
                                         (*(npy_short *)data1) *
                                         (*(npy_short *)data2) +
                                         (*(npy_short *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_short temp = (*(npy_short *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_short *)dataptr[i]);
        }
        *(npy_short *)dataptr[nop] = (temp +
                                           (*(npy_short *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1 == 1
        ((npy_short *)data_out)[0] = ((npy_short *)data0)[0] +
                                         ((npy_short *)data_out)[0];
        ((npy_short *)data_out)[1] = ((npy_short *)data0)[1] +
                                         ((npy_short *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 1 <= 3
#define _SUMPROD_NOP 1
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_short re, im, tmp;
        int i;
        re = ((npy_short *)dataptr[0])[0];
        im = ((npy_short *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_short *)dataptr[i])[0] -
                  im * ((npy_short *)dataptr[i])[1];
            im = re * ((npy_short *)dataptr[i])[1] +
                 im * ((npy_short *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_short *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[0];
        ((npy_short *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 1 == 1

static void
short_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data_out = (npy_short *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 228
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_short *)data_out + 2*6)[0] =
                                    ((npy_short *)data0 + 2*6)[0] +
                                    ((npy_short *)data_out + 2*6)[0];
            ((npy_short *)data_out + 2*6)[1] =
                                    ((npy_short *)data0 + 2*6)[1] +
                                    ((npy_short *)data_out + 2*6)[1];
#endif

#line 228
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_short *)data_out + 2*5)[0] =
                                    ((npy_short *)data0 + 2*5)[0] +
                                    ((npy_short *)data_out + 2*5)[0];
            ((npy_short *)data_out + 2*5)[1] =
                                    ((npy_short *)data0 + 2*5)[1] +
                                    ((npy_short *)data_out + 2*5)[1];
#endif

#line 228
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_short *)data_out + 2*4)[0] =
                                    ((npy_short *)data0 + 2*4)[0] +
                                    ((npy_short *)data_out + 2*4)[0];
            ((npy_short *)data_out + 2*4)[1] =
                                    ((npy_short *)data0 + 2*4)[1] +
                                    ((npy_short *)data_out + 2*4)[1];
#endif

#line 228
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_short *)data_out + 2*3)[0] =
                                    ((npy_short *)data0 + 2*3)[0] +
                                    ((npy_short *)data_out + 2*3)[0];
            ((npy_short *)data_out + 2*3)[1] =
                                    ((npy_short *)data0 + 2*3)[1] +
                                    ((npy_short *)data_out + 2*3)[1];
#endif

#line 228
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_short *)data_out + 2*2)[0] =
                                    ((npy_short *)data0 + 2*2)[0] +
                                    ((npy_short *)data_out + 2*2)[0];
            ((npy_short *)data_out + 2*2)[1] =
                                    ((npy_short *)data0 + 2*2)[1] +
                                    ((npy_short *)data_out + 2*2)[1];
#endif

#line 228
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_short *)data_out + 2*1)[0] =
                                    ((npy_short *)data0 + 2*1)[0] +
                                    ((npy_short *)data_out + 2*1)[0];
            ((npy_short *)data_out + 2*1)[1] =
                                    ((npy_short *)data0 + 2*1)[1] +
                                    ((npy_short *)data_out + 2*1)[1];
#endif

#line 228
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_short *)data_out + 2*0)[0] =
                                    ((npy_short *)data0 + 2*0)[0] +
                                    ((npy_short *)data_out + 2*0)[0];
            ((npy_short *)data_out + 2*0)[1] =
                                    ((npy_short *)data0 + 2*0)[1] +
                                    ((npy_short *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 252
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_short *)data_out + 2*0)[0] =
                                ((npy_short *)data0 + 2*0)[0] +
                                ((npy_short *)data_out + 2*0)[0];
        ((npy_short *)data_out + 2*0)[1] =
                                ((npy_short *)data0 + 2*0)[1] +
                                ((npy_short *)data_out + 2*0)[1];
#endif

#line 252
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_short *)data_out + 2*1)[0] =
                                ((npy_short *)data0 + 2*1)[0] +
                                ((npy_short *)data_out + 2*1)[0];
        ((npy_short *)data_out + 2*1)[1] =
                                ((npy_short *)data0 + 2*1)[1] +
                                ((npy_short *)data_out + 2*1)[1];
#endif

#line 252
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_short *)data_out + 2*2)[0] =
                                ((npy_short *)data0 + 2*2)[0] +
                                ((npy_short *)data_out + 2*2)[0];
        ((npy_short *)data_out + 2*2)[1] =
                                ((npy_short *)data0 + 2*2)[1] +
                                ((npy_short *)data_out + 2*2)[1];
#endif

#line 252
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_short *)data_out + 2*3)[0] =
                                ((npy_short *)data0 + 2*3)[0] +
                                ((npy_short *)data_out + 2*3)[0];
        ((npy_short *)data_out + 2*3)[1] =
                                ((npy_short *)data0 + 2*3)[1] +
                                ((npy_short *)data_out + 2*3)[1];
#endif

#line 252
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_short *)data_out + 2*4)[0] =
                                ((npy_short *)data0 + 2*4)[0] +
                                ((npy_short *)data_out + 2*4)[0];
        ((npy_short *)data_out + 2*4)[1] =
                                ((npy_short *)data0 + 2*4)[1] +
                                ((npy_short *)data_out + 2*4)[1];
#endif

#line 252
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_short *)data_out + 2*5)[0] =
                                ((npy_short *)data0 + 2*5)[0] +
                                ((npy_short *)data_out + 2*5)[0];
        ((npy_short *)data_out + 2*5)[1] =
                                ((npy_short *)data0 + 2*5)[1] +
                                ((npy_short *)data_out + 2*5)[1];
#endif

#line 252
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_short *)data_out + 2*6)[0] =
                                ((npy_short *)data0 + 2*6)[0] +
                                ((npy_short *)data_out + 2*6)[0];
        ((npy_short *)data_out + 2*6)[1] =
                                ((npy_short *)data0 + 2*6)[1] +
                                ((npy_short *)data_out + 2*6)[1];
#endif

#line 252
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_short *)data_out + 2*7)[0] =
                                ((npy_short *)data0 + 2*7)[0] +
                                ((npy_short *)data_out + 2*7)[0];
        ((npy_short *)data_out + 2*7)[1] =
                                ((npy_short *)data0 + 2*7)[1] +
                                ((npy_short *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1 == 2 && !0

static void
short_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short *data_out = (npy_short *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b;
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_two (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 295
        case 6+1:
            data_out[6] = ((data0[6]) *
                                 (data1[6]) +
                                 (data_out[6]));

#line 295
        case 5+1:
            data_out[5] = ((data0[5]) *
                                 (data1[5]) +
                                 (data_out[5]));

#line 295
        case 4+1:
            data_out[4] = ((data0[4]) *
                                 (data1[4]) +
                                 (data_out[4]));

#line 295
        case 3+1:
            data_out[3] = ((data0[3]) *
                                 (data1[3]) +
                                 (data_out[3]));

#line 295
        case 2+1:
            data_out[2] = ((data0[2]) *
                                 (data1[2]) +
                                 (data_out[2]));

#line 295
        case 1+1:
            data_out[1] = ((data0[1]) *
                                 (data1[1]) +
                                 (data_out[1]));

#line 295
        case 0+1:
            data_out[0] = ((data0[0]) *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
        EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 345
        data_out[0] = ((data0[0]) *
                             (data1[0]) +
                             (data_out[0]));

#line 345
        data_out[1] = ((data0[1]) *
                             (data1[1]) +
                             (data_out[1]));

#line 345
        data_out[2] = ((data0[2]) *
                             (data1[2]) +
                             (data_out[2]));

#line 345
        data_out[3] = ((data0[3]) *
                             (data1[3]) +
                             (data_out[3]));

#line 345
        data_out[4] = ((data0[4]) *
                             (data1[4]) +
                             (data_out[4]));

#line 345
        data_out[5] = ((data0[5]) *
                             (data1[5]) +
                             (data_out[5]));

#line 345
        data_out[6] = ((data0[6]) *
                             (data1[6]) +
                             (data_out[6]));

#line 345
        data_out[7] = ((data0[7]) *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data0 += 8;
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

/* Some extra specializations for the two operand case */
static void
short_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short value0 = (*(npy_short *)dataptr[0]);
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short *data_out = (npy_short *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value0_sse;
#elif EINSUM_USE_SSE2 && 0
    __m128d a, b, value0_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 383
        case 6+1:
            data_out[6] = (value0 *
                                 (data1[6]) +
                                 (data_out[6]));

#line 383
        case 5+1:
            data_out[5] = (value0 *
                                 (data1[5]) +
                                 (data_out[5]));

#line 383
        case 4+1:
            data_out[4] = (value0 *
                                 (data1[4]) +
                                 (data_out[4]));

#line 383
        case 3+1:
            data_out[3] = (value0 *
                                 (data1[3]) +
                                 (data_out[3]));

#line 383
        case 2+1:
            data_out[2] = (value0 *
                                 (data1[2]) +
                                 (data_out[2]));

#line 383
        case 1+1:
            data_out[1] = (value0 *
                                 (data1[1]) +
                                 (data_out[1]));

#line 383
        case 0+1:
            data_out[0] = (value0 *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value0_sse = _mm_set_ps1(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#elif EINSUM_USE_SSE2 && 0
    value0_sse = _mm_set1_pd(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+0));
            b = _mm_add_pd(a, _mm_load_pd(data_out+0));
            _mm_store_pd(data_out+0, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+2));
            b = _mm_add_pd(a, _mm_load_pd(data_out+2));
            _mm_store_pd(data_out+2, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+4));
            b = _mm_add_pd(a, _mm_load_pd(data_out+4));
            _mm_store_pd(data_out+4, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+6));
            b = _mm_add_pd(a, _mm_load_pd(data_out+6));
            _mm_store_pd(data_out+6, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#elif EINSUM_USE_SSE2 && 0
#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+0));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+0));
        _mm_storeu_pd(data_out+0, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+2));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+2));
        _mm_storeu_pd(data_out+2, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+4));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+4));
        _mm_storeu_pd(data_out+4, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+6));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+6));
        _mm_storeu_pd(data_out+6, b);

#else
#line 474
        data_out[0] = (value0 *
                             (data1[0]) +
                             (data_out[0]));

#line 474
        data_out[1] = (value0 *
                             (data1[1]) +
                             (data_out[1]));

#line 474
        data_out[2] = (value0 *
                             (data1[2]) +
                             (data_out[2]));

#line 474
        data_out[3] = (value0 *
                             (data1[3]) +
                             (data_out[3]));

#line 474
        data_out[4] = (value0 *
                             (data1[4]) +
                             (data_out[4]));

#line 474
        data_out[5] = (value0 *
                             (data1[5]) +
                             (data_out[5]));

#line 474
        data_out[6] = (value0 *
                             (data1[6]) +
                             (data_out[6]));

#line 474
        data_out[7] = (value0 *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    if (count > 0) {
        goto finish_after_unrolled_loop;
    }
}

static void
short_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short value1 = (*(npy_short *)dataptr[1]);
    npy_short *data_out = (npy_short *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value1_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 510
        case 6+1:
            data_out[6] = ((data0[6])*
                                 value1  +
                                 (data_out[6]));

#line 510
        case 5+1:
            data_out[5] = ((data0[5])*
                                 value1  +
                                 (data_out[5]));

#line 510
        case 4+1:
            data_out[4] = ((data0[4])*
                                 value1  +
                                 (data_out[4]));

#line 510
        case 3+1:
            data_out[3] = ((data0[3])*
                                 value1  +
                                 (data_out[3]));

#line 510
        case 2+1:
            data_out[2] = ((data0[2])*
                                 value1  +
                                 (data_out[2]));

#line 510
        case 1+1:
            data_out[1] = ((data0[1])*
                                 value1  +
                                 (data_out[1]));

#line 510
        case 0+1:
            data_out[0] = ((data0[0])*
                                 value1  +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value1_sse = _mm_set_ps1(value1);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+0), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+4), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 560
        data_out[0] = ((data0[0])*
                             value1  +
                             (data_out[0]));

#line 560
        data_out[1] = ((data0[1])*
                             value1  +
                             (data_out[1]));

#line 560
        data_out[2] = ((data0[2])*
                             value1  +
                             (data_out[2]));

#line 560
        data_out[3] = ((data0[3])*
                             value1  +
                             (data_out[3]));

#line 560
        data_out[4] = ((data0[4])*
                             value1  +
                             (data_out[4]));

#line 560
        data_out[5] = ((data0[5])*
                             value1  +
                             (data_out[5]));

#line 560
        data_out[6] = ((data0[6])*
                             value1  +
                             (data_out[6]));

#line 560
        data_out[7] = ((data0[7])*
                             value1  +
                             (data_out[7]));

#endif
        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
short_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 596
        case 6+1:
            accum += (data0[6]) * (data1[6]);

#line 596
        case 5+1:
            accum += (data0[5]) * (data1[5]);

#line 596
        case 4+1:
            accum += (data0[4]) * (data1[4]);

#line 596
        case 3+1:
            accum += (data0[3]) * (data1[3]);

#line 596
        case 2+1:
            accum += (data0[2]) * (data1[2]);

#line 596
        case 1+1:
            accum += (data0[1]) * (data1[1]);

#line 596
        case 0+1:
            accum += (data0[0]) * (data1[0]);

        case 0:
            *(npy_short *)dataptr[2] += (accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            accum_sse = _mm_add_ps(accum_sse, a);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            accum_sse = _mm_add_ps(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+0), _mm_load_pd(data1+0));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+2), _mm_load_pd(data1+2));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+4), _mm_load_pd(data1+4));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+6), _mm_load_pd(data1+6));
            accum_sse = _mm_add_pd(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        accum_sse = _mm_add_ps(accum_sse, a);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        accum_sse = _mm_add_ps(accum_sse, a);

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+0), _mm_loadu_pd(data1+0));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+2), _mm_loadu_pd(data1+2));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+4), _mm_loadu_pd(data1+4));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+6), _mm_loadu_pd(data1+6));
        accum_sse = _mm_add_pd(accum_sse, a);

#else
#line 708
        accum += (data0[0]) * (data1[0]);

#line 708
        accum += (data0[1]) * (data1[1]);

#line 708
        accum += (data0[2]) * (data1[2]);

#line 708
        accum += (data0[3]) * (data1[3]);

#line 708
        accum += (data0[4]) * (data1[4]);

#line 708
        accum += (data0[5]) * (data1[5]);

#line 708
        accum += (data0[6]) * (data1[6]);

#line 708
        accum += (data0[7]) * (data1[7]);

#endif
        data0 += 8;
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
short_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short value0 = (*(npy_short *)dataptr[0]);
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_stride0_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 754
        case 6+1:
            accum += (data1[6]);

#line 754
        case 5+1:
            accum += (data1[5]);

#line 754
        case 4+1:
            accum += (data1[4]);

#line 754
        case 3+1:
            accum += (data1[3]);

#line 754
        case 2+1:
            accum += (data1[2]);

#line 754
        case 1+1:
            accum += (data1[1]);

#line 754
        case 0+1:
            accum += (data1[0]);

        case 0:
            *(npy_short *)dataptr[2] += (value0 * accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+0));

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+4));

            data1 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+0));

#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+4));

#else
#line 813
        accum += (data1[0]);

#line 813
        accum += (data1[1]);

#line 813
        accum += (data1[2]);

#line 813
        accum += (data1[3]);

#line 813
        accum += (data1[4]);

#line 813
        accum += (data1[5]);

#line 813
        accum += (data1[6]);

#line 813
        accum += (data1[7]);

#endif
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
short_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short value1 = (*(npy_short *)dataptr[1]);
    npy_short accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_stride0_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 853
        case 6+1:
            accum += (data0[6]);

#line 853
        case 5+1:
            accum += (data0[5]);

#line 853
        case 4+1:
            accum += (data0[4]);

#line 853
        case 3+1:
            accum += (data0[3]);

#line 853
        case 2+1:
            accum += (data0[2]);

#line 853
        case 1+1:
            accum += (data0[1]);

#line 853
        case 0+1:
            accum += (data0[0]);

        case 0:
            *(npy_short *)dataptr[2] += (accum * value1);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#else
#line 912
        accum += (data0[0]);

#line 912
        accum += (data0[1]);

#line 912
        accum += (data0[2]);

#line 912
        accum += (data0[3]);

#line 912
        accum += (data0[4]);

#line 912
        accum += (data0[5]);

#line 912
        accum += (data0[6]);

#line 912
        accum += (data0[7]);

#endif
        data0 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1 == 3 && !0

static void
short_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short *data2 = (npy_short *)dataptr[2];
    npy_short *data_out = (npy_short *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 949
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 949
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 949
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 949
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 949
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 949
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 949
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 949
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 1 > 3 || @complex */

static void
short_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_one (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_short temp = (*(npy_short *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_short *)dataptr[i]);
        }
        *(npy_short *)dataptr[nop] = (temp +
                                           (*(npy_short *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_short);
        }
#else /* complex */
#  if 1 <= 3
#    define _SUMPROD_NOP 1
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_short re, im, tmp;
        int i;
        re = ((npy_short *)dataptr[0])[0];
        im = ((npy_short *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_short *)dataptr[i])[0] -
                  im * ((npy_short *)dataptr[i])[1];
            im = re * ((npy_short *)dataptr[i])[1] +
                 im * ((npy_short *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_short *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[0];
        ((npy_short *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_short);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 1 */

#if 1 == 1

static void
short_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_short accum_re = 0, accum_im = 0;
    npy_short *data0 = (npy_short *)dataptr[0];
#else
    npy_short accum = 0;
    npy_short *data0 = (npy_short *)dataptr[0];
#endif

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif


    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_outstride0_one (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 1058
        case 6+1:
#if !0
            accum += (data0[6]);
#else /* complex */
            accum_re += data0[2*6+0];
            accum_im += data0[2*6+1];
#endif

#line 1058
        case 5+1:
#if !0
            accum += (data0[5]);
#else /* complex */
            accum_re += data0[2*5+0];
            accum_im += data0[2*5+1];
#endif

#line 1058
        case 4+1:
#if !0
            accum += (data0[4]);
#else /* complex */
            accum_re += data0[2*4+0];
            accum_im += data0[2*4+1];
#endif

#line 1058
        case 3+1:
#if !0
            accum += (data0[3]);
#else /* complex */
            accum_re += data0[2*3+0];
            accum_im += data0[2*3+1];
#endif

#line 1058
        case 2+1:
#if !0
            accum += (data0[2]);
#else /* complex */
            accum_re += data0[2*2+0];
            accum_im += data0[2*2+1];
#endif

#line 1058
        case 1+1:
#if !0
            accum += (data0[1]);
#else /* complex */
            accum_re += data0[2*1+0];
            accum_im += data0[2*1+1];
#endif

#line 1058
        case 0+1:
#if !0
            accum += (data0[0]);
#else /* complex */
            accum_re += data0[2*0+0];
            accum_im += data0[2*0+1];
#endif

        case 0:
#if 0
            ((npy_short *)dataptr[1])[0] += accum_re;
            ((npy_short *)dataptr[1])[1] += accum_im;
#else
            *((npy_short *)dataptr[1]) = (accum +
                                    (*((npy_short *)dataptr[1])));
#endif
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+0));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+2));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+4));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+6));

            data0 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+0));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+2));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+4));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+6));

#else
#line 1171
#  if !0
        accum += (data0[0]);
#  else /* complex */
        accum_re += data0[2*0+0];
        accum_im += data0[2*0+1];
#  endif

#line 1171
#  if !0
        accum += (data0[1]);
#  else /* complex */
        accum_re += data0[2*1+0];
        accum_im += data0[2*1+1];
#  endif

#line 1171
#  if !0
        accum += (data0[2]);
#  else /* complex */
        accum_re += data0[2*2+0];
        accum_im += data0[2*2+1];
#  endif

#line 1171
#  if !0
        accum += (data0[3]);
#  else /* complex */
        accum_re += data0[2*3+0];
        accum_im += data0[2*3+1];
#  endif

#line 1171
#  if !0
        accum += (data0[4]);
#  else /* complex */
        accum_re += data0[2*4+0];
        accum_im += data0[2*4+1];
#  endif

#line 1171
#  if !0
        accum += (data0[5]);
#  else /* complex */
        accum_re += data0[2*5+0];
        accum_im += data0[2*5+1];
#  endif

#line 1171
#  if !0
        accum += (data0[6]);
#  else /* complex */
        accum_re += data0[2*6+0];
        accum_im += data0[2*6+1];
#  endif

#line 1171
#  if !0
        accum += (data0[7]);
#  else /* complex */
        accum_re += data0[2*7+0];
        accum_im += data0[2*7+1];
#  endif

#endif

#if !0
        data0 += 8;
#else
        data0 += 8*2;
#endif
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#endif /* 1 == 1 */

static void
short_sum_of_products_outstride0_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_short accum_re = 0, accum_im = 0;
#else
    npy_short accum = 0;
#endif

#if (1 == 1) || (1 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1 == 2 || 1 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_outstride0_one (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 1 == 1
        accum += (*(npy_short *)data0);
        data0 += stride0;
#  elif 1 == 2
        accum += (*(npy_short *)data0) *
                 (*(npy_short *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 1 == 3
        accum += (*(npy_short *)data0) *
                 (*(npy_short *)data1) *
                 (*(npy_short *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_short temp = (*(npy_short *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_short *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1 == 1
        accum_re += ((npy_short *)data0)[0];
        accum_im += ((npy_short *)data0)[1];
        data0 += stride0;
#  else
#    if 1 <= 3
#define _SUMPROD_NOP 1
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_short re, im, tmp;
        int i;
        re = ((npy_short *)dataptr[0])[0];
        im = ((npy_short *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_short *)dataptr[i])[0] -
                  im * ((npy_short *)dataptr[i])[1];
            im = re * ((npy_short *)dataptr[i])[1] +
                 im * ((npy_short *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 1 <= 3
    ((npy_short *)dataptr[1])[0] += accum_re;
    ((npy_short *)dataptr[1])[1] += accum_im;
#  else
    ((npy_short *)dataptr[nop])[0] += accum_re;
    ((npy_short *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 1 <= 3
    *((npy_short *)dataptr[1]) = (accum +
                                    (*((npy_short *)dataptr[1])));
#  else
    *((npy_short *)dataptr[nop]) = (accum +
                                    (*((npy_short *)dataptr[nop])));
#  endif
#endif

}


#line 113
static void
short_sum_of_products_two(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if (2 == 1) || (2 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (2 == 2 || 2 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (2 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (2 == 1) || (2 <= 3 && !0)
    char *data_out = dataptr[2];
    npy_intp stride_out = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_two (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 2 == 1
        *(npy_short *)data_out = ((*(npy_short *)data0) +
                                         (*(npy_short *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 2 == 2
        *(npy_short *)data_out = ((*(npy_short *)data0) *
                                         (*(npy_short *)data1) +
                                         (*(npy_short *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 2 == 3
        *(npy_short *)data_out = ((*(npy_short *)data0) *
                                         (*(npy_short *)data1) *
                                         (*(npy_short *)data2) +
                                         (*(npy_short *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_short temp = (*(npy_short *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_short *)dataptr[i]);
        }
        *(npy_short *)dataptr[nop] = (temp +
                                           (*(npy_short *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 2 == 1
        ((npy_short *)data_out)[0] = ((npy_short *)data0)[0] +
                                         ((npy_short *)data_out)[0];
        ((npy_short *)data_out)[1] = ((npy_short *)data0)[1] +
                                         ((npy_short *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 2 <= 3
#define _SUMPROD_NOP 2
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_short re, im, tmp;
        int i;
        re = ((npy_short *)dataptr[0])[0];
        im = ((npy_short *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_short *)dataptr[i])[0] -
                  im * ((npy_short *)dataptr[i])[1];
            im = re * ((npy_short *)dataptr[i])[1] +
                 im * ((npy_short *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_short *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[0];
        ((npy_short *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 2 == 1

static void
short_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data_out = (npy_short *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 228
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_short *)data_out + 2*6)[0] =
                                    ((npy_short *)data0 + 2*6)[0] +
                                    ((npy_short *)data_out + 2*6)[0];
            ((npy_short *)data_out + 2*6)[1] =
                                    ((npy_short *)data0 + 2*6)[1] +
                                    ((npy_short *)data_out + 2*6)[1];
#endif

#line 228
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_short *)data_out + 2*5)[0] =
                                    ((npy_short *)data0 + 2*5)[0] +
                                    ((npy_short *)data_out + 2*5)[0];
            ((npy_short *)data_out + 2*5)[1] =
                                    ((npy_short *)data0 + 2*5)[1] +
                                    ((npy_short *)data_out + 2*5)[1];
#endif

#line 228
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_short *)data_out + 2*4)[0] =
                                    ((npy_short *)data0 + 2*4)[0] +
                                    ((npy_short *)data_out + 2*4)[0];
            ((npy_short *)data_out + 2*4)[1] =
                                    ((npy_short *)data0 + 2*4)[1] +
                                    ((npy_short *)data_out + 2*4)[1];
#endif

#line 228
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_short *)data_out + 2*3)[0] =
                                    ((npy_short *)data0 + 2*3)[0] +
                                    ((npy_short *)data_out + 2*3)[0];
            ((npy_short *)data_out + 2*3)[1] =
                                    ((npy_short *)data0 + 2*3)[1] +
                                    ((npy_short *)data_out + 2*3)[1];
#endif

#line 228
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_short *)data_out + 2*2)[0] =
                                    ((npy_short *)data0 + 2*2)[0] +
                                    ((npy_short *)data_out + 2*2)[0];
            ((npy_short *)data_out + 2*2)[1] =
                                    ((npy_short *)data0 + 2*2)[1] +
                                    ((npy_short *)data_out + 2*2)[1];
#endif

#line 228
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_short *)data_out + 2*1)[0] =
                                    ((npy_short *)data0 + 2*1)[0] +
                                    ((npy_short *)data_out + 2*1)[0];
            ((npy_short *)data_out + 2*1)[1] =
                                    ((npy_short *)data0 + 2*1)[1] +
                                    ((npy_short *)data_out + 2*1)[1];
#endif

#line 228
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_short *)data_out + 2*0)[0] =
                                    ((npy_short *)data0 + 2*0)[0] +
                                    ((npy_short *)data_out + 2*0)[0];
            ((npy_short *)data_out + 2*0)[1] =
                                    ((npy_short *)data0 + 2*0)[1] +
                                    ((npy_short *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 252
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_short *)data_out + 2*0)[0] =
                                ((npy_short *)data0 + 2*0)[0] +
                                ((npy_short *)data_out + 2*0)[0];
        ((npy_short *)data_out + 2*0)[1] =
                                ((npy_short *)data0 + 2*0)[1] +
                                ((npy_short *)data_out + 2*0)[1];
#endif

#line 252
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_short *)data_out + 2*1)[0] =
                                ((npy_short *)data0 + 2*1)[0] +
                                ((npy_short *)data_out + 2*1)[0];
        ((npy_short *)data_out + 2*1)[1] =
                                ((npy_short *)data0 + 2*1)[1] +
                                ((npy_short *)data_out + 2*1)[1];
#endif

#line 252
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_short *)data_out + 2*2)[0] =
                                ((npy_short *)data0 + 2*2)[0] +
                                ((npy_short *)data_out + 2*2)[0];
        ((npy_short *)data_out + 2*2)[1] =
                                ((npy_short *)data0 + 2*2)[1] +
                                ((npy_short *)data_out + 2*2)[1];
#endif

#line 252
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_short *)data_out + 2*3)[0] =
                                ((npy_short *)data0 + 2*3)[0] +
                                ((npy_short *)data_out + 2*3)[0];
        ((npy_short *)data_out + 2*3)[1] =
                                ((npy_short *)data0 + 2*3)[1] +
                                ((npy_short *)data_out + 2*3)[1];
#endif

#line 252
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_short *)data_out + 2*4)[0] =
                                ((npy_short *)data0 + 2*4)[0] +
                                ((npy_short *)data_out + 2*4)[0];
        ((npy_short *)data_out + 2*4)[1] =
                                ((npy_short *)data0 + 2*4)[1] +
                                ((npy_short *)data_out + 2*4)[1];
#endif

#line 252
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_short *)data_out + 2*5)[0] =
                                ((npy_short *)data0 + 2*5)[0] +
                                ((npy_short *)data_out + 2*5)[0];
        ((npy_short *)data_out + 2*5)[1] =
                                ((npy_short *)data0 + 2*5)[1] +
                                ((npy_short *)data_out + 2*5)[1];
#endif

#line 252
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_short *)data_out + 2*6)[0] =
                                ((npy_short *)data0 + 2*6)[0] +
                                ((npy_short *)data_out + 2*6)[0];
        ((npy_short *)data_out + 2*6)[1] =
                                ((npy_short *)data0 + 2*6)[1] +
                                ((npy_short *)data_out + 2*6)[1];
#endif

#line 252
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_short *)data_out + 2*7)[0] =
                                ((npy_short *)data0 + 2*7)[0] +
                                ((npy_short *)data_out + 2*7)[0];
        ((npy_short *)data_out + 2*7)[1] =
                                ((npy_short *)data0 + 2*7)[1] +
                                ((npy_short *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 2 == 2 && !0

static void
short_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short *data_out = (npy_short *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b;
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_two (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 295
        case 6+1:
            data_out[6] = ((data0[6]) *
                                 (data1[6]) +
                                 (data_out[6]));

#line 295
        case 5+1:
            data_out[5] = ((data0[5]) *
                                 (data1[5]) +
                                 (data_out[5]));

#line 295
        case 4+1:
            data_out[4] = ((data0[4]) *
                                 (data1[4]) +
                                 (data_out[4]));

#line 295
        case 3+1:
            data_out[3] = ((data0[3]) *
                                 (data1[3]) +
                                 (data_out[3]));

#line 295
        case 2+1:
            data_out[2] = ((data0[2]) *
                                 (data1[2]) +
                                 (data_out[2]));

#line 295
        case 1+1:
            data_out[1] = ((data0[1]) *
                                 (data1[1]) +
                                 (data_out[1]));

#line 295
        case 0+1:
            data_out[0] = ((data0[0]) *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
        EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 345
        data_out[0] = ((data0[0]) *
                             (data1[0]) +
                             (data_out[0]));

#line 345
        data_out[1] = ((data0[1]) *
                             (data1[1]) +
                             (data_out[1]));

#line 345
        data_out[2] = ((data0[2]) *
                             (data1[2]) +
                             (data_out[2]));

#line 345
        data_out[3] = ((data0[3]) *
                             (data1[3]) +
                             (data_out[3]));

#line 345
        data_out[4] = ((data0[4]) *
                             (data1[4]) +
                             (data_out[4]));

#line 345
        data_out[5] = ((data0[5]) *
                             (data1[5]) +
                             (data_out[5]));

#line 345
        data_out[6] = ((data0[6]) *
                             (data1[6]) +
                             (data_out[6]));

#line 345
        data_out[7] = ((data0[7]) *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data0 += 8;
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

/* Some extra specializations for the two operand case */
static void
short_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short value0 = (*(npy_short *)dataptr[0]);
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short *data_out = (npy_short *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value0_sse;
#elif EINSUM_USE_SSE2 && 0
    __m128d a, b, value0_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 383
        case 6+1:
            data_out[6] = (value0 *
                                 (data1[6]) +
                                 (data_out[6]));

#line 383
        case 5+1:
            data_out[5] = (value0 *
                                 (data1[5]) +
                                 (data_out[5]));

#line 383
        case 4+1:
            data_out[4] = (value0 *
                                 (data1[4]) +
                                 (data_out[4]));

#line 383
        case 3+1:
            data_out[3] = (value0 *
                                 (data1[3]) +
                                 (data_out[3]));

#line 383
        case 2+1:
            data_out[2] = (value0 *
                                 (data1[2]) +
                                 (data_out[2]));

#line 383
        case 1+1:
            data_out[1] = (value0 *
                                 (data1[1]) +
                                 (data_out[1]));

#line 383
        case 0+1:
            data_out[0] = (value0 *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value0_sse = _mm_set_ps1(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#elif EINSUM_USE_SSE2 && 0
    value0_sse = _mm_set1_pd(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+0));
            b = _mm_add_pd(a, _mm_load_pd(data_out+0));
            _mm_store_pd(data_out+0, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+2));
            b = _mm_add_pd(a, _mm_load_pd(data_out+2));
            _mm_store_pd(data_out+2, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+4));
            b = _mm_add_pd(a, _mm_load_pd(data_out+4));
            _mm_store_pd(data_out+4, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+6));
            b = _mm_add_pd(a, _mm_load_pd(data_out+6));
            _mm_store_pd(data_out+6, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#elif EINSUM_USE_SSE2 && 0
#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+0));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+0));
        _mm_storeu_pd(data_out+0, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+2));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+2));
        _mm_storeu_pd(data_out+2, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+4));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+4));
        _mm_storeu_pd(data_out+4, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+6));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+6));
        _mm_storeu_pd(data_out+6, b);

#else
#line 474
        data_out[0] = (value0 *
                             (data1[0]) +
                             (data_out[0]));

#line 474
        data_out[1] = (value0 *
                             (data1[1]) +
                             (data_out[1]));

#line 474
        data_out[2] = (value0 *
                             (data1[2]) +
                             (data_out[2]));

#line 474
        data_out[3] = (value0 *
                             (data1[3]) +
                             (data_out[3]));

#line 474
        data_out[4] = (value0 *
                             (data1[4]) +
                             (data_out[4]));

#line 474
        data_out[5] = (value0 *
                             (data1[5]) +
                             (data_out[5]));

#line 474
        data_out[6] = (value0 *
                             (data1[6]) +
                             (data_out[6]));

#line 474
        data_out[7] = (value0 *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    if (count > 0) {
        goto finish_after_unrolled_loop;
    }
}

static void
short_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short value1 = (*(npy_short *)dataptr[1]);
    npy_short *data_out = (npy_short *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value1_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 510
        case 6+1:
            data_out[6] = ((data0[6])*
                                 value1  +
                                 (data_out[6]));

#line 510
        case 5+1:
            data_out[5] = ((data0[5])*
                                 value1  +
                                 (data_out[5]));

#line 510
        case 4+1:
            data_out[4] = ((data0[4])*
                                 value1  +
                                 (data_out[4]));

#line 510
        case 3+1:
            data_out[3] = ((data0[3])*
                                 value1  +
                                 (data_out[3]));

#line 510
        case 2+1:
            data_out[2] = ((data0[2])*
                                 value1  +
                                 (data_out[2]));

#line 510
        case 1+1:
            data_out[1] = ((data0[1])*
                                 value1  +
                                 (data_out[1]));

#line 510
        case 0+1:
            data_out[0] = ((data0[0])*
                                 value1  +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value1_sse = _mm_set_ps1(value1);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+0), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+4), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 560
        data_out[0] = ((data0[0])*
                             value1  +
                             (data_out[0]));

#line 560
        data_out[1] = ((data0[1])*
                             value1  +
                             (data_out[1]));

#line 560
        data_out[2] = ((data0[2])*
                             value1  +
                             (data_out[2]));

#line 560
        data_out[3] = ((data0[3])*
                             value1  +
                             (data_out[3]));

#line 560
        data_out[4] = ((data0[4])*
                             value1  +
                             (data_out[4]));

#line 560
        data_out[5] = ((data0[5])*
                             value1  +
                             (data_out[5]));

#line 560
        data_out[6] = ((data0[6])*
                             value1  +
                             (data_out[6]));

#line 560
        data_out[7] = ((data0[7])*
                             value1  +
                             (data_out[7]));

#endif
        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
short_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 596
        case 6+1:
            accum += (data0[6]) * (data1[6]);

#line 596
        case 5+1:
            accum += (data0[5]) * (data1[5]);

#line 596
        case 4+1:
            accum += (data0[4]) * (data1[4]);

#line 596
        case 3+1:
            accum += (data0[3]) * (data1[3]);

#line 596
        case 2+1:
            accum += (data0[2]) * (data1[2]);

#line 596
        case 1+1:
            accum += (data0[1]) * (data1[1]);

#line 596
        case 0+1:
            accum += (data0[0]) * (data1[0]);

        case 0:
            *(npy_short *)dataptr[2] += (accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            accum_sse = _mm_add_ps(accum_sse, a);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            accum_sse = _mm_add_ps(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+0), _mm_load_pd(data1+0));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+2), _mm_load_pd(data1+2));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+4), _mm_load_pd(data1+4));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+6), _mm_load_pd(data1+6));
            accum_sse = _mm_add_pd(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        accum_sse = _mm_add_ps(accum_sse, a);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        accum_sse = _mm_add_ps(accum_sse, a);

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+0), _mm_loadu_pd(data1+0));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+2), _mm_loadu_pd(data1+2));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+4), _mm_loadu_pd(data1+4));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+6), _mm_loadu_pd(data1+6));
        accum_sse = _mm_add_pd(accum_sse, a);

#else
#line 708
        accum += (data0[0]) * (data1[0]);

#line 708
        accum += (data0[1]) * (data1[1]);

#line 708
        accum += (data0[2]) * (data1[2]);

#line 708
        accum += (data0[3]) * (data1[3]);

#line 708
        accum += (data0[4]) * (data1[4]);

#line 708
        accum += (data0[5]) * (data1[5]);

#line 708
        accum += (data0[6]) * (data1[6]);

#line 708
        accum += (data0[7]) * (data1[7]);

#endif
        data0 += 8;
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
short_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short value0 = (*(npy_short *)dataptr[0]);
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_stride0_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 754
        case 6+1:
            accum += (data1[6]);

#line 754
        case 5+1:
            accum += (data1[5]);

#line 754
        case 4+1:
            accum += (data1[4]);

#line 754
        case 3+1:
            accum += (data1[3]);

#line 754
        case 2+1:
            accum += (data1[2]);

#line 754
        case 1+1:
            accum += (data1[1]);

#line 754
        case 0+1:
            accum += (data1[0]);

        case 0:
            *(npy_short *)dataptr[2] += (value0 * accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+0));

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+4));

            data1 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+0));

#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+4));

#else
#line 813
        accum += (data1[0]);

#line 813
        accum += (data1[1]);

#line 813
        accum += (data1[2]);

#line 813
        accum += (data1[3]);

#line 813
        accum += (data1[4]);

#line 813
        accum += (data1[5]);

#line 813
        accum += (data1[6]);

#line 813
        accum += (data1[7]);

#endif
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
short_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short value1 = (*(npy_short *)dataptr[1]);
    npy_short accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_stride0_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 853
        case 6+1:
            accum += (data0[6]);

#line 853
        case 5+1:
            accum += (data0[5]);

#line 853
        case 4+1:
            accum += (data0[4]);

#line 853
        case 3+1:
            accum += (data0[3]);

#line 853
        case 2+1:
            accum += (data0[2]);

#line 853
        case 1+1:
            accum += (data0[1]);

#line 853
        case 0+1:
            accum += (data0[0]);

        case 0:
            *(npy_short *)dataptr[2] += (accum * value1);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#else
#line 912
        accum += (data0[0]);

#line 912
        accum += (data0[1]);

#line 912
        accum += (data0[2]);

#line 912
        accum += (data0[3]);

#line 912
        accum += (data0[4]);

#line 912
        accum += (data0[5]);

#line 912
        accum += (data0[6]);

#line 912
        accum += (data0[7]);

#endif
        data0 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 2 == 3 && !0

static void
short_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short *data2 = (npy_short *)dataptr[2];
    npy_short *data_out = (npy_short *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 949
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 949
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 949
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 949
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 949
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 949
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 949
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 949
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 2 > 3 || @complex */

static void
short_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_two (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_short temp = (*(npy_short *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_short *)dataptr[i]);
        }
        *(npy_short *)dataptr[nop] = (temp +
                                           (*(npy_short *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_short);
        }
#else /* complex */
#  if 2 <= 3
#    define _SUMPROD_NOP 2
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_short re, im, tmp;
        int i;
        re = ((npy_short *)dataptr[0])[0];
        im = ((npy_short *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_short *)dataptr[i])[0] -
                  im * ((npy_short *)dataptr[i])[1];
            im = re * ((npy_short *)dataptr[i])[1] +
                 im * ((npy_short *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_short *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[0];
        ((npy_short *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_short);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 2 */

#if 2 == 1

static void
short_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_short accum_re = 0, accum_im = 0;
    npy_short *data0 = (npy_short *)dataptr[0];
#else
    npy_short accum = 0;
    npy_short *data0 = (npy_short *)dataptr[0];
#endif

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif


    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_outstride0_one (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 1058
        case 6+1:
#if !0
            accum += (data0[6]);
#else /* complex */
            accum_re += data0[2*6+0];
            accum_im += data0[2*6+1];
#endif

#line 1058
        case 5+1:
#if !0
            accum += (data0[5]);
#else /* complex */
            accum_re += data0[2*5+0];
            accum_im += data0[2*5+1];
#endif

#line 1058
        case 4+1:
#if !0
            accum += (data0[4]);
#else /* complex */
            accum_re += data0[2*4+0];
            accum_im += data0[2*4+1];
#endif

#line 1058
        case 3+1:
#if !0
            accum += (data0[3]);
#else /* complex */
            accum_re += data0[2*3+0];
            accum_im += data0[2*3+1];
#endif

#line 1058
        case 2+1:
#if !0
            accum += (data0[2]);
#else /* complex */
            accum_re += data0[2*2+0];
            accum_im += data0[2*2+1];
#endif

#line 1058
        case 1+1:
#if !0
            accum += (data0[1]);
#else /* complex */
            accum_re += data0[2*1+0];
            accum_im += data0[2*1+1];
#endif

#line 1058
        case 0+1:
#if !0
            accum += (data0[0]);
#else /* complex */
            accum_re += data0[2*0+0];
            accum_im += data0[2*0+1];
#endif

        case 0:
#if 0
            ((npy_short *)dataptr[1])[0] += accum_re;
            ((npy_short *)dataptr[1])[1] += accum_im;
#else
            *((npy_short *)dataptr[1]) = (accum +
                                    (*((npy_short *)dataptr[1])));
#endif
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+0));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+2));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+4));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+6));

            data0 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+0));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+2));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+4));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+6));

#else
#line 1171
#  if !0
        accum += (data0[0]);
#  else /* complex */
        accum_re += data0[2*0+0];
        accum_im += data0[2*0+1];
#  endif

#line 1171
#  if !0
        accum += (data0[1]);
#  else /* complex */
        accum_re += data0[2*1+0];
        accum_im += data0[2*1+1];
#  endif

#line 1171
#  if !0
        accum += (data0[2]);
#  else /* complex */
        accum_re += data0[2*2+0];
        accum_im += data0[2*2+1];
#  endif

#line 1171
#  if !0
        accum += (data0[3]);
#  else /* complex */
        accum_re += data0[2*3+0];
        accum_im += data0[2*3+1];
#  endif

#line 1171
#  if !0
        accum += (data0[4]);
#  else /* complex */
        accum_re += data0[2*4+0];
        accum_im += data0[2*4+1];
#  endif

#line 1171
#  if !0
        accum += (data0[5]);
#  else /* complex */
        accum_re += data0[2*5+0];
        accum_im += data0[2*5+1];
#  endif

#line 1171
#  if !0
        accum += (data0[6]);
#  else /* complex */
        accum_re += data0[2*6+0];
        accum_im += data0[2*6+1];
#  endif

#line 1171
#  if !0
        accum += (data0[7]);
#  else /* complex */
        accum_re += data0[2*7+0];
        accum_im += data0[2*7+1];
#  endif

#endif

#if !0
        data0 += 8;
#else
        data0 += 8*2;
#endif
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#endif /* 2 == 1 */

static void
short_sum_of_products_outstride0_two(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_short accum_re = 0, accum_im = 0;
#else
    npy_short accum = 0;
#endif

#if (2 == 1) || (2 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (2 == 2 || 2 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (2 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_outstride0_two (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 2 == 1
        accum += (*(npy_short *)data0);
        data0 += stride0;
#  elif 2 == 2
        accum += (*(npy_short *)data0) *
                 (*(npy_short *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 2 == 3
        accum += (*(npy_short *)data0) *
                 (*(npy_short *)data1) *
                 (*(npy_short *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_short temp = (*(npy_short *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_short *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 2 == 1
        accum_re += ((npy_short *)data0)[0];
        accum_im += ((npy_short *)data0)[1];
        data0 += stride0;
#  else
#    if 2 <= 3
#define _SUMPROD_NOP 2
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_short re, im, tmp;
        int i;
        re = ((npy_short *)dataptr[0])[0];
        im = ((npy_short *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_short *)dataptr[i])[0] -
                  im * ((npy_short *)dataptr[i])[1];
            im = re * ((npy_short *)dataptr[i])[1] +
                 im * ((npy_short *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 2 <= 3
    ((npy_short *)dataptr[2])[0] += accum_re;
    ((npy_short *)dataptr[2])[1] += accum_im;
#  else
    ((npy_short *)dataptr[nop])[0] += accum_re;
    ((npy_short *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 2 <= 3
    *((npy_short *)dataptr[2]) = (accum +
                                    (*((npy_short *)dataptr[2])));
#  else
    *((npy_short *)dataptr[nop]) = (accum +
                                    (*((npy_short *)dataptr[nop])));
#  endif
#endif

}


#line 113
static void
short_sum_of_products_three(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if (3 == 1) || (3 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (3 == 2 || 3 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (3 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (3 == 1) || (3 <= 3 && !0)
    char *data_out = dataptr[3];
    npy_intp stride_out = strides[3];
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_three (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 3 == 1
        *(npy_short *)data_out = ((*(npy_short *)data0) +
                                         (*(npy_short *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 3 == 2
        *(npy_short *)data_out = ((*(npy_short *)data0) *
                                         (*(npy_short *)data1) +
                                         (*(npy_short *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 3 == 3
        *(npy_short *)data_out = ((*(npy_short *)data0) *
                                         (*(npy_short *)data1) *
                                         (*(npy_short *)data2) +
                                         (*(npy_short *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_short temp = (*(npy_short *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_short *)dataptr[i]);
        }
        *(npy_short *)dataptr[nop] = (temp +
                                           (*(npy_short *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 3 == 1
        ((npy_short *)data_out)[0] = ((npy_short *)data0)[0] +
                                         ((npy_short *)data_out)[0];
        ((npy_short *)data_out)[1] = ((npy_short *)data0)[1] +
                                         ((npy_short *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 3 <= 3
#define _SUMPROD_NOP 3
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_short re, im, tmp;
        int i;
        re = ((npy_short *)dataptr[0])[0];
        im = ((npy_short *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_short *)dataptr[i])[0] -
                  im * ((npy_short *)dataptr[i])[1];
            im = re * ((npy_short *)dataptr[i])[1] +
                 im * ((npy_short *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_short *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[0];
        ((npy_short *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 3 == 1

static void
short_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data_out = (npy_short *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 228
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_short *)data_out + 2*6)[0] =
                                    ((npy_short *)data0 + 2*6)[0] +
                                    ((npy_short *)data_out + 2*6)[0];
            ((npy_short *)data_out + 2*6)[1] =
                                    ((npy_short *)data0 + 2*6)[1] +
                                    ((npy_short *)data_out + 2*6)[1];
#endif

#line 228
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_short *)data_out + 2*5)[0] =
                                    ((npy_short *)data0 + 2*5)[0] +
                                    ((npy_short *)data_out + 2*5)[0];
            ((npy_short *)data_out + 2*5)[1] =
                                    ((npy_short *)data0 + 2*5)[1] +
                                    ((npy_short *)data_out + 2*5)[1];
#endif

#line 228
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_short *)data_out + 2*4)[0] =
                                    ((npy_short *)data0 + 2*4)[0] +
                                    ((npy_short *)data_out + 2*4)[0];
            ((npy_short *)data_out + 2*4)[1] =
                                    ((npy_short *)data0 + 2*4)[1] +
                                    ((npy_short *)data_out + 2*4)[1];
#endif

#line 228
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_short *)data_out + 2*3)[0] =
                                    ((npy_short *)data0 + 2*3)[0] +
                                    ((npy_short *)data_out + 2*3)[0];
            ((npy_short *)data_out + 2*3)[1] =
                                    ((npy_short *)data0 + 2*3)[1] +
                                    ((npy_short *)data_out + 2*3)[1];
#endif

#line 228
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_short *)data_out + 2*2)[0] =
                                    ((npy_short *)data0 + 2*2)[0] +
                                    ((npy_short *)data_out + 2*2)[0];
            ((npy_short *)data_out + 2*2)[1] =
                                    ((npy_short *)data0 + 2*2)[1] +
                                    ((npy_short *)data_out + 2*2)[1];
#endif

#line 228
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_short *)data_out + 2*1)[0] =
                                    ((npy_short *)data0 + 2*1)[0] +
                                    ((npy_short *)data_out + 2*1)[0];
            ((npy_short *)data_out + 2*1)[1] =
                                    ((npy_short *)data0 + 2*1)[1] +
                                    ((npy_short *)data_out + 2*1)[1];
#endif

#line 228
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_short *)data_out + 2*0)[0] =
                                    ((npy_short *)data0 + 2*0)[0] +
                                    ((npy_short *)data_out + 2*0)[0];
            ((npy_short *)data_out + 2*0)[1] =
                                    ((npy_short *)data0 + 2*0)[1] +
                                    ((npy_short *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 252
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_short *)data_out + 2*0)[0] =
                                ((npy_short *)data0 + 2*0)[0] +
                                ((npy_short *)data_out + 2*0)[0];
        ((npy_short *)data_out + 2*0)[1] =
                                ((npy_short *)data0 + 2*0)[1] +
                                ((npy_short *)data_out + 2*0)[1];
#endif

#line 252
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_short *)data_out + 2*1)[0] =
                                ((npy_short *)data0 + 2*1)[0] +
                                ((npy_short *)data_out + 2*1)[0];
        ((npy_short *)data_out + 2*1)[1] =
                                ((npy_short *)data0 + 2*1)[1] +
                                ((npy_short *)data_out + 2*1)[1];
#endif

#line 252
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_short *)data_out + 2*2)[0] =
                                ((npy_short *)data0 + 2*2)[0] +
                                ((npy_short *)data_out + 2*2)[0];
        ((npy_short *)data_out + 2*2)[1] =
                                ((npy_short *)data0 + 2*2)[1] +
                                ((npy_short *)data_out + 2*2)[1];
#endif

#line 252
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_short *)data_out + 2*3)[0] =
                                ((npy_short *)data0 + 2*3)[0] +
                                ((npy_short *)data_out + 2*3)[0];
        ((npy_short *)data_out + 2*3)[1] =
                                ((npy_short *)data0 + 2*3)[1] +
                                ((npy_short *)data_out + 2*3)[1];
#endif

#line 252
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_short *)data_out + 2*4)[0] =
                                ((npy_short *)data0 + 2*4)[0] +
                                ((npy_short *)data_out + 2*4)[0];
        ((npy_short *)data_out + 2*4)[1] =
                                ((npy_short *)data0 + 2*4)[1] +
                                ((npy_short *)data_out + 2*4)[1];
#endif

#line 252
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_short *)data_out + 2*5)[0] =
                                ((npy_short *)data0 + 2*5)[0] +
                                ((npy_short *)data_out + 2*5)[0];
        ((npy_short *)data_out + 2*5)[1] =
                                ((npy_short *)data0 + 2*5)[1] +
                                ((npy_short *)data_out + 2*5)[1];
#endif

#line 252
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_short *)data_out + 2*6)[0] =
                                ((npy_short *)data0 + 2*6)[0] +
                                ((npy_short *)data_out + 2*6)[0];
        ((npy_short *)data_out + 2*6)[1] =
                                ((npy_short *)data0 + 2*6)[1] +
                                ((npy_short *)data_out + 2*6)[1];
#endif

#line 252
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_short *)data_out + 2*7)[0] =
                                ((npy_short *)data0 + 2*7)[0] +
                                ((npy_short *)data_out + 2*7)[0];
        ((npy_short *)data_out + 2*7)[1] =
                                ((npy_short *)data0 + 2*7)[1] +
                                ((npy_short *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 3 == 2 && !0

static void
short_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short *data_out = (npy_short *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b;
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_two (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 295
        case 6+1:
            data_out[6] = ((data0[6]) *
                                 (data1[6]) +
                                 (data_out[6]));

#line 295
        case 5+1:
            data_out[5] = ((data0[5]) *
                                 (data1[5]) +
                                 (data_out[5]));

#line 295
        case 4+1:
            data_out[4] = ((data0[4]) *
                                 (data1[4]) +
                                 (data_out[4]));

#line 295
        case 3+1:
            data_out[3] = ((data0[3]) *
                                 (data1[3]) +
                                 (data_out[3]));

#line 295
        case 2+1:
            data_out[2] = ((data0[2]) *
                                 (data1[2]) +
                                 (data_out[2]));

#line 295
        case 1+1:
            data_out[1] = ((data0[1]) *
                                 (data1[1]) +
                                 (data_out[1]));

#line 295
        case 0+1:
            data_out[0] = ((data0[0]) *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
        EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 345
        data_out[0] = ((data0[0]) *
                             (data1[0]) +
                             (data_out[0]));

#line 345
        data_out[1] = ((data0[1]) *
                             (data1[1]) +
                             (data_out[1]));

#line 345
        data_out[2] = ((data0[2]) *
                             (data1[2]) +
                             (data_out[2]));

#line 345
        data_out[3] = ((data0[3]) *
                             (data1[3]) +
                             (data_out[3]));

#line 345
        data_out[4] = ((data0[4]) *
                             (data1[4]) +
                             (data_out[4]));

#line 345
        data_out[5] = ((data0[5]) *
                             (data1[5]) +
                             (data_out[5]));

#line 345
        data_out[6] = ((data0[6]) *
                             (data1[6]) +
                             (data_out[6]));

#line 345
        data_out[7] = ((data0[7]) *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data0 += 8;
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

/* Some extra specializations for the two operand case */
static void
short_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short value0 = (*(npy_short *)dataptr[0]);
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short *data_out = (npy_short *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value0_sse;
#elif EINSUM_USE_SSE2 && 0
    __m128d a, b, value0_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 383
        case 6+1:
            data_out[6] = (value0 *
                                 (data1[6]) +
                                 (data_out[6]));

#line 383
        case 5+1:
            data_out[5] = (value0 *
                                 (data1[5]) +
                                 (data_out[5]));

#line 383
        case 4+1:
            data_out[4] = (value0 *
                                 (data1[4]) +
                                 (data_out[4]));

#line 383
        case 3+1:
            data_out[3] = (value0 *
                                 (data1[3]) +
                                 (data_out[3]));

#line 383
        case 2+1:
            data_out[2] = (value0 *
                                 (data1[2]) +
                                 (data_out[2]));

#line 383
        case 1+1:
            data_out[1] = (value0 *
                                 (data1[1]) +
                                 (data_out[1]));

#line 383
        case 0+1:
            data_out[0] = (value0 *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value0_sse = _mm_set_ps1(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#elif EINSUM_USE_SSE2 && 0
    value0_sse = _mm_set1_pd(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+0));
            b = _mm_add_pd(a, _mm_load_pd(data_out+0));
            _mm_store_pd(data_out+0, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+2));
            b = _mm_add_pd(a, _mm_load_pd(data_out+2));
            _mm_store_pd(data_out+2, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+4));
            b = _mm_add_pd(a, _mm_load_pd(data_out+4));
            _mm_store_pd(data_out+4, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+6));
            b = _mm_add_pd(a, _mm_load_pd(data_out+6));
            _mm_store_pd(data_out+6, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#elif EINSUM_USE_SSE2 && 0
#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+0));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+0));
        _mm_storeu_pd(data_out+0, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+2));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+2));
        _mm_storeu_pd(data_out+2, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+4));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+4));
        _mm_storeu_pd(data_out+4, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+6));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+6));
        _mm_storeu_pd(data_out+6, b);

#else
#line 474
        data_out[0] = (value0 *
                             (data1[0]) +
                             (data_out[0]));

#line 474
        data_out[1] = (value0 *
                             (data1[1]) +
                             (data_out[1]));

#line 474
        data_out[2] = (value0 *
                             (data1[2]) +
                             (data_out[2]));

#line 474
        data_out[3] = (value0 *
                             (data1[3]) +
                             (data_out[3]));

#line 474
        data_out[4] = (value0 *
                             (data1[4]) +
                             (data_out[4]));

#line 474
        data_out[5] = (value0 *
                             (data1[5]) +
                             (data_out[5]));

#line 474
        data_out[6] = (value0 *
                             (data1[6]) +
                             (data_out[6]));

#line 474
        data_out[7] = (value0 *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    if (count > 0) {
        goto finish_after_unrolled_loop;
    }
}

static void
short_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short value1 = (*(npy_short *)dataptr[1]);
    npy_short *data_out = (npy_short *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value1_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 510
        case 6+1:
            data_out[6] = ((data0[6])*
                                 value1  +
                                 (data_out[6]));

#line 510
        case 5+1:
            data_out[5] = ((data0[5])*
                                 value1  +
                                 (data_out[5]));

#line 510
        case 4+1:
            data_out[4] = ((data0[4])*
                                 value1  +
                                 (data_out[4]));

#line 510
        case 3+1:
            data_out[3] = ((data0[3])*
                                 value1  +
                                 (data_out[3]));

#line 510
        case 2+1:
            data_out[2] = ((data0[2])*
                                 value1  +
                                 (data_out[2]));

#line 510
        case 1+1:
            data_out[1] = ((data0[1])*
                                 value1  +
                                 (data_out[1]));

#line 510
        case 0+1:
            data_out[0] = ((data0[0])*
                                 value1  +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value1_sse = _mm_set_ps1(value1);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+0), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+4), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 560
        data_out[0] = ((data0[0])*
                             value1  +
                             (data_out[0]));

#line 560
        data_out[1] = ((data0[1])*
                             value1  +
                             (data_out[1]));

#line 560
        data_out[2] = ((data0[2])*
                             value1  +
                             (data_out[2]));

#line 560
        data_out[3] = ((data0[3])*
                             value1  +
                             (data_out[3]));

#line 560
        data_out[4] = ((data0[4])*
                             value1  +
                             (data_out[4]));

#line 560
        data_out[5] = ((data0[5])*
                             value1  +
                             (data_out[5]));

#line 560
        data_out[6] = ((data0[6])*
                             value1  +
                             (data_out[6]));

#line 560
        data_out[7] = ((data0[7])*
                             value1  +
                             (data_out[7]));

#endif
        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
short_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 596
        case 6+1:
            accum += (data0[6]) * (data1[6]);

#line 596
        case 5+1:
            accum += (data0[5]) * (data1[5]);

#line 596
        case 4+1:
            accum += (data0[4]) * (data1[4]);

#line 596
        case 3+1:
            accum += (data0[3]) * (data1[3]);

#line 596
        case 2+1:
            accum += (data0[2]) * (data1[2]);

#line 596
        case 1+1:
            accum += (data0[1]) * (data1[1]);

#line 596
        case 0+1:
            accum += (data0[0]) * (data1[0]);

        case 0:
            *(npy_short *)dataptr[2] += (accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            accum_sse = _mm_add_ps(accum_sse, a);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            accum_sse = _mm_add_ps(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+0), _mm_load_pd(data1+0));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+2), _mm_load_pd(data1+2));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+4), _mm_load_pd(data1+4));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+6), _mm_load_pd(data1+6));
            accum_sse = _mm_add_pd(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        accum_sse = _mm_add_ps(accum_sse, a);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        accum_sse = _mm_add_ps(accum_sse, a);

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+0), _mm_loadu_pd(data1+0));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+2), _mm_loadu_pd(data1+2));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+4), _mm_loadu_pd(data1+4));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+6), _mm_loadu_pd(data1+6));
        accum_sse = _mm_add_pd(accum_sse, a);

#else
#line 708
        accum += (data0[0]) * (data1[0]);

#line 708
        accum += (data0[1]) * (data1[1]);

#line 708
        accum += (data0[2]) * (data1[2]);

#line 708
        accum += (data0[3]) * (data1[3]);

#line 708
        accum += (data0[4]) * (data1[4]);

#line 708
        accum += (data0[5]) * (data1[5]);

#line 708
        accum += (data0[6]) * (data1[6]);

#line 708
        accum += (data0[7]) * (data1[7]);

#endif
        data0 += 8;
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
short_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short value0 = (*(npy_short *)dataptr[0]);
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_stride0_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 754
        case 6+1:
            accum += (data1[6]);

#line 754
        case 5+1:
            accum += (data1[5]);

#line 754
        case 4+1:
            accum += (data1[4]);

#line 754
        case 3+1:
            accum += (data1[3]);

#line 754
        case 2+1:
            accum += (data1[2]);

#line 754
        case 1+1:
            accum += (data1[1]);

#line 754
        case 0+1:
            accum += (data1[0]);

        case 0:
            *(npy_short *)dataptr[2] += (value0 * accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+0));

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+4));

            data1 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+0));

#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+4));

#else
#line 813
        accum += (data1[0]);

#line 813
        accum += (data1[1]);

#line 813
        accum += (data1[2]);

#line 813
        accum += (data1[3]);

#line 813
        accum += (data1[4]);

#line 813
        accum += (data1[5]);

#line 813
        accum += (data1[6]);

#line 813
        accum += (data1[7]);

#endif
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
short_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short value1 = (*(npy_short *)dataptr[1]);
    npy_short accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_stride0_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 853
        case 6+1:
            accum += (data0[6]);

#line 853
        case 5+1:
            accum += (data0[5]);

#line 853
        case 4+1:
            accum += (data0[4]);

#line 853
        case 3+1:
            accum += (data0[3]);

#line 853
        case 2+1:
            accum += (data0[2]);

#line 853
        case 1+1:
            accum += (data0[1]);

#line 853
        case 0+1:
            accum += (data0[0]);

        case 0:
            *(npy_short *)dataptr[2] += (accum * value1);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#else
#line 912
        accum += (data0[0]);

#line 912
        accum += (data0[1]);

#line 912
        accum += (data0[2]);

#line 912
        accum += (data0[3]);

#line 912
        accum += (data0[4]);

#line 912
        accum += (data0[5]);

#line 912
        accum += (data0[6]);

#line 912
        accum += (data0[7]);

#endif
        data0 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 3 == 3 && !0

static void
short_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short *data2 = (npy_short *)dataptr[2];
    npy_short *data_out = (npy_short *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 949
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 949
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 949
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 949
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 949
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 949
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 949
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 949
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 3 > 3 || @complex */

static void
short_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_three (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_short temp = (*(npy_short *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_short *)dataptr[i]);
        }
        *(npy_short *)dataptr[nop] = (temp +
                                           (*(npy_short *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_short);
        }
#else /* complex */
#  if 3 <= 3
#    define _SUMPROD_NOP 3
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_short re, im, tmp;
        int i;
        re = ((npy_short *)dataptr[0])[0];
        im = ((npy_short *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_short *)dataptr[i])[0] -
                  im * ((npy_short *)dataptr[i])[1];
            im = re * ((npy_short *)dataptr[i])[1] +
                 im * ((npy_short *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_short *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[0];
        ((npy_short *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_short);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 3 */

#if 3 == 1

static void
short_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_short accum_re = 0, accum_im = 0;
    npy_short *data0 = (npy_short *)dataptr[0];
#else
    npy_short accum = 0;
    npy_short *data0 = (npy_short *)dataptr[0];
#endif

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif


    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_outstride0_one (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 1058
        case 6+1:
#if !0
            accum += (data0[6]);
#else /* complex */
            accum_re += data0[2*6+0];
            accum_im += data0[2*6+1];
#endif

#line 1058
        case 5+1:
#if !0
            accum += (data0[5]);
#else /* complex */
            accum_re += data0[2*5+0];
            accum_im += data0[2*5+1];
#endif

#line 1058
        case 4+1:
#if !0
            accum += (data0[4]);
#else /* complex */
            accum_re += data0[2*4+0];
            accum_im += data0[2*4+1];
#endif

#line 1058
        case 3+1:
#if !0
            accum += (data0[3]);
#else /* complex */
            accum_re += data0[2*3+0];
            accum_im += data0[2*3+1];
#endif

#line 1058
        case 2+1:
#if !0
            accum += (data0[2]);
#else /* complex */
            accum_re += data0[2*2+0];
            accum_im += data0[2*2+1];
#endif

#line 1058
        case 1+1:
#if !0
            accum += (data0[1]);
#else /* complex */
            accum_re += data0[2*1+0];
            accum_im += data0[2*1+1];
#endif

#line 1058
        case 0+1:
#if !0
            accum += (data0[0]);
#else /* complex */
            accum_re += data0[2*0+0];
            accum_im += data0[2*0+1];
#endif

        case 0:
#if 0
            ((npy_short *)dataptr[1])[0] += accum_re;
            ((npy_short *)dataptr[1])[1] += accum_im;
#else
            *((npy_short *)dataptr[1]) = (accum +
                                    (*((npy_short *)dataptr[1])));
#endif
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+0));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+2));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+4));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+6));

            data0 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+0));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+2));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+4));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+6));

#else
#line 1171
#  if !0
        accum += (data0[0]);
#  else /* complex */
        accum_re += data0[2*0+0];
        accum_im += data0[2*0+1];
#  endif

#line 1171
#  if !0
        accum += (data0[1]);
#  else /* complex */
        accum_re += data0[2*1+0];
        accum_im += data0[2*1+1];
#  endif

#line 1171
#  if !0
        accum += (data0[2]);
#  else /* complex */
        accum_re += data0[2*2+0];
        accum_im += data0[2*2+1];
#  endif

#line 1171
#  if !0
        accum += (data0[3]);
#  else /* complex */
        accum_re += data0[2*3+0];
        accum_im += data0[2*3+1];
#  endif

#line 1171
#  if !0
        accum += (data0[4]);
#  else /* complex */
        accum_re += data0[2*4+0];
        accum_im += data0[2*4+1];
#  endif

#line 1171
#  if !0
        accum += (data0[5]);
#  else /* complex */
        accum_re += data0[2*5+0];
        accum_im += data0[2*5+1];
#  endif

#line 1171
#  if !0
        accum += (data0[6]);
#  else /* complex */
        accum_re += data0[2*6+0];
        accum_im += data0[2*6+1];
#  endif

#line 1171
#  if !0
        accum += (data0[7]);
#  else /* complex */
        accum_re += data0[2*7+0];
        accum_im += data0[2*7+1];
#  endif

#endif

#if !0
        data0 += 8;
#else
        data0 += 8*2;
#endif
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#endif /* 3 == 1 */

static void
short_sum_of_products_outstride0_three(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_short accum_re = 0, accum_im = 0;
#else
    npy_short accum = 0;
#endif

#if (3 == 1) || (3 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (3 == 2 || 3 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (3 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_outstride0_three (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 3 == 1
        accum += (*(npy_short *)data0);
        data0 += stride0;
#  elif 3 == 2
        accum += (*(npy_short *)data0) *
                 (*(npy_short *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 3 == 3
        accum += (*(npy_short *)data0) *
                 (*(npy_short *)data1) *
                 (*(npy_short *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_short temp = (*(npy_short *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_short *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 3 == 1
        accum_re += ((npy_short *)data0)[0];
        accum_im += ((npy_short *)data0)[1];
        data0 += stride0;
#  else
#    if 3 <= 3
#define _SUMPROD_NOP 3
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_short re, im, tmp;
        int i;
        re = ((npy_short *)dataptr[0])[0];
        im = ((npy_short *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_short *)dataptr[i])[0] -
                  im * ((npy_short *)dataptr[i])[1];
            im = re * ((npy_short *)dataptr[i])[1] +
                 im * ((npy_short *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 3 <= 3
    ((npy_short *)dataptr[3])[0] += accum_re;
    ((npy_short *)dataptr[3])[1] += accum_im;
#  else
    ((npy_short *)dataptr[nop])[0] += accum_re;
    ((npy_short *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 3 <= 3
    *((npy_short *)dataptr[3]) = (accum +
                                    (*((npy_short *)dataptr[3])));
#  else
    *((npy_short *)dataptr[nop]) = (accum +
                                    (*((npy_short *)dataptr[nop])));
#  endif
#endif

}


#line 113
static void
short_sum_of_products_any(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if (1000 == 1) || (1000 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1000 == 2 || 1000 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1000 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (1000 == 1) || (1000 <= 3 && !0)
    char *data_out = dataptr[1000];
    npy_intp stride_out = strides[1000];
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_any (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 1000 == 1
        *(npy_short *)data_out = ((*(npy_short *)data0) +
                                         (*(npy_short *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 1000 == 2
        *(npy_short *)data_out = ((*(npy_short *)data0) *
                                         (*(npy_short *)data1) +
                                         (*(npy_short *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 1000 == 3
        *(npy_short *)data_out = ((*(npy_short *)data0) *
                                         (*(npy_short *)data1) *
                                         (*(npy_short *)data2) +
                                         (*(npy_short *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_short temp = (*(npy_short *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_short *)dataptr[i]);
        }
        *(npy_short *)dataptr[nop] = (temp +
                                           (*(npy_short *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1000 == 1
        ((npy_short *)data_out)[0] = ((npy_short *)data0)[0] +
                                         ((npy_short *)data_out)[0];
        ((npy_short *)data_out)[1] = ((npy_short *)data0)[1] +
                                         ((npy_short *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 1000 <= 3
#define _SUMPROD_NOP 1000
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_short re, im, tmp;
        int i;
        re = ((npy_short *)dataptr[0])[0];
        im = ((npy_short *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_short *)dataptr[i])[0] -
                  im * ((npy_short *)dataptr[i])[1];
            im = re * ((npy_short *)dataptr[i])[1] +
                 im * ((npy_short *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_short *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[0];
        ((npy_short *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 1000 == 1

static void
short_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data_out = (npy_short *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 228
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_short *)data_out + 2*6)[0] =
                                    ((npy_short *)data0 + 2*6)[0] +
                                    ((npy_short *)data_out + 2*6)[0];
            ((npy_short *)data_out + 2*6)[1] =
                                    ((npy_short *)data0 + 2*6)[1] +
                                    ((npy_short *)data_out + 2*6)[1];
#endif

#line 228
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_short *)data_out + 2*5)[0] =
                                    ((npy_short *)data0 + 2*5)[0] +
                                    ((npy_short *)data_out + 2*5)[0];
            ((npy_short *)data_out + 2*5)[1] =
                                    ((npy_short *)data0 + 2*5)[1] +
                                    ((npy_short *)data_out + 2*5)[1];
#endif

#line 228
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_short *)data_out + 2*4)[0] =
                                    ((npy_short *)data0 + 2*4)[0] +
                                    ((npy_short *)data_out + 2*4)[0];
            ((npy_short *)data_out + 2*4)[1] =
                                    ((npy_short *)data0 + 2*4)[1] +
                                    ((npy_short *)data_out + 2*4)[1];
#endif

#line 228
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_short *)data_out + 2*3)[0] =
                                    ((npy_short *)data0 + 2*3)[0] +
                                    ((npy_short *)data_out + 2*3)[0];
            ((npy_short *)data_out + 2*3)[1] =
                                    ((npy_short *)data0 + 2*3)[1] +
                                    ((npy_short *)data_out + 2*3)[1];
#endif

#line 228
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_short *)data_out + 2*2)[0] =
                                    ((npy_short *)data0 + 2*2)[0] +
                                    ((npy_short *)data_out + 2*2)[0];
            ((npy_short *)data_out + 2*2)[1] =
                                    ((npy_short *)data0 + 2*2)[1] +
                                    ((npy_short *)data_out + 2*2)[1];
#endif

#line 228
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_short *)data_out + 2*1)[0] =
                                    ((npy_short *)data0 + 2*1)[0] +
                                    ((npy_short *)data_out + 2*1)[0];
            ((npy_short *)data_out + 2*1)[1] =
                                    ((npy_short *)data0 + 2*1)[1] +
                                    ((npy_short *)data_out + 2*1)[1];
#endif

#line 228
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_short *)data_out + 2*0)[0] =
                                    ((npy_short *)data0 + 2*0)[0] +
                                    ((npy_short *)data_out + 2*0)[0];
            ((npy_short *)data_out + 2*0)[1] =
                                    ((npy_short *)data0 + 2*0)[1] +
                                    ((npy_short *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 252
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_short *)data_out + 2*0)[0] =
                                ((npy_short *)data0 + 2*0)[0] +
                                ((npy_short *)data_out + 2*0)[0];
        ((npy_short *)data_out + 2*0)[1] =
                                ((npy_short *)data0 + 2*0)[1] +
                                ((npy_short *)data_out + 2*0)[1];
#endif

#line 252
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_short *)data_out + 2*1)[0] =
                                ((npy_short *)data0 + 2*1)[0] +
                                ((npy_short *)data_out + 2*1)[0];
        ((npy_short *)data_out + 2*1)[1] =
                                ((npy_short *)data0 + 2*1)[1] +
                                ((npy_short *)data_out + 2*1)[1];
#endif

#line 252
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_short *)data_out + 2*2)[0] =
                                ((npy_short *)data0 + 2*2)[0] +
                                ((npy_short *)data_out + 2*2)[0];
        ((npy_short *)data_out + 2*2)[1] =
                                ((npy_short *)data0 + 2*2)[1] +
                                ((npy_short *)data_out + 2*2)[1];
#endif

#line 252
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_short *)data_out + 2*3)[0] =
                                ((npy_short *)data0 + 2*3)[0] +
                                ((npy_short *)data_out + 2*3)[0];
        ((npy_short *)data_out + 2*3)[1] =
                                ((npy_short *)data0 + 2*3)[1] +
                                ((npy_short *)data_out + 2*3)[1];
#endif

#line 252
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_short *)data_out + 2*4)[0] =
                                ((npy_short *)data0 + 2*4)[0] +
                                ((npy_short *)data_out + 2*4)[0];
        ((npy_short *)data_out + 2*4)[1] =
                                ((npy_short *)data0 + 2*4)[1] +
                                ((npy_short *)data_out + 2*4)[1];
#endif

#line 252
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_short *)data_out + 2*5)[0] =
                                ((npy_short *)data0 + 2*5)[0] +
                                ((npy_short *)data_out + 2*5)[0];
        ((npy_short *)data_out + 2*5)[1] =
                                ((npy_short *)data0 + 2*5)[1] +
                                ((npy_short *)data_out + 2*5)[1];
#endif

#line 252
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_short *)data_out + 2*6)[0] =
                                ((npy_short *)data0 + 2*6)[0] +
                                ((npy_short *)data_out + 2*6)[0];
        ((npy_short *)data_out + 2*6)[1] =
                                ((npy_short *)data0 + 2*6)[1] +
                                ((npy_short *)data_out + 2*6)[1];
#endif

#line 252
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_short *)data_out + 2*7)[0] =
                                ((npy_short *)data0 + 2*7)[0] +
                                ((npy_short *)data_out + 2*7)[0];
        ((npy_short *)data_out + 2*7)[1] =
                                ((npy_short *)data0 + 2*7)[1] +
                                ((npy_short *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1000 == 2 && !0

static void
short_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short *data_out = (npy_short *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b;
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_two (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 295
        case 6+1:
            data_out[6] = ((data0[6]) *
                                 (data1[6]) +
                                 (data_out[6]));

#line 295
        case 5+1:
            data_out[5] = ((data0[5]) *
                                 (data1[5]) +
                                 (data_out[5]));

#line 295
        case 4+1:
            data_out[4] = ((data0[4]) *
                                 (data1[4]) +
                                 (data_out[4]));

#line 295
        case 3+1:
            data_out[3] = ((data0[3]) *
                                 (data1[3]) +
                                 (data_out[3]));

#line 295
        case 2+1:
            data_out[2] = ((data0[2]) *
                                 (data1[2]) +
                                 (data_out[2]));

#line 295
        case 1+1:
            data_out[1] = ((data0[1]) *
                                 (data1[1]) +
                                 (data_out[1]));

#line 295
        case 0+1:
            data_out[0] = ((data0[0]) *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
        EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 345
        data_out[0] = ((data0[0]) *
                             (data1[0]) +
                             (data_out[0]));

#line 345
        data_out[1] = ((data0[1]) *
                             (data1[1]) +
                             (data_out[1]));

#line 345
        data_out[2] = ((data0[2]) *
                             (data1[2]) +
                             (data_out[2]));

#line 345
        data_out[3] = ((data0[3]) *
                             (data1[3]) +
                             (data_out[3]));

#line 345
        data_out[4] = ((data0[4]) *
                             (data1[4]) +
                             (data_out[4]));

#line 345
        data_out[5] = ((data0[5]) *
                             (data1[5]) +
                             (data_out[5]));

#line 345
        data_out[6] = ((data0[6]) *
                             (data1[6]) +
                             (data_out[6]));

#line 345
        data_out[7] = ((data0[7]) *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data0 += 8;
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

/* Some extra specializations for the two operand case */
static void
short_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short value0 = (*(npy_short *)dataptr[0]);
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short *data_out = (npy_short *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value0_sse;
#elif EINSUM_USE_SSE2 && 0
    __m128d a, b, value0_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 383
        case 6+1:
            data_out[6] = (value0 *
                                 (data1[6]) +
                                 (data_out[6]));

#line 383
        case 5+1:
            data_out[5] = (value0 *
                                 (data1[5]) +
                                 (data_out[5]));

#line 383
        case 4+1:
            data_out[4] = (value0 *
                                 (data1[4]) +
                                 (data_out[4]));

#line 383
        case 3+1:
            data_out[3] = (value0 *
                                 (data1[3]) +
                                 (data_out[3]));

#line 383
        case 2+1:
            data_out[2] = (value0 *
                                 (data1[2]) +
                                 (data_out[2]));

#line 383
        case 1+1:
            data_out[1] = (value0 *
                                 (data1[1]) +
                                 (data_out[1]));

#line 383
        case 0+1:
            data_out[0] = (value0 *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value0_sse = _mm_set_ps1(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#elif EINSUM_USE_SSE2 && 0
    value0_sse = _mm_set1_pd(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+0));
            b = _mm_add_pd(a, _mm_load_pd(data_out+0));
            _mm_store_pd(data_out+0, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+2));
            b = _mm_add_pd(a, _mm_load_pd(data_out+2));
            _mm_store_pd(data_out+2, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+4));
            b = _mm_add_pd(a, _mm_load_pd(data_out+4));
            _mm_store_pd(data_out+4, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+6));
            b = _mm_add_pd(a, _mm_load_pd(data_out+6));
            _mm_store_pd(data_out+6, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#elif EINSUM_USE_SSE2 && 0
#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+0));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+0));
        _mm_storeu_pd(data_out+0, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+2));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+2));
        _mm_storeu_pd(data_out+2, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+4));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+4));
        _mm_storeu_pd(data_out+4, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+6));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+6));
        _mm_storeu_pd(data_out+6, b);

#else
#line 474
        data_out[0] = (value0 *
                             (data1[0]) +
                             (data_out[0]));

#line 474
        data_out[1] = (value0 *
                             (data1[1]) +
                             (data_out[1]));

#line 474
        data_out[2] = (value0 *
                             (data1[2]) +
                             (data_out[2]));

#line 474
        data_out[3] = (value0 *
                             (data1[3]) +
                             (data_out[3]));

#line 474
        data_out[4] = (value0 *
                             (data1[4]) +
                             (data_out[4]));

#line 474
        data_out[5] = (value0 *
                             (data1[5]) +
                             (data_out[5]));

#line 474
        data_out[6] = (value0 *
                             (data1[6]) +
                             (data_out[6]));

#line 474
        data_out[7] = (value0 *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    if (count > 0) {
        goto finish_after_unrolled_loop;
    }
}

static void
short_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short value1 = (*(npy_short *)dataptr[1]);
    npy_short *data_out = (npy_short *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value1_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 510
        case 6+1:
            data_out[6] = ((data0[6])*
                                 value1  +
                                 (data_out[6]));

#line 510
        case 5+1:
            data_out[5] = ((data0[5])*
                                 value1  +
                                 (data_out[5]));

#line 510
        case 4+1:
            data_out[4] = ((data0[4])*
                                 value1  +
                                 (data_out[4]));

#line 510
        case 3+1:
            data_out[3] = ((data0[3])*
                                 value1  +
                                 (data_out[3]));

#line 510
        case 2+1:
            data_out[2] = ((data0[2])*
                                 value1  +
                                 (data_out[2]));

#line 510
        case 1+1:
            data_out[1] = ((data0[1])*
                                 value1  +
                                 (data_out[1]));

#line 510
        case 0+1:
            data_out[0] = ((data0[0])*
                                 value1  +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value1_sse = _mm_set_ps1(value1);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+0), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+4), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 560
        data_out[0] = ((data0[0])*
                             value1  +
                             (data_out[0]));

#line 560
        data_out[1] = ((data0[1])*
                             value1  +
                             (data_out[1]));

#line 560
        data_out[2] = ((data0[2])*
                             value1  +
                             (data_out[2]));

#line 560
        data_out[3] = ((data0[3])*
                             value1  +
                             (data_out[3]));

#line 560
        data_out[4] = ((data0[4])*
                             value1  +
                             (data_out[4]));

#line 560
        data_out[5] = ((data0[5])*
                             value1  +
                             (data_out[5]));

#line 560
        data_out[6] = ((data0[6])*
                             value1  +
                             (data_out[6]));

#line 560
        data_out[7] = ((data0[7])*
                             value1  +
                             (data_out[7]));

#endif
        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
short_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 596
        case 6+1:
            accum += (data0[6]) * (data1[6]);

#line 596
        case 5+1:
            accum += (data0[5]) * (data1[5]);

#line 596
        case 4+1:
            accum += (data0[4]) * (data1[4]);

#line 596
        case 3+1:
            accum += (data0[3]) * (data1[3]);

#line 596
        case 2+1:
            accum += (data0[2]) * (data1[2]);

#line 596
        case 1+1:
            accum += (data0[1]) * (data1[1]);

#line 596
        case 0+1:
            accum += (data0[0]) * (data1[0]);

        case 0:
            *(npy_short *)dataptr[2] += (accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            accum_sse = _mm_add_ps(accum_sse, a);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            accum_sse = _mm_add_ps(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+0), _mm_load_pd(data1+0));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+2), _mm_load_pd(data1+2));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+4), _mm_load_pd(data1+4));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+6), _mm_load_pd(data1+6));
            accum_sse = _mm_add_pd(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        accum_sse = _mm_add_ps(accum_sse, a);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        accum_sse = _mm_add_ps(accum_sse, a);

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+0), _mm_loadu_pd(data1+0));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+2), _mm_loadu_pd(data1+2));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+4), _mm_loadu_pd(data1+4));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+6), _mm_loadu_pd(data1+6));
        accum_sse = _mm_add_pd(accum_sse, a);

#else
#line 708
        accum += (data0[0]) * (data1[0]);

#line 708
        accum += (data0[1]) * (data1[1]);

#line 708
        accum += (data0[2]) * (data1[2]);

#line 708
        accum += (data0[3]) * (data1[3]);

#line 708
        accum += (data0[4]) * (data1[4]);

#line 708
        accum += (data0[5]) * (data1[5]);

#line 708
        accum += (data0[6]) * (data1[6]);

#line 708
        accum += (data0[7]) * (data1[7]);

#endif
        data0 += 8;
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
short_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short value0 = (*(npy_short *)dataptr[0]);
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_stride0_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 754
        case 6+1:
            accum += (data1[6]);

#line 754
        case 5+1:
            accum += (data1[5]);

#line 754
        case 4+1:
            accum += (data1[4]);

#line 754
        case 3+1:
            accum += (data1[3]);

#line 754
        case 2+1:
            accum += (data1[2]);

#line 754
        case 1+1:
            accum += (data1[1]);

#line 754
        case 0+1:
            accum += (data1[0]);

        case 0:
            *(npy_short *)dataptr[2] += (value0 * accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+0));

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+4));

            data1 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+0));

#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+4));

#else
#line 813
        accum += (data1[0]);

#line 813
        accum += (data1[1]);

#line 813
        accum += (data1[2]);

#line 813
        accum += (data1[3]);

#line 813
        accum += (data1[4]);

#line 813
        accum += (data1[5]);

#line 813
        accum += (data1[6]);

#line 813
        accum += (data1[7]);

#endif
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
short_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short value1 = (*(npy_short *)dataptr[1]);
    npy_short accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_stride0_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 853
        case 6+1:
            accum += (data0[6]);

#line 853
        case 5+1:
            accum += (data0[5]);

#line 853
        case 4+1:
            accum += (data0[4]);

#line 853
        case 3+1:
            accum += (data0[3]);

#line 853
        case 2+1:
            accum += (data0[2]);

#line 853
        case 1+1:
            accum += (data0[1]);

#line 853
        case 0+1:
            accum += (data0[0]);

        case 0:
            *(npy_short *)dataptr[2] += (accum * value1);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#else
#line 912
        accum += (data0[0]);

#line 912
        accum += (data0[1]);

#line 912
        accum += (data0[2]);

#line 912
        accum += (data0[3]);

#line 912
        accum += (data0[4]);

#line 912
        accum += (data0[5]);

#line 912
        accum += (data0[6]);

#line 912
        accum += (data0[7]);

#endif
        data0 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1000 == 3 && !0

static void
short_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short *data2 = (npy_short *)dataptr[2];
    npy_short *data_out = (npy_short *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 949
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 949
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 949
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 949
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 949
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 949
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 949
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 949
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 1000 > 3 || @complex */

static void
short_sum_of_products_contig_any(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_any (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_short temp = (*(npy_short *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_short *)dataptr[i]);
        }
        *(npy_short *)dataptr[nop] = (temp +
                                           (*(npy_short *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_short);
        }
#else /* complex */
#  if 1000 <= 3
#    define _SUMPROD_NOP 1000
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_short re, im, tmp;
        int i;
        re = ((npy_short *)dataptr[0])[0];
        im = ((npy_short *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_short *)dataptr[i])[0] -
                  im * ((npy_short *)dataptr[i])[1];
            im = re * ((npy_short *)dataptr[i])[1] +
                 im * ((npy_short *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_short *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[0];
        ((npy_short *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_short);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 1000 */

#if 1000 == 1

static void
short_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_short accum_re = 0, accum_im = 0;
    npy_short *data0 = (npy_short *)dataptr[0];
#else
    npy_short accum = 0;
    npy_short *data0 = (npy_short *)dataptr[0];
#endif

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif


    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_outstride0_one (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 1058
        case 6+1:
#if !0
            accum += (data0[6]);
#else /* complex */
            accum_re += data0[2*6+0];
            accum_im += data0[2*6+1];
#endif

#line 1058
        case 5+1:
#if !0
            accum += (data0[5]);
#else /* complex */
            accum_re += data0[2*5+0];
            accum_im += data0[2*5+1];
#endif

#line 1058
        case 4+1:
#if !0
            accum += (data0[4]);
#else /* complex */
            accum_re += data0[2*4+0];
            accum_im += data0[2*4+1];
#endif

#line 1058
        case 3+1:
#if !0
            accum += (data0[3]);
#else /* complex */
            accum_re += data0[2*3+0];
            accum_im += data0[2*3+1];
#endif

#line 1058
        case 2+1:
#if !0
            accum += (data0[2]);
#else /* complex */
            accum_re += data0[2*2+0];
            accum_im += data0[2*2+1];
#endif

#line 1058
        case 1+1:
#if !0
            accum += (data0[1]);
#else /* complex */
            accum_re += data0[2*1+0];
            accum_im += data0[2*1+1];
#endif

#line 1058
        case 0+1:
#if !0
            accum += (data0[0]);
#else /* complex */
            accum_re += data0[2*0+0];
            accum_im += data0[2*0+1];
#endif

        case 0:
#if 0
            ((npy_short *)dataptr[1])[0] += accum_re;
            ((npy_short *)dataptr[1])[1] += accum_im;
#else
            *((npy_short *)dataptr[1]) = (accum +
                                    (*((npy_short *)dataptr[1])));
#endif
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+0));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+2));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+4));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+6));

            data0 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+0));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+2));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+4));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+6));

#else
#line 1171
#  if !0
        accum += (data0[0]);
#  else /* complex */
        accum_re += data0[2*0+0];
        accum_im += data0[2*0+1];
#  endif

#line 1171
#  if !0
        accum += (data0[1]);
#  else /* complex */
        accum_re += data0[2*1+0];
        accum_im += data0[2*1+1];
#  endif

#line 1171
#  if !0
        accum += (data0[2]);
#  else /* complex */
        accum_re += data0[2*2+0];
        accum_im += data0[2*2+1];
#  endif

#line 1171
#  if !0
        accum += (data0[3]);
#  else /* complex */
        accum_re += data0[2*3+0];
        accum_im += data0[2*3+1];
#  endif

#line 1171
#  if !0
        accum += (data0[4]);
#  else /* complex */
        accum_re += data0[2*4+0];
        accum_im += data0[2*4+1];
#  endif

#line 1171
#  if !0
        accum += (data0[5]);
#  else /* complex */
        accum_re += data0[2*5+0];
        accum_im += data0[2*5+1];
#  endif

#line 1171
#  if !0
        accum += (data0[6]);
#  else /* complex */
        accum_re += data0[2*6+0];
        accum_im += data0[2*6+1];
#  endif

#line 1171
#  if !0
        accum += (data0[7]);
#  else /* complex */
        accum_re += data0[2*7+0];
        accum_im += data0[2*7+1];
#  endif

#endif

#if !0
        data0 += 8;
#else
        data0 += 8*2;
#endif
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#endif /* 1000 == 1 */

static void
short_sum_of_products_outstride0_any(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_short accum_re = 0, accum_im = 0;
#else
    npy_short accum = 0;
#endif

#if (1000 == 1) || (1000 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1000 == 2 || 1000 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1000 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_outstride0_any (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 1000 == 1
        accum += (*(npy_short *)data0);
        data0 += stride0;
#  elif 1000 == 2
        accum += (*(npy_short *)data0) *
                 (*(npy_short *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 1000 == 3
        accum += (*(npy_short *)data0) *
                 (*(npy_short *)data1) *
                 (*(npy_short *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_short temp = (*(npy_short *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_short *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1000 == 1
        accum_re += ((npy_short *)data0)[0];
        accum_im += ((npy_short *)data0)[1];
        data0 += stride0;
#  else
#    if 1000 <= 3
#define _SUMPROD_NOP 1000
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_short re, im, tmp;
        int i;
        re = ((npy_short *)dataptr[0])[0];
        im = ((npy_short *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_short *)dataptr[i])[0] -
                  im * ((npy_short *)dataptr[i])[1];
            im = re * ((npy_short *)dataptr[i])[1] +
                 im * ((npy_short *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 1000 <= 3
    ((npy_short *)dataptr[1000])[0] += accum_re;
    ((npy_short *)dataptr[1000])[1] += accum_im;
#  else
    ((npy_short *)dataptr[nop])[0] += accum_re;
    ((npy_short *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 1000 <= 3
    *((npy_short *)dataptr[1000]) = (accum +
                                    (*((npy_short *)dataptr[1000])));
#  else
    *((npy_short *)dataptr[nop]) = (accum +
                                    (*((npy_short *)dataptr[nop])));
#  endif
#endif

}




#line 108

#line 113
static void
int_sum_of_products_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if (1 == 1) || (1 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1 == 2 || 1 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (1 == 1) || (1 <= 3 && !0)
    char *data_out = dataptr[1];
    npy_intp stride_out = strides[1];
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_one (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 1 == 1
        *(npy_int *)data_out = ((*(npy_int *)data0) +
                                         (*(npy_int *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 1 == 2
        *(npy_int *)data_out = ((*(npy_int *)data0) *
                                         (*(npy_int *)data1) +
                                         (*(npy_int *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 1 == 3
        *(npy_int *)data_out = ((*(npy_int *)data0) *
                                         (*(npy_int *)data1) *
                                         (*(npy_int *)data2) +
                                         (*(npy_int *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_int temp = (*(npy_int *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_int *)dataptr[i]);
        }
        *(npy_int *)dataptr[nop] = (temp +
                                           (*(npy_int *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1 == 1
        ((npy_int *)data_out)[0] = ((npy_int *)data0)[0] +
                                         ((npy_int *)data_out)[0];
        ((npy_int *)data_out)[1] = ((npy_int *)data0)[1] +
                                         ((npy_int *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 1 <= 3
#define _SUMPROD_NOP 1
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_int re, im, tmp;
        int i;
        re = ((npy_int *)dataptr[0])[0];
        im = ((npy_int *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_int *)dataptr[i])[0] -
                  im * ((npy_int *)dataptr[i])[1];
            im = re * ((npy_int *)dataptr[i])[1] +
                 im * ((npy_int *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_int *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[0];
        ((npy_int *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 1 == 1

static void
int_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data_out = (npy_int *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 228
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_int *)data_out + 2*6)[0] =
                                    ((npy_int *)data0 + 2*6)[0] +
                                    ((npy_int *)data_out + 2*6)[0];
            ((npy_int *)data_out + 2*6)[1] =
                                    ((npy_int *)data0 + 2*6)[1] +
                                    ((npy_int *)data_out + 2*6)[1];
#endif

#line 228
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_int *)data_out + 2*5)[0] =
                                    ((npy_int *)data0 + 2*5)[0] +
                                    ((npy_int *)data_out + 2*5)[0];
            ((npy_int *)data_out + 2*5)[1] =
                                    ((npy_int *)data0 + 2*5)[1] +
                                    ((npy_int *)data_out + 2*5)[1];
#endif

#line 228
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_int *)data_out + 2*4)[0] =
                                    ((npy_int *)data0 + 2*4)[0] +
                                    ((npy_int *)data_out + 2*4)[0];
            ((npy_int *)data_out + 2*4)[1] =
                                    ((npy_int *)data0 + 2*4)[1] +
                                    ((npy_int *)data_out + 2*4)[1];
#endif

#line 228
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_int *)data_out + 2*3)[0] =
                                    ((npy_int *)data0 + 2*3)[0] +
                                    ((npy_int *)data_out + 2*3)[0];
            ((npy_int *)data_out + 2*3)[1] =
                                    ((npy_int *)data0 + 2*3)[1] +
                                    ((npy_int *)data_out + 2*3)[1];
#endif

#line 228
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_int *)data_out + 2*2)[0] =
                                    ((npy_int *)data0 + 2*2)[0] +
                                    ((npy_int *)data_out + 2*2)[0];
            ((npy_int *)data_out + 2*2)[1] =
                                    ((npy_int *)data0 + 2*2)[1] +
                                    ((npy_int *)data_out + 2*2)[1];
#endif

#line 228
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_int *)data_out + 2*1)[0] =
                                    ((npy_int *)data0 + 2*1)[0] +
                                    ((npy_int *)data_out + 2*1)[0];
            ((npy_int *)data_out + 2*1)[1] =
                                    ((npy_int *)data0 + 2*1)[1] +
                                    ((npy_int *)data_out + 2*1)[1];
#endif

#line 228
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_int *)data_out + 2*0)[0] =
                                    ((npy_int *)data0 + 2*0)[0] +
                                    ((npy_int *)data_out + 2*0)[0];
            ((npy_int *)data_out + 2*0)[1] =
                                    ((npy_int *)data0 + 2*0)[1] +
                                    ((npy_int *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 252
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_int *)data_out + 2*0)[0] =
                                ((npy_int *)data0 + 2*0)[0] +
                                ((npy_int *)data_out + 2*0)[0];
        ((npy_int *)data_out + 2*0)[1] =
                                ((npy_int *)data0 + 2*0)[1] +
                                ((npy_int *)data_out + 2*0)[1];
#endif

#line 252
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_int *)data_out + 2*1)[0] =
                                ((npy_int *)data0 + 2*1)[0] +
                                ((npy_int *)data_out + 2*1)[0];
        ((npy_int *)data_out + 2*1)[1] =
                                ((npy_int *)data0 + 2*1)[1] +
                                ((npy_int *)data_out + 2*1)[1];
#endif

#line 252
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_int *)data_out + 2*2)[0] =
                                ((npy_int *)data0 + 2*2)[0] +
                                ((npy_int *)data_out + 2*2)[0];
        ((npy_int *)data_out + 2*2)[1] =
                                ((npy_int *)data0 + 2*2)[1] +
                                ((npy_int *)data_out + 2*2)[1];
#endif

#line 252
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_int *)data_out + 2*3)[0] =
                                ((npy_int *)data0 + 2*3)[0] +
                                ((npy_int *)data_out + 2*3)[0];
        ((npy_int *)data_out + 2*3)[1] =
                                ((npy_int *)data0 + 2*3)[1] +
                                ((npy_int *)data_out + 2*3)[1];
#endif

#line 252
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_int *)data_out + 2*4)[0] =
                                ((npy_int *)data0 + 2*4)[0] +
                                ((npy_int *)data_out + 2*4)[0];
        ((npy_int *)data_out + 2*4)[1] =
                                ((npy_int *)data0 + 2*4)[1] +
                                ((npy_int *)data_out + 2*4)[1];
#endif

#line 252
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_int *)data_out + 2*5)[0] =
                                ((npy_int *)data0 + 2*5)[0] +
                                ((npy_int *)data_out + 2*5)[0];
        ((npy_int *)data_out + 2*5)[1] =
                                ((npy_int *)data0 + 2*5)[1] +
                                ((npy_int *)data_out + 2*5)[1];
#endif

#line 252
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_int *)data_out + 2*6)[0] =
                                ((npy_int *)data0 + 2*6)[0] +
                                ((npy_int *)data_out + 2*6)[0];
        ((npy_int *)data_out + 2*6)[1] =
                                ((npy_int *)data0 + 2*6)[1] +
                                ((npy_int *)data_out + 2*6)[1];
#endif

#line 252
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_int *)data_out + 2*7)[0] =
                                ((npy_int *)data0 + 2*7)[0] +
                                ((npy_int *)data_out + 2*7)[0];
        ((npy_int *)data_out + 2*7)[1] =
                                ((npy_int *)data0 + 2*7)[1] +
                                ((npy_int *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1 == 2 && !0

static void
int_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int *data_out = (npy_int *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b;
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_two (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 295
        case 6+1:
            data_out[6] = ((data0[6]) *
                                 (data1[6]) +
                                 (data_out[6]));

#line 295
        case 5+1:
            data_out[5] = ((data0[5]) *
                                 (data1[5]) +
                                 (data_out[5]));

#line 295
        case 4+1:
            data_out[4] = ((data0[4]) *
                                 (data1[4]) +
                                 (data_out[4]));

#line 295
        case 3+1:
            data_out[3] = ((data0[3]) *
                                 (data1[3]) +
                                 (data_out[3]));

#line 295
        case 2+1:
            data_out[2] = ((data0[2]) *
                                 (data1[2]) +
                                 (data_out[2]));

#line 295
        case 1+1:
            data_out[1] = ((data0[1]) *
                                 (data1[1]) +
                                 (data_out[1]));

#line 295
        case 0+1:
            data_out[0] = ((data0[0]) *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
        EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 345
        data_out[0] = ((data0[0]) *
                             (data1[0]) +
                             (data_out[0]));

#line 345
        data_out[1] = ((data0[1]) *
                             (data1[1]) +
                             (data_out[1]));

#line 345
        data_out[2] = ((data0[2]) *
                             (data1[2]) +
                             (data_out[2]));

#line 345
        data_out[3] = ((data0[3]) *
                             (data1[3]) +
                             (data_out[3]));

#line 345
        data_out[4] = ((data0[4]) *
                             (data1[4]) +
                             (data_out[4]));

#line 345
        data_out[5] = ((data0[5]) *
                             (data1[5]) +
                             (data_out[5]));

#line 345
        data_out[6] = ((data0[6]) *
                             (data1[6]) +
                             (data_out[6]));

#line 345
        data_out[7] = ((data0[7]) *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data0 += 8;
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

/* Some extra specializations for the two operand case */
static void
int_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int value0 = (*(npy_int *)dataptr[0]);
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int *data_out = (npy_int *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value0_sse;
#elif EINSUM_USE_SSE2 && 0
    __m128d a, b, value0_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 383
        case 6+1:
            data_out[6] = (value0 *
                                 (data1[6]) +
                                 (data_out[6]));

#line 383
        case 5+1:
            data_out[5] = (value0 *
                                 (data1[5]) +
                                 (data_out[5]));

#line 383
        case 4+1:
            data_out[4] = (value0 *
                                 (data1[4]) +
                                 (data_out[4]));

#line 383
        case 3+1:
            data_out[3] = (value0 *
                                 (data1[3]) +
                                 (data_out[3]));

#line 383
        case 2+1:
            data_out[2] = (value0 *
                                 (data1[2]) +
                                 (data_out[2]));

#line 383
        case 1+1:
            data_out[1] = (value0 *
                                 (data1[1]) +
                                 (data_out[1]));

#line 383
        case 0+1:
            data_out[0] = (value0 *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value0_sse = _mm_set_ps1(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#elif EINSUM_USE_SSE2 && 0
    value0_sse = _mm_set1_pd(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+0));
            b = _mm_add_pd(a, _mm_load_pd(data_out+0));
            _mm_store_pd(data_out+0, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+2));
            b = _mm_add_pd(a, _mm_load_pd(data_out+2));
            _mm_store_pd(data_out+2, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+4));
            b = _mm_add_pd(a, _mm_load_pd(data_out+4));
            _mm_store_pd(data_out+4, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+6));
            b = _mm_add_pd(a, _mm_load_pd(data_out+6));
            _mm_store_pd(data_out+6, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#elif EINSUM_USE_SSE2 && 0
#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+0));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+0));
        _mm_storeu_pd(data_out+0, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+2));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+2));
        _mm_storeu_pd(data_out+2, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+4));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+4));
        _mm_storeu_pd(data_out+4, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+6));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+6));
        _mm_storeu_pd(data_out+6, b);

#else
#line 474
        data_out[0] = (value0 *
                             (data1[0]) +
                             (data_out[0]));

#line 474
        data_out[1] = (value0 *
                             (data1[1]) +
                             (data_out[1]));

#line 474
        data_out[2] = (value0 *
                             (data1[2]) +
                             (data_out[2]));

#line 474
        data_out[3] = (value0 *
                             (data1[3]) +
                             (data_out[3]));

#line 474
        data_out[4] = (value0 *
                             (data1[4]) +
                             (data_out[4]));

#line 474
        data_out[5] = (value0 *
                             (data1[5]) +
                             (data_out[5]));

#line 474
        data_out[6] = (value0 *
                             (data1[6]) +
                             (data_out[6]));

#line 474
        data_out[7] = (value0 *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    if (count > 0) {
        goto finish_after_unrolled_loop;
    }
}

static void
int_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int value1 = (*(npy_int *)dataptr[1]);
    npy_int *data_out = (npy_int *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value1_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 510
        case 6+1:
            data_out[6] = ((data0[6])*
                                 value1  +
                                 (data_out[6]));

#line 510
        case 5+1:
            data_out[5] = ((data0[5])*
                                 value1  +
                                 (data_out[5]));

#line 510
        case 4+1:
            data_out[4] = ((data0[4])*
                                 value1  +
                                 (data_out[4]));

#line 510
        case 3+1:
            data_out[3] = ((data0[3])*
                                 value1  +
                                 (data_out[3]));

#line 510
        case 2+1:
            data_out[2] = ((data0[2])*
                                 value1  +
                                 (data_out[2]));

#line 510
        case 1+1:
            data_out[1] = ((data0[1])*
                                 value1  +
                                 (data_out[1]));

#line 510
        case 0+1:
            data_out[0] = ((data0[0])*
                                 value1  +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value1_sse = _mm_set_ps1(value1);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+0), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+4), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 560
        data_out[0] = ((data0[0])*
                             value1  +
                             (data_out[0]));

#line 560
        data_out[1] = ((data0[1])*
                             value1  +
                             (data_out[1]));

#line 560
        data_out[2] = ((data0[2])*
                             value1  +
                             (data_out[2]));

#line 560
        data_out[3] = ((data0[3])*
                             value1  +
                             (data_out[3]));

#line 560
        data_out[4] = ((data0[4])*
                             value1  +
                             (data_out[4]));

#line 560
        data_out[5] = ((data0[5])*
                             value1  +
                             (data_out[5]));

#line 560
        data_out[6] = ((data0[6])*
                             value1  +
                             (data_out[6]));

#line 560
        data_out[7] = ((data0[7])*
                             value1  +
                             (data_out[7]));

#endif
        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
int_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 596
        case 6+1:
            accum += (data0[6]) * (data1[6]);

#line 596
        case 5+1:
            accum += (data0[5]) * (data1[5]);

#line 596
        case 4+1:
            accum += (data0[4]) * (data1[4]);

#line 596
        case 3+1:
            accum += (data0[3]) * (data1[3]);

#line 596
        case 2+1:
            accum += (data0[2]) * (data1[2]);

#line 596
        case 1+1:
            accum += (data0[1]) * (data1[1]);

#line 596
        case 0+1:
            accum += (data0[0]) * (data1[0]);

        case 0:
            *(npy_int *)dataptr[2] += (accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            accum_sse = _mm_add_ps(accum_sse, a);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            accum_sse = _mm_add_ps(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+0), _mm_load_pd(data1+0));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+2), _mm_load_pd(data1+2));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+4), _mm_load_pd(data1+4));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+6), _mm_load_pd(data1+6));
            accum_sse = _mm_add_pd(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        accum_sse = _mm_add_ps(accum_sse, a);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        accum_sse = _mm_add_ps(accum_sse, a);

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+0), _mm_loadu_pd(data1+0));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+2), _mm_loadu_pd(data1+2));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+4), _mm_loadu_pd(data1+4));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+6), _mm_loadu_pd(data1+6));
        accum_sse = _mm_add_pd(accum_sse, a);

#else
#line 708
        accum += (data0[0]) * (data1[0]);

#line 708
        accum += (data0[1]) * (data1[1]);

#line 708
        accum += (data0[2]) * (data1[2]);

#line 708
        accum += (data0[3]) * (data1[3]);

#line 708
        accum += (data0[4]) * (data1[4]);

#line 708
        accum += (data0[5]) * (data1[5]);

#line 708
        accum += (data0[6]) * (data1[6]);

#line 708
        accum += (data0[7]) * (data1[7]);

#endif
        data0 += 8;
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
int_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int value0 = (*(npy_int *)dataptr[0]);
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_stride0_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 754
        case 6+1:
            accum += (data1[6]);

#line 754
        case 5+1:
            accum += (data1[5]);

#line 754
        case 4+1:
            accum += (data1[4]);

#line 754
        case 3+1:
            accum += (data1[3]);

#line 754
        case 2+1:
            accum += (data1[2]);

#line 754
        case 1+1:
            accum += (data1[1]);

#line 754
        case 0+1:
            accum += (data1[0]);

        case 0:
            *(npy_int *)dataptr[2] += (value0 * accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+0));

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+4));

            data1 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+0));

#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+4));

#else
#line 813
        accum += (data1[0]);

#line 813
        accum += (data1[1]);

#line 813
        accum += (data1[2]);

#line 813
        accum += (data1[3]);

#line 813
        accum += (data1[4]);

#line 813
        accum += (data1[5]);

#line 813
        accum += (data1[6]);

#line 813
        accum += (data1[7]);

#endif
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
int_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int value1 = (*(npy_int *)dataptr[1]);
    npy_int accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_stride0_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 853
        case 6+1:
            accum += (data0[6]);

#line 853
        case 5+1:
            accum += (data0[5]);

#line 853
        case 4+1:
            accum += (data0[4]);

#line 853
        case 3+1:
            accum += (data0[3]);

#line 853
        case 2+1:
            accum += (data0[2]);

#line 853
        case 1+1:
            accum += (data0[1]);

#line 853
        case 0+1:
            accum += (data0[0]);

        case 0:
            *(npy_int *)dataptr[2] += (accum * value1);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#else
#line 912
        accum += (data0[0]);

#line 912
        accum += (data0[1]);

#line 912
        accum += (data0[2]);

#line 912
        accum += (data0[3]);

#line 912
        accum += (data0[4]);

#line 912
        accum += (data0[5]);

#line 912
        accum += (data0[6]);

#line 912
        accum += (data0[7]);

#endif
        data0 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1 == 3 && !0

static void
int_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int *data2 = (npy_int *)dataptr[2];
    npy_int *data_out = (npy_int *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 949
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 949
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 949
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 949
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 949
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 949
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 949
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 949
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 1 > 3 || @complex */

static void
int_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_one (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_int temp = (*(npy_int *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_int *)dataptr[i]);
        }
        *(npy_int *)dataptr[nop] = (temp +
                                           (*(npy_int *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_int);
        }
#else /* complex */
#  if 1 <= 3
#    define _SUMPROD_NOP 1
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_int re, im, tmp;
        int i;
        re = ((npy_int *)dataptr[0])[0];
        im = ((npy_int *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_int *)dataptr[i])[0] -
                  im * ((npy_int *)dataptr[i])[1];
            im = re * ((npy_int *)dataptr[i])[1] +
                 im * ((npy_int *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_int *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[0];
        ((npy_int *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_int);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 1 */

#if 1 == 1

static void
int_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_int accum_re = 0, accum_im = 0;
    npy_int *data0 = (npy_int *)dataptr[0];
#else
    npy_int accum = 0;
    npy_int *data0 = (npy_int *)dataptr[0];
#endif

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif


    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_outstride0_one (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 1058
        case 6+1:
#if !0
            accum += (data0[6]);
#else /* complex */
            accum_re += data0[2*6+0];
            accum_im += data0[2*6+1];
#endif

#line 1058
        case 5+1:
#if !0
            accum += (data0[5]);
#else /* complex */
            accum_re += data0[2*5+0];
            accum_im += data0[2*5+1];
#endif

#line 1058
        case 4+1:
#if !0
            accum += (data0[4]);
#else /* complex */
            accum_re += data0[2*4+0];
            accum_im += data0[2*4+1];
#endif

#line 1058
        case 3+1:
#if !0
            accum += (data0[3]);
#else /* complex */
            accum_re += data0[2*3+0];
            accum_im += data0[2*3+1];
#endif

#line 1058
        case 2+1:
#if !0
            accum += (data0[2]);
#else /* complex */
            accum_re += data0[2*2+0];
            accum_im += data0[2*2+1];
#endif

#line 1058
        case 1+1:
#if !0
            accum += (data0[1]);
#else /* complex */
            accum_re += data0[2*1+0];
            accum_im += data0[2*1+1];
#endif

#line 1058
        case 0+1:
#if !0
            accum += (data0[0]);
#else /* complex */
            accum_re += data0[2*0+0];
            accum_im += data0[2*0+1];
#endif

        case 0:
#if 0
            ((npy_int *)dataptr[1])[0] += accum_re;
            ((npy_int *)dataptr[1])[1] += accum_im;
#else
            *((npy_int *)dataptr[1]) = (accum +
                                    (*((npy_int *)dataptr[1])));
#endif
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+0));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+2));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+4));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+6));

            data0 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+0));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+2));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+4));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+6));

#else
#line 1171
#  if !0
        accum += (data0[0]);
#  else /* complex */
        accum_re += data0[2*0+0];
        accum_im += data0[2*0+1];
#  endif

#line 1171
#  if !0
        accum += (data0[1]);
#  else /* complex */
        accum_re += data0[2*1+0];
        accum_im += data0[2*1+1];
#  endif

#line 1171
#  if !0
        accum += (data0[2]);
#  else /* complex */
        accum_re += data0[2*2+0];
        accum_im += data0[2*2+1];
#  endif

#line 1171
#  if !0
        accum += (data0[3]);
#  else /* complex */
        accum_re += data0[2*3+0];
        accum_im += data0[2*3+1];
#  endif

#line 1171
#  if !0
        accum += (data0[4]);
#  else /* complex */
        accum_re += data0[2*4+0];
        accum_im += data0[2*4+1];
#  endif

#line 1171
#  if !0
        accum += (data0[5]);
#  else /* complex */
        accum_re += data0[2*5+0];
        accum_im += data0[2*5+1];
#  endif

#line 1171
#  if !0
        accum += (data0[6]);
#  else /* complex */
        accum_re += data0[2*6+0];
        accum_im += data0[2*6+1];
#  endif

#line 1171
#  if !0
        accum += (data0[7]);
#  else /* complex */
        accum_re += data0[2*7+0];
        accum_im += data0[2*7+1];
#  endif

#endif

#if !0
        data0 += 8;
#else
        data0 += 8*2;
#endif
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#endif /* 1 == 1 */

static void
int_sum_of_products_outstride0_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_int accum_re = 0, accum_im = 0;
#else
    npy_int accum = 0;
#endif

#if (1 == 1) || (1 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1 == 2 || 1 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_outstride0_one (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 1 == 1
        accum += (*(npy_int *)data0);
        data0 += stride0;
#  elif 1 == 2
        accum += (*(npy_int *)data0) *
                 (*(npy_int *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 1 == 3
        accum += (*(npy_int *)data0) *
                 (*(npy_int *)data1) *
                 (*(npy_int *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_int temp = (*(npy_int *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_int *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1 == 1
        accum_re += ((npy_int *)data0)[0];
        accum_im += ((npy_int *)data0)[1];
        data0 += stride0;
#  else
#    if 1 <= 3
#define _SUMPROD_NOP 1
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_int re, im, tmp;
        int i;
        re = ((npy_int *)dataptr[0])[0];
        im = ((npy_int *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_int *)dataptr[i])[0] -
                  im * ((npy_int *)dataptr[i])[1];
            im = re * ((npy_int *)dataptr[i])[1] +
                 im * ((npy_int *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 1 <= 3
    ((npy_int *)dataptr[1])[0] += accum_re;
    ((npy_int *)dataptr[1])[1] += accum_im;
#  else
    ((npy_int *)dataptr[nop])[0] += accum_re;
    ((npy_int *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 1 <= 3
    *((npy_int *)dataptr[1]) = (accum +
                                    (*((npy_int *)dataptr[1])));
#  else
    *((npy_int *)dataptr[nop]) = (accum +
                                    (*((npy_int *)dataptr[nop])));
#  endif
#endif

}


#line 113
static void
int_sum_of_products_two(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if (2 == 1) || (2 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (2 == 2 || 2 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (2 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (2 == 1) || (2 <= 3 && !0)
    char *data_out = dataptr[2];
    npy_intp stride_out = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_two (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 2 == 1
        *(npy_int *)data_out = ((*(npy_int *)data0) +
                                         (*(npy_int *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 2 == 2
        *(npy_int *)data_out = ((*(npy_int *)data0) *
                                         (*(npy_int *)data1) +
                                         (*(npy_int *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 2 == 3
        *(npy_int *)data_out = ((*(npy_int *)data0) *
                                         (*(npy_int *)data1) *
                                         (*(npy_int *)data2) +
                                         (*(npy_int *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_int temp = (*(npy_int *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_int *)dataptr[i]);
        }
        *(npy_int *)dataptr[nop] = (temp +
                                           (*(npy_int *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 2 == 1
        ((npy_int *)data_out)[0] = ((npy_int *)data0)[0] +
                                         ((npy_int *)data_out)[0];
        ((npy_int *)data_out)[1] = ((npy_int *)data0)[1] +
                                         ((npy_int *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 2 <= 3
#define _SUMPROD_NOP 2
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_int re, im, tmp;
        int i;
        re = ((npy_int *)dataptr[0])[0];
        im = ((npy_int *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_int *)dataptr[i])[0] -
                  im * ((npy_int *)dataptr[i])[1];
            im = re * ((npy_int *)dataptr[i])[1] +
                 im * ((npy_int *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_int *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[0];
        ((npy_int *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 2 == 1

static void
int_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data_out = (npy_int *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 228
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_int *)data_out + 2*6)[0] =
                                    ((npy_int *)data0 + 2*6)[0] +
                                    ((npy_int *)data_out + 2*6)[0];
            ((npy_int *)data_out + 2*6)[1] =
                                    ((npy_int *)data0 + 2*6)[1] +
                                    ((npy_int *)data_out + 2*6)[1];
#endif

#line 228
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_int *)data_out + 2*5)[0] =
                                    ((npy_int *)data0 + 2*5)[0] +
                                    ((npy_int *)data_out + 2*5)[0];
            ((npy_int *)data_out + 2*5)[1] =
                                    ((npy_int *)data0 + 2*5)[1] +
                                    ((npy_int *)data_out + 2*5)[1];
#endif

#line 228
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_int *)data_out + 2*4)[0] =
                                    ((npy_int *)data0 + 2*4)[0] +
                                    ((npy_int *)data_out + 2*4)[0];
            ((npy_int *)data_out + 2*4)[1] =
                                    ((npy_int *)data0 + 2*4)[1] +
                                    ((npy_int *)data_out + 2*4)[1];
#endif

#line 228
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_int *)data_out + 2*3)[0] =
                                    ((npy_int *)data0 + 2*3)[0] +
                                    ((npy_int *)data_out + 2*3)[0];
            ((npy_int *)data_out + 2*3)[1] =
                                    ((npy_int *)data0 + 2*3)[1] +
                                    ((npy_int *)data_out + 2*3)[1];
#endif

#line 228
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_int *)data_out + 2*2)[0] =
                                    ((npy_int *)data0 + 2*2)[0] +
                                    ((npy_int *)data_out + 2*2)[0];
            ((npy_int *)data_out + 2*2)[1] =
                                    ((npy_int *)data0 + 2*2)[1] +
                                    ((npy_int *)data_out + 2*2)[1];
#endif

#line 228
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_int *)data_out + 2*1)[0] =
                                    ((npy_int *)data0 + 2*1)[0] +
                                    ((npy_int *)data_out + 2*1)[0];
            ((npy_int *)data_out + 2*1)[1] =
                                    ((npy_int *)data0 + 2*1)[1] +
                                    ((npy_int *)data_out + 2*1)[1];
#endif

#line 228
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_int *)data_out + 2*0)[0] =
                                    ((npy_int *)data0 + 2*0)[0] +
                                    ((npy_int *)data_out + 2*0)[0];
            ((npy_int *)data_out + 2*0)[1] =
                                    ((npy_int *)data0 + 2*0)[1] +
                                    ((npy_int *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 252
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_int *)data_out + 2*0)[0] =
                                ((npy_int *)data0 + 2*0)[0] +
                                ((npy_int *)data_out + 2*0)[0];
        ((npy_int *)data_out + 2*0)[1] =
                                ((npy_int *)data0 + 2*0)[1] +
                                ((npy_int *)data_out + 2*0)[1];
#endif

#line 252
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_int *)data_out + 2*1)[0] =
                                ((npy_int *)data0 + 2*1)[0] +
                                ((npy_int *)data_out + 2*1)[0];
        ((npy_int *)data_out + 2*1)[1] =
                                ((npy_int *)data0 + 2*1)[1] +
                                ((npy_int *)data_out + 2*1)[1];
#endif

#line 252
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_int *)data_out + 2*2)[0] =
                                ((npy_int *)data0 + 2*2)[0] +
                                ((npy_int *)data_out + 2*2)[0];
        ((npy_int *)data_out + 2*2)[1] =
                                ((npy_int *)data0 + 2*2)[1] +
                                ((npy_int *)data_out + 2*2)[1];
#endif

#line 252
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_int *)data_out + 2*3)[0] =
                                ((npy_int *)data0 + 2*3)[0] +
                                ((npy_int *)data_out + 2*3)[0];
        ((npy_int *)data_out + 2*3)[1] =
                                ((npy_int *)data0 + 2*3)[1] +
                                ((npy_int *)data_out + 2*3)[1];
#endif

#line 252
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_int *)data_out + 2*4)[0] =
                                ((npy_int *)data0 + 2*4)[0] +
                                ((npy_int *)data_out + 2*4)[0];
        ((npy_int *)data_out + 2*4)[1] =
                                ((npy_int *)data0 + 2*4)[1] +
                                ((npy_int *)data_out + 2*4)[1];
#endif

#line 252
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_int *)data_out + 2*5)[0] =
                                ((npy_int *)data0 + 2*5)[0] +
                                ((npy_int *)data_out + 2*5)[0];
        ((npy_int *)data_out + 2*5)[1] =
                                ((npy_int *)data0 + 2*5)[1] +
                                ((npy_int *)data_out + 2*5)[1];
#endif

#line 252
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_int *)data_out + 2*6)[0] =
                                ((npy_int *)data0 + 2*6)[0] +
                                ((npy_int *)data_out + 2*6)[0];
        ((npy_int *)data_out + 2*6)[1] =
                                ((npy_int *)data0 + 2*6)[1] +
                                ((npy_int *)data_out + 2*6)[1];
#endif

#line 252
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_int *)data_out + 2*7)[0] =
                                ((npy_int *)data0 + 2*7)[0] +
                                ((npy_int *)data_out + 2*7)[0];
        ((npy_int *)data_out + 2*7)[1] =
                                ((npy_int *)data0 + 2*7)[1] +
                                ((npy_int *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 2 == 2 && !0

static void
int_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int *data_out = (npy_int *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b;
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_two (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 295
        case 6+1:
            data_out[6] = ((data0[6]) *
                                 (data1[6]) +
                                 (data_out[6]));

#line 295
        case 5+1:
            data_out[5] = ((data0[5]) *
                                 (data1[5]) +
                                 (data_out[5]));

#line 295
        case 4+1:
            data_out[4] = ((data0[4]) *
                                 (data1[4]) +
                                 (data_out[4]));

#line 295
        case 3+1:
            data_out[3] = ((data0[3]) *
                                 (data1[3]) +
                                 (data_out[3]));

#line 295
        case 2+1:
            data_out[2] = ((data0[2]) *
                                 (data1[2]) +
                                 (data_out[2]));

#line 295
        case 1+1:
            data_out[1] = ((data0[1]) *
                                 (data1[1]) +
                                 (data_out[1]));

#line 295
        case 0+1:
            data_out[0] = ((data0[0]) *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
        EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 345
        data_out[0] = ((data0[0]) *
                             (data1[0]) +
                             (data_out[0]));

#line 345
        data_out[1] = ((data0[1]) *
                             (data1[1]) +
                             (data_out[1]));

#line 345
        data_out[2] = ((data0[2]) *
                             (data1[2]) +
                             (data_out[2]));

#line 345
        data_out[3] = ((data0[3]) *
                             (data1[3]) +
                             (data_out[3]));

#line 345
        data_out[4] = ((data0[4]) *
                             (data1[4]) +
                             (data_out[4]));

#line 345
        data_out[5] = ((data0[5]) *
                             (data1[5]) +
                             (data_out[5]));

#line 345
        data_out[6] = ((data0[6]) *
                             (data1[6]) +
                             (data_out[6]));

#line 345
        data_out[7] = ((data0[7]) *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data0 += 8;
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

/* Some extra specializations for the two operand case */
static void
int_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int value0 = (*(npy_int *)dataptr[0]);
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int *data_out = (npy_int *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value0_sse;
#elif EINSUM_USE_SSE2 && 0
    __m128d a, b, value0_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 383
        case 6+1:
            data_out[6] = (value0 *
                                 (data1[6]) +
                                 (data_out[6]));

#line 383
        case 5+1:
            data_out[5] = (value0 *
                                 (data1[5]) +
                                 (data_out[5]));

#line 383
        case 4+1:
            data_out[4] = (value0 *
                                 (data1[4]) +
                                 (data_out[4]));

#line 383
        case 3+1:
            data_out[3] = (value0 *
                                 (data1[3]) +
                                 (data_out[3]));

#line 383
        case 2+1:
            data_out[2] = (value0 *
                                 (data1[2]) +
                                 (data_out[2]));

#line 383
        case 1+1:
            data_out[1] = (value0 *
                                 (data1[1]) +
                                 (data_out[1]));

#line 383
        case 0+1:
            data_out[0] = (value0 *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value0_sse = _mm_set_ps1(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#elif EINSUM_USE_SSE2 && 0
    value0_sse = _mm_set1_pd(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+0));
            b = _mm_add_pd(a, _mm_load_pd(data_out+0));
            _mm_store_pd(data_out+0, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+2));
            b = _mm_add_pd(a, _mm_load_pd(data_out+2));
            _mm_store_pd(data_out+2, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+4));
            b = _mm_add_pd(a, _mm_load_pd(data_out+4));
            _mm_store_pd(data_out+4, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+6));
            b = _mm_add_pd(a, _mm_load_pd(data_out+6));
            _mm_store_pd(data_out+6, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#elif EINSUM_USE_SSE2 && 0
#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+0));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+0));
        _mm_storeu_pd(data_out+0, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+2));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+2));
        _mm_storeu_pd(data_out+2, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+4));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+4));
        _mm_storeu_pd(data_out+4, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+6));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+6));
        _mm_storeu_pd(data_out+6, b);

#else
#line 474
        data_out[0] = (value0 *
                             (data1[0]) +
                             (data_out[0]));

#line 474
        data_out[1] = (value0 *
                             (data1[1]) +
                             (data_out[1]));

#line 474
        data_out[2] = (value0 *
                             (data1[2]) +
                             (data_out[2]));

#line 474
        data_out[3] = (value0 *
                             (data1[3]) +
                             (data_out[3]));

#line 474
        data_out[4] = (value0 *
                             (data1[4]) +
                             (data_out[4]));

#line 474
        data_out[5] = (value0 *
                             (data1[5]) +
                             (data_out[5]));

#line 474
        data_out[6] = (value0 *
                             (data1[6]) +
                             (data_out[6]));

#line 474
        data_out[7] = (value0 *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    if (count > 0) {
        goto finish_after_unrolled_loop;
    }
}

static void
int_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int value1 = (*(npy_int *)dataptr[1]);
    npy_int *data_out = (npy_int *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value1_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 510
        case 6+1:
            data_out[6] = ((data0[6])*
                                 value1  +
                                 (data_out[6]));

#line 510
        case 5+1:
            data_out[5] = ((data0[5])*
                                 value1  +
                                 (data_out[5]));

#line 510
        case 4+1:
            data_out[4] = ((data0[4])*
                                 value1  +
                                 (data_out[4]));

#line 510
        case 3+1:
            data_out[3] = ((data0[3])*
                                 value1  +
                                 (data_out[3]));

#line 510
        case 2+1:
            data_out[2] = ((data0[2])*
                                 value1  +
                                 (data_out[2]));

#line 510
        case 1+1:
            data_out[1] = ((data0[1])*
                                 value1  +
                                 (data_out[1]));

#line 510
        case 0+1:
            data_out[0] = ((data0[0])*
                                 value1  +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value1_sse = _mm_set_ps1(value1);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+0), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+4), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 560
        data_out[0] = ((data0[0])*
                             value1  +
                             (data_out[0]));

#line 560
        data_out[1] = ((data0[1])*
                             value1  +
                             (data_out[1]));

#line 560
        data_out[2] = ((data0[2])*
                             value1  +
                             (data_out[2]));

#line 560
        data_out[3] = ((data0[3])*
                             value1  +
                             (data_out[3]));

#line 560
        data_out[4] = ((data0[4])*
                             value1  +
                             (data_out[4]));

#line 560
        data_out[5] = ((data0[5])*
                             value1  +
                             (data_out[5]));

#line 560
        data_out[6] = ((data0[6])*
                             value1  +
                             (data_out[6]));

#line 560
        data_out[7] = ((data0[7])*
                             value1  +
                             (data_out[7]));

#endif
        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
int_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 596
        case 6+1:
            accum += (data0[6]) * (data1[6]);

#line 596
        case 5+1:
            accum += (data0[5]) * (data1[5]);

#line 596
        case 4+1:
            accum += (data0[4]) * (data1[4]);

#line 596
        case 3+1:
            accum += (data0[3]) * (data1[3]);

#line 596
        case 2+1:
            accum += (data0[2]) * (data1[2]);

#line 596
        case 1+1:
            accum += (data0[1]) * (data1[1]);

#line 596
        case 0+1:
            accum += (data0[0]) * (data1[0]);

        case 0:
            *(npy_int *)dataptr[2] += (accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            accum_sse = _mm_add_ps(accum_sse, a);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            accum_sse = _mm_add_ps(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+0), _mm_load_pd(data1+0));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+2), _mm_load_pd(data1+2));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+4), _mm_load_pd(data1+4));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+6), _mm_load_pd(data1+6));
            accum_sse = _mm_add_pd(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        accum_sse = _mm_add_ps(accum_sse, a);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        accum_sse = _mm_add_ps(accum_sse, a);

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+0), _mm_loadu_pd(data1+0));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+2), _mm_loadu_pd(data1+2));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+4), _mm_loadu_pd(data1+4));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+6), _mm_loadu_pd(data1+6));
        accum_sse = _mm_add_pd(accum_sse, a);

#else
#line 708
        accum += (data0[0]) * (data1[0]);

#line 708
        accum += (data0[1]) * (data1[1]);

#line 708
        accum += (data0[2]) * (data1[2]);

#line 708
        accum += (data0[3]) * (data1[3]);

#line 708
        accum += (data0[4]) * (data1[4]);

#line 708
        accum += (data0[5]) * (data1[5]);

#line 708
        accum += (data0[6]) * (data1[6]);

#line 708
        accum += (data0[7]) * (data1[7]);

#endif
        data0 += 8;
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
int_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int value0 = (*(npy_int *)dataptr[0]);
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_stride0_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 754
        case 6+1:
            accum += (data1[6]);

#line 754
        case 5+1:
            accum += (data1[5]);

#line 754
        case 4+1:
            accum += (data1[4]);

#line 754
        case 3+1:
            accum += (data1[3]);

#line 754
        case 2+1:
            accum += (data1[2]);

#line 754
        case 1+1:
            accum += (data1[1]);

#line 754
        case 0+1:
            accum += (data1[0]);

        case 0:
            *(npy_int *)dataptr[2] += (value0 * accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+0));

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+4));

            data1 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+0));

#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+4));

#else
#line 813
        accum += (data1[0]);

#line 813
        accum += (data1[1]);

#line 813
        accum += (data1[2]);

#line 813
        accum += (data1[3]);

#line 813
        accum += (data1[4]);

#line 813
        accum += (data1[5]);

#line 813
        accum += (data1[6]);

#line 813
        accum += (data1[7]);

#endif
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
int_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int value1 = (*(npy_int *)dataptr[1]);
    npy_int accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_stride0_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 853
        case 6+1:
            accum += (data0[6]);

#line 853
        case 5+1:
            accum += (data0[5]);

#line 853
        case 4+1:
            accum += (data0[4]);

#line 853
        case 3+1:
            accum += (data0[3]);

#line 853
        case 2+1:
            accum += (data0[2]);

#line 853
        case 1+1:
            accum += (data0[1]);

#line 853
        case 0+1:
            accum += (data0[0]);

        case 0:
            *(npy_int *)dataptr[2] += (accum * value1);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#else
#line 912
        accum += (data0[0]);

#line 912
        accum += (data0[1]);

#line 912
        accum += (data0[2]);

#line 912
        accum += (data0[3]);

#line 912
        accum += (data0[4]);

#line 912
        accum += (data0[5]);

#line 912
        accum += (data0[6]);

#line 912
        accum += (data0[7]);

#endif
        data0 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 2 == 3 && !0

static void
int_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int *data2 = (npy_int *)dataptr[2];
    npy_int *data_out = (npy_int *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 949
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 949
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 949
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 949
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 949
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 949
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 949
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 949
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 2 > 3 || @complex */

static void
int_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_two (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_int temp = (*(npy_int *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_int *)dataptr[i]);
        }
        *(npy_int *)dataptr[nop] = (temp +
                                           (*(npy_int *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_int);
        }
#else /* complex */
#  if 2 <= 3
#    define _SUMPROD_NOP 2
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_int re, im, tmp;
        int i;
        re = ((npy_int *)dataptr[0])[0];
        im = ((npy_int *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_int *)dataptr[i])[0] -
                  im * ((npy_int *)dataptr[i])[1];
            im = re * ((npy_int *)dataptr[i])[1] +
                 im * ((npy_int *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_int *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[0];
        ((npy_int *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_int);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 2 */

#if 2 == 1

static void
int_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_int accum_re = 0, accum_im = 0;
    npy_int *data0 = (npy_int *)dataptr[0];
#else
    npy_int accum = 0;
    npy_int *data0 = (npy_int *)dataptr[0];
#endif

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif


    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_outstride0_one (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 1058
        case 6+1:
#if !0
            accum += (data0[6]);
#else /* complex */
            accum_re += data0[2*6+0];
            accum_im += data0[2*6+1];
#endif

#line 1058
        case 5+1:
#if !0
            accum += (data0[5]);
#else /* complex */
            accum_re += data0[2*5+0];
            accum_im += data0[2*5+1];
#endif

#line 1058
        case 4+1:
#if !0
            accum += (data0[4]);
#else /* complex */
            accum_re += data0[2*4+0];
            accum_im += data0[2*4+1];
#endif

#line 1058
        case 3+1:
#if !0
            accum += (data0[3]);
#else /* complex */
            accum_re += data0[2*3+0];
            accum_im += data0[2*3+1];
#endif

#line 1058
        case 2+1:
#if !0
            accum += (data0[2]);
#else /* complex */
            accum_re += data0[2*2+0];
            accum_im += data0[2*2+1];
#endif

#line 1058
        case 1+1:
#if !0
            accum += (data0[1]);
#else /* complex */
            accum_re += data0[2*1+0];
            accum_im += data0[2*1+1];
#endif

#line 1058
        case 0+1:
#if !0
            accum += (data0[0]);
#else /* complex */
            accum_re += data0[2*0+0];
            accum_im += data0[2*0+1];
#endif

        case 0:
#if 0
            ((npy_int *)dataptr[1])[0] += accum_re;
            ((npy_int *)dataptr[1])[1] += accum_im;
#else
            *((npy_int *)dataptr[1]) = (accum +
                                    (*((npy_int *)dataptr[1])));
#endif
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+0));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+2));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+4));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+6));

            data0 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+0));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+2));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+4));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+6));

#else
#line 1171
#  if !0
        accum += (data0[0]);
#  else /* complex */
        accum_re += data0[2*0+0];
        accum_im += data0[2*0+1];
#  endif

#line 1171
#  if !0
        accum += (data0[1]);
#  else /* complex */
        accum_re += data0[2*1+0];
        accum_im += data0[2*1+1];
#  endif

#line 1171
#  if !0
        accum += (data0[2]);
#  else /* complex */
        accum_re += data0[2*2+0];
        accum_im += data0[2*2+1];
#  endif

#line 1171
#  if !0
        accum += (data0[3]);
#  else /* complex */
        accum_re += data0[2*3+0];
        accum_im += data0[2*3+1];
#  endif

#line 1171
#  if !0
        accum += (data0[4]);
#  else /* complex */
        accum_re += data0[2*4+0];
        accum_im += data0[2*4+1];
#  endif

#line 1171
#  if !0
        accum += (data0[5]);
#  else /* complex */
        accum_re += data0[2*5+0];
        accum_im += data0[2*5+1];
#  endif

#line 1171
#  if !0
        accum += (data0[6]);
#  else /* complex */
        accum_re += data0[2*6+0];
        accum_im += data0[2*6+1];
#  endif

#line 1171
#  if !0
        accum += (data0[7]);
#  else /* complex */
        accum_re += data0[2*7+0];
        accum_im += data0[2*7+1];
#  endif

#endif

#if !0
        data0 += 8;
#else
        data0 += 8*2;
#endif
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#endif /* 2 == 1 */

static void
int_sum_of_products_outstride0_two(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_int accum_re = 0, accum_im = 0;
#else
    npy_int accum = 0;
#endif

#if (2 == 1) || (2 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (2 == 2 || 2 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (2 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_outstride0_two (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 2 == 1
        accum += (*(npy_int *)data0);
        data0 += stride0;
#  elif 2 == 2
        accum += (*(npy_int *)data0) *
                 (*(npy_int *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 2 == 3
        accum += (*(npy_int *)data0) *
                 (*(npy_int *)data1) *
                 (*(npy_int *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_int temp = (*(npy_int *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_int *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 2 == 1
        accum_re += ((npy_int *)data0)[0];
        accum_im += ((npy_int *)data0)[1];
        data0 += stride0;
#  else
#    if 2 <= 3
#define _SUMPROD_NOP 2
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_int re, im, tmp;
        int i;
        re = ((npy_int *)dataptr[0])[0];
        im = ((npy_int *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_int *)dataptr[i])[0] -
                  im * ((npy_int *)dataptr[i])[1];
            im = re * ((npy_int *)dataptr[i])[1] +
                 im * ((npy_int *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 2 <= 3
    ((npy_int *)dataptr[2])[0] += accum_re;
    ((npy_int *)dataptr[2])[1] += accum_im;
#  else
    ((npy_int *)dataptr[nop])[0] += accum_re;
    ((npy_int *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 2 <= 3
    *((npy_int *)dataptr[2]) = (accum +
                                    (*((npy_int *)dataptr[2])));
#  else
    *((npy_int *)dataptr[nop]) = (accum +
                                    (*((npy_int *)dataptr[nop])));
#  endif
#endif

}


#line 113
static void
int_sum_of_products_three(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if (3 == 1) || (3 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (3 == 2 || 3 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (3 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (3 == 1) || (3 <= 3 && !0)
    char *data_out = dataptr[3];
    npy_intp stride_out = strides[3];
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_three (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 3 == 1
        *(npy_int *)data_out = ((*(npy_int *)data0) +
                                         (*(npy_int *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 3 == 2
        *(npy_int *)data_out = ((*(npy_int *)data0) *
                                         (*(npy_int *)data1) +
                                         (*(npy_int *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 3 == 3
        *(npy_int *)data_out = ((*(npy_int *)data0) *
                                         (*(npy_int *)data1) *
                                         (*(npy_int *)data2) +
                                         (*(npy_int *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_int temp = (*(npy_int *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_int *)dataptr[i]);
        }
        *(npy_int *)dataptr[nop] = (temp +
                                           (*(npy_int *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 3 == 1
        ((npy_int *)data_out)[0] = ((npy_int *)data0)[0] +
                                         ((npy_int *)data_out)[0];
        ((npy_int *)data_out)[1] = ((npy_int *)data0)[1] +
                                         ((npy_int *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 3 <= 3
#define _SUMPROD_NOP 3
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_int re, im, tmp;
        int i;
        re = ((npy_int *)dataptr[0])[0];
        im = ((npy_int *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_int *)dataptr[i])[0] -
                  im * ((npy_int *)dataptr[i])[1];
            im = re * ((npy_int *)dataptr[i])[1] +
                 im * ((npy_int *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_int *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[0];
        ((npy_int *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 3 == 1

static void
int_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data_out = (npy_int *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 228
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_int *)data_out + 2*6)[0] =
                                    ((npy_int *)data0 + 2*6)[0] +
                                    ((npy_int *)data_out + 2*6)[0];
            ((npy_int *)data_out + 2*6)[1] =
                                    ((npy_int *)data0 + 2*6)[1] +
                                    ((npy_int *)data_out + 2*6)[1];
#endif

#line 228
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_int *)data_out + 2*5)[0] =
                                    ((npy_int *)data0 + 2*5)[0] +
                                    ((npy_int *)data_out + 2*5)[0];
            ((npy_int *)data_out + 2*5)[1] =
                                    ((npy_int *)data0 + 2*5)[1] +
                                    ((npy_int *)data_out + 2*5)[1];
#endif

#line 228
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_int *)data_out + 2*4)[0] =
                                    ((npy_int *)data0 + 2*4)[0] +
                                    ((npy_int *)data_out + 2*4)[0];
            ((npy_int *)data_out + 2*4)[1] =
                                    ((npy_int *)data0 + 2*4)[1] +
                                    ((npy_int *)data_out + 2*4)[1];
#endif

#line 228
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_int *)data_out + 2*3)[0] =
                                    ((npy_int *)data0 + 2*3)[0] +
                                    ((npy_int *)data_out + 2*3)[0];
            ((npy_int *)data_out + 2*3)[1] =
                                    ((npy_int *)data0 + 2*3)[1] +
                                    ((npy_int *)data_out + 2*3)[1];
#endif

#line 228
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_int *)data_out + 2*2)[0] =
                                    ((npy_int *)data0 + 2*2)[0] +
                                    ((npy_int *)data_out + 2*2)[0];
            ((npy_int *)data_out + 2*2)[1] =
                                    ((npy_int *)data0 + 2*2)[1] +
                                    ((npy_int *)data_out + 2*2)[1];
#endif

#line 228
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_int *)data_out + 2*1)[0] =
                                    ((npy_int *)data0 + 2*1)[0] +
                                    ((npy_int *)data_out + 2*1)[0];
            ((npy_int *)data_out + 2*1)[1] =
                                    ((npy_int *)data0 + 2*1)[1] +
                                    ((npy_int *)data_out + 2*1)[1];
#endif

#line 228
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_int *)data_out + 2*0)[0] =
                                    ((npy_int *)data0 + 2*0)[0] +
                                    ((npy_int *)data_out + 2*0)[0];
            ((npy_int *)data_out + 2*0)[1] =
                                    ((npy_int *)data0 + 2*0)[1] +
                                    ((npy_int *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 252
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_int *)data_out + 2*0)[0] =
                                ((npy_int *)data0 + 2*0)[0] +
                                ((npy_int *)data_out + 2*0)[0];
        ((npy_int *)data_out + 2*0)[1] =
                                ((npy_int *)data0 + 2*0)[1] +
                                ((npy_int *)data_out + 2*0)[1];
#endif

#line 252
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_int *)data_out + 2*1)[0] =
                                ((npy_int *)data0 + 2*1)[0] +
                                ((npy_int *)data_out + 2*1)[0];
        ((npy_int *)data_out + 2*1)[1] =
                                ((npy_int *)data0 + 2*1)[1] +
                                ((npy_int *)data_out + 2*1)[1];
#endif

#line 252
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_int *)data_out + 2*2)[0] =
                                ((npy_int *)data0 + 2*2)[0] +
                                ((npy_int *)data_out + 2*2)[0];
        ((npy_int *)data_out + 2*2)[1] =
                                ((npy_int *)data0 + 2*2)[1] +
                                ((npy_int *)data_out + 2*2)[1];
#endif

#line 252
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_int *)data_out + 2*3)[0] =
                                ((npy_int *)data0 + 2*3)[0] +
                                ((npy_int *)data_out + 2*3)[0];
        ((npy_int *)data_out + 2*3)[1] =
                                ((npy_int *)data0 + 2*3)[1] +
                                ((npy_int *)data_out + 2*3)[1];
#endif

#line 252
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_int *)data_out + 2*4)[0] =
                                ((npy_int *)data0 + 2*4)[0] +
                                ((npy_int *)data_out + 2*4)[0];
        ((npy_int *)data_out + 2*4)[1] =
                                ((npy_int *)data0 + 2*4)[1] +
                                ((npy_int *)data_out + 2*4)[1];
#endif

#line 252
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_int *)data_out + 2*5)[0] =
                                ((npy_int *)data0 + 2*5)[0] +
                                ((npy_int *)data_out + 2*5)[0];
        ((npy_int *)data_out + 2*5)[1] =
                                ((npy_int *)data0 + 2*5)[1] +
                                ((npy_int *)data_out + 2*5)[1];
#endif

#line 252
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_int *)data_out + 2*6)[0] =
                                ((npy_int *)data0 + 2*6)[0] +
                                ((npy_int *)data_out + 2*6)[0];
        ((npy_int *)data_out + 2*6)[1] =
                                ((npy_int *)data0 + 2*6)[1] +
                                ((npy_int *)data_out + 2*6)[1];
#endif

#line 252
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_int *)data_out + 2*7)[0] =
                                ((npy_int *)data0 + 2*7)[0] +
                                ((npy_int *)data_out + 2*7)[0];
        ((npy_int *)data_out + 2*7)[1] =
                                ((npy_int *)data0 + 2*7)[1] +
                                ((npy_int *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 3 == 2 && !0

static void
int_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int *data_out = (npy_int *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b;
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_two (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 295
        case 6+1:
            data_out[6] = ((data0[6]) *
                                 (data1[6]) +
                                 (data_out[6]));

#line 295
        case 5+1:
            data_out[5] = ((data0[5]) *
                                 (data1[5]) +
                                 (data_out[5]));

#line 295
        case 4+1:
            data_out[4] = ((data0[4]) *
                                 (data1[4]) +
                                 (data_out[4]));

#line 295
        case 3+1:
            data_out[3] = ((data0[3]) *
                                 (data1[3]) +
                                 (data_out[3]));

#line 295
        case 2+1:
            data_out[2] = ((data0[2]) *
                                 (data1[2]) +
                                 (data_out[2]));

#line 295
        case 1+1:
            data_out[1] = ((data0[1]) *
                                 (data1[1]) +
                                 (data_out[1]));

#line 295
        case 0+1:
            data_out[0] = ((data0[0]) *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
        EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 345
        data_out[0] = ((data0[0]) *
                             (data1[0]) +
                             (data_out[0]));

#line 345
        data_out[1] = ((data0[1]) *
                             (data1[1]) +
                             (data_out[1]));

#line 345
        data_out[2] = ((data0[2]) *
                             (data1[2]) +
                             (data_out[2]));

#line 345
        data_out[3] = ((data0[3]) *
                             (data1[3]) +
                             (data_out[3]));

#line 345
        data_out[4] = ((data0[4]) *
                             (data1[4]) +
                             (data_out[4]));

#line 345
        data_out[5] = ((data0[5]) *
                             (data1[5]) +
                             (data_out[5]));

#line 345
        data_out[6] = ((data0[6]) *
                             (data1[6]) +
                             (data_out[6]));

#line 345
        data_out[7] = ((data0[7]) *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data0 += 8;
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

/* Some extra specializations for the two operand case */
static void
int_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int value0 = (*(npy_int *)dataptr[0]);
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int *data_out = (npy_int *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value0_sse;
#elif EINSUM_USE_SSE2 && 0
    __m128d a, b, value0_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 383
        case 6+1:
            data_out[6] = (value0 *
                                 (data1[6]) +
                                 (data_out[6]));

#line 383
        case 5+1:
            data_out[5] = (value0 *
                                 (data1[5]) +
                                 (data_out[5]));

#line 383
        case 4+1:
            data_out[4] = (value0 *
                                 (data1[4]) +
                                 (data_out[4]));

#line 383
        case 3+1:
            data_out[3] = (value0 *
                                 (data1[3]) +
                                 (data_out[3]));

#line 383
        case 2+1:
            data_out[2] = (value0 *
                                 (data1[2]) +
                                 (data_out[2]));

#line 383
        case 1+1:
            data_out[1] = (value0 *
                                 (data1[1]) +
                                 (data_out[1]));

#line 383
        case 0+1:
            data_out[0] = (value0 *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value0_sse = _mm_set_ps1(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#elif EINSUM_USE_SSE2 && 0
    value0_sse = _mm_set1_pd(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+0));
            b = _mm_add_pd(a, _mm_load_pd(data_out+0));
            _mm_store_pd(data_out+0, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+2));
            b = _mm_add_pd(a, _mm_load_pd(data_out+2));
            _mm_store_pd(data_out+2, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+4));
            b = _mm_add_pd(a, _mm_load_pd(data_out+4));
            _mm_store_pd(data_out+4, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+6));
            b = _mm_add_pd(a, _mm_load_pd(data_out+6));
            _mm_store_pd(data_out+6, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#elif EINSUM_USE_SSE2 && 0
#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+0));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+0));
        _mm_storeu_pd(data_out+0, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+2));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+2));
        _mm_storeu_pd(data_out+2, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+4));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+4));
        _mm_storeu_pd(data_out+4, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+6));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+6));
        _mm_storeu_pd(data_out+6, b);

#else
#line 474
        data_out[0] = (value0 *
                             (data1[0]) +
                             (data_out[0]));

#line 474
        data_out[1] = (value0 *
                             (data1[1]) +
                             (data_out[1]));

#line 474
        data_out[2] = (value0 *
                             (data1[2]) +
                             (data_out[2]));

#line 474
        data_out[3] = (value0 *
                             (data1[3]) +
                             (data_out[3]));

#line 474
        data_out[4] = (value0 *
                             (data1[4]) +
                             (data_out[4]));

#line 474
        data_out[5] = (value0 *
                             (data1[5]) +
                             (data_out[5]));

#line 474
        data_out[6] = (value0 *
                             (data1[6]) +
                             (data_out[6]));

#line 474
        data_out[7] = (value0 *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    if (count > 0) {
        goto finish_after_unrolled_loop;
    }
}

static void
int_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int value1 = (*(npy_int *)dataptr[1]);
    npy_int *data_out = (npy_int *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value1_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 510
        case 6+1:
            data_out[6] = ((data0[6])*
                                 value1  +
                                 (data_out[6]));

#line 510
        case 5+1:
            data_out[5] = ((data0[5])*
                                 value1  +
                                 (data_out[5]));

#line 510
        case 4+1:
            data_out[4] = ((data0[4])*
                                 value1  +
                                 (data_out[4]));

#line 510
        case 3+1:
            data_out[3] = ((data0[3])*
                                 value1  +
                                 (data_out[3]));

#line 510
        case 2+1:
            data_out[2] = ((data0[2])*
                                 value1  +
                                 (data_out[2]));

#line 510
        case 1+1:
            data_out[1] = ((data0[1])*
                                 value1  +
                                 (data_out[1]));

#line 510
        case 0+1:
            data_out[0] = ((data0[0])*
                                 value1  +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value1_sse = _mm_set_ps1(value1);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+0), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+4), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 560
        data_out[0] = ((data0[0])*
                             value1  +
                             (data_out[0]));

#line 560
        data_out[1] = ((data0[1])*
                             value1  +
                             (data_out[1]));

#line 560
        data_out[2] = ((data0[2])*
                             value1  +
                             (data_out[2]));

#line 560
        data_out[3] = ((data0[3])*
                             value1  +
                             (data_out[3]));

#line 560
        data_out[4] = ((data0[4])*
                             value1  +
                             (data_out[4]));

#line 560
        data_out[5] = ((data0[5])*
                             value1  +
                             (data_out[5]));

#line 560
        data_out[6] = ((data0[6])*
                             value1  +
                             (data_out[6]));

#line 560
        data_out[7] = ((data0[7])*
                             value1  +
                             (data_out[7]));

#endif
        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
int_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 596
        case 6+1:
            accum += (data0[6]) * (data1[6]);

#line 596
        case 5+1:
            accum += (data0[5]) * (data1[5]);

#line 596
        case 4+1:
            accum += (data0[4]) * (data1[4]);

#line 596
        case 3+1:
            accum += (data0[3]) * (data1[3]);

#line 596
        case 2+1:
            accum += (data0[2]) * (data1[2]);

#line 596
        case 1+1:
            accum += (data0[1]) * (data1[1]);

#line 596
        case 0+1:
            accum += (data0[0]) * (data1[0]);

        case 0:
            *(npy_int *)dataptr[2] += (accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            accum_sse = _mm_add_ps(accum_sse, a);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            accum_sse = _mm_add_ps(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+0), _mm_load_pd(data1+0));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+2), _mm_load_pd(data1+2));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+4), _mm_load_pd(data1+4));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+6), _mm_load_pd(data1+6));
            accum_sse = _mm_add_pd(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        accum_sse = _mm_add_ps(accum_sse, a);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        accum_sse = _mm_add_ps(accum_sse, a);

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+0), _mm_loadu_pd(data1+0));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+2), _mm_loadu_pd(data1+2));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+4), _mm_loadu_pd(data1+4));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+6), _mm_loadu_pd(data1+6));
        accum_sse = _mm_add_pd(accum_sse, a);

#else
#line 708
        accum += (data0[0]) * (data1[0]);

#line 708
        accum += (data0[1]) * (data1[1]);

#line 708
        accum += (data0[2]) * (data1[2]);

#line 708
        accum += (data0[3]) * (data1[3]);

#line 708
        accum += (data0[4]) * (data1[4]);

#line 708
        accum += (data0[5]) * (data1[5]);

#line 708
        accum += (data0[6]) * (data1[6]);

#line 708
        accum += (data0[7]) * (data1[7]);

#endif
        data0 += 8;
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
int_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int value0 = (*(npy_int *)dataptr[0]);
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_stride0_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 754
        case 6+1:
            accum += (data1[6]);

#line 754
        case 5+1:
            accum += (data1[5]);

#line 754
        case 4+1:
            accum += (data1[4]);

#line 754
        case 3+1:
            accum += (data1[3]);

#line 754
        case 2+1:
            accum += (data1[2]);

#line 754
        case 1+1:
            accum += (data1[1]);

#line 754
        case 0+1:
            accum += (data1[0]);

        case 0:
            *(npy_int *)dataptr[2] += (value0 * accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+0));

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+4));

            data1 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+0));

#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+4));

#else
#line 813
        accum += (data1[0]);

#line 813
        accum += (data1[1]);

#line 813
        accum += (data1[2]);

#line 813
        accum += (data1[3]);

#line 813
        accum += (data1[4]);

#line 813
        accum += (data1[5]);

#line 813
        accum += (data1[6]);

#line 813
        accum += (data1[7]);

#endif
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
int_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int value1 = (*(npy_int *)dataptr[1]);
    npy_int accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_stride0_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 853
        case 6+1:
            accum += (data0[6]);

#line 853
        case 5+1:
            accum += (data0[5]);

#line 853
        case 4+1:
            accum += (data0[4]);

#line 853
        case 3+1:
            accum += (data0[3]);

#line 853
        case 2+1:
            accum += (data0[2]);

#line 853
        case 1+1:
            accum += (data0[1]);

#line 853
        case 0+1:
            accum += (data0[0]);

        case 0:
            *(npy_int *)dataptr[2] += (accum * value1);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#else
#line 912
        accum += (data0[0]);

#line 912
        accum += (data0[1]);

#line 912
        accum += (data0[2]);

#line 912
        accum += (data0[3]);

#line 912
        accum += (data0[4]);

#line 912
        accum += (data0[5]);

#line 912
        accum += (data0[6]);

#line 912
        accum += (data0[7]);

#endif
        data0 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 3 == 3 && !0

static void
int_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int *data2 = (npy_int *)dataptr[2];
    npy_int *data_out = (npy_int *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 949
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 949
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 949
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 949
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 949
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 949
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 949
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 949
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 3 > 3 || @complex */

static void
int_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_three (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_int temp = (*(npy_int *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_int *)dataptr[i]);
        }
        *(npy_int *)dataptr[nop] = (temp +
                                           (*(npy_int *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_int);
        }
#else /* complex */
#  if 3 <= 3
#    define _SUMPROD_NOP 3
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_int re, im, tmp;
        int i;
        re = ((npy_int *)dataptr[0])[0];
        im = ((npy_int *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_int *)dataptr[i])[0] -
                  im * ((npy_int *)dataptr[i])[1];
            im = re * ((npy_int *)dataptr[i])[1] +
                 im * ((npy_int *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_int *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[0];
        ((npy_int *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_int);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 3 */

#if 3 == 1

static void
int_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_int accum_re = 0, accum_im = 0;
    npy_int *data0 = (npy_int *)dataptr[0];
#else
    npy_int accum = 0;
    npy_int *data0 = (npy_int *)dataptr[0];
#endif

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif


    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_outstride0_one (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 1058
        case 6+1:
#if !0
            accum += (data0[6]);
#else /* complex */
            accum_re += data0[2*6+0];
            accum_im += data0[2*6+1];
#endif

#line 1058
        case 5+1:
#if !0
            accum += (data0[5]);
#else /* complex */
            accum_re += data0[2*5+0];
            accum_im += data0[2*5+1];
#endif

#line 1058
        case 4+1:
#if !0
            accum += (data0[4]);
#else /* complex */
            accum_re += data0[2*4+0];
            accum_im += data0[2*4+1];
#endif

#line 1058
        case 3+1:
#if !0
            accum += (data0[3]);
#else /* complex */
            accum_re += data0[2*3+0];
            accum_im += data0[2*3+1];
#endif

#line 1058
        case 2+1:
#if !0
            accum += (data0[2]);
#else /* complex */
            accum_re += data0[2*2+0];
            accum_im += data0[2*2+1];
#endif

#line 1058
        case 1+1:
#if !0
            accum += (data0[1]);
#else /* complex */
            accum_re += data0[2*1+0];
            accum_im += data0[2*1+1];
#endif

#line 1058
        case 0+1:
#if !0
            accum += (data0[0]);
#else /* complex */
            accum_re += data0[2*0+0];
            accum_im += data0[2*0+1];
#endif

        case 0:
#if 0
            ((npy_int *)dataptr[1])[0] += accum_re;
            ((npy_int *)dataptr[1])[1] += accum_im;
#else
            *((npy_int *)dataptr[1]) = (accum +
                                    (*((npy_int *)dataptr[1])));
#endif
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+0));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+2));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+4));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+6));

            data0 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+0));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+2));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+4));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+6));

#else
#line 1171
#  if !0
        accum += (data0[0]);
#  else /* complex */
        accum_re += data0[2*0+0];
        accum_im += data0[2*0+1];
#  endif

#line 1171
#  if !0
        accum += (data0[1]);
#  else /* complex */
        accum_re += data0[2*1+0];
        accum_im += data0[2*1+1];
#  endif

#line 1171
#  if !0
        accum += (data0[2]);
#  else /* complex */
        accum_re += data0[2*2+0];
        accum_im += data0[2*2+1];
#  endif

#line 1171
#  if !0
        accum += (data0[3]);
#  else /* complex */
        accum_re += data0[2*3+0];
        accum_im += data0[2*3+1];
#  endif

#line 1171
#  if !0
        accum += (data0[4]);
#  else /* complex */
        accum_re += data0[2*4+0];
        accum_im += data0[2*4+1];
#  endif

#line 1171
#  if !0
        accum += (data0[5]);
#  else /* complex */
        accum_re += data0[2*5+0];
        accum_im += data0[2*5+1];
#  endif

#line 1171
#  if !0
        accum += (data0[6]);
#  else /* complex */
        accum_re += data0[2*6+0];
        accum_im += data0[2*6+1];
#  endif

#line 1171
#  if !0
        accum += (data0[7]);
#  else /* complex */
        accum_re += data0[2*7+0];
        accum_im += data0[2*7+1];
#  endif

#endif

#if !0
        data0 += 8;
#else
        data0 += 8*2;
#endif
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#endif /* 3 == 1 */

static void
int_sum_of_products_outstride0_three(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_int accum_re = 0, accum_im = 0;
#else
    npy_int accum = 0;
#endif

#if (3 == 1) || (3 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (3 == 2 || 3 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (3 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_outstride0_three (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 3 == 1
        accum += (*(npy_int *)data0);
        data0 += stride0;
#  elif 3 == 2
        accum += (*(npy_int *)data0) *
                 (*(npy_int *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 3 == 3
        accum += (*(npy_int *)data0) *
                 (*(npy_int *)data1) *
                 (*(npy_int *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_int temp = (*(npy_int *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_int *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 3 == 1
        accum_re += ((npy_int *)data0)[0];
        accum_im += ((npy_int *)data0)[1];
        data0 += stride0;
#  else
#    if 3 <= 3
#define _SUMPROD_NOP 3
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_int re, im, tmp;
        int i;
        re = ((npy_int *)dataptr[0])[0];
        im = ((npy_int *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_int *)dataptr[i])[0] -
                  im * ((npy_int *)dataptr[i])[1];
            im = re * ((npy_int *)dataptr[i])[1] +
                 im * ((npy_int *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 3 <= 3
    ((npy_int *)dataptr[3])[0] += accum_re;
    ((npy_int *)dataptr[3])[1] += accum_im;
#  else
    ((npy_int *)dataptr[nop])[0] += accum_re;
    ((npy_int *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 3 <= 3
    *((npy_int *)dataptr[3]) = (accum +
                                    (*((npy_int *)dataptr[3])));
#  else
    *((npy_int *)dataptr[nop]) = (accum +
                                    (*((npy_int *)dataptr[nop])));
#  endif
#endif

}


#line 113
static void
int_sum_of_products_any(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if (1000 == 1) || (1000 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1000 == 2 || 1000 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1000 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (1000 == 1) || (1000 <= 3 && !0)
    char *data_out = dataptr[1000];
    npy_intp stride_out = strides[1000];
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_any (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 1000 == 1
        *(npy_int *)data_out = ((*(npy_int *)data0) +
                                         (*(npy_int *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 1000 == 2
        *(npy_int *)data_out = ((*(npy_int *)data0) *
                                         (*(npy_int *)data1) +
                                         (*(npy_int *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 1000 == 3
        *(npy_int *)data_out = ((*(npy_int *)data0) *
                                         (*(npy_int *)data1) *
                                         (*(npy_int *)data2) +
                                         (*(npy_int *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_int temp = (*(npy_int *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_int *)dataptr[i]);
        }
        *(npy_int *)dataptr[nop] = (temp +
                                           (*(npy_int *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1000 == 1
        ((npy_int *)data_out)[0] = ((npy_int *)data0)[0] +
                                         ((npy_int *)data_out)[0];
        ((npy_int *)data_out)[1] = ((npy_int *)data0)[1] +
                                         ((npy_int *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 1000 <= 3
#define _SUMPROD_NOP 1000
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_int re, im, tmp;
        int i;
        re = ((npy_int *)dataptr[0])[0];
        im = ((npy_int *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_int *)dataptr[i])[0] -
                  im * ((npy_int *)dataptr[i])[1];
            im = re * ((npy_int *)dataptr[i])[1] +
                 im * ((npy_int *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_int *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[0];
        ((npy_int *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 1000 == 1

static void
int_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data_out = (npy_int *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 228
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_int *)data_out + 2*6)[0] =
                                    ((npy_int *)data0 + 2*6)[0] +
                                    ((npy_int *)data_out + 2*6)[0];
            ((npy_int *)data_out + 2*6)[1] =
                                    ((npy_int *)data0 + 2*6)[1] +
                                    ((npy_int *)data_out + 2*6)[1];
#endif

#line 228
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_int *)data_out + 2*5)[0] =
                                    ((npy_int *)data0 + 2*5)[0] +
                                    ((npy_int *)data_out + 2*5)[0];
            ((npy_int *)data_out + 2*5)[1] =
                                    ((npy_int *)data0 + 2*5)[1] +
                                    ((npy_int *)data_out + 2*5)[1];
#endif

#line 228
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_int *)data_out + 2*4)[0] =
                                    ((npy_int *)data0 + 2*4)[0] +
                                    ((npy_int *)data_out + 2*4)[0];
            ((npy_int *)data_out + 2*4)[1] =
                                    ((npy_int *)data0 + 2*4)[1] +
                                    ((npy_int *)data_out + 2*4)[1];
#endif

#line 228
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_int *)data_out + 2*3)[0] =
                                    ((npy_int *)data0 + 2*3)[0] +
                                    ((npy_int *)data_out + 2*3)[0];
            ((npy_int *)data_out + 2*3)[1] =
                                    ((npy_int *)data0 + 2*3)[1] +
                                    ((npy_int *)data_out + 2*3)[1];
#endif

#line 228
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_int *)data_out + 2*2)[0] =
                                    ((npy_int *)data0 + 2*2)[0] +
                                    ((npy_int *)data_out + 2*2)[0];
            ((npy_int *)data_out + 2*2)[1] =
                                    ((npy_int *)data0 + 2*2)[1] +
                                    ((npy_int *)data_out + 2*2)[1];
#endif

#line 228
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_int *)data_out + 2*1)[0] =
                                    ((npy_int *)data0 + 2*1)[0] +
                                    ((npy_int *)data_out + 2*1)[0];
            ((npy_int *)data_out + 2*1)[1] =
                                    ((npy_int *)data0 + 2*1)[1] +
                                    ((npy_int *)data_out + 2*1)[1];
#endif

#line 228
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_int *)data_out + 2*0)[0] =
                                    ((npy_int *)data0 + 2*0)[0] +
                                    ((npy_int *)data_out + 2*0)[0];
            ((npy_int *)data_out + 2*0)[1] =
                                    ((npy_int *)data0 + 2*0)[1] +
                                    ((npy_int *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 252
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_int *)data_out + 2*0)[0] =
                                ((npy_int *)data0 + 2*0)[0] +
                                ((npy_int *)data_out + 2*0)[0];
        ((npy_int *)data_out + 2*0)[1] =
                                ((npy_int *)data0 + 2*0)[1] +
                                ((npy_int *)data_out + 2*0)[1];
#endif

#line 252
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_int *)data_out + 2*1)[0] =
                                ((npy_int *)data0 + 2*1)[0] +
                                ((npy_int *)data_out + 2*1)[0];
        ((npy_int *)data_out + 2*1)[1] =
                                ((npy_int *)data0 + 2*1)[1] +
                                ((npy_int *)data_out + 2*1)[1];
#endif

#line 252
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_int *)data_out + 2*2)[0] =
                                ((npy_int *)data0 + 2*2)[0] +
                                ((npy_int *)data_out + 2*2)[0];
        ((npy_int *)data_out + 2*2)[1] =
                                ((npy_int *)data0 + 2*2)[1] +
                                ((npy_int *)data_out + 2*2)[1];
#endif

#line 252
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_int *)data_out + 2*3)[0] =
                                ((npy_int *)data0 + 2*3)[0] +
                                ((npy_int *)data_out + 2*3)[0];
        ((npy_int *)data_out + 2*3)[1] =
                                ((npy_int *)data0 + 2*3)[1] +
                                ((npy_int *)data_out + 2*3)[1];
#endif

#line 252
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_int *)data_out + 2*4)[0] =
                                ((npy_int *)data0 + 2*4)[0] +
                                ((npy_int *)data_out + 2*4)[0];
        ((npy_int *)data_out + 2*4)[1] =
                                ((npy_int *)data0 + 2*4)[1] +
                                ((npy_int *)data_out + 2*4)[1];
#endif

#line 252
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_int *)data_out + 2*5)[0] =
                                ((npy_int *)data0 + 2*5)[0] +
                                ((npy_int *)data_out + 2*5)[0];
        ((npy_int *)data_out + 2*5)[1] =
                                ((npy_int *)data0 + 2*5)[1] +
                                ((npy_int *)data_out + 2*5)[1];
#endif

#line 252
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_int *)data_out + 2*6)[0] =
                                ((npy_int *)data0 + 2*6)[0] +
                                ((npy_int *)data_out + 2*6)[0];
        ((npy_int *)data_out + 2*6)[1] =
                                ((npy_int *)data0 + 2*6)[1] +
                                ((npy_int *)data_out + 2*6)[1];
#endif

#line 252
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_int *)data_out + 2*7)[0] =
                                ((npy_int *)data0 + 2*7)[0] +
                                ((npy_int *)data_out + 2*7)[0];
        ((npy_int *)data_out + 2*7)[1] =
                                ((npy_int *)data0 + 2*7)[1] +
                                ((npy_int *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1000 == 2 && !0

static void
int_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int *data_out = (npy_int *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b;
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_two (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 295
        case 6+1:
            data_out[6] = ((data0[6]) *
                                 (data1[6]) +
                                 (data_out[6]));

#line 295
        case 5+1:
            data_out[5] = ((data0[5]) *
                                 (data1[5]) +
                                 (data_out[5]));

#line 295
        case 4+1:
            data_out[4] = ((data0[4]) *
                                 (data1[4]) +
                                 (data_out[4]));

#line 295
        case 3+1:
            data_out[3] = ((data0[3]) *
                                 (data1[3]) +
                                 (data_out[3]));

#line 295
        case 2+1:
            data_out[2] = ((data0[2]) *
                                 (data1[2]) +
                                 (data_out[2]));

#line 295
        case 1+1:
            data_out[1] = ((data0[1]) *
                                 (data1[1]) +
                                 (data_out[1]));

#line 295
        case 0+1:
            data_out[0] = ((data0[0]) *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
        EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 345
        data_out[0] = ((data0[0]) *
                             (data1[0]) +
                             (data_out[0]));

#line 345
        data_out[1] = ((data0[1]) *
                             (data1[1]) +
                             (data_out[1]));

#line 345
        data_out[2] = ((data0[2]) *
                             (data1[2]) +
                             (data_out[2]));

#line 345
        data_out[3] = ((data0[3]) *
                             (data1[3]) +
                             (data_out[3]));

#line 345
        data_out[4] = ((data0[4]) *
                             (data1[4]) +
                             (data_out[4]));

#line 345
        data_out[5] = ((data0[5]) *
                             (data1[5]) +
                             (data_out[5]));

#line 345
        data_out[6] = ((data0[6]) *
                             (data1[6]) +
                             (data_out[6]));

#line 345
        data_out[7] = ((data0[7]) *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data0 += 8;
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

/* Some extra specializations for the two operand case */
static void
int_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int value0 = (*(npy_int *)dataptr[0]);
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int *data_out = (npy_int *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value0_sse;
#elif EINSUM_USE_SSE2 && 0
    __m128d a, b, value0_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 383
        case 6+1:
            data_out[6] = (value0 *
                                 (data1[6]) +
                                 (data_out[6]));

#line 383
        case 5+1:
            data_out[5] = (value0 *
                                 (data1[5]) +
                                 (data_out[5]));

#line 383
        case 4+1:
            data_out[4] = (value0 *
                                 (data1[4]) +
                                 (data_out[4]));

#line 383
        case 3+1:
            data_out[3] = (value0 *
                                 (data1[3]) +
                                 (data_out[3]));

#line 383
        case 2+1:
            data_out[2] = (value0 *
                                 (data1[2]) +
                                 (data_out[2]));

#line 383
        case 1+1:
            data_out[1] = (value0 *
                                 (data1[1]) +
                                 (data_out[1]));

#line 383
        case 0+1:
            data_out[0] = (value0 *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value0_sse = _mm_set_ps1(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#elif EINSUM_USE_SSE2 && 0
    value0_sse = _mm_set1_pd(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+0));
            b = _mm_add_pd(a, _mm_load_pd(data_out+0));
            _mm_store_pd(data_out+0, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+2));
            b = _mm_add_pd(a, _mm_load_pd(data_out+2));
            _mm_store_pd(data_out+2, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+4));
            b = _mm_add_pd(a, _mm_load_pd(data_out+4));
            _mm_store_pd(data_out+4, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+6));
            b = _mm_add_pd(a, _mm_load_pd(data_out+6));
            _mm_store_pd(data_out+6, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#elif EINSUM_USE_SSE2 && 0
#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+0));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+0));
        _mm_storeu_pd(data_out+0, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+2));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+2));
        _mm_storeu_pd(data_out+2, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+4));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+4));
        _mm_storeu_pd(data_out+4, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+6));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+6));
        _mm_storeu_pd(data_out+6, b);

#else
#line 474
        data_out[0] = (value0 *
                             (data1[0]) +
                             (data_out[0]));

#line 474
        data_out[1] = (value0 *
                             (data1[1]) +
                             (data_out[1]));

#line 474
        data_out[2] = (value0 *
                             (data1[2]) +
                             (data_out[2]));

#line 474
        data_out[3] = (value0 *
                             (data1[3]) +
                             (data_out[3]));

#line 474
        data_out[4] = (value0 *
                             (data1[4]) +
                             (data_out[4]));

#line 474
        data_out[5] = (value0 *
                             (data1[5]) +
                             (data_out[5]));

#line 474
        data_out[6] = (value0 *
                             (data1[6]) +
                             (data_out[6]));

#line 474
        data_out[7] = (value0 *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    if (count > 0) {
        goto finish_after_unrolled_loop;
    }
}

static void
int_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int value1 = (*(npy_int *)dataptr[1]);
    npy_int *data_out = (npy_int *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value1_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 510
        case 6+1:
            data_out[6] = ((data0[6])*
                                 value1  +
                                 (data_out[6]));

#line 510
        case 5+1:
            data_out[5] = ((data0[5])*
                                 value1  +
                                 (data_out[5]));

#line 510
        case 4+1:
            data_out[4] = ((data0[4])*
                                 value1  +
                                 (data_out[4]));

#line 510
        case 3+1:
            data_out[3] = ((data0[3])*
                                 value1  +
                                 (data_out[3]));

#line 510
        case 2+1:
            data_out[2] = ((data0[2])*
                                 value1  +
                                 (data_out[2]));

#line 510
        case 1+1:
            data_out[1] = ((data0[1])*
                                 value1  +
                                 (data_out[1]));

#line 510
        case 0+1:
            data_out[0] = ((data0[0])*
                                 value1  +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value1_sse = _mm_set_ps1(value1);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+0), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+4), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 560
        data_out[0] = ((data0[0])*
                             value1  +
                             (data_out[0]));

#line 560
        data_out[1] = ((data0[1])*
                             value1  +
                             (data_out[1]));

#line 560
        data_out[2] = ((data0[2])*
                             value1  +
                             (data_out[2]));

#line 560
        data_out[3] = ((data0[3])*
                             value1  +
                             (data_out[3]));

#line 560
        data_out[4] = ((data0[4])*
                             value1  +
                             (data_out[4]));

#line 560
        data_out[5] = ((data0[5])*
                             value1  +
                             (data_out[5]));

#line 560
        data_out[6] = ((data0[6])*
                             value1  +
                             (data_out[6]));

#line 560
        data_out[7] = ((data0[7])*
                             value1  +
                             (data_out[7]));

#endif
        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
int_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 596
        case 6+1:
            accum += (data0[6]) * (data1[6]);

#line 596
        case 5+1:
            accum += (data0[5]) * (data1[5]);

#line 596
        case 4+1:
            accum += (data0[4]) * (data1[4]);

#line 596
        case 3+1:
            accum += (data0[3]) * (data1[3]);

#line 596
        case 2+1:
            accum += (data0[2]) * (data1[2]);

#line 596
        case 1+1:
            accum += (data0[1]) * (data1[1]);

#line 596
        case 0+1:
            accum += (data0[0]) * (data1[0]);

        case 0:
            *(npy_int *)dataptr[2] += (accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            accum_sse = _mm_add_ps(accum_sse, a);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            accum_sse = _mm_add_ps(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+0), _mm_load_pd(data1+0));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+2), _mm_load_pd(data1+2));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+4), _mm_load_pd(data1+4));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+6), _mm_load_pd(data1+6));
            accum_sse = _mm_add_pd(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        accum_sse = _mm_add_ps(accum_sse, a);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        accum_sse = _mm_add_ps(accum_sse, a);

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+0), _mm_loadu_pd(data1+0));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+2), _mm_loadu_pd(data1+2));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+4), _mm_loadu_pd(data1+4));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+6), _mm_loadu_pd(data1+6));
        accum_sse = _mm_add_pd(accum_sse, a);

#else
#line 708
        accum += (data0[0]) * (data1[0]);

#line 708
        accum += (data0[1]) * (data1[1]);

#line 708
        accum += (data0[2]) * (data1[2]);

#line 708
        accum += (data0[3]) * (data1[3]);

#line 708
        accum += (data0[4]) * (data1[4]);

#line 708
        accum += (data0[5]) * (data1[5]);

#line 708
        accum += (data0[6]) * (data1[6]);

#line 708
        accum += (data0[7]) * (data1[7]);

#endif
        data0 += 8;
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
int_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int value0 = (*(npy_int *)dataptr[0]);
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_stride0_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 754
        case 6+1:
            accum += (data1[6]);

#line 754
        case 5+1:
            accum += (data1[5]);

#line 754
        case 4+1:
            accum += (data1[4]);

#line 754
        case 3+1:
            accum += (data1[3]);

#line 754
        case 2+1:
            accum += (data1[2]);

#line 754
        case 1+1:
            accum += (data1[1]);

#line 754
        case 0+1:
            accum += (data1[0]);

        case 0:
            *(npy_int *)dataptr[2] += (value0 * accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+0));

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+4));

            data1 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+0));

#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+4));

#else
#line 813
        accum += (data1[0]);

#line 813
        accum += (data1[1]);

#line 813
        accum += (data1[2]);

#line 813
        accum += (data1[3]);

#line 813
        accum += (data1[4]);

#line 813
        accum += (data1[5]);

#line 813
        accum += (data1[6]);

#line 813
        accum += (data1[7]);

#endif
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
int_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int value1 = (*(npy_int *)dataptr[1]);
    npy_int accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_stride0_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 853
        case 6+1:
            accum += (data0[6]);

#line 853
        case 5+1:
            accum += (data0[5]);

#line 853
        case 4+1:
            accum += (data0[4]);

#line 853
        case 3+1:
            accum += (data0[3]);

#line 853
        case 2+1:
            accum += (data0[2]);

#line 853
        case 1+1:
            accum += (data0[1]);

#line 853
        case 0+1:
            accum += (data0[0]);

        case 0:
            *(npy_int *)dataptr[2] += (accum * value1);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#else
#line 912
        accum += (data0[0]);

#line 912
        accum += (data0[1]);

#line 912
        accum += (data0[2]);

#line 912
        accum += (data0[3]);

#line 912
        accum += (data0[4]);

#line 912
        accum += (data0[5]);

#line 912
        accum += (data0[6]);

#line 912
        accum += (data0[7]);

#endif
        data0 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1000 == 3 && !0

static void
int_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int *data2 = (npy_int *)dataptr[2];
    npy_int *data_out = (npy_int *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 949
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 949
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 949
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 949
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 949
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 949
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 949
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 949
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 1000 > 3 || @complex */

static void
int_sum_of_products_contig_any(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_any (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_int temp = (*(npy_int *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_int *)dataptr[i]);
        }
        *(npy_int *)dataptr[nop] = (temp +
                                           (*(npy_int *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_int);
        }
#else /* complex */
#  if 1000 <= 3
#    define _SUMPROD_NOP 1000
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_int re, im, tmp;
        int i;
        re = ((npy_int *)dataptr[0])[0];
        im = ((npy_int *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_int *)dataptr[i])[0] -
                  im * ((npy_int *)dataptr[i])[1];
            im = re * ((npy_int *)dataptr[i])[1] +
                 im * ((npy_int *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_int *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[0];
        ((npy_int *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_int);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 1000 */

#if 1000 == 1

static void
int_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_int accum_re = 0, accum_im = 0;
    npy_int *data0 = (npy_int *)dataptr[0];
#else
    npy_int accum = 0;
    npy_int *data0 = (npy_int *)dataptr[0];
#endif

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif


    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_outstride0_one (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 1058
        case 6+1:
#if !0
            accum += (data0[6]);
#else /* complex */
            accum_re += data0[2*6+0];
            accum_im += data0[2*6+1];
#endif

#line 1058
        case 5+1:
#if !0
            accum += (data0[5]);
#else /* complex */
            accum_re += data0[2*5+0];
            accum_im += data0[2*5+1];
#endif

#line 1058
        case 4+1:
#if !0
            accum += (data0[4]);
#else /* complex */
            accum_re += data0[2*4+0];
            accum_im += data0[2*4+1];
#endif

#line 1058
        case 3+1:
#if !0
            accum += (data0[3]);
#else /* complex */
            accum_re += data0[2*3+0];
            accum_im += data0[2*3+1];
#endif

#line 1058
        case 2+1:
#if !0
            accum += (data0[2]);
#else /* complex */
            accum_re += data0[2*2+0];
            accum_im += data0[2*2+1];
#endif

#line 1058
        case 1+1:
#if !0
            accum += (data0[1]);
#else /* complex */
            accum_re += data0[2*1+0];
            accum_im += data0[2*1+1];
#endif

#line 1058
        case 0+1:
#if !0
            accum += (data0[0]);
#else /* complex */
            accum_re += data0[2*0+0];
            accum_im += data0[2*0+1];
#endif

        case 0:
#if 0
            ((npy_int *)dataptr[1])[0] += accum_re;
            ((npy_int *)dataptr[1])[1] += accum_im;
#else
            *((npy_int *)dataptr[1]) = (accum +
                                    (*((npy_int *)dataptr[1])));
#endif
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+0));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+2));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+4));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+6));

            data0 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+0));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+2));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+4));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+6));

#else
#line 1171
#  if !0
        accum += (data0[0]);
#  else /* complex */
        accum_re += data0[2*0+0];
        accum_im += data0[2*0+1];
#  endif

#line 1171
#  if !0
        accum += (data0[1]);
#  else /* complex */
        accum_re += data0[2*1+0];
        accum_im += data0[2*1+1];
#  endif

#line 1171
#  if !0
        accum += (data0[2]);
#  else /* complex */
        accum_re += data0[2*2+0];
        accum_im += data0[2*2+1];
#  endif

#line 1171
#  if !0
        accum += (data0[3]);
#  else /* complex */
        accum_re += data0[2*3+0];
        accum_im += data0[2*3+1];
#  endif

#line 1171
#  if !0
        accum += (data0[4]);
#  else /* complex */
        accum_re += data0[2*4+0];
        accum_im += data0[2*4+1];
#  endif

#line 1171
#  if !0
        accum += (data0[5]);
#  else /* complex */
        accum_re += data0[2*5+0];
        accum_im += data0[2*5+1];
#  endif

#line 1171
#  if !0
        accum += (data0[6]);
#  else /* complex */
        accum_re += data0[2*6+0];
        accum_im += data0[2*6+1];
#  endif

#line 1171
#  if !0
        accum += (data0[7]);
#  else /* complex */
        accum_re += data0[2*7+0];
        accum_im += data0[2*7+1];
#  endif

#endif

#if !0
        data0 += 8;
#else
        data0 += 8*2;
#endif
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#endif /* 1000 == 1 */

static void
int_sum_of_products_outstride0_any(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_int accum_re = 0, accum_im = 0;
#else
    npy_int accum = 0;
#endif

#if (1000 == 1) || (1000 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1000 == 2 || 1000 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1000 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_outstride0_any (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 1000 == 1
        accum += (*(npy_int *)data0);
        data0 += stride0;
#  elif 1000 == 2
        accum += (*(npy_int *)data0) *
                 (*(npy_int *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 1000 == 3
        accum += (*(npy_int *)data0) *
                 (*(npy_int *)data1) *
                 (*(npy_int *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_int temp = (*(npy_int *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_int *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1000 == 1
        accum_re += ((npy_int *)data0)[0];
        accum_im += ((npy_int *)data0)[1];
        data0 += stride0;
#  else
#    if 1000 <= 3
#define _SUMPROD_NOP 1000
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_int re, im, tmp;
        int i;
        re = ((npy_int *)dataptr[0])[0];
        im = ((npy_int *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_int *)dataptr[i])[0] -
                  im * ((npy_int *)dataptr[i])[1];
            im = re * ((npy_int *)dataptr[i])[1] +
                 im * ((npy_int *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 1000 <= 3
    ((npy_int *)dataptr[1000])[0] += accum_re;
    ((npy_int *)dataptr[1000])[1] += accum_im;
#  else
    ((npy_int *)dataptr[nop])[0] += accum_re;
    ((npy_int *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 1000 <= 3
    *((npy_int *)dataptr[1000]) = (accum +
                                    (*((npy_int *)dataptr[1000])));
#  else
    *((npy_int *)dataptr[nop]) = (accum +
                                    (*((npy_int *)dataptr[nop])));
#  endif
#endif

}




#line 108

#line 113
static void
long_sum_of_products_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if (1 == 1) || (1 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1 == 2 || 1 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (1 == 1) || (1 <= 3 && !0)
    char *data_out = dataptr[1];
    npy_intp stride_out = strides[1];
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_one (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 1 == 1
        *(npy_long *)data_out = ((*(npy_long *)data0) +
                                         (*(npy_long *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 1 == 2
        *(npy_long *)data_out = ((*(npy_long *)data0) *
                                         (*(npy_long *)data1) +
                                         (*(npy_long *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 1 == 3
        *(npy_long *)data_out = ((*(npy_long *)data0) *
                                         (*(npy_long *)data1) *
                                         (*(npy_long *)data2) +
                                         (*(npy_long *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_long temp = (*(npy_long *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_long *)dataptr[i]);
        }
        *(npy_long *)dataptr[nop] = (temp +
                                           (*(npy_long *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1 == 1
        ((npy_long *)data_out)[0] = ((npy_long *)data0)[0] +
                                         ((npy_long *)data_out)[0];
        ((npy_long *)data_out)[1] = ((npy_long *)data0)[1] +
                                         ((npy_long *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 1 <= 3
#define _SUMPROD_NOP 1
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_long re, im, tmp;
        int i;
        re = ((npy_long *)dataptr[0])[0];
        im = ((npy_long *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_long *)dataptr[i])[0] -
                  im * ((npy_long *)dataptr[i])[1];
            im = re * ((npy_long *)dataptr[i])[1] +
                 im * ((npy_long *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_long *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[0];
        ((npy_long *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 1 == 1

static void
long_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data_out = (npy_long *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 228
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_long *)data_out + 2*6)[0] =
                                    ((npy_long *)data0 + 2*6)[0] +
                                    ((npy_long *)data_out + 2*6)[0];
            ((npy_long *)data_out + 2*6)[1] =
                                    ((npy_long *)data0 + 2*6)[1] +
                                    ((npy_long *)data_out + 2*6)[1];
#endif

#line 228
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_long *)data_out + 2*5)[0] =
                                    ((npy_long *)data0 + 2*5)[0] +
                                    ((npy_long *)data_out + 2*5)[0];
            ((npy_long *)data_out + 2*5)[1] =
                                    ((npy_long *)data0 + 2*5)[1] +
                                    ((npy_long *)data_out + 2*5)[1];
#endif

#line 228
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_long *)data_out + 2*4)[0] =
                                    ((npy_long *)data0 + 2*4)[0] +
                                    ((npy_long *)data_out + 2*4)[0];
            ((npy_long *)data_out + 2*4)[1] =
                                    ((npy_long *)data0 + 2*4)[1] +
                                    ((npy_long *)data_out + 2*4)[1];
#endif

#line 228
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_long *)data_out + 2*3)[0] =
                                    ((npy_long *)data0 + 2*3)[0] +
                                    ((npy_long *)data_out + 2*3)[0];
            ((npy_long *)data_out + 2*3)[1] =
                                    ((npy_long *)data0 + 2*3)[1] +
                                    ((npy_long *)data_out + 2*3)[1];
#endif

#line 228
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_long *)data_out + 2*2)[0] =
                                    ((npy_long *)data0 + 2*2)[0] +
                                    ((npy_long *)data_out + 2*2)[0];
            ((npy_long *)data_out + 2*2)[1] =
                                    ((npy_long *)data0 + 2*2)[1] +
                                    ((npy_long *)data_out + 2*2)[1];
#endif

#line 228
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_long *)data_out + 2*1)[0] =
                                    ((npy_long *)data0 + 2*1)[0] +
                                    ((npy_long *)data_out + 2*1)[0];
            ((npy_long *)data_out + 2*1)[1] =
                                    ((npy_long *)data0 + 2*1)[1] +
                                    ((npy_long *)data_out + 2*1)[1];
#endif

#line 228
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_long *)data_out + 2*0)[0] =
                                    ((npy_long *)data0 + 2*0)[0] +
                                    ((npy_long *)data_out + 2*0)[0];
            ((npy_long *)data_out + 2*0)[1] =
                                    ((npy_long *)data0 + 2*0)[1] +
                                    ((npy_long *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 252
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_long *)data_out + 2*0)[0] =
                                ((npy_long *)data0 + 2*0)[0] +
                                ((npy_long *)data_out + 2*0)[0];
        ((npy_long *)data_out + 2*0)[1] =
                                ((npy_long *)data0 + 2*0)[1] +
                                ((npy_long *)data_out + 2*0)[1];
#endif

#line 252
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_long *)data_out + 2*1)[0] =
                                ((npy_long *)data0 + 2*1)[0] +
                                ((npy_long *)data_out + 2*1)[0];
        ((npy_long *)data_out + 2*1)[1] =
                                ((npy_long *)data0 + 2*1)[1] +
                                ((npy_long *)data_out + 2*1)[1];
#endif

#line 252
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_long *)data_out + 2*2)[0] =
                                ((npy_long *)data0 + 2*2)[0] +
                                ((npy_long *)data_out + 2*2)[0];
        ((npy_long *)data_out + 2*2)[1] =
                                ((npy_long *)data0 + 2*2)[1] +
                                ((npy_long *)data_out + 2*2)[1];
#endif

#line 252
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_long *)data_out + 2*3)[0] =
                                ((npy_long *)data0 + 2*3)[0] +
                                ((npy_long *)data_out + 2*3)[0];
        ((npy_long *)data_out + 2*3)[1] =
                                ((npy_long *)data0 + 2*3)[1] +
                                ((npy_long *)data_out + 2*3)[1];
#endif

#line 252
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_long *)data_out + 2*4)[0] =
                                ((npy_long *)data0 + 2*4)[0] +
                                ((npy_long *)data_out + 2*4)[0];
        ((npy_long *)data_out + 2*4)[1] =
                                ((npy_long *)data0 + 2*4)[1] +
                                ((npy_long *)data_out + 2*4)[1];
#endif

#line 252
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_long *)data_out + 2*5)[0] =
                                ((npy_long *)data0 + 2*5)[0] +
                                ((npy_long *)data_out + 2*5)[0];
        ((npy_long *)data_out + 2*5)[1] =
                                ((npy_long *)data0 + 2*5)[1] +
                                ((npy_long *)data_out + 2*5)[1];
#endif

#line 252
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_long *)data_out + 2*6)[0] =
                                ((npy_long *)data0 + 2*6)[0] +
                                ((npy_long *)data_out + 2*6)[0];
        ((npy_long *)data_out + 2*6)[1] =
                                ((npy_long *)data0 + 2*6)[1] +
                                ((npy_long *)data_out + 2*6)[1];
#endif

#line 252
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_long *)data_out + 2*7)[0] =
                                ((npy_long *)data0 + 2*7)[0] +
                                ((npy_long *)data_out + 2*7)[0];
        ((npy_long *)data_out + 2*7)[1] =
                                ((npy_long *)data0 + 2*7)[1] +
                                ((npy_long *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1 == 2 && !0

static void
long_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long *data_out = (npy_long *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b;
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_two (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 295
        case 6+1:
            data_out[6] = ((data0[6]) *
                                 (data1[6]) +
                                 (data_out[6]));

#line 295
        case 5+1:
            data_out[5] = ((data0[5]) *
                                 (data1[5]) +
                                 (data_out[5]));

#line 295
        case 4+1:
            data_out[4] = ((data0[4]) *
                                 (data1[4]) +
                                 (data_out[4]));

#line 295
        case 3+1:
            data_out[3] = ((data0[3]) *
                                 (data1[3]) +
                                 (data_out[3]));

#line 295
        case 2+1:
            data_out[2] = ((data0[2]) *
                                 (data1[2]) +
                                 (data_out[2]));

#line 295
        case 1+1:
            data_out[1] = ((data0[1]) *
                                 (data1[1]) +
                                 (data_out[1]));

#line 295
        case 0+1:
            data_out[0] = ((data0[0]) *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
        EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 345
        data_out[0] = ((data0[0]) *
                             (data1[0]) +
                             (data_out[0]));

#line 345
        data_out[1] = ((data0[1]) *
                             (data1[1]) +
                             (data_out[1]));

#line 345
        data_out[2] = ((data0[2]) *
                             (data1[2]) +
                             (data_out[2]));

#line 345
        data_out[3] = ((data0[3]) *
                             (data1[3]) +
                             (data_out[3]));

#line 345
        data_out[4] = ((data0[4]) *
                             (data1[4]) +
                             (data_out[4]));

#line 345
        data_out[5] = ((data0[5]) *
                             (data1[5]) +
                             (data_out[5]));

#line 345
        data_out[6] = ((data0[6]) *
                             (data1[6]) +
                             (data_out[6]));

#line 345
        data_out[7] = ((data0[7]) *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data0 += 8;
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

/* Some extra specializations for the two operand case */
static void
long_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long value0 = (*(npy_long *)dataptr[0]);
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long *data_out = (npy_long *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value0_sse;
#elif EINSUM_USE_SSE2 && 0
    __m128d a, b, value0_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 383
        case 6+1:
            data_out[6] = (value0 *
                                 (data1[6]) +
                                 (data_out[6]));

#line 383
        case 5+1:
            data_out[5] = (value0 *
                                 (data1[5]) +
                                 (data_out[5]));

#line 383
        case 4+1:
            data_out[4] = (value0 *
                                 (data1[4]) +
                                 (data_out[4]));

#line 383
        case 3+1:
            data_out[3] = (value0 *
                                 (data1[3]) +
                                 (data_out[3]));

#line 383
        case 2+1:
            data_out[2] = (value0 *
                                 (data1[2]) +
                                 (data_out[2]));

#line 383
        case 1+1:
            data_out[1] = (value0 *
                                 (data1[1]) +
                                 (data_out[1]));

#line 383
        case 0+1:
            data_out[0] = (value0 *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value0_sse = _mm_set_ps1(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#elif EINSUM_USE_SSE2 && 0
    value0_sse = _mm_set1_pd(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+0));
            b = _mm_add_pd(a, _mm_load_pd(data_out+0));
            _mm_store_pd(data_out+0, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+2));
            b = _mm_add_pd(a, _mm_load_pd(data_out+2));
            _mm_store_pd(data_out+2, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+4));
            b = _mm_add_pd(a, _mm_load_pd(data_out+4));
            _mm_store_pd(data_out+4, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+6));
            b = _mm_add_pd(a, _mm_load_pd(data_out+6));
            _mm_store_pd(data_out+6, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#elif EINSUM_USE_SSE2 && 0
#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+0));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+0));
        _mm_storeu_pd(data_out+0, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+2));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+2));
        _mm_storeu_pd(data_out+2, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+4));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+4));
        _mm_storeu_pd(data_out+4, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+6));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+6));
        _mm_storeu_pd(data_out+6, b);

#else
#line 474
        data_out[0] = (value0 *
                             (data1[0]) +
                             (data_out[0]));

#line 474
        data_out[1] = (value0 *
                             (data1[1]) +
                             (data_out[1]));

#line 474
        data_out[2] = (value0 *
                             (data1[2]) +
                             (data_out[2]));

#line 474
        data_out[3] = (value0 *
                             (data1[3]) +
                             (data_out[3]));

#line 474
        data_out[4] = (value0 *
                             (data1[4]) +
                             (data_out[4]));

#line 474
        data_out[5] = (value0 *
                             (data1[5]) +
                             (data_out[5]));

#line 474
        data_out[6] = (value0 *
                             (data1[6]) +
                             (data_out[6]));

#line 474
        data_out[7] = (value0 *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    if (count > 0) {
        goto finish_after_unrolled_loop;
    }
}

static void
long_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long value1 = (*(npy_long *)dataptr[1]);
    npy_long *data_out = (npy_long *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value1_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 510
        case 6+1:
            data_out[6] = ((data0[6])*
                                 value1  +
                                 (data_out[6]));

#line 510
        case 5+1:
            data_out[5] = ((data0[5])*
                                 value1  +
                                 (data_out[5]));

#line 510
        case 4+1:
            data_out[4] = ((data0[4])*
                                 value1  +
                                 (data_out[4]));

#line 510
        case 3+1:
            data_out[3] = ((data0[3])*
                                 value1  +
                                 (data_out[3]));

#line 510
        case 2+1:
            data_out[2] = ((data0[2])*
                                 value1  +
                                 (data_out[2]));

#line 510
        case 1+1:
            data_out[1] = ((data0[1])*
                                 value1  +
                                 (data_out[1]));

#line 510
        case 0+1:
            data_out[0] = ((data0[0])*
                                 value1  +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value1_sse = _mm_set_ps1(value1);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+0), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+4), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 560
        data_out[0] = ((data0[0])*
                             value1  +
                             (data_out[0]));

#line 560
        data_out[1] = ((data0[1])*
                             value1  +
                             (data_out[1]));

#line 560
        data_out[2] = ((data0[2])*
                             value1  +
                             (data_out[2]));

#line 560
        data_out[3] = ((data0[3])*
                             value1  +
                             (data_out[3]));

#line 560
        data_out[4] = ((data0[4])*
                             value1  +
                             (data_out[4]));

#line 560
        data_out[5] = ((data0[5])*
                             value1  +
                             (data_out[5]));

#line 560
        data_out[6] = ((data0[6])*
                             value1  +
                             (data_out[6]));

#line 560
        data_out[7] = ((data0[7])*
                             value1  +
                             (data_out[7]));

#endif
        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
long_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 596
        case 6+1:
            accum += (data0[6]) * (data1[6]);

#line 596
        case 5+1:
            accum += (data0[5]) * (data1[5]);

#line 596
        case 4+1:
            accum += (data0[4]) * (data1[4]);

#line 596
        case 3+1:
            accum += (data0[3]) * (data1[3]);

#line 596
        case 2+1:
            accum += (data0[2]) * (data1[2]);

#line 596
        case 1+1:
            accum += (data0[1]) * (data1[1]);

#line 596
        case 0+1:
            accum += (data0[0]) * (data1[0]);

        case 0:
            *(npy_long *)dataptr[2] += (accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            accum_sse = _mm_add_ps(accum_sse, a);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            accum_sse = _mm_add_ps(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+0), _mm_load_pd(data1+0));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+2), _mm_load_pd(data1+2));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+4), _mm_load_pd(data1+4));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+6), _mm_load_pd(data1+6));
            accum_sse = _mm_add_pd(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        accum_sse = _mm_add_ps(accum_sse, a);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        accum_sse = _mm_add_ps(accum_sse, a);

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+0), _mm_loadu_pd(data1+0));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+2), _mm_loadu_pd(data1+2));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+4), _mm_loadu_pd(data1+4));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+6), _mm_loadu_pd(data1+6));
        accum_sse = _mm_add_pd(accum_sse, a);

#else
#line 708
        accum += (data0[0]) * (data1[0]);

#line 708
        accum += (data0[1]) * (data1[1]);

#line 708
        accum += (data0[2]) * (data1[2]);

#line 708
        accum += (data0[3]) * (data1[3]);

#line 708
        accum += (data0[4]) * (data1[4]);

#line 708
        accum += (data0[5]) * (data1[5]);

#line 708
        accum += (data0[6]) * (data1[6]);

#line 708
        accum += (data0[7]) * (data1[7]);

#endif
        data0 += 8;
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
long_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long value0 = (*(npy_long *)dataptr[0]);
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_stride0_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 754
        case 6+1:
            accum += (data1[6]);

#line 754
        case 5+1:
            accum += (data1[5]);

#line 754
        case 4+1:
            accum += (data1[4]);

#line 754
        case 3+1:
            accum += (data1[3]);

#line 754
        case 2+1:
            accum += (data1[2]);

#line 754
        case 1+1:
            accum += (data1[1]);

#line 754
        case 0+1:
            accum += (data1[0]);

        case 0:
            *(npy_long *)dataptr[2] += (value0 * accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+0));

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+4));

            data1 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+0));

#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+4));

#else
#line 813
        accum += (data1[0]);

#line 813
        accum += (data1[1]);

#line 813
        accum += (data1[2]);

#line 813
        accum += (data1[3]);

#line 813
        accum += (data1[4]);

#line 813
        accum += (data1[5]);

#line 813
        accum += (data1[6]);

#line 813
        accum += (data1[7]);

#endif
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
long_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long value1 = (*(npy_long *)dataptr[1]);
    npy_long accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_stride0_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 853
        case 6+1:
            accum += (data0[6]);

#line 853
        case 5+1:
            accum += (data0[5]);

#line 853
        case 4+1:
            accum += (data0[4]);

#line 853
        case 3+1:
            accum += (data0[3]);

#line 853
        case 2+1:
            accum += (data0[2]);

#line 853
        case 1+1:
            accum += (data0[1]);

#line 853
        case 0+1:
            accum += (data0[0]);

        case 0:
            *(npy_long *)dataptr[2] += (accum * value1);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#else
#line 912
        accum += (data0[0]);

#line 912
        accum += (data0[1]);

#line 912
        accum += (data0[2]);

#line 912
        accum += (data0[3]);

#line 912
        accum += (data0[4]);

#line 912
        accum += (data0[5]);

#line 912
        accum += (data0[6]);

#line 912
        accum += (data0[7]);

#endif
        data0 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1 == 3 && !0

static void
long_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long *data2 = (npy_long *)dataptr[2];
    npy_long *data_out = (npy_long *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 949
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 949
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 949
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 949
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 949
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 949
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 949
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 949
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 1 > 3 || @complex */

static void
long_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_one (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_long temp = (*(npy_long *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_long *)dataptr[i]);
        }
        *(npy_long *)dataptr[nop] = (temp +
                                           (*(npy_long *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_long);
        }
#else /* complex */
#  if 1 <= 3
#    define _SUMPROD_NOP 1
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_long re, im, tmp;
        int i;
        re = ((npy_long *)dataptr[0])[0];
        im = ((npy_long *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_long *)dataptr[i])[0] -
                  im * ((npy_long *)dataptr[i])[1];
            im = re * ((npy_long *)dataptr[i])[1] +
                 im * ((npy_long *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_long *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[0];
        ((npy_long *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_long);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 1 */

#if 1 == 1

static void
long_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_long accum_re = 0, accum_im = 0;
    npy_long *data0 = (npy_long *)dataptr[0];
#else
    npy_long accum = 0;
    npy_long *data0 = (npy_long *)dataptr[0];
#endif

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif


    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_outstride0_one (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 1058
        case 6+1:
#if !0
            accum += (data0[6]);
#else /* complex */
            accum_re += data0[2*6+0];
            accum_im += data0[2*6+1];
#endif

#line 1058
        case 5+1:
#if !0
            accum += (data0[5]);
#else /* complex */
            accum_re += data0[2*5+0];
            accum_im += data0[2*5+1];
#endif

#line 1058
        case 4+1:
#if !0
            accum += (data0[4]);
#else /* complex */
            accum_re += data0[2*4+0];
            accum_im += data0[2*4+1];
#endif

#line 1058
        case 3+1:
#if !0
            accum += (data0[3]);
#else /* complex */
            accum_re += data0[2*3+0];
            accum_im += data0[2*3+1];
#endif

#line 1058
        case 2+1:
#if !0
            accum += (data0[2]);
#else /* complex */
            accum_re += data0[2*2+0];
            accum_im += data0[2*2+1];
#endif

#line 1058
        case 1+1:
#if !0
            accum += (data0[1]);
#else /* complex */
            accum_re += data0[2*1+0];
            accum_im += data0[2*1+1];
#endif

#line 1058
        case 0+1:
#if !0
            accum += (data0[0]);
#else /* complex */
            accum_re += data0[2*0+0];
            accum_im += data0[2*0+1];
#endif

        case 0:
#if 0
            ((npy_long *)dataptr[1])[0] += accum_re;
            ((npy_long *)dataptr[1])[1] += accum_im;
#else
            *((npy_long *)dataptr[1]) = (accum +
                                    (*((npy_long *)dataptr[1])));
#endif
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+0));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+2));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+4));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+6));

            data0 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+0));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+2));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+4));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+6));

#else
#line 1171
#  if !0
        accum += (data0[0]);
#  else /* complex */
        accum_re += data0[2*0+0];
        accum_im += data0[2*0+1];
#  endif

#line 1171
#  if !0
        accum += (data0[1]);
#  else /* complex */
        accum_re += data0[2*1+0];
        accum_im += data0[2*1+1];
#  endif

#line 1171
#  if !0
        accum += (data0[2]);
#  else /* complex */
        accum_re += data0[2*2+0];
        accum_im += data0[2*2+1];
#  endif

#line 1171
#  if !0
        accum += (data0[3]);
#  else /* complex */
        accum_re += data0[2*3+0];
        accum_im += data0[2*3+1];
#  endif

#line 1171
#  if !0
        accum += (data0[4]);
#  else /* complex */
        accum_re += data0[2*4+0];
        accum_im += data0[2*4+1];
#  endif

#line 1171
#  if !0
        accum += (data0[5]);
#  else /* complex */
        accum_re += data0[2*5+0];
        accum_im += data0[2*5+1];
#  endif

#line 1171
#  if !0
        accum += (data0[6]);
#  else /* complex */
        accum_re += data0[2*6+0];
        accum_im += data0[2*6+1];
#  endif

#line 1171
#  if !0
        accum += (data0[7]);
#  else /* complex */
        accum_re += data0[2*7+0];
        accum_im += data0[2*7+1];
#  endif

#endif

#if !0
        data0 += 8;
#else
        data0 += 8*2;
#endif
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#endif /* 1 == 1 */

static void
long_sum_of_products_outstride0_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_long accum_re = 0, accum_im = 0;
#else
    npy_long accum = 0;
#endif

#if (1 == 1) || (1 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1 == 2 || 1 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_outstride0_one (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 1 == 1
        accum += (*(npy_long *)data0);
        data0 += stride0;
#  elif 1 == 2
        accum += (*(npy_long *)data0) *
                 (*(npy_long *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 1 == 3
        accum += (*(npy_long *)data0) *
                 (*(npy_long *)data1) *
                 (*(npy_long *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_long temp = (*(npy_long *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_long *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1 == 1
        accum_re += ((npy_long *)data0)[0];
        accum_im += ((npy_long *)data0)[1];
        data0 += stride0;
#  else
#    if 1 <= 3
#define _SUMPROD_NOP 1
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_long re, im, tmp;
        int i;
        re = ((npy_long *)dataptr[0])[0];
        im = ((npy_long *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_long *)dataptr[i])[0] -
                  im * ((npy_long *)dataptr[i])[1];
            im = re * ((npy_long *)dataptr[i])[1] +
                 im * ((npy_long *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 1 <= 3
    ((npy_long *)dataptr[1])[0] += accum_re;
    ((npy_long *)dataptr[1])[1] += accum_im;
#  else
    ((npy_long *)dataptr[nop])[0] += accum_re;
    ((npy_long *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 1 <= 3
    *((npy_long *)dataptr[1]) = (accum +
                                    (*((npy_long *)dataptr[1])));
#  else
    *((npy_long *)dataptr[nop]) = (accum +
                                    (*((npy_long *)dataptr[nop])));
#  endif
#endif

}


#line 113
static void
long_sum_of_products_two(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if (2 == 1) || (2 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (2 == 2 || 2 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (2 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (2 == 1) || (2 <= 3 && !0)
    char *data_out = dataptr[2];
    npy_intp stride_out = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_two (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 2 == 1
        *(npy_long *)data_out = ((*(npy_long *)data0) +
                                         (*(npy_long *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 2 == 2
        *(npy_long *)data_out = ((*(npy_long *)data0) *
                                         (*(npy_long *)data1) +
                                         (*(npy_long *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 2 == 3
        *(npy_long *)data_out = ((*(npy_long *)data0) *
                                         (*(npy_long *)data1) *
                                         (*(npy_long *)data2) +
                                         (*(npy_long *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_long temp = (*(npy_long *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_long *)dataptr[i]);
        }
        *(npy_long *)dataptr[nop] = (temp +
                                           (*(npy_long *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 2 == 1
        ((npy_long *)data_out)[0] = ((npy_long *)data0)[0] +
                                         ((npy_long *)data_out)[0];
        ((npy_long *)data_out)[1] = ((npy_long *)data0)[1] +
                                         ((npy_long *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 2 <= 3
#define _SUMPROD_NOP 2
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_long re, im, tmp;
        int i;
        re = ((npy_long *)dataptr[0])[0];
        im = ((npy_long *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_long *)dataptr[i])[0] -
                  im * ((npy_long *)dataptr[i])[1];
            im = re * ((npy_long *)dataptr[i])[1] +
                 im * ((npy_long *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_long *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[0];
        ((npy_long *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 2 == 1

static void
long_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data_out = (npy_long *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 228
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_long *)data_out + 2*6)[0] =
                                    ((npy_long *)data0 + 2*6)[0] +
                                    ((npy_long *)data_out + 2*6)[0];
            ((npy_long *)data_out + 2*6)[1] =
                                    ((npy_long *)data0 + 2*6)[1] +
                                    ((npy_long *)data_out + 2*6)[1];
#endif

#line 228
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_long *)data_out + 2*5)[0] =
                                    ((npy_long *)data0 + 2*5)[0] +
                                    ((npy_long *)data_out + 2*5)[0];
            ((npy_long *)data_out + 2*5)[1] =
                                    ((npy_long *)data0 + 2*5)[1] +
                                    ((npy_long *)data_out + 2*5)[1];
#endif

#line 228
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_long *)data_out + 2*4)[0] =
                                    ((npy_long *)data0 + 2*4)[0] +
                                    ((npy_long *)data_out + 2*4)[0];
            ((npy_long *)data_out + 2*4)[1] =
                                    ((npy_long *)data0 + 2*4)[1] +
                                    ((npy_long *)data_out + 2*4)[1];
#endif

#line 228
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_long *)data_out + 2*3)[0] =
                                    ((npy_long *)data0 + 2*3)[0] +
                                    ((npy_long *)data_out + 2*3)[0];
            ((npy_long *)data_out + 2*3)[1] =
                                    ((npy_long *)data0 + 2*3)[1] +
                                    ((npy_long *)data_out + 2*3)[1];
#endif

#line 228
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_long *)data_out + 2*2)[0] =
                                    ((npy_long *)data0 + 2*2)[0] +
                                    ((npy_long *)data_out + 2*2)[0];
            ((npy_long *)data_out + 2*2)[1] =
                                    ((npy_long *)data0 + 2*2)[1] +
                                    ((npy_long *)data_out + 2*2)[1];
#endif

#line 228
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_long *)data_out + 2*1)[0] =
                                    ((npy_long *)data0 + 2*1)[0] +
                                    ((npy_long *)data_out + 2*1)[0];
            ((npy_long *)data_out + 2*1)[1] =
                                    ((npy_long *)data0 + 2*1)[1] +
                                    ((npy_long *)data_out + 2*1)[1];
#endif

#line 228
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_long *)data_out + 2*0)[0] =
                                    ((npy_long *)data0 + 2*0)[0] +
                                    ((npy_long *)data_out + 2*0)[0];
            ((npy_long *)data_out + 2*0)[1] =
                                    ((npy_long *)data0 + 2*0)[1] +
                                    ((npy_long *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 252
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_long *)data_out + 2*0)[0] =
                                ((npy_long *)data0 + 2*0)[0] +
                                ((npy_long *)data_out + 2*0)[0];
        ((npy_long *)data_out + 2*0)[1] =
                                ((npy_long *)data0 + 2*0)[1] +
                                ((npy_long *)data_out + 2*0)[1];
#endif

#line 252
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_long *)data_out + 2*1)[0] =
                                ((npy_long *)data0 + 2*1)[0] +
                                ((npy_long *)data_out + 2*1)[0];
        ((npy_long *)data_out + 2*1)[1] =
                                ((npy_long *)data0 + 2*1)[1] +
                                ((npy_long *)data_out + 2*1)[1];
#endif

#line 252
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_long *)data_out + 2*2)[0] =
                                ((npy_long *)data0 + 2*2)[0] +
                                ((npy_long *)data_out + 2*2)[0];
        ((npy_long *)data_out + 2*2)[1] =
                                ((npy_long *)data0 + 2*2)[1] +
                                ((npy_long *)data_out + 2*2)[1];
#endif

#line 252
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_long *)data_out + 2*3)[0] =
                                ((npy_long *)data0 + 2*3)[0] +
                                ((npy_long *)data_out + 2*3)[0];
        ((npy_long *)data_out + 2*3)[1] =
                                ((npy_long *)data0 + 2*3)[1] +
                                ((npy_long *)data_out + 2*3)[1];
#endif

#line 252
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_long *)data_out + 2*4)[0] =
                                ((npy_long *)data0 + 2*4)[0] +
                                ((npy_long *)data_out + 2*4)[0];
        ((npy_long *)data_out + 2*4)[1] =
                                ((npy_long *)data0 + 2*4)[1] +
                                ((npy_long *)data_out + 2*4)[1];
#endif

#line 252
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_long *)data_out + 2*5)[0] =
                                ((npy_long *)data0 + 2*5)[0] +
                                ((npy_long *)data_out + 2*5)[0];
        ((npy_long *)data_out + 2*5)[1] =
                                ((npy_long *)data0 + 2*5)[1] +
                                ((npy_long *)data_out + 2*5)[1];
#endif

#line 252
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_long *)data_out + 2*6)[0] =
                                ((npy_long *)data0 + 2*6)[0] +
                                ((npy_long *)data_out + 2*6)[0];
        ((npy_long *)data_out + 2*6)[1] =
                                ((npy_long *)data0 + 2*6)[1] +
                                ((npy_long *)data_out + 2*6)[1];
#endif

#line 252
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_long *)data_out + 2*7)[0] =
                                ((npy_long *)data0 + 2*7)[0] +
                                ((npy_long *)data_out + 2*7)[0];
        ((npy_long *)data_out + 2*7)[1] =
                                ((npy_long *)data0 + 2*7)[1] +
                                ((npy_long *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 2 == 2 && !0

static void
long_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long *data_out = (npy_long *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b;
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_two (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 295
        case 6+1:
            data_out[6] = ((data0[6]) *
                                 (data1[6]) +
                                 (data_out[6]));

#line 295
        case 5+1:
            data_out[5] = ((data0[5]) *
                                 (data1[5]) +
                                 (data_out[5]));

#line 295
        case 4+1:
            data_out[4] = ((data0[4]) *
                                 (data1[4]) +
                                 (data_out[4]));

#line 295
        case 3+1:
            data_out[3] = ((data0[3]) *
                                 (data1[3]) +
                                 (data_out[3]));

#line 295
        case 2+1:
            data_out[2] = ((data0[2]) *
                                 (data1[2]) +
                                 (data_out[2]));

#line 295
        case 1+1:
            data_out[1] = ((data0[1]) *
                                 (data1[1]) +
                                 (data_out[1]));

#line 295
        case 0+1:
            data_out[0] = ((data0[0]) *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
        EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 345
        data_out[0] = ((data0[0]) *
                             (data1[0]) +
                             (data_out[0]));

#line 345
        data_out[1] = ((data0[1]) *
                             (data1[1]) +
                             (data_out[1]));

#line 345
        data_out[2] = ((data0[2]) *
                             (data1[2]) +
                             (data_out[2]));

#line 345
        data_out[3] = ((data0[3]) *
                             (data1[3]) +
                             (data_out[3]));

#line 345
        data_out[4] = ((data0[4]) *
                             (data1[4]) +
                             (data_out[4]));

#line 345
        data_out[5] = ((data0[5]) *
                             (data1[5]) +
                             (data_out[5]));

#line 345
        data_out[6] = ((data0[6]) *
                             (data1[6]) +
                             (data_out[6]));

#line 345
        data_out[7] = ((data0[7]) *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data0 += 8;
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

/* Some extra specializations for the two operand case */
static void
long_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long value0 = (*(npy_long *)dataptr[0]);
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long *data_out = (npy_long *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value0_sse;
#elif EINSUM_USE_SSE2 && 0
    __m128d a, b, value0_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 383
        case 6+1:
            data_out[6] = (value0 *
                                 (data1[6]) +
                                 (data_out[6]));

#line 383
        case 5+1:
            data_out[5] = (value0 *
                                 (data1[5]) +
                                 (data_out[5]));

#line 383
        case 4+1:
            data_out[4] = (value0 *
                                 (data1[4]) +
                                 (data_out[4]));

#line 383
        case 3+1:
            data_out[3] = (value0 *
                                 (data1[3]) +
                                 (data_out[3]));

#line 383
        case 2+1:
            data_out[2] = (value0 *
                                 (data1[2]) +
                                 (data_out[2]));

#line 383
        case 1+1:
            data_out[1] = (value0 *
                                 (data1[1]) +
                                 (data_out[1]));

#line 383
        case 0+1:
            data_out[0] = (value0 *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value0_sse = _mm_set_ps1(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#elif EINSUM_USE_SSE2 && 0
    value0_sse = _mm_set1_pd(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+0));
            b = _mm_add_pd(a, _mm_load_pd(data_out+0));
            _mm_store_pd(data_out+0, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+2));
            b = _mm_add_pd(a, _mm_load_pd(data_out+2));
            _mm_store_pd(data_out+2, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+4));
            b = _mm_add_pd(a, _mm_load_pd(data_out+4));
            _mm_store_pd(data_out+4, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+6));
            b = _mm_add_pd(a, _mm_load_pd(data_out+6));
            _mm_store_pd(data_out+6, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#elif EINSUM_USE_SSE2 && 0
#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+0));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+0));
        _mm_storeu_pd(data_out+0, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+2));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+2));
        _mm_storeu_pd(data_out+2, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+4));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+4));
        _mm_storeu_pd(data_out+4, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+6));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+6));
        _mm_storeu_pd(data_out+6, b);

#else
#line 474
        data_out[0] = (value0 *
                             (data1[0]) +
                             (data_out[0]));

#line 474
        data_out[1] = (value0 *
                             (data1[1]) +
                             (data_out[1]));

#line 474
        data_out[2] = (value0 *
                             (data1[2]) +
                             (data_out[2]));

#line 474
        data_out[3] = (value0 *
                             (data1[3]) +
                             (data_out[3]));

#line 474
        data_out[4] = (value0 *
                             (data1[4]) +
                             (data_out[4]));

#line 474
        data_out[5] = (value0 *
                             (data1[5]) +
                             (data_out[5]));

#line 474
        data_out[6] = (value0 *
                             (data1[6]) +
                             (data_out[6]));

#line 474
        data_out[7] = (value0 *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    if (count > 0) {
        goto finish_after_unrolled_loop;
    }
}

static void
long_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long value1 = (*(npy_long *)dataptr[1]);
    npy_long *data_out = (npy_long *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value1_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 510
        case 6+1:
            data_out[6] = ((data0[6])*
                                 value1  +
                                 (data_out[6]));

#line 510
        case 5+1:
            data_out[5] = ((data0[5])*
                                 value1  +
                                 (data_out[5]));

#line 510
        case 4+1:
            data_out[4] = ((data0[4])*
                                 value1  +
                                 (data_out[4]));

#line 510
        case 3+1:
            data_out[3] = ((data0[3])*
                                 value1  +
                                 (data_out[3]));

#line 510
        case 2+1:
            data_out[2] = ((data0[2])*
                                 value1  +
                                 (data_out[2]));

#line 510
        case 1+1:
            data_out[1] = ((data0[1])*
                                 value1  +
                                 (data_out[1]));

#line 510
        case 0+1:
            data_out[0] = ((data0[0])*
                                 value1  +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value1_sse = _mm_set_ps1(value1);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+0), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+4), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 560
        data_out[0] = ((data0[0])*
                             value1  +
                             (data_out[0]));

#line 560
        data_out[1] = ((data0[1])*
                             value1  +
                             (data_out[1]));

#line 560
        data_out[2] = ((data0[2])*
                             value1  +
                             (data_out[2]));

#line 560
        data_out[3] = ((data0[3])*
                             value1  +
                             (data_out[3]));

#line 560
        data_out[4] = ((data0[4])*
                             value1  +
                             (data_out[4]));

#line 560
        data_out[5] = ((data0[5])*
                             value1  +
                             (data_out[5]));

#line 560
        data_out[6] = ((data0[6])*
                             value1  +
                             (data_out[6]));

#line 560
        data_out[7] = ((data0[7])*
                             value1  +
                             (data_out[7]));

#endif
        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
long_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 596
        case 6+1:
            accum += (data0[6]) * (data1[6]);

#line 596
        case 5+1:
            accum += (data0[5]) * (data1[5]);

#line 596
        case 4+1:
            accum += (data0[4]) * (data1[4]);

#line 596
        case 3+1:
            accum += (data0[3]) * (data1[3]);

#line 596
        case 2+1:
            accum += (data0[2]) * (data1[2]);

#line 596
        case 1+1:
            accum += (data0[1]) * (data1[1]);

#line 596
        case 0+1:
            accum += (data0[0]) * (data1[0]);

        case 0:
            *(npy_long *)dataptr[2] += (accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            accum_sse = _mm_add_ps(accum_sse, a);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            accum_sse = _mm_add_ps(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+0), _mm_load_pd(data1+0));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+2), _mm_load_pd(data1+2));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+4), _mm_load_pd(data1+4));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+6), _mm_load_pd(data1+6));
            accum_sse = _mm_add_pd(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        accum_sse = _mm_add_ps(accum_sse, a);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        accum_sse = _mm_add_ps(accum_sse, a);

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+0), _mm_loadu_pd(data1+0));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+2), _mm_loadu_pd(data1+2));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+4), _mm_loadu_pd(data1+4));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+6), _mm_loadu_pd(data1+6));
        accum_sse = _mm_add_pd(accum_sse, a);

#else
#line 708
        accum += (data0[0]) * (data1[0]);

#line 708
        accum += (data0[1]) * (data1[1]);

#line 708
        accum += (data0[2]) * (data1[2]);

#line 708
        accum += (data0[3]) * (data1[3]);

#line 708
        accum += (data0[4]) * (data1[4]);

#line 708
        accum += (data0[5]) * (data1[5]);

#line 708
        accum += (data0[6]) * (data1[6]);

#line 708
        accum += (data0[7]) * (data1[7]);

#endif
        data0 += 8;
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
long_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long value0 = (*(npy_long *)dataptr[0]);
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_stride0_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 754
        case 6+1:
            accum += (data1[6]);

#line 754
        case 5+1:
            accum += (data1[5]);

#line 754
        case 4+1:
            accum += (data1[4]);

#line 754
        case 3+1:
            accum += (data1[3]);

#line 754
        case 2+1:
            accum += (data1[2]);

#line 754
        case 1+1:
            accum += (data1[1]);

#line 754
        case 0+1:
            accum += (data1[0]);

        case 0:
            *(npy_long *)dataptr[2] += (value0 * accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+0));

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+4));

            data1 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+0));

#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+4));

#else
#line 813
        accum += (data1[0]);

#line 813
        accum += (data1[1]);

#line 813
        accum += (data1[2]);

#line 813
        accum += (data1[3]);

#line 813
        accum += (data1[4]);

#line 813
        accum += (data1[5]);

#line 813
        accum += (data1[6]);

#line 813
        accum += (data1[7]);

#endif
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
long_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long value1 = (*(npy_long *)dataptr[1]);
    npy_long accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_stride0_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 853
        case 6+1:
            accum += (data0[6]);

#line 853
        case 5+1:
            accum += (data0[5]);

#line 853
        case 4+1:
            accum += (data0[4]);

#line 853
        case 3+1:
            accum += (data0[3]);

#line 853
        case 2+1:
            accum += (data0[2]);

#line 853
        case 1+1:
            accum += (data0[1]);

#line 853
        case 0+1:
            accum += (data0[0]);

        case 0:
            *(npy_long *)dataptr[2] += (accum * value1);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#else
#line 912
        accum += (data0[0]);

#line 912
        accum += (data0[1]);

#line 912
        accum += (data0[2]);

#line 912
        accum += (data0[3]);

#line 912
        accum += (data0[4]);

#line 912
        accum += (data0[5]);

#line 912
        accum += (data0[6]);

#line 912
        accum += (data0[7]);

#endif
        data0 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 2 == 3 && !0

static void
long_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long *data2 = (npy_long *)dataptr[2];
    npy_long *data_out = (npy_long *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 949
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 949
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 949
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 949
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 949
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 949
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 949
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 949
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 2 > 3 || @complex */

static void
long_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_two (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_long temp = (*(npy_long *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_long *)dataptr[i]);
        }
        *(npy_long *)dataptr[nop] = (temp +
                                           (*(npy_long *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_long);
        }
#else /* complex */
#  if 2 <= 3
#    define _SUMPROD_NOP 2
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_long re, im, tmp;
        int i;
        re = ((npy_long *)dataptr[0])[0];
        im = ((npy_long *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_long *)dataptr[i])[0] -
                  im * ((npy_long *)dataptr[i])[1];
            im = re * ((npy_long *)dataptr[i])[1] +
                 im * ((npy_long *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_long *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[0];
        ((npy_long *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_long);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 2 */

#if 2 == 1

static void
long_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_long accum_re = 0, accum_im = 0;
    npy_long *data0 = (npy_long *)dataptr[0];
#else
    npy_long accum = 0;
    npy_long *data0 = (npy_long *)dataptr[0];
#endif

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif


    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_outstride0_one (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 1058
        case 6+1:
#if !0
            accum += (data0[6]);
#else /* complex */
            accum_re += data0[2*6+0];
            accum_im += data0[2*6+1];
#endif

#line 1058
        case 5+1:
#if !0
            accum += (data0[5]);
#else /* complex */
            accum_re += data0[2*5+0];
            accum_im += data0[2*5+1];
#endif

#line 1058
        case 4+1:
#if !0
            accum += (data0[4]);
#else /* complex */
            accum_re += data0[2*4+0];
            accum_im += data0[2*4+1];
#endif

#line 1058
        case 3+1:
#if !0
            accum += (data0[3]);
#else /* complex */
            accum_re += data0[2*3+0];
            accum_im += data0[2*3+1];
#endif

#line 1058
        case 2+1:
#if !0
            accum += (data0[2]);
#else /* complex */
            accum_re += data0[2*2+0];
            accum_im += data0[2*2+1];
#endif

#line 1058
        case 1+1:
#if !0
            accum += (data0[1]);
#else /* complex */
            accum_re += data0[2*1+0];
            accum_im += data0[2*1+1];
#endif

#line 1058
        case 0+1:
#if !0
            accum += (data0[0]);
#else /* complex */
            accum_re += data0[2*0+0];
            accum_im += data0[2*0+1];
#endif

        case 0:
#if 0
            ((npy_long *)dataptr[1])[0] += accum_re;
            ((npy_long *)dataptr[1])[1] += accum_im;
#else
            *((npy_long *)dataptr[1]) = (accum +
                                    (*((npy_long *)dataptr[1])));
#endif
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+0));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+2));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+4));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+6));

            data0 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+0));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+2));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+4));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+6));

#else
#line 1171
#  if !0
        accum += (data0[0]);
#  else /* complex */
        accum_re += data0[2*0+0];
        accum_im += data0[2*0+1];
#  endif

#line 1171
#  if !0
        accum += (data0[1]);
#  else /* complex */
        accum_re += data0[2*1+0];
        accum_im += data0[2*1+1];
#  endif

#line 1171
#  if !0
        accum += (data0[2]);
#  else /* complex */
        accum_re += data0[2*2+0];
        accum_im += data0[2*2+1];
#  endif

#line 1171
#  if !0
        accum += (data0[3]);
#  else /* complex */
        accum_re += data0[2*3+0];
        accum_im += data0[2*3+1];
#  endif

#line 1171
#  if !0
        accum += (data0[4]);
#  else /* complex */
        accum_re += data0[2*4+0];
        accum_im += data0[2*4+1];
#  endif

#line 1171
#  if !0
        accum += (data0[5]);
#  else /* complex */
        accum_re += data0[2*5+0];
        accum_im += data0[2*5+1];
#  endif

#line 1171
#  if !0
        accum += (data0[6]);
#  else /* complex */
        accum_re += data0[2*6+0];
        accum_im += data0[2*6+1];
#  endif

#line 1171
#  if !0
        accum += (data0[7]);
#  else /* complex */
        accum_re += data0[2*7+0];
        accum_im += data0[2*7+1];
#  endif

#endif

#if !0
        data0 += 8;
#else
        data0 += 8*2;
#endif
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#endif /* 2 == 1 */

static void
long_sum_of_products_outstride0_two(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_long accum_re = 0, accum_im = 0;
#else
    npy_long accum = 0;
#endif

#if (2 == 1) || (2 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (2 == 2 || 2 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (2 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_outstride0_two (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 2 == 1
        accum += (*(npy_long *)data0);
        data0 += stride0;
#  elif 2 == 2
        accum += (*(npy_long *)data0) *
                 (*(npy_long *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 2 == 3
        accum += (*(npy_long *)data0) *
                 (*(npy_long *)data1) *
                 (*(npy_long *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_long temp = (*(npy_long *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_long *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 2 == 1
        accum_re += ((npy_long *)data0)[0];
        accum_im += ((npy_long *)data0)[1];
        data0 += stride0;
#  else
#    if 2 <= 3
#define _SUMPROD_NOP 2
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_long re, im, tmp;
        int i;
        re = ((npy_long *)dataptr[0])[0];
        im = ((npy_long *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_long *)dataptr[i])[0] -
                  im * ((npy_long *)dataptr[i])[1];
            im = re * ((npy_long *)dataptr[i])[1] +
                 im * ((npy_long *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 2 <= 3
    ((npy_long *)dataptr[2])[0] += accum_re;
    ((npy_long *)dataptr[2])[1] += accum_im;
#  else
    ((npy_long *)dataptr[nop])[0] += accum_re;
    ((npy_long *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 2 <= 3
    *((npy_long *)dataptr[2]) = (accum +
                                    (*((npy_long *)dataptr[2])));
#  else
    *((npy_long *)dataptr[nop]) = (accum +
                                    (*((npy_long *)dataptr[nop])));
#  endif
#endif

}


#line 113
static void
long_sum_of_products_three(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if (3 == 1) || (3 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (3 == 2 || 3 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (3 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (3 == 1) || (3 <= 3 && !0)
    char *data_out = dataptr[3];
    npy_intp stride_out = strides[3];
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_three (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 3 == 1
        *(npy_long *)data_out = ((*(npy_long *)data0) +
                                         (*(npy_long *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 3 == 2
        *(npy_long *)data_out = ((*(npy_long *)data0) *
                                         (*(npy_long *)data1) +
                                         (*(npy_long *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 3 == 3
        *(npy_long *)data_out = ((*(npy_long *)data0) *
                                         (*(npy_long *)data1) *
                                         (*(npy_long *)data2) +
                                         (*(npy_long *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_long temp = (*(npy_long *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_long *)dataptr[i]);
        }
        *(npy_long *)dataptr[nop] = (temp +
                                           (*(npy_long *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 3 == 1
        ((npy_long *)data_out)[0] = ((npy_long *)data0)[0] +
                                         ((npy_long *)data_out)[0];
        ((npy_long *)data_out)[1] = ((npy_long *)data0)[1] +
                                         ((npy_long *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 3 <= 3
#define _SUMPROD_NOP 3
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_long re, im, tmp;
        int i;
        re = ((npy_long *)dataptr[0])[0];
        im = ((npy_long *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_long *)dataptr[i])[0] -
                  im * ((npy_long *)dataptr[i])[1];
            im = re * ((npy_long *)dataptr[i])[1] +
                 im * ((npy_long *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_long *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[0];
        ((npy_long *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 3 == 1

static void
long_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data_out = (npy_long *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 228
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_long *)data_out + 2*6)[0] =
                                    ((npy_long *)data0 + 2*6)[0] +
                                    ((npy_long *)data_out + 2*6)[0];
            ((npy_long *)data_out + 2*6)[1] =
                                    ((npy_long *)data0 + 2*6)[1] +
                                    ((npy_long *)data_out + 2*6)[1];
#endif

#line 228
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_long *)data_out + 2*5)[0] =
                                    ((npy_long *)data0 + 2*5)[0] +
                                    ((npy_long *)data_out + 2*5)[0];
            ((npy_long *)data_out + 2*5)[1] =
                                    ((npy_long *)data0 + 2*5)[1] +
                                    ((npy_long *)data_out + 2*5)[1];
#endif

#line 228
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_long *)data_out + 2*4)[0] =
                                    ((npy_long *)data0 + 2*4)[0] +
                                    ((npy_long *)data_out + 2*4)[0];
            ((npy_long *)data_out + 2*4)[1] =
                                    ((npy_long *)data0 + 2*4)[1] +
                                    ((npy_long *)data_out + 2*4)[1];
#endif

#line 228
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_long *)data_out + 2*3)[0] =
                                    ((npy_long *)data0 + 2*3)[0] +
                                    ((npy_long *)data_out + 2*3)[0];
            ((npy_long *)data_out + 2*3)[1] =
                                    ((npy_long *)data0 + 2*3)[1] +
                                    ((npy_long *)data_out + 2*3)[1];
#endif

#line 228
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_long *)data_out + 2*2)[0] =
                                    ((npy_long *)data0 + 2*2)[0] +
                                    ((npy_long *)data_out + 2*2)[0];
            ((npy_long *)data_out + 2*2)[1] =
                                    ((npy_long *)data0 + 2*2)[1] +
                                    ((npy_long *)data_out + 2*2)[1];
#endif

#line 228
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_long *)data_out + 2*1)[0] =
                                    ((npy_long *)data0 + 2*1)[0] +
                                    ((npy_long *)data_out + 2*1)[0];
            ((npy_long *)data_out + 2*1)[1] =
                                    ((npy_long *)data0 + 2*1)[1] +
                                    ((npy_long *)data_out + 2*1)[1];
#endif

#line 228
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_long *)data_out + 2*0)[0] =
                                    ((npy_long *)data0 + 2*0)[0] +
                                    ((npy_long *)data_out + 2*0)[0];
            ((npy_long *)data_out + 2*0)[1] =
                                    ((npy_long *)data0 + 2*0)[1] +
                                    ((npy_long *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 252
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_long *)data_out + 2*0)[0] =
                                ((npy_long *)data0 + 2*0)[0] +
                                ((npy_long *)data_out + 2*0)[0];
        ((npy_long *)data_out + 2*0)[1] =
                                ((npy_long *)data0 + 2*0)[1] +
                                ((npy_long *)data_out + 2*0)[1];
#endif

#line 252
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_long *)data_out + 2*1)[0] =
                                ((npy_long *)data0 + 2*1)[0] +
                                ((npy_long *)data_out + 2*1)[0];
        ((npy_long *)data_out + 2*1)[1] =
                                ((npy_long *)data0 + 2*1)[1] +
                                ((npy_long *)data_out + 2*1)[1];
#endif

#line 252
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_long *)data_out + 2*2)[0] =
                                ((npy_long *)data0 + 2*2)[0] +
                                ((npy_long *)data_out + 2*2)[0];
        ((npy_long *)data_out + 2*2)[1] =
                                ((npy_long *)data0 + 2*2)[1] +
                                ((npy_long *)data_out + 2*2)[1];
#endif

#line 252
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_long *)data_out + 2*3)[0] =
                                ((npy_long *)data0 + 2*3)[0] +
                                ((npy_long *)data_out + 2*3)[0];
        ((npy_long *)data_out + 2*3)[1] =
                                ((npy_long *)data0 + 2*3)[1] +
                                ((npy_long *)data_out + 2*3)[1];
#endif

#line 252
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_long *)data_out + 2*4)[0] =
                                ((npy_long *)data0 + 2*4)[0] +
                                ((npy_long *)data_out + 2*4)[0];
        ((npy_long *)data_out + 2*4)[1] =
                                ((npy_long *)data0 + 2*4)[1] +
                                ((npy_long *)data_out + 2*4)[1];
#endif

#line 252
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_long *)data_out + 2*5)[0] =
                                ((npy_long *)data0 + 2*5)[0] +
                                ((npy_long *)data_out + 2*5)[0];
        ((npy_long *)data_out + 2*5)[1] =
                                ((npy_long *)data0 + 2*5)[1] +
                                ((npy_long *)data_out + 2*5)[1];
#endif

#line 252
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_long *)data_out + 2*6)[0] =
                                ((npy_long *)data0 + 2*6)[0] +
                                ((npy_long *)data_out + 2*6)[0];
        ((npy_long *)data_out + 2*6)[1] =
                                ((npy_long *)data0 + 2*6)[1] +
                                ((npy_long *)data_out + 2*6)[1];
#endif

#line 252
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_long *)data_out + 2*7)[0] =
                                ((npy_long *)data0 + 2*7)[0] +
                                ((npy_long *)data_out + 2*7)[0];
        ((npy_long *)data_out + 2*7)[1] =
                                ((npy_long *)data0 + 2*7)[1] +
                                ((npy_long *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 3 == 2 && !0

static void
long_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long *data_out = (npy_long *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b;
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_two (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 295
        case 6+1:
            data_out[6] = ((data0[6]) *
                                 (data1[6]) +
                                 (data_out[6]));

#line 295
        case 5+1:
            data_out[5] = ((data0[5]) *
                                 (data1[5]) +
                                 (data_out[5]));

#line 295
        case 4+1:
            data_out[4] = ((data0[4]) *
                                 (data1[4]) +
                                 (data_out[4]));

#line 295
        case 3+1:
            data_out[3] = ((data0[3]) *
                                 (data1[3]) +
                                 (data_out[3]));

#line 295
        case 2+1:
            data_out[2] = ((data0[2]) *
                                 (data1[2]) +
                                 (data_out[2]));

#line 295
        case 1+1:
            data_out[1] = ((data0[1]) *
                                 (data1[1]) +
                                 (data_out[1]));

#line 295
        case 0+1:
            data_out[0] = ((data0[0]) *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
        EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 345
        data_out[0] = ((data0[0]) *
                             (data1[0]) +
                             (data_out[0]));

#line 345
        data_out[1] = ((data0[1]) *
                             (data1[1]) +
                             (data_out[1]));

#line 345
        data_out[2] = ((data0[2]) *
                             (data1[2]) +
                             (data_out[2]));

#line 345
        data_out[3] = ((data0[3]) *
                             (data1[3]) +
                             (data_out[3]));

#line 345
        data_out[4] = ((data0[4]) *
                             (data1[4]) +
                             (data_out[4]));

#line 345
        data_out[5] = ((data0[5]) *
                             (data1[5]) +
                             (data_out[5]));

#line 345
        data_out[6] = ((data0[6]) *
                             (data1[6]) +
                             (data_out[6]));

#line 345
        data_out[7] = ((data0[7]) *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data0 += 8;
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

/* Some extra specializations for the two operand case */
static void
long_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long value0 = (*(npy_long *)dataptr[0]);
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long *data_out = (npy_long *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value0_sse;
#elif EINSUM_USE_SSE2 && 0
    __m128d a, b, value0_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 383
        case 6+1:
            data_out[6] = (value0 *
                                 (data1[6]) +
                                 (data_out[6]));

#line 383
        case 5+1:
            data_out[5] = (value0 *
                                 (data1[5]) +
                                 (data_out[5]));

#line 383
        case 4+1:
            data_out[4] = (value0 *
                                 (data1[4]) +
                                 (data_out[4]));

#line 383
        case 3+1:
            data_out[3] = (value0 *
                                 (data1[3]) +
                                 (data_out[3]));

#line 383
        case 2+1:
            data_out[2] = (value0 *
                                 (data1[2]) +
                                 (data_out[2]));

#line 383
        case 1+1:
            data_out[1] = (value0 *
                                 (data1[1]) +
                                 (data_out[1]));

#line 383
        case 0+1:
            data_out[0] = (value0 *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value0_sse = _mm_set_ps1(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#elif EINSUM_USE_SSE2 && 0
    value0_sse = _mm_set1_pd(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+0));
            b = _mm_add_pd(a, _mm_load_pd(data_out+0));
            _mm_store_pd(data_out+0, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+2));
            b = _mm_add_pd(a, _mm_load_pd(data_out+2));
            _mm_store_pd(data_out+2, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+4));
            b = _mm_add_pd(a, _mm_load_pd(data_out+4));
            _mm_store_pd(data_out+4, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+6));
            b = _mm_add_pd(a, _mm_load_pd(data_out+6));
            _mm_store_pd(data_out+6, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#elif EINSUM_USE_SSE2 && 0
#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+0));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+0));
        _mm_storeu_pd(data_out+0, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+2));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+2));
        _mm_storeu_pd(data_out+2, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+4));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+4));
        _mm_storeu_pd(data_out+4, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+6));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+6));
        _mm_storeu_pd(data_out+6, b);

#else
#line 474
        data_out[0] = (value0 *
                             (data1[0]) +
                             (data_out[0]));

#line 474
        data_out[1] = (value0 *
                             (data1[1]) +
                             (data_out[1]));

#line 474
        data_out[2] = (value0 *
                             (data1[2]) +
                             (data_out[2]));

#line 474
        data_out[3] = (value0 *
                             (data1[3]) +
                             (data_out[3]));

#line 474
        data_out[4] = (value0 *
                             (data1[4]) +
                             (data_out[4]));

#line 474
        data_out[5] = (value0 *
                             (data1[5]) +
                             (data_out[5]));

#line 474
        data_out[6] = (value0 *
                             (data1[6]) +
                             (data_out[6]));

#line 474
        data_out[7] = (value0 *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    if (count > 0) {
        goto finish_after_unrolled_loop;
    }
}

static void
long_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long value1 = (*(npy_long *)dataptr[1]);
    npy_long *data_out = (npy_long *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value1_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 510
        case 6+1:
            data_out[6] = ((data0[6])*
                                 value1  +
                                 (data_out[6]));

#line 510
        case 5+1:
            data_out[5] = ((data0[5])*
                                 value1  +
                                 (data_out[5]));

#line 510
        case 4+1:
            data_out[4] = ((data0[4])*
                                 value1  +
                                 (data_out[4]));

#line 510
        case 3+1:
            data_out[3] = ((data0[3])*
                                 value1  +
                                 (data_out[3]));

#line 510
        case 2+1:
            data_out[2] = ((data0[2])*
                                 value1  +
                                 (data_out[2]));

#line 510
        case 1+1:
            data_out[1] = ((data0[1])*
                                 value1  +
                                 (data_out[1]));

#line 510
        case 0+1:
            data_out[0] = ((data0[0])*
                                 value1  +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value1_sse = _mm_set_ps1(value1);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+0), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+4), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 560
        data_out[0] = ((data0[0])*
                             value1  +
                             (data_out[0]));

#line 560
        data_out[1] = ((data0[1])*
                             value1  +
                             (data_out[1]));

#line 560
        data_out[2] = ((data0[2])*
                             value1  +
                             (data_out[2]));

#line 560
        data_out[3] = ((data0[3])*
                             value1  +
                             (data_out[3]));

#line 560
        data_out[4] = ((data0[4])*
                             value1  +
                             (data_out[4]));

#line 560
        data_out[5] = ((data0[5])*
                             value1  +
                             (data_out[5]));

#line 560
        data_out[6] = ((data0[6])*
                             value1  +
                             (data_out[6]));

#line 560
        data_out[7] = ((data0[7])*
                             value1  +
                             (data_out[7]));

#endif
        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
long_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 596
        case 6+1:
            accum += (data0[6]) * (data1[6]);

#line 596
        case 5+1:
            accum += (data0[5]) * (data1[5]);

#line 596
        case 4+1:
            accum += (data0[4]) * (data1[4]);

#line 596
        case 3+1:
            accum += (data0[3]) * (data1[3]);

#line 596
        case 2+1:
            accum += (data0[2]) * (data1[2]);

#line 596
        case 1+1:
            accum += (data0[1]) * (data1[1]);

#line 596
        case 0+1:
            accum += (data0[0]) * (data1[0]);

        case 0:
            *(npy_long *)dataptr[2] += (accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            accum_sse = _mm_add_ps(accum_sse, a);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            accum_sse = _mm_add_ps(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+0), _mm_load_pd(data1+0));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+2), _mm_load_pd(data1+2));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+4), _mm_load_pd(data1+4));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+6), _mm_load_pd(data1+6));
            accum_sse = _mm_add_pd(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        accum_sse = _mm_add_ps(accum_sse, a);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        accum_sse = _mm_add_ps(accum_sse, a);

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+0), _mm_loadu_pd(data1+0));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+2), _mm_loadu_pd(data1+2));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+4), _mm_loadu_pd(data1+4));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+6), _mm_loadu_pd(data1+6));
        accum_sse = _mm_add_pd(accum_sse, a);

#else
#line 708
        accum += (data0[0]) * (data1[0]);

#line 708
        accum += (data0[1]) * (data1[1]);

#line 708
        accum += (data0[2]) * (data1[2]);

#line 708
        accum += (data0[3]) * (data1[3]);

#line 708
        accum += (data0[4]) * (data1[4]);

#line 708
        accum += (data0[5]) * (data1[5]);

#line 708
        accum += (data0[6]) * (data1[6]);

#line 708
        accum += (data0[7]) * (data1[7]);

#endif
        data0 += 8;
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
long_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long value0 = (*(npy_long *)dataptr[0]);
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_stride0_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 754
        case 6+1:
            accum += (data1[6]);

#line 754
        case 5+1:
            accum += (data1[5]);

#line 754
        case 4+1:
            accum += (data1[4]);

#line 754
        case 3+1:
            accum += (data1[3]);

#line 754
        case 2+1:
            accum += (data1[2]);

#line 754
        case 1+1:
            accum += (data1[1]);

#line 754
        case 0+1:
            accum += (data1[0]);

        case 0:
            *(npy_long *)dataptr[2] += (value0 * accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+0));

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+4));

            data1 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+0));

#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+4));

#else
#line 813
        accum += (data1[0]);

#line 813
        accum += (data1[1]);

#line 813
        accum += (data1[2]);

#line 813
        accum += (data1[3]);

#line 813
        accum += (data1[4]);

#line 813
        accum += (data1[5]);

#line 813
        accum += (data1[6]);

#line 813
        accum += (data1[7]);

#endif
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
long_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long value1 = (*(npy_long *)dataptr[1]);
    npy_long accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_stride0_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 853
        case 6+1:
            accum += (data0[6]);

#line 853
        case 5+1:
            accum += (data0[5]);

#line 853
        case 4+1:
            accum += (data0[4]);

#line 853
        case 3+1:
            accum += (data0[3]);

#line 853
        case 2+1:
            accum += (data0[2]);

#line 853
        case 1+1:
            accum += (data0[1]);

#line 853
        case 0+1:
            accum += (data0[0]);

        case 0:
            *(npy_long *)dataptr[2] += (accum * value1);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#else
#line 912
        accum += (data0[0]);

#line 912
        accum += (data0[1]);

#line 912
        accum += (data0[2]);

#line 912
        accum += (data0[3]);

#line 912
        accum += (data0[4]);

#line 912
        accum += (data0[5]);

#line 912
        accum += (data0[6]);

#line 912
        accum += (data0[7]);

#endif
        data0 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 3 == 3 && !0

static void
long_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long *data2 = (npy_long *)dataptr[2];
    npy_long *data_out = (npy_long *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 949
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 949
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 949
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 949
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 949
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 949
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 949
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 949
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 3 > 3 || @complex */

static void
long_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_three (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_long temp = (*(npy_long *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_long *)dataptr[i]);
        }
        *(npy_long *)dataptr[nop] = (temp +
                                           (*(npy_long *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_long);
        }
#else /* complex */
#  if 3 <= 3
#    define _SUMPROD_NOP 3
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_long re, im, tmp;
        int i;
        re = ((npy_long *)dataptr[0])[0];
        im = ((npy_long *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_long *)dataptr[i])[0] -
                  im * ((npy_long *)dataptr[i])[1];
            im = re * ((npy_long *)dataptr[i])[1] +
                 im * ((npy_long *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_long *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[0];
        ((npy_long *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_long);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 3 */

#if 3 == 1

static void
long_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_long accum_re = 0, accum_im = 0;
    npy_long *data0 = (npy_long *)dataptr[0];
#else
    npy_long accum = 0;
    npy_long *data0 = (npy_long *)dataptr[0];
#endif

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif


    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_outstride0_one (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 1058
        case 6+1:
#if !0
            accum += (data0[6]);
#else /* complex */
            accum_re += data0[2*6+0];
            accum_im += data0[2*6+1];
#endif

#line 1058
        case 5+1:
#if !0
            accum += (data0[5]);
#else /* complex */
            accum_re += data0[2*5+0];
            accum_im += data0[2*5+1];
#endif

#line 1058
        case 4+1:
#if !0
            accum += (data0[4]);
#else /* complex */
            accum_re += data0[2*4+0];
            accum_im += data0[2*4+1];
#endif

#line 1058
        case 3+1:
#if !0
            accum += (data0[3]);
#else /* complex */
            accum_re += data0[2*3+0];
            accum_im += data0[2*3+1];
#endif

#line 1058
        case 2+1:
#if !0
            accum += (data0[2]);
#else /* complex */
            accum_re += data0[2*2+0];
            accum_im += data0[2*2+1];
#endif

#line 1058
        case 1+1:
#if !0
            accum += (data0[1]);
#else /* complex */
            accum_re += data0[2*1+0];
            accum_im += data0[2*1+1];
#endif

#line 1058
        case 0+1:
#if !0
            accum += (data0[0]);
#else /* complex */
            accum_re += data0[2*0+0];
            accum_im += data0[2*0+1];
#endif

        case 0:
#if 0
            ((npy_long *)dataptr[1])[0] += accum_re;
            ((npy_long *)dataptr[1])[1] += accum_im;
#else
            *((npy_long *)dataptr[1]) = (accum +
                                    (*((npy_long *)dataptr[1])));
#endif
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+0));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+2));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+4));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+6));

            data0 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+0));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+2));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+4));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+6));

#else
#line 1171
#  if !0
        accum += (data0[0]);
#  else /* complex */
        accum_re += data0[2*0+0];
        accum_im += data0[2*0+1];
#  endif

#line 1171
#  if !0
        accum += (data0[1]);
#  else /* complex */
        accum_re += data0[2*1+0];
        accum_im += data0[2*1+1];
#  endif

#line 1171
#  if !0
        accum += (data0[2]);
#  else /* complex */
        accum_re += data0[2*2+0];
        accum_im += data0[2*2+1];
#  endif

#line 1171
#  if !0
        accum += (data0[3]);
#  else /* complex */
        accum_re += data0[2*3+0];
        accum_im += data0[2*3+1];
#  endif

#line 1171
#  if !0
        accum += (data0[4]);
#  else /* complex */
        accum_re += data0[2*4+0];
        accum_im += data0[2*4+1];
#  endif

#line 1171
#  if !0
        accum += (data0[5]);
#  else /* complex */
        accum_re += data0[2*5+0];
        accum_im += data0[2*5+1];
#  endif

#line 1171
#  if !0
        accum += (data0[6]);
#  else /* complex */
        accum_re += data0[2*6+0];
        accum_im += data0[2*6+1];
#  endif

#line 1171
#  if !0
        accum += (data0[7]);
#  else /* complex */
        accum_re += data0[2*7+0];
        accum_im += data0[2*7+1];
#  endif

#endif

#if !0
        data0 += 8;
#else
        data0 += 8*2;
#endif
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#endif /* 3 == 1 */

static void
long_sum_of_products_outstride0_three(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_long accum_re = 0, accum_im = 0;
#else
    npy_long accum = 0;
#endif

#if (3 == 1) || (3 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (3 == 2 || 3 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (3 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_outstride0_three (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 3 == 1
        accum += (*(npy_long *)data0);
        data0 += stride0;
#  elif 3 == 2
        accum += (*(npy_long *)data0) *
                 (*(npy_long *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 3 == 3
        accum += (*(npy_long *)data0) *
                 (*(npy_long *)data1) *
                 (*(npy_long *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_long temp = (*(npy_long *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_long *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 3 == 1
        accum_re += ((npy_long *)data0)[0];
        accum_im += ((npy_long *)data0)[1];
        data0 += stride0;
#  else
#    if 3 <= 3
#define _SUMPROD_NOP 3
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_long re, im, tmp;
        int i;
        re = ((npy_long *)dataptr[0])[0];
        im = ((npy_long *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_long *)dataptr[i])[0] -
                  im * ((npy_long *)dataptr[i])[1];
            im = re * ((npy_long *)dataptr[i])[1] +
                 im * ((npy_long *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 3 <= 3
    ((npy_long *)dataptr[3])[0] += accum_re;
    ((npy_long *)dataptr[3])[1] += accum_im;
#  else
    ((npy_long *)dataptr[nop])[0] += accum_re;
    ((npy_long *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 3 <= 3
    *((npy_long *)dataptr[3]) = (accum +
                                    (*((npy_long *)dataptr[3])));
#  else
    *((npy_long *)dataptr[nop]) = (accum +
                                    (*((npy_long *)dataptr[nop])));
#  endif
#endif

}


#line 113
static void
long_sum_of_products_any(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if (1000 == 1) || (1000 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1000 == 2 || 1000 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1000 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (1000 == 1) || (1000 <= 3 && !0)
    char *data_out = dataptr[1000];
    npy_intp stride_out = strides[1000];
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_any (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 1000 == 1
        *(npy_long *)data_out = ((*(npy_long *)data0) +
                                         (*(npy_long *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 1000 == 2
        *(npy_long *)data_out = ((*(npy_long *)data0) *
                                         (*(npy_long *)data1) +
                                         (*(npy_long *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 1000 == 3
        *(npy_long *)data_out = ((*(npy_long *)data0) *
                                         (*(npy_long *)data1) *
                                         (*(npy_long *)data2) +
                                         (*(npy_long *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_long temp = (*(npy_long *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_long *)dataptr[i]);
        }
        *(npy_long *)dataptr[nop] = (temp +
                                           (*(npy_long *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1000 == 1
        ((npy_long *)data_out)[0] = ((npy_long *)data0)[0] +
                                         ((npy_long *)data_out)[0];
        ((npy_long *)data_out)[1] = ((npy_long *)data0)[1] +
                                         ((npy_long *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 1000 <= 3
#define _SUMPROD_NOP 1000
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_long re, im, tmp;
        int i;
        re = ((npy_long *)dataptr[0])[0];
        im = ((npy_long *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_long *)dataptr[i])[0] -
                  im * ((npy_long *)dataptr[i])[1];
            im = re * ((npy_long *)dataptr[i])[1] +
                 im * ((npy_long *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_long *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[0];
        ((npy_long *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 1000 == 1

static void
long_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data_out = (npy_long *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 228
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_long *)data_out + 2*6)[0] =
                                    ((npy_long *)data0 + 2*6)[0] +
                                    ((npy_long *)data_out + 2*6)[0];
            ((npy_long *)data_out + 2*6)[1] =
                                    ((npy_long *)data0 + 2*6)[1] +
                                    ((npy_long *)data_out + 2*6)[1];
#endif

#line 228
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_long *)data_out + 2*5)[0] =
                                    ((npy_long *)data0 + 2*5)[0] +
                                    ((npy_long *)data_out + 2*5)[0];
            ((npy_long *)data_out + 2*5)[1] =
                                    ((npy_long *)data0 + 2*5)[1] +
                                    ((npy_long *)data_out + 2*5)[1];
#endif

#line 228
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_long *)data_out + 2*4)[0] =
                                    ((npy_long *)data0 + 2*4)[0] +
                                    ((npy_long *)data_out + 2*4)[0];
            ((npy_long *)data_out + 2*4)[1] =
                                    ((npy_long *)data0 + 2*4)[1] +
                                    ((npy_long *)data_out + 2*4)[1];
#endif

#line 228
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_long *)data_out + 2*3)[0] =
                                    ((npy_long *)data0 + 2*3)[0] +
                                    ((npy_long *)data_out + 2*3)[0];
            ((npy_long *)data_out + 2*3)[1] =
                                    ((npy_long *)data0 + 2*3)[1] +
                                    ((npy_long *)data_out + 2*3)[1];
#endif

#line 228
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_long *)data_out + 2*2)[0] =
                                    ((npy_long *)data0 + 2*2)[0] +
                                    ((npy_long *)data_out + 2*2)[0];
            ((npy_long *)data_out + 2*2)[1] =
                                    ((npy_long *)data0 + 2*2)[1] +
                                    ((npy_long *)data_out + 2*2)[1];
#endif

#line 228
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_long *)data_out + 2*1)[0] =
                                    ((npy_long *)data0 + 2*1)[0] +
                                    ((npy_long *)data_out + 2*1)[0];
            ((npy_long *)data_out + 2*1)[1] =
                                    ((npy_long *)data0 + 2*1)[1] +
                                    ((npy_long *)data_out + 2*1)[1];
#endif

#line 228
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_long *)data_out + 2*0)[0] =
                                    ((npy_long *)data0 + 2*0)[0] +
                                    ((npy_long *)data_out + 2*0)[0];
            ((npy_long *)data_out + 2*0)[1] =
                                    ((npy_long *)data0 + 2*0)[1] +
                                    ((npy_long *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 252
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_long *)data_out + 2*0)[0] =
                                ((npy_long *)data0 + 2*0)[0] +
                                ((npy_long *)data_out + 2*0)[0];
        ((npy_long *)data_out + 2*0)[1] =
                                ((npy_long *)data0 + 2*0)[1] +
                                ((npy_long *)data_out + 2*0)[1];
#endif

#line 252
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_long *)data_out + 2*1)[0] =
                                ((npy_long *)data0 + 2*1)[0] +
                                ((npy_long *)data_out + 2*1)[0];
        ((npy_long *)data_out + 2*1)[1] =
                                ((npy_long *)data0 + 2*1)[1] +
                                ((npy_long *)data_out + 2*1)[1];
#endif

#line 252
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_long *)data_out + 2*2)[0] =
                                ((npy_long *)data0 + 2*2)[0] +
                                ((npy_long *)data_out + 2*2)[0];
        ((npy_long *)data_out + 2*2)[1] =
                                ((npy_long *)data0 + 2*2)[1] +
                                ((npy_long *)data_out + 2*2)[1];
#endif

#line 252
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_long *)data_out + 2*3)[0] =
                                ((npy_long *)data0 + 2*3)[0] +
                                ((npy_long *)data_out + 2*3)[0];
        ((npy_long *)data_out + 2*3)[1] =
                                ((npy_long *)data0 + 2*3)[1] +
                                ((npy_long *)data_out + 2*3)[1];
#endif

#line 252
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_long *)data_out + 2*4)[0] =
                                ((npy_long *)data0 + 2*4)[0] +
                                ((npy_long *)data_out + 2*4)[0];
        ((npy_long *)data_out + 2*4)[1] =
                                ((npy_long *)data0 + 2*4)[1] +
                                ((npy_long *)data_out + 2*4)[1];
#endif

#line 252
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_long *)data_out + 2*5)[0] =
                                ((npy_long *)data0 + 2*5)[0] +
                                ((npy_long *)data_out + 2*5)[0];
        ((npy_long *)data_out + 2*5)[1] =
                                ((npy_long *)data0 + 2*5)[1] +
                                ((npy_long *)data_out + 2*5)[1];
#endif

#line 252
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_long *)data_out + 2*6)[0] =
                                ((npy_long *)data0 + 2*6)[0] +
                                ((npy_long *)data_out + 2*6)[0];
        ((npy_long *)data_out + 2*6)[1] =
                                ((npy_long *)data0 + 2*6)[1] +
                                ((npy_long *)data_out + 2*6)[1];
#endif

#line 252
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_long *)data_out + 2*7)[0] =
                                ((npy_long *)data0 + 2*7)[0] +
                                ((npy_long *)data_out + 2*7)[0];
        ((npy_long *)data_out + 2*7)[1] =
                                ((npy_long *)data0 + 2*7)[1] +
                                ((npy_long *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1000 == 2 && !0

static void
long_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long *data_out = (npy_long *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b;
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_two (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 295
        case 6+1:
            data_out[6] = ((data0[6]) *
                                 (data1[6]) +
                                 (data_out[6]));

#line 295
        case 5+1:
            data_out[5] = ((data0[5]) *
                                 (data1[5]) +
                                 (data_out[5]));

#line 295
        case 4+1:
            data_out[4] = ((data0[4]) *
                                 (data1[4]) +
                                 (data_out[4]));

#line 295
        case 3+1:
            data_out[3] = ((data0[3]) *
                                 (data1[3]) +
                                 (data_out[3]));

#line 295
        case 2+1:
            data_out[2] = ((data0[2]) *
                                 (data1[2]) +
                                 (data_out[2]));

#line 295
        case 1+1:
            data_out[1] = ((data0[1]) *
                                 (data1[1]) +
                                 (data_out[1]));

#line 295
        case 0+1:
            data_out[0] = ((data0[0]) *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
        EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 345
        data_out[0] = ((data0[0]) *
                             (data1[0]) +
                             (data_out[0]));

#line 345
        data_out[1] = ((data0[1]) *
                             (data1[1]) +
                             (data_out[1]));

#line 345
        data_out[2] = ((data0[2]) *
                             (data1[2]) +
                             (data_out[2]));

#line 345
        data_out[3] = ((data0[3]) *
                             (data1[3]) +
                             (data_out[3]));

#line 345
        data_out[4] = ((data0[4]) *
                             (data1[4]) +
                             (data_out[4]));

#line 345
        data_out[5] = ((data0[5]) *
                             (data1[5]) +
                             (data_out[5]));

#line 345
        data_out[6] = ((data0[6]) *
                             (data1[6]) +
                             (data_out[6]));

#line 345
        data_out[7] = ((data0[7]) *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data0 += 8;
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

/* Some extra specializations for the two operand case */
static void
long_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long value0 = (*(npy_long *)dataptr[0]);
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long *data_out = (npy_long *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value0_sse;
#elif EINSUM_USE_SSE2 && 0
    __m128d a, b, value0_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 383
        case 6+1:
            data_out[6] = (value0 *
                                 (data1[6]) +
                                 (data_out[6]));

#line 383
        case 5+1:
            data_out[5] = (value0 *
                                 (data1[5]) +
                                 (data_out[5]));

#line 383
        case 4+1:
            data_out[4] = (value0 *
                                 (data1[4]) +
                                 (data_out[4]));

#line 383
        case 3+1:
            data_out[3] = (value0 *
                                 (data1[3]) +
                                 (data_out[3]));

#line 383
        case 2+1:
            data_out[2] = (value0 *
                                 (data1[2]) +
                                 (data_out[2]));

#line 383
        case 1+1:
            data_out[1] = (value0 *
                                 (data1[1]) +
                                 (data_out[1]));

#line 383
        case 0+1:
            data_out[0] = (value0 *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value0_sse = _mm_set_ps1(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#elif EINSUM_USE_SSE2 && 0
    value0_sse = _mm_set1_pd(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+0));
            b = _mm_add_pd(a, _mm_load_pd(data_out+0));
            _mm_store_pd(data_out+0, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+2));
            b = _mm_add_pd(a, _mm_load_pd(data_out+2));
            _mm_store_pd(data_out+2, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+4));
            b = _mm_add_pd(a, _mm_load_pd(data_out+4));
            _mm_store_pd(data_out+4, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+6));
            b = _mm_add_pd(a, _mm_load_pd(data_out+6));
            _mm_store_pd(data_out+6, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#elif EINSUM_USE_SSE2 && 0
#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+0));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+0));
        _mm_storeu_pd(data_out+0, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+2));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+2));
        _mm_storeu_pd(data_out+2, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+4));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+4));
        _mm_storeu_pd(data_out+4, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+6));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+6));
        _mm_storeu_pd(data_out+6, b);

#else
#line 474
        data_out[0] = (value0 *
                             (data1[0]) +
                             (data_out[0]));

#line 474
        data_out[1] = (value0 *
                             (data1[1]) +
                             (data_out[1]));

#line 474
        data_out[2] = (value0 *
                             (data1[2]) +
                             (data_out[2]));

#line 474
        data_out[3] = (value0 *
                             (data1[3]) +
                             (data_out[3]));

#line 474
        data_out[4] = (value0 *
                             (data1[4]) +
                             (data_out[4]));

#line 474
        data_out[5] = (value0 *
                             (data1[5]) +
                             (data_out[5]));

#line 474
        data_out[6] = (value0 *
                             (data1[6]) +
                             (data_out[6]));

#line 474
        data_out[7] = (value0 *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    if (count > 0) {
        goto finish_after_unrolled_loop;
    }
}

static void
long_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long value1 = (*(npy_long *)dataptr[1]);
    npy_long *data_out = (npy_long *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value1_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 510
        case 6+1:
            data_out[6] = ((data0[6])*
                                 value1  +
                                 (data_out[6]));

#line 510
        case 5+1:
            data_out[5] = ((data0[5])*
                                 value1  +
                                 (data_out[5]));

#line 510
        case 4+1:
            data_out[4] = ((data0[4])*
                                 value1  +
                                 (data_out[4]));

#line 510
        case 3+1:
            data_out[3] = ((data0[3])*
                                 value1  +
                                 (data_out[3]));

#line 510
        case 2+1:
            data_out[2] = ((data0[2])*
                                 value1  +
                                 (data_out[2]));

#line 510
        case 1+1:
            data_out[1] = ((data0[1])*
                                 value1  +
                                 (data_out[1]));

#line 510
        case 0+1:
            data_out[0] = ((data0[0])*
                                 value1  +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value1_sse = _mm_set_ps1(value1);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+0), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+4), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 560
        data_out[0] = ((data0[0])*
                             value1  +
                             (data_out[0]));

#line 560
        data_out[1] = ((data0[1])*
                             value1  +
                             (data_out[1]));

#line 560
        data_out[2] = ((data0[2])*
                             value1  +
                             (data_out[2]));

#line 560
        data_out[3] = ((data0[3])*
                             value1  +
                             (data_out[3]));

#line 560
        data_out[4] = ((data0[4])*
                             value1  +
                             (data_out[4]));

#line 560
        data_out[5] = ((data0[5])*
                             value1  +
                             (data_out[5]));

#line 560
        data_out[6] = ((data0[6])*
                             value1  +
                             (data_out[6]));

#line 560
        data_out[7] = ((data0[7])*
                             value1  +
                             (data_out[7]));

#endif
        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
long_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 596
        case 6+1:
            accum += (data0[6]) * (data1[6]);

#line 596
        case 5+1:
            accum += (data0[5]) * (data1[5]);

#line 596
        case 4+1:
            accum += (data0[4]) * (data1[4]);

#line 596
        case 3+1:
            accum += (data0[3]) * (data1[3]);

#line 596
        case 2+1:
            accum += (data0[2]) * (data1[2]);

#line 596
        case 1+1:
            accum += (data0[1]) * (data1[1]);

#line 596
        case 0+1:
            accum += (data0[0]) * (data1[0]);

        case 0:
            *(npy_long *)dataptr[2] += (accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            accum_sse = _mm_add_ps(accum_sse, a);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            accum_sse = _mm_add_ps(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+0), _mm_load_pd(data1+0));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+2), _mm_load_pd(data1+2));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+4), _mm_load_pd(data1+4));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+6), _mm_load_pd(data1+6));
            accum_sse = _mm_add_pd(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        accum_sse = _mm_add_ps(accum_sse, a);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        accum_sse = _mm_add_ps(accum_sse, a);

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+0), _mm_loadu_pd(data1+0));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+2), _mm_loadu_pd(data1+2));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+4), _mm_loadu_pd(data1+4));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+6), _mm_loadu_pd(data1+6));
        accum_sse = _mm_add_pd(accum_sse, a);

#else
#line 708
        accum += (data0[0]) * (data1[0]);

#line 708
        accum += (data0[1]) * (data1[1]);

#line 708
        accum += (data0[2]) * (data1[2]);

#line 708
        accum += (data0[3]) * (data1[3]);

#line 708
        accum += (data0[4]) * (data1[4]);

#line 708
        accum += (data0[5]) * (data1[5]);

#line 708
        accum += (data0[6]) * (data1[6]);

#line 708
        accum += (data0[7]) * (data1[7]);

#endif
        data0 += 8;
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
long_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long value0 = (*(npy_long *)dataptr[0]);
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_stride0_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 754
        case 6+1:
            accum += (data1[6]);

#line 754
        case 5+1:
            accum += (data1[5]);

#line 754
        case 4+1:
            accum += (data1[4]);

#line 754
        case 3+1:
            accum += (data1[3]);

#line 754
        case 2+1:
            accum += (data1[2]);

#line 754
        case 1+1:
            accum += (data1[1]);

#line 754
        case 0+1:
            accum += (data1[0]);

        case 0:
            *(npy_long *)dataptr[2] += (value0 * accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+0));

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+4));

            data1 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+0));

#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+4));

#else
#line 813
        accum += (data1[0]);

#line 813
        accum += (data1[1]);

#line 813
        accum += (data1[2]);

#line 813
        accum += (data1[3]);

#line 813
        accum += (data1[4]);

#line 813
        accum += (data1[5]);

#line 813
        accum += (data1[6]);

#line 813
        accum += (data1[7]);

#endif
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
long_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long value1 = (*(npy_long *)dataptr[1]);
    npy_long accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_stride0_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 853
        case 6+1:
            accum += (data0[6]);

#line 853
        case 5+1:
            accum += (data0[5]);

#line 853
        case 4+1:
            accum += (data0[4]);

#line 853
        case 3+1:
            accum += (data0[3]);

#line 853
        case 2+1:
            accum += (data0[2]);

#line 853
        case 1+1:
            accum += (data0[1]);

#line 853
        case 0+1:
            accum += (data0[0]);

        case 0:
            *(npy_long *)dataptr[2] += (accum * value1);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#else
#line 912
        accum += (data0[0]);

#line 912
        accum += (data0[1]);

#line 912
        accum += (data0[2]);

#line 912
        accum += (data0[3]);

#line 912
        accum += (data0[4]);

#line 912
        accum += (data0[5]);

#line 912
        accum += (data0[6]);

#line 912
        accum += (data0[7]);

#endif
        data0 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1000 == 3 && !0

static void
long_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long *data2 = (npy_long *)dataptr[2];
    npy_long *data_out = (npy_long *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 949
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 949
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 949
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 949
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 949
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 949
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 949
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 949
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 1000 > 3 || @complex */

static void
long_sum_of_products_contig_any(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_any (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_long temp = (*(npy_long *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_long *)dataptr[i]);
        }
        *(npy_long *)dataptr[nop] = (temp +
                                           (*(npy_long *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_long);
        }
#else /* complex */
#  if 1000 <= 3
#    define _SUMPROD_NOP 1000
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_long re, im, tmp;
        int i;
        re = ((npy_long *)dataptr[0])[0];
        im = ((npy_long *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_long *)dataptr[i])[0] -
                  im * ((npy_long *)dataptr[i])[1];
            im = re * ((npy_long *)dataptr[i])[1] +
                 im * ((npy_long *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_long *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[0];
        ((npy_long *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_long);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 1000 */

#if 1000 == 1

static void
long_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_long accum_re = 0, accum_im = 0;
    npy_long *data0 = (npy_long *)dataptr[0];
#else
    npy_long accum = 0;
    npy_long *data0 = (npy_long *)dataptr[0];
#endif

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif


    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_outstride0_one (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 1058
        case 6+1:
#if !0
            accum += (data0[6]);
#else /* complex */
            accum_re += data0[2*6+0];
            accum_im += data0[2*6+1];
#endif

#line 1058
        case 5+1:
#if !0
            accum += (data0[5]);
#else /* complex */
            accum_re += data0[2*5+0];
            accum_im += data0[2*5+1];
#endif

#line 1058
        case 4+1:
#if !0
            accum += (data0[4]);
#else /* complex */
            accum_re += data0[2*4+0];
            accum_im += data0[2*4+1];
#endif

#line 1058
        case 3+1:
#if !0
            accum += (data0[3]);
#else /* complex */
            accum_re += data0[2*3+0];
            accum_im += data0[2*3+1];
#endif

#line 1058
        case 2+1:
#if !0
            accum += (data0[2]);
#else /* complex */
            accum_re += data0[2*2+0];
            accum_im += data0[2*2+1];
#endif

#line 1058
        case 1+1:
#if !0
            accum += (data0[1]);
#else /* complex */
            accum_re += data0[2*1+0];
            accum_im += data0[2*1+1];
#endif

#line 1058
        case 0+1:
#if !0
            accum += (data0[0]);
#else /* complex */
            accum_re += data0[2*0+0];
            accum_im += data0[2*0+1];
#endif

        case 0:
#if 0
            ((npy_long *)dataptr[1])[0] += accum_re;
            ((npy_long *)dataptr[1])[1] += accum_im;
#else
            *((npy_long *)dataptr[1]) = (accum +
                                    (*((npy_long *)dataptr[1])));
#endif
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+0));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+2));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+4));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+6));

            data0 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+0));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+2));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+4));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+6));

#else
#line 1171
#  if !0
        accum += (data0[0]);
#  else /* complex */
        accum_re += data0[2*0+0];
        accum_im += data0[2*0+1];
#  endif

#line 1171
#  if !0
        accum += (data0[1]);
#  else /* complex */
        accum_re += data0[2*1+0];
        accum_im += data0[2*1+1];
#  endif

#line 1171
#  if !0
        accum += (data0[2]);
#  else /* complex */
        accum_re += data0[2*2+0];
        accum_im += data0[2*2+1];
#  endif

#line 1171
#  if !0
        accum += (data0[3]);
#  else /* complex */
        accum_re += data0[2*3+0];
        accum_im += data0[2*3+1];
#  endif

#line 1171
#  if !0
        accum += (data0[4]);
#  else /* complex */
        accum_re += data0[2*4+0];
        accum_im += data0[2*4+1];
#  endif

#line 1171
#  if !0
        accum += (data0[5]);
#  else /* complex */
        accum_re += data0[2*5+0];
        accum_im += data0[2*5+1];
#  endif

#line 1171
#  if !0
        accum += (data0[6]);
#  else /* complex */
        accum_re += data0[2*6+0];
        accum_im += data0[2*6+1];
#  endif

#line 1171
#  if !0
        accum += (data0[7]);
#  else /* complex */
        accum_re += data0[2*7+0];
        accum_im += data0[2*7+1];
#  endif

#endif

#if !0
        data0 += 8;
#else
        data0 += 8*2;
#endif
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#endif /* 1000 == 1 */

static void
long_sum_of_products_outstride0_any(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_long accum_re = 0, accum_im = 0;
#else
    npy_long accum = 0;
#endif

#if (1000 == 1) || (1000 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1000 == 2 || 1000 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1000 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_outstride0_any (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 1000 == 1
        accum += (*(npy_long *)data0);
        data0 += stride0;
#  elif 1000 == 2
        accum += (*(npy_long *)data0) *
                 (*(npy_long *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 1000 == 3
        accum += (*(npy_long *)data0) *
                 (*(npy_long *)data1) *
                 (*(npy_long *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_long temp = (*(npy_long *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_long *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1000 == 1
        accum_re += ((npy_long *)data0)[0];
        accum_im += ((npy_long *)data0)[1];
        data0 += stride0;
#  else
#    if 1000 <= 3
#define _SUMPROD_NOP 1000
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_long re, im, tmp;
        int i;
        re = ((npy_long *)dataptr[0])[0];
        im = ((npy_long *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_long *)dataptr[i])[0] -
                  im * ((npy_long *)dataptr[i])[1];
            im = re * ((npy_long *)dataptr[i])[1] +
                 im * ((npy_long *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 1000 <= 3
    ((npy_long *)dataptr[1000])[0] += accum_re;
    ((npy_long *)dataptr[1000])[1] += accum_im;
#  else
    ((npy_long *)dataptr[nop])[0] += accum_re;
    ((npy_long *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 1000 <= 3
    *((npy_long *)dataptr[1000]) = (accum +
                                    (*((npy_long *)dataptr[1000])));
#  else
    *((npy_long *)dataptr[nop]) = (accum +
                                    (*((npy_long *)dataptr[nop])));
#  endif
#endif

}




#line 108

#line 113
static void
longlong_sum_of_products_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if (1 == 1) || (1 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1 == 2 || 1 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (1 == 1) || (1 <= 3 && !0)
    char *data_out = dataptr[1];
    npy_intp stride_out = strides[1];
#endif

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_one (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 1 == 1
        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) +
                                         (*(npy_longlong *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 1 == 2
        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) *
                                         (*(npy_longlong *)data1) +
                                         (*(npy_longlong *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 1 == 3
        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) *
                                         (*(npy_longlong *)data1) *
                                         (*(npy_longlong *)data2) +
                                         (*(npy_longlong *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_longlong *)dataptr[i]);
        }
        *(npy_longlong *)dataptr[nop] = (temp +
                                           (*(npy_longlong *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1 == 1
        ((npy_longlong *)data_out)[0] = ((npy_longlong *)data0)[0] +
                                         ((npy_longlong *)data_out)[0];
        ((npy_longlong *)data_out)[1] = ((npy_longlong *)data0)[1] +
                                         ((npy_longlong *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 1 <= 3
#define _SUMPROD_NOP 1
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_longlong re, im, tmp;
        int i;
        re = ((npy_longlong *)dataptr[0])[0];
        im = ((npy_longlong *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_longlong *)dataptr[i])[0] -
                  im * ((npy_longlong *)dataptr[i])[1];
            im = re * ((npy_longlong *)dataptr[i])[1] +
                 im * ((npy_longlong *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_longlong *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[0];
        ((npy_longlong *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 1 == 1

static void
longlong_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data_out = (npy_longlong *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 228
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_longlong *)data_out + 2*6)[0] =
                                    ((npy_longlong *)data0 + 2*6)[0] +
                                    ((npy_longlong *)data_out + 2*6)[0];
            ((npy_longlong *)data_out + 2*6)[1] =
                                    ((npy_longlong *)data0 + 2*6)[1] +
                                    ((npy_longlong *)data_out + 2*6)[1];
#endif

#line 228
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_longlong *)data_out + 2*5)[0] =
                                    ((npy_longlong *)data0 + 2*5)[0] +
                                    ((npy_longlong *)data_out + 2*5)[0];
            ((npy_longlong *)data_out + 2*5)[1] =
                                    ((npy_longlong *)data0 + 2*5)[1] +
                                    ((npy_longlong *)data_out + 2*5)[1];
#endif

#line 228
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_longlong *)data_out + 2*4)[0] =
                                    ((npy_longlong *)data0 + 2*4)[0] +
                                    ((npy_longlong *)data_out + 2*4)[0];
            ((npy_longlong *)data_out + 2*4)[1] =
                                    ((npy_longlong *)data0 + 2*4)[1] +
                                    ((npy_longlong *)data_out + 2*4)[1];
#endif

#line 228
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_longlong *)data_out + 2*3)[0] =
                                    ((npy_longlong *)data0 + 2*3)[0] +
                                    ((npy_longlong *)data_out + 2*3)[0];
            ((npy_longlong *)data_out + 2*3)[1] =
                                    ((npy_longlong *)data0 + 2*3)[1] +
                                    ((npy_longlong *)data_out + 2*3)[1];
#endif

#line 228
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_longlong *)data_out + 2*2)[0] =
                                    ((npy_longlong *)data0 + 2*2)[0] +
                                    ((npy_longlong *)data_out + 2*2)[0];
            ((npy_longlong *)data_out + 2*2)[1] =
                                    ((npy_longlong *)data0 + 2*2)[1] +
                                    ((npy_longlong *)data_out + 2*2)[1];
#endif

#line 228
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_longlong *)data_out + 2*1)[0] =
                                    ((npy_longlong *)data0 + 2*1)[0] +
                                    ((npy_longlong *)data_out + 2*1)[0];
            ((npy_longlong *)data_out + 2*1)[1] =
                                    ((npy_longlong *)data0 + 2*1)[1] +
                                    ((npy_longlong *)data_out + 2*1)[1];
#endif

#line 228
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_longlong *)data_out + 2*0)[0] =
                                    ((npy_longlong *)data0 + 2*0)[0] +
                                    ((npy_longlong *)data_out + 2*0)[0];
            ((npy_longlong *)data_out + 2*0)[1] =
                                    ((npy_longlong *)data0 + 2*0)[1] +
                                    ((npy_longlong *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 252
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_longlong *)data_out + 2*0)[0] =
                                ((npy_longlong *)data0 + 2*0)[0] +
                                ((npy_longlong *)data_out + 2*0)[0];
        ((npy_longlong *)data_out + 2*0)[1] =
                                ((npy_longlong *)data0 + 2*0)[1] +
                                ((npy_longlong *)data_out + 2*0)[1];
#endif

#line 252
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_longlong *)data_out + 2*1)[0] =
                                ((npy_longlong *)data0 + 2*1)[0] +
                                ((npy_longlong *)data_out + 2*1)[0];
        ((npy_longlong *)data_out + 2*1)[1] =
                                ((npy_longlong *)data0 + 2*1)[1] +
                                ((npy_longlong *)data_out + 2*1)[1];
#endif

#line 252
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_longlong *)data_out + 2*2)[0] =
                                ((npy_longlong *)data0 + 2*2)[0] +
                                ((npy_longlong *)data_out + 2*2)[0];
        ((npy_longlong *)data_out + 2*2)[1] =
                                ((npy_longlong *)data0 + 2*2)[1] +
                                ((npy_longlong *)data_out + 2*2)[1];
#endif

#line 252
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_longlong *)data_out + 2*3)[0] =
                                ((npy_longlong *)data0 + 2*3)[0] +
                                ((npy_longlong *)data_out + 2*3)[0];
        ((npy_longlong *)data_out + 2*3)[1] =
                                ((npy_longlong *)data0 + 2*3)[1] +
                                ((npy_longlong *)data_out + 2*3)[1];
#endif

#line 252
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_longlong *)data_out + 2*4)[0] =
                                ((npy_longlong *)data0 + 2*4)[0] +
                                ((npy_longlong *)data_out + 2*4)[0];
        ((npy_longlong *)data_out + 2*4)[1] =
                                ((npy_longlong *)data0 + 2*4)[1] +
                                ((npy_longlong *)data_out + 2*4)[1];
#endif

#line 252
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_longlong *)data_out + 2*5)[0] =
                                ((npy_longlong *)data0 + 2*5)[0] +
                                ((npy_longlong *)data_out + 2*5)[0];
        ((npy_longlong *)data_out + 2*5)[1] =
                                ((npy_longlong *)data0 + 2*5)[1] +
                                ((npy_longlong *)data_out + 2*5)[1];
#endif

#line 252
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_longlong *)data_out + 2*6)[0] =
                                ((npy_longlong *)data0 + 2*6)[0] +
                                ((npy_longlong *)data_out + 2*6)[0];
        ((npy_longlong *)data_out + 2*6)[1] =
                                ((npy_longlong *)data0 + 2*6)[1] +
                                ((npy_longlong *)data_out + 2*6)[1];
#endif

#line 252
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_longlong *)data_out + 2*7)[0] =
                                ((npy_longlong *)data0 + 2*7)[0] +
                                ((npy_longlong *)data_out + 2*7)[0];
        ((npy_longlong *)data_out + 2*7)[1] =
                                ((npy_longlong *)data0 + 2*7)[1] +
                                ((npy_longlong *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1 == 2 && !0

static void
longlong_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong *data_out = (npy_longlong *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b;
#endif

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_two (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 295
        case 6+1:
            data_out[6] = ((data0[6]) *
                                 (data1[6]) +
                                 (data_out[6]));

#line 295
        case 5+1:
            data_out[5] = ((data0[5]) *
                                 (data1[5]) +
                                 (data_out[5]));

#line 295
        case 4+1:
            data_out[4] = ((data0[4]) *
                                 (data1[4]) +
                                 (data_out[4]));

#line 295
        case 3+1:
            data_out[3] = ((data0[3]) *
                                 (data1[3]) +
                                 (data_out[3]));

#line 295
        case 2+1:
            data_out[2] = ((data0[2]) *
                                 (data1[2]) +
                                 (data_out[2]));

#line 295
        case 1+1:
            data_out[1] = ((data0[1]) *
                                 (data1[1]) +
                                 (data_out[1]));

#line 295
        case 0+1:
            data_out[0] = ((data0[0]) *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
        EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 345
        data_out[0] = ((data0[0]) *
                             (data1[0]) +
                             (data_out[0]));

#line 345
        data_out[1] = ((data0[1]) *
                             (data1[1]) +
                             (data_out[1]));

#line 345
        data_out[2] = ((data0[2]) *
                             (data1[2]) +
                             (data_out[2]));

#line 345
        data_out[3] = ((data0[3]) *
                             (data1[3]) +
                             (data_out[3]));

#line 345
        data_out[4] = ((data0[4]) *
                             (data1[4]) +
                             (data_out[4]));

#line 345
        data_out[5] = ((data0[5]) *
                             (data1[5]) +
                             (data_out[5]));

#line 345
        data_out[6] = ((data0[6]) *
                             (data1[6]) +
                             (data_out[6]));

#line 345
        data_out[7] = ((data0[7]) *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data0 += 8;
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

/* Some extra specializations for the two operand case */
static void
longlong_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong value0 = (*(npy_longlong *)dataptr[0]);
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong *data_out = (npy_longlong *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value0_sse;
#elif EINSUM_USE_SSE2 && 0
    __m128d a, b, value0_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 383
        case 6+1:
            data_out[6] = (value0 *
                                 (data1[6]) +
                                 (data_out[6]));

#line 383
        case 5+1:
            data_out[5] = (value0 *
                                 (data1[5]) +
                                 (data_out[5]));

#line 383
        case 4+1:
            data_out[4] = (value0 *
                                 (data1[4]) +
                                 (data_out[4]));

#line 383
        case 3+1:
            data_out[3] = (value0 *
                                 (data1[3]) +
                                 (data_out[3]));

#line 383
        case 2+1:
            data_out[2] = (value0 *
                                 (data1[2]) +
                                 (data_out[2]));

#line 383
        case 1+1:
            data_out[1] = (value0 *
                                 (data1[1]) +
                                 (data_out[1]));

#line 383
        case 0+1:
            data_out[0] = (value0 *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value0_sse = _mm_set_ps1(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#elif EINSUM_USE_SSE2 && 0
    value0_sse = _mm_set1_pd(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+0));
            b = _mm_add_pd(a, _mm_load_pd(data_out+0));
            _mm_store_pd(data_out+0, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+2));
            b = _mm_add_pd(a, _mm_load_pd(data_out+2));
            _mm_store_pd(data_out+2, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+4));
            b = _mm_add_pd(a, _mm_load_pd(data_out+4));
            _mm_store_pd(data_out+4, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+6));
            b = _mm_add_pd(a, _mm_load_pd(data_out+6));
            _mm_store_pd(data_out+6, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#elif EINSUM_USE_SSE2 && 0
#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+0));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+0));
        _mm_storeu_pd(data_out+0, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+2));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+2));
        _mm_storeu_pd(data_out+2, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+4));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+4));
        _mm_storeu_pd(data_out+4, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+6));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+6));
        _mm_storeu_pd(data_out+6, b);

#else
#line 474
        data_out[0] = (value0 *
                             (data1[0]) +
                             (data_out[0]));

#line 474
        data_out[1] = (value0 *
                             (data1[1]) +
                             (data_out[1]));

#line 474
        data_out[2] = (value0 *
                             (data1[2]) +
                             (data_out[2]));

#line 474
        data_out[3] = (value0 *
                             (data1[3]) +
                             (data_out[3]));

#line 474
        data_out[4] = (value0 *
                             (data1[4]) +
                             (data_out[4]));

#line 474
        data_out[5] = (value0 *
                             (data1[5]) +
                             (data_out[5]));

#line 474
        data_out[6] = (value0 *
                             (data1[6]) +
                             (data_out[6]));

#line 474
        data_out[7] = (value0 *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    if (count > 0) {
        goto finish_after_unrolled_loop;
    }
}

static void
longlong_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong value1 = (*(npy_longlong *)dataptr[1]);
    npy_longlong *data_out = (npy_longlong *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value1_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 510
        case 6+1:
            data_out[6] = ((data0[6])*
                                 value1  +
                                 (data_out[6]));

#line 510
        case 5+1:
            data_out[5] = ((data0[5])*
                                 value1  +
                                 (data_out[5]));

#line 510
        case 4+1:
            data_out[4] = ((data0[4])*
                                 value1  +
                                 (data_out[4]));

#line 510
        case 3+1:
            data_out[3] = ((data0[3])*
                                 value1  +
                                 (data_out[3]));

#line 510
        case 2+1:
            data_out[2] = ((data0[2])*
                                 value1  +
                                 (data_out[2]));

#line 510
        case 1+1:
            data_out[1] = ((data0[1])*
                                 value1  +
                                 (data_out[1]));

#line 510
        case 0+1:
            data_out[0] = ((data0[0])*
                                 value1  +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value1_sse = _mm_set_ps1(value1);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+0), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 531
            a = _mm_mul_ps(_mm_load_ps(data0+4), value1_sse);
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 552
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), value1_sse);
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 560
        data_out[0] = ((data0[0])*
                             value1  +
                             (data_out[0]));

#line 560
        data_out[1] = ((data0[1])*
                             value1  +
                             (data_out[1]));

#line 560
        data_out[2] = ((data0[2])*
                             value1  +
                             (data_out[2]));

#line 560
        data_out[3] = ((data0[3])*
                             value1  +
                             (data_out[3]));

#line 560
        data_out[4] = ((data0[4])*
                             value1  +
                             (data_out[4]));

#line 560
        data_out[5] = ((data0[5])*
                             value1  +
                             (data_out[5]));

#line 560
        data_out[6] = ((data0[6])*
                             value1  +
                             (data_out[6]));

#line 560
        data_out[7] = ((data0[7])*
                             value1  +
                             (data_out[7]));

#endif
        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
longlong_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 596
        case 6+1:
            accum += (data0[6]) * (data1[6]);

#line 596
        case 5+1:
            accum += (data0[5]) * (data1[5]);

#line 596
        case 4+1:
            accum += (data0[4]) * (data1[4]);

#line 596
        case 3+1:
            accum += (data0[3]) * (data1[3]);

#line 596
        case 2+1:
            accum += (data0[2]) * (data1[2]);

#line 596
        case 1+1:
            accum += (data0[1]) * (data1[1]);

#line 596
        case 0+1:
            accum += (data0[0]) * (data1[0]);

        case 0:
            *(npy_longlong *)dataptr[2] += (accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            accum_sse = _mm_add_ps(accum_sse, a);

#line 617
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            accum_sse = _mm_add_ps(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);
            _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+0), _mm_load_pd(data1+0));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+2), _mm_load_pd(data1+2));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+4), _mm_load_pd(data1+4));
            accum_sse = _mm_add_pd(accum_sse, a);

#line 651
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            a = _mm_mul_pd(_mm_load_pd(data0+6), _mm_load_pd(data1+6));
            accum_sse = _mm_add_pd(accum_sse, a);

            data0 += 8;
            data1 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        accum_sse = _mm_add_ps(accum_sse, a);

#line 683
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        accum_sse = _mm_add_ps(accum_sse, a);

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);
        _mm_prefetch(data1 + 512, _MM_HINT_T0);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+0), _mm_loadu_pd(data1+0));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+2), _mm_loadu_pd(data1+2));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+4), _mm_loadu_pd(data1+4));
        accum_sse = _mm_add_pd(accum_sse, a);

#line 697
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        a = _mm_mul_pd(_mm_loadu_pd(data0+6), _mm_loadu_pd(data1+6));
        accum_sse = _mm_add_pd(accum_sse, a);

#else
#line 708
        accum += (data0[0]) * (data1[0]);

#line 708
        accum += (data0[1]) * (data1[1]);

#line 708
        accum += (data0[2]) * (data1[2]);

#line 708
        accum += (data0[3]) * (data1[3]);

#line 708
        accum += (data0[4]) * (data1[4]);

#line 708
        accum += (data0[5]) * (data1[5]);

#line 708
        accum += (data0[6]) * (data1[6]);

#line 708
        accum += (data0[7]) * (data1[7]);

#endif
        data0 += 8;
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
longlong_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong value0 = (*(npy_longlong *)dataptr[0]);
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_stride0_contig_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 754
        case 6+1:
            accum += (data1[6]);

#line 754
        case 5+1:
            accum += (data1[5]);

#line 754
        case 4+1:
            accum += (data1[4]);

#line 754
        case 3+1:
            accum += (data1[3]);

#line 754
        case 2+1:
            accum += (data1[2]);

#line 754
        case 1+1:
            accum += (data1[1]);

#line 754
        case 0+1:
            accum += (data1[0]);

        case 0:
            *(npy_longlong *)dataptr[2] += (value0 * accum);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+0));

#line 772
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+4));

            data1 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+0));

#line 803
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+4));

#else
#line 813
        accum += (data1[0]);

#line 813
        accum += (data1[1]);

#line 813
        accum += (data1[2]);

#line 813
        accum += (data1[3]);

#line 813
        accum += (data1[4]);

#line 813
        accum += (data1[5]);

#line 813
        accum += (data1[6]);

#line 813
        accum += (data1[7]);

#endif
        data1 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

static void
longlong_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong value1 = (*(npy_longlong *)dataptr[1]);
    npy_longlong accum = 0;

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#endif

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_stride0_outstride0_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 853
        case 6+1:
            accum += (data0[6]);

#line 853
        case 5+1:
            accum += (data0[5]);

#line 853
        case 4+1:
            accum += (data0[4]);

#line 853
        case 3+1:
            accum += (data0[3]);

#line 853
        case 2+1:
            accum += (data0[2]);

#line 853
        case 1+1:
            accum += (data0[1]);

#line 853
        case 0+1:
            accum += (data0[0]);

        case 0:
            *(npy_longlong *)dataptr[2] += (accum * value1);
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 871
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

#if EINSUM_USE_SSE1 && 0
        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);
#endif

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 902
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#else
#line 912
        accum += (data0[0]);

#line 912
        accum += (data0[1]);

#line 912
        accum += (data0[2]);

#line 912
        accum += (data0[3]);

#line 912
        accum += (data0[4]);

#line 912
        accum += (data0[5]);

#line 912
        accum += (data0[6]);

#line 912
        accum += (data0[7]);

#endif
        data0 += 8;
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1 == 3 && !0

static void
longlong_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong *data2 = (npy_longlong *)dataptr[2];
    npy_longlong *data_out = (npy_longlong *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 949
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 949
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 949
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 949
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 949
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 949
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 949
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 949
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 965
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 1 > 3 || @complex */

static void
longlong_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_one (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_longlong *)dataptr[i]);
        }
        *(npy_longlong *)dataptr[nop] = (temp +
                                           (*(npy_longlong *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_longlong);
        }
#else /* complex */
#  if 1 <= 3
#    define _SUMPROD_NOP 1
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_longlong re, im, tmp;
        int i;
        re = ((npy_longlong *)dataptr[0])[0];
        im = ((npy_longlong *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_longlong *)dataptr[i])[0] -
                  im * ((npy_longlong *)dataptr[i])[1];
            im = re * ((npy_longlong *)dataptr[i])[1] +
                 im * ((npy_longlong *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_longlong *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[0];
        ((npy_longlong *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_longlong);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 1 */

#if 1 == 1

static void
longlong_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_longlong accum_re = 0, accum_im = 0;
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
#else
    npy_longlong accum = 0;
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
#endif

#if EINSUM_USE_SSE1 && 0
    __m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && 0
    __m128d a, accum_sse = _mm_setzero_pd();
#endif


    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_outstride0_one (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 1058
        case 6+1:
#if !0
            accum += (data0[6]);
#else /* complex */
            accum_re += data0[2*6+0];
            accum_im += data0[2*6+1];
#endif

#line 1058
        case 5+1:
#if !0
            accum += (data0[5]);
#else /* complex */
            accum_re += data0[2*5+0];
            accum_im += data0[2*5+1];
#endif

#line 1058
        case 4+1:
#if !0
            accum += (data0[4]);
#else /* complex */
            accum_re += data0[2*4+0];
            accum_im += data0[2*4+1];
#endif

#line 1058
        case 3+1:
#if !0
            accum += (data0[3]);
#else /* complex */
            accum_re += data0[2*3+0];
            accum_im += data0[2*3+1];
#endif

#line 1058
        case 2+1:
#if !0
            accum += (data0[2]);
#else /* complex */
            accum_re += data0[2*2+0];
            accum_im += data0[2*2+1];
#endif

#line 1058
        case 1+1:
#if !0
            accum += (data0[1]);
#else /* complex */
            accum_re += data0[2*1+0];
            accum_im += data0[2*1+1];
#endif

#line 1058
        case 0+1:
#if !0
            accum += (data0[0]);
#else /* complex */
            accum_re += data0[2*0+0];
            accum_im += data0[2*0+1];
#endif

        case 0:
#if 0
            ((npy_longlong *)dataptr[1])[0] += accum_re;
            ((npy_longlong *)dataptr[1])[1] += accum_im;
#else
            *((npy_longlong *)dataptr[1]) = (accum +
                                    (*((npy_longlong *)dataptr[1])));
#endif
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+0));

#line 1089
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+4));

            data0 += 8;
        }

        /* Add the four SSE values and put in accum */
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
        accum_sse = _mm_add_ps(a, accum_sse);
        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
        accum_sse = _mm_add_ps(a, accum_sse);
        _mm_store_ss(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#elif EINSUM_USE_SSE2 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

            _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+0));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+2));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+4));

#line 1120
            /*
             * NOTE: This accumulation changes the order, so will likely
             *       produce slightly different results.
             */
            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+6));

            data0 += 8;
        }

        /* Add the two SSE2 values and put in accum */
        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
        accum_sse = _mm_add_pd(a, accum_sse);
        _mm_store_sd(&accum, accum_sse);

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+0));

#line 1149
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+4));

#elif EINSUM_USE_SSE2 && 0
        _mm_prefetch(data0 + 512, _MM_HINT_T0);

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+0));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+2));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+4));

#line 1161
        /*
         * NOTE: This accumulation changes the order, so will likely
         *       produce slightly different results.
         */
        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+6));

#else
#line 1171
#  if !0
        accum += (data0[0]);
#  else /* complex */
        accum_re += data0[2*0+0];
        accum_im += data0[2*0+1];
#  endif

#line 1171
#  if !0
        accum += (data0[1]);
#  else /* complex */
        accum_re += data0[2*1+0];
        accum_im += data0[2*1+1];
#  endif

#line 1171
#  if !0
        accum += (data0[2]);
#  else /* complex */
        accum_re += data0[2*2+0];
        accum_im += data0[2*2+1];
#  endif

#line 1171
#  if !0
        accum += (data0[3]);
#  else /* complex */
        accum_re += data0[2*3+0];
        accum_im += data0[2*3+1];
#  endif

#line 1171
#  if !0
        accum += (data0[4]);
#  else /* complex */
        accum_re += data0[2*4+0];
        accum_im += data0[2*4+1];
#  endif

#line 1171
#  if !0
        accum += (data0[5]);
#  else /* complex */
        accum_re += data0[2*5+0];
        accum_im += data0[2*5+1];
#  endif

#line 1171
#  if !0
        accum += (data0[6]);
#  else /* complex */
        accum_re += data0[2*6+0];
        accum_im += data0[2*6+1];
#  endif

#line 1171
#  if !0
        accum += (data0[7]);
#  else /* complex */
        accum_re += data0[2*7+0];
        accum_im += data0[2*7+1];
#  endif

#endif

#if !0
        data0 += 8;
#else
        data0 += 8*2;
#endif
    }

#if EINSUM_USE_SSE1 && 0
    /* Add the four SSE values and put in accum */
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
    accum_sse = _mm_add_ps(a, accum_sse);
    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
    accum_sse = _mm_add_ps(a, accum_sse);
    _mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && 0
    /* Add the two SSE2 values and put in accum */
    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
    accum_sse = _mm_add_pd(a, accum_sse);
    _mm_store_sd(&accum, accum_sse);
#endif

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#endif /* 1 == 1 */

static void
longlong_sum_of_products_outstride0_one(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if 0
    npy_longlong accum_re = 0, accum_im = 0;
#else
    npy_longlong accum = 0;
#endif

#if (1 == 1) || (1 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1 == 2 || 1 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_outstride0_one (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 1 == 1
        accum += (*(npy_longlong *)data0);
        data0 += stride0;
#  elif 1 == 2
        accum += (*(npy_longlong *)data0) *
                 (*(npy_longlong *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 1 == 3
        accum += (*(npy_longlong *)data0) *
                 (*(npy_longlong *)data1) *
                 (*(npy_longlong *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_longlong *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1 == 1
        accum_re += ((npy_longlong *)data0)[0];
        accum_im += ((npy_longlong *)data0)[1];
        data0 += stride0;
#  else
#    if 1 <= 3
#define _SUMPROD_NOP 1
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_longlong re, im, tmp;
        int i;
        re = ((npy_longlong *)dataptr[0])[0];
        im = ((npy_longlong *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_longlong *)dataptr[i])[0] -
                  im * ((npy_longlong *)dataptr[i])[1];
            im = re * ((npy_longlong *)dataptr[i])[1] +
                 im * ((npy_longlong *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 1 <= 3
    ((npy_longlong *)dataptr[1])[0] += accum_re;
    ((npy_longlong *)dataptr[1])[1] += accum_im;
#  else
    ((npy_longlong *)dataptr[nop])[0] += accum_re;
    ((npy_longlong *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 1 <= 3
    *((npy_longlong *)dataptr[1]) = (accum +
                                    (*((npy_longlong *)dataptr[1])));
#  else
    *((npy_longlong *)dataptr[nop]) = (accum +
                                    (*((npy_longlong *)dataptr[nop])));
#  endif
#endif

}


#line 113
static void
longlong_sum_of_products_two(int nop, char **dataptr,
                                npy_intp *strides, npy_intp count)
{
#if (2 == 1) || (2 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (2 == 2 || 2 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (2 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (2 == 1) || (2 <= 3 && !0)
    char *data_out = dataptr[2];
    npy_intp stride_out = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_two (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 2 == 1
        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) +
                                         (*(npy_longlong *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 2 == 2
        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) *
                                         (*(npy_longlong *)data1) +
                                         (*(npy_longlong *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 2 == 3
        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) *
                                         (*(npy_longlong *)data1) *
                                         (*(npy_longlong *)data2) +
                                         (*(npy_longlong *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_longlong *)dataptr[i]);
        }
        *(npy_longlong *)dataptr[nop] = (temp +
                                           (*(npy_longlong *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 2 == 1
        ((npy_longlong *)data_out)[0] = ((npy_longlong *)data0)[0] +
                                         ((npy_longlong *)data_out)[0];
        ((npy_longlong *)data_out)[1] = ((npy_longlong *)data0)[1] +
                                         ((npy_longlong *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 2 <= 3
#define _SUMPROD_NOP 2
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_longlong re, im, tmp;
        int i;
        re = ((npy_longlong *)dataptr[0])[0];
        im = ((npy_longlong *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_longlong *)dataptr[i])[0] -
                  im * ((npy_longlong *)dataptr[i])[1];
            im = re * ((npy_longlong *)dataptr[i])[1] +
                 im * ((npy_longlong *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_longlong *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[0];
        ((npy_longlong *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 2 == 1

static void
longlong_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data_out = (npy_longlong *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 228
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_longlong *)data_out + 2*6)[0] =
                                    ((npy_longlong *)data0 + 2*6)[0] +
                                    ((npy_longlong *)data_out + 2*6)[0];
            ((npy_longlong *)data_out + 2*6)[1] =
                                    ((npy_longlong *)data0 + 2*6)[1] +
                                    ((npy_longlong *)data_out + 2*6)[1];
#endif

#line 228
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_longlong *)data_out + 2*5)[0] =
                                    ((npy_longlong *)data0 + 2*5)[0] +
                                    ((npy_longlong *)data_out + 2*5)[0];
            ((npy_longlong *)data_out + 2*5)[1] =
                                    ((npy_longlong *)data0 + 2*5)[1] +
                                    ((npy_longlong *)data_out + 2*5)[1];
#endif

#line 228
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_longlong *)data_out + 2*4)[0] =
                                    ((npy_longlong *)data0 + 2*4)[0] +
                                    ((npy_longlong *)data_out + 2*4)[0];
            ((npy_longlong *)data_out + 2*4)[1] =
                                    ((npy_longlong *)data0 + 2*4)[1] +
                                    ((npy_longlong *)data_out + 2*4)[1];
#endif

#line 228
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_longlong *)data_out + 2*3)[0] =
                                    ((npy_longlong *)data0 + 2*3)[0] +
                                    ((npy_longlong *)data_out + 2*3)[0];
            ((npy_longlong *)data_out + 2*3)[1] =
                                    ((npy_longlong *)data0 + 2*3)[1] +
                                    ((npy_longlong *)data_out + 2*3)[1];
#endif

#line 228
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_longlong *)data_out + 2*2)[0] =
                                    ((npy_longlong *)data0 + 2*2)[0] +
                                    ((npy_longlong *)data_out + 2*2)[0];
            ((npy_longlong *)data_out + 2*2)[1] =
                                    ((npy_longlong *)data0 + 2*2)[1] +
                                    ((npy_longlong *)data_out + 2*2)[1];
#endif

#line 228
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_longlong *)data_out + 2*1)[0] =
                                    ((npy_longlong *)data0 + 2*1)[0] +
                                    ((npy_longlong *)data_out + 2*1)[0];
            ((npy_longlong *)data_out + 2*1)[1] =
                                    ((npy_longlong *)data0 + 2*1)[1] +
                                    ((npy_longlong *)data_out + 2*1)[1];
#endif

#line 228
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_longlong *)data_out + 2*0)[0] =
                                    ((npy_longlong *)data0 + 2*0)[0] +
                                    ((npy_longlong *)data_out + 2*0)[0];
            ((npy_longlong *)data_out + 2*0)[1] =
                                    ((npy_longlong *)data0 + 2*0)[1] +
                                    ((npy_longlong *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 252
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_longlong *)data_out + 2*0)[0] =
                                ((npy_longlong *)data0 + 2*0)[0] +
                                ((npy_longlong *)data_out + 2*0)[0];
        ((npy_longlong *)data_out + 2*0)[1] =
                                ((npy_longlong *)data0 + 2*0)[1] +
                                ((npy_longlong *)data_out + 2*0)[1];
#endif

#line 252
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_longlong *)data_out + 2*1)[0] =
                                ((npy_longlong *)data0 + 2*1)[0] +
                                ((npy_longlong *)data_out + 2*1)[0];
        ((npy_longlong *)data_out + 2*1)[1] =
                                ((npy_longlong *)data0 + 2*1)[1] +
                                ((npy_longlong *)data_out + 2*1)[1];
#endif

#line 252
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_longlong *)data_out + 2*2)[0] =
                                ((npy_longlong *)data0 + 2*2)[0] +
                                ((npy_longlong *)data_out + 2*2)[0];
        ((npy_longlong *)data_out + 2*2)[1] =
                                ((npy_longlong *)data0 + 2*2)[1] +
                                ((npy_longlong *)data_out + 2*2)[1];
#endif

#line 252
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_longlong *)data_out + 2*3)[0] =
                                ((npy_longlong *)data0 + 2*3)[0] +
                                ((npy_longlong *)data_out + 2*3)[0];
        ((npy_longlong *)data_out + 2*3)[1] =
                                ((npy_longlong *)data0 + 2*3)[1] +
                                ((npy_longlong *)data_out + 2*3)[1];
#endif

#line 252
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_longlong *)data_out + 2*4)[0] =
                                ((npy_longlong *)data0 + 2*4)[0] +
                                ((npy_longlong *)data_out + 2*4)[0];
        ((npy_longlong *)data_out + 2*4)[1] =
                                ((npy_longlong *)data0 + 2*4)[1] +
                                ((npy_longlong *)data_out + 2*4)[1];
#endif

#line 252
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_longlong *)data_out + 2*5)[0] =
                                ((npy_longlong *)data0 + 2*5)[0] +
                                ((npy_longlong *)data_out + 2*5)[0];
        ((npy_longlong *)data_out + 2*5)[1] =
                                ((npy_longlong *)data0 + 2*5)[1] +
                                ((npy_longlong *)data_out + 2*5)[1];
#endif

#line 252
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_longlong *)data_out + 2*6)[0] =
                                ((npy_longlong *)data0 + 2*6)[0] +
                                ((npy_longlong *)data_out + 2*6)[0];
        ((npy_longlong *)data_out + 2*6)[1] =
                                ((npy_longlong *)data0 + 2*6)[1] +
                                ((npy_longlong *)data_out + 2*6)[1];
#endif

#line 252
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_longlong *)data_out + 2*7)[0] =
                                ((npy_longlong *)data0 + 2*7)[0] +
                                ((npy_longlong *)data_out + 2*7)[0];
        ((npy_longlong *)data_out + 2*7)[1] =
                                ((npy_longlong *)data0 + 2*7)[1] +
                                ((npy_longlong *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 2 == 2 && !0

static void
longlong_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong *data_out = (npy_longlong *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b;
#endif

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_two (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 295
        case 6+1:
            data_out[6] = ((data0[6]) *
                                 (data1[6]) +
                                 (data_out[6]));

#line 295
        case 5+1:
            data_out[5] = ((data0[5]) *
                                 (data1[5]) +
                                 (data_out[5]));

#line 295
        case 4+1:
            data_out[4] = ((data0[4]) *
                                 (data1[4]) +
                                 (data_out[4]));

#line 295
        case 3+1:
            data_out[3] = ((data0[3]) *
                                 (data1[3]) +
                                 (data_out[3]));

#line 295
        case 2+1:
            data_out[2] = ((data0[2]) *
                                 (data1[2]) +
                                 (data_out[2]));

#line 295
        case 1+1:
            data_out[1] = ((data0[1]) *
                                 (data1[1]) +
                                 (data_out[1]));

#line 295
        case 0+1:
            data_out[0] = ((data0[0]) *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
        EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+0), _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 315
            a = _mm_mul_ps(_mm_load_ps(data0+4), _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data0 += 8;
            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        goto finish_after_unrolled_loop;
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+0), _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 337
        a = _mm_mul_ps(_mm_loadu_ps(data0+4), _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#else
#line 345
        data_out[0] = ((data0[0]) *
                             (data1[0]) +
                             (data_out[0]));

#line 345
        data_out[1] = ((data0[1]) *
                             (data1[1]) +
                             (data_out[1]));

#line 345
        data_out[2] = ((data0[2]) *
                             (data1[2]) +
                             (data_out[2]));

#line 345
        data_out[3] = ((data0[3]) *
                             (data1[3]) +
                             (data_out[3]));

#line 345
        data_out[4] = ((data0[4]) *
                             (data1[4]) +
                             (data_out[4]));

#line 345
        data_out[5] = ((data0[5]) *
                             (data1[5]) +
                             (data_out[5]));

#line 345
        data_out[6] = ((data0[6]) *
                             (data1[6]) +
                             (data_out[6]));

#line 345
        data_out[7] = ((data0[7]) *
                             (data1[7]) +
                             (data_out[7]));

#endif
        data0 += 8;
        data1 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

/* Some extra specializations for the two operand case */
static void
longlong_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong value0 = (*(npy_longlong *)dataptr[0]);
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong *data_out = (npy_longlong *)dataptr[2];

#if EINSUM_USE_SSE1 && 0
    __m128 a, b, value0_sse;
#elif EINSUM_USE_SSE2 && 0
    __m128d a, b, value0_sse;
#endif

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 383
        case 6+1:
            data_out[6] = (value0 *
                                 (data1[6]) +
                                 (data_out[6]));

#line 383
        case 5+1:
            data_out[5] = (value0 *
                                 (data1[5]) +
                                 (data_out[5]));

#line 383
        case 4+1:
            data_out[4] = (value0 *
                                 (data1[4]) +
                                 (data_out[4]));

#line 383
        case 3+1:
            data_out[3] = (value0 *
                                 (data1[3]) +
                                 (data_out[3]));

#line 383
        case 2+1:
            data_out[2] = (value0 *
                                 (data1[2]) +
                                 (data_out[2]));

#line 383
        case 1+1:
            data_out[1] = (value0 *
                                 (data1[1]) +
                                 (data_out[1]));

#line 383
        case 0+1:
            data_out[0] = (value0 *
                                 (data1[0]) +
                                 (data_out[0]));

        case 0:
            return;
    }

#if EINSUM_USE_SSE1 && 0
    value0_sse = _mm_set_ps1(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+0));
            b = _mm_add_ps(a, _mm_load_ps(data_out+0));
            _mm_store_ps(data_out+0, b);

#line 404
            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+4));
            b = _mm_add_ps(a, _mm_load_ps(data_out+4));
            _mm_store_ps(data_out+4, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#elif EINSUM_USE_SSE2 && 0
    value0_sse = _mm_set1_pd(value0);

    /* Use aligned instructions if possible */
    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
        /* Unroll the loop by 8 */
        while (count >= 8) {
            count -= 8;

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+0));
            b = _mm_add_pd(a, _mm_load_pd(data_out+0));
            _mm_store_pd(data_out+0, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+2));
            b = _mm_add_pd(a, _mm_load_pd(data_out+2));
            _mm_store_pd(data_out+2, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+4));
            b = _mm_add_pd(a, _mm_load_pd(data_out+4));
            _mm_store_pd(data_out+4, b);

#line 432
            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+6));
            b = _mm_add_pd(a, _mm_load_pd(data_out+6));
            _mm_store_pd(data_out+6, b);

            data1 += 8;
            data_out += 8;
        }

        /* Finish off the loop */
        if (count > 0) {
            goto finish_after_unrolled_loop;
        }
        else {
            return;
        }
    }
#endif

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#if EINSUM_USE_SSE1 && 0
#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+0));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+0));
        _mm_storeu_ps(data_out+0, b);

#line 458
        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+4));
        b = _mm_add_ps(a, _mm_loadu_ps(data_out+4));
        _mm_storeu_ps(data_out+4, b);

#elif EINSUM_USE_SSE2 && 0
#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+0));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+0));
        _mm_storeu_pd(data_out+0, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+2));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+2));
        _mm_storeu_pd(data_out+2, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+4));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+4));
        _mm_storeu_pd(data_out+4, b);

#line 466
        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+6));
        b = _mm_add_pd(a, _mm_loadu_pd(data_out+6));
        _mm_storeu_pd(data_out+6, b);

#else
#line 474
        data_out[0] = (value0 *
                             (data1[0]) +
                             (data_out[0]));

#line 474
        data_out[1] = (value0 *
     