#line 1 "numpy/core/src/multiarray/lowlevel_strided_loops.c.src"

/*
 *****************************************************************************
 **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
 **       Changes should be made to the original source (.src) file         **
 *****************************************************************************
 */

#line 1
/*
 * This file contains low-level loops for copying and byte-swapping
 * strided data.
 *
 * Copyright (c) 2010 by Mark Wiebe (mwwiebe@gmail.com)
 * The Univerity of British Columbia
 *
 * See LICENSE.txt for the license.
 */

#define PY_SSIZE_T_CLEAN
#include "Python.h"
#include "structmember.h"

#define NPY_NO_DEPRECATED_API NPY_API_VERSION
#define _MULTIARRAYMODULE
#include <numpy/arrayobject.h>
#include <numpy/npy_cpu.h>
#include <numpy/halffloat.h>

#include "lowlevel_strided_loops.h"

/*
 * x86 platform works with unaligned access but the compiler is allowed to
 * assume all data is aligned to its size by the C standard. This means it can
 * vectorize instructions peeling only by the size of the type, if the data is
 * not aligned to this size one ends up with data not correctly aligned for SSE
 * instructions (16 byte).
 * So this flag can only be enabled if autovectorization is disabled.
 */
#if (defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64))
#  define NPY_USE_UNALIGNED_ACCESS 0
#else
#  define NPY_USE_UNALIGNED_ACCESS 0
#endif

#define _NPY_NOP1(x) (x)
#define _NPY_NOP2(x) (x)
#define _NPY_NOP4(x) (x)
#define _NPY_NOP8(x) (x)

#define _NPY_SWAP2(x) npy_bswap2(x)

#define _NPY_SWAP4(x) npy_bswap4(x)

#define _NPY_SWAP_PAIR4(x) (((((npy_uint32)x)&0xffu) << 8) | \
                       ((((npy_uint32)x)&0xff00u) >> 8) | \
                       ((((npy_uint32)x)&0xff0000u) << 8) | \
                       ((((npy_uint32)x)&0xff000000u) >> 8))

#define _NPY_SWAP8(x) npy_bswap8(x)

#define _NPY_SWAP_PAIR8(x) (((((npy_uint64)x)&0xffULL) << 24) | \
                       ((((npy_uint64)x)&0xff00ULL) << 8) | \
                       ((((npy_uint64)x)&0xff0000ULL) >> 8) | \
                       ((((npy_uint64)x)&0xff000000ULL) >> 24) | \
                       ((((npy_uint64)x)&0xff00000000ULL) << 24) | \
                       ((((npy_uint64)x)&0xff0000000000ULL) << 8) | \
                       ((((npy_uint64)x)&0xff000000000000ULL) >> 8) | \
                       ((((npy_uint64)x)&0xff00000000000000ULL) >> 24))

#define _NPY_SWAP_INPLACE2(x) npy_bswap2_unaligned(x)

#define _NPY_SWAP_INPLACE4(x) npy_bswap4_unaligned(x)

#define _NPY_SWAP_INPLACE8(x) npy_bswap8_unaligned(x)

#define _NPY_SWAP_INPLACE16(x) { \
        char a = (x)[0]; (x)[0] = (x)[15]; (x)[15] = a; \
        a = (x)[1]; (x)[1] = (x)[14]; (x)[14] = a; \
        a = (x)[2]; (x)[2] = (x)[13]; (x)[13] = a; \
        a = (x)[3]; (x)[3] = (x)[12]; (x)[12] = a; \
        a = (x)[4]; (x)[4] = (x)[11]; (x)[11] = a; \
        a = (x)[5]; (x)[5] = (x)[10]; (x)[10] = a; \
        a = (x)[6]; (x)[6] = (x)[9]; (x)[9] = a; \
        a = (x)[7]; (x)[7] = (x)[8]; (x)[8] = a; \
        }

/************* STRIDED COPYING/SWAPPING SPECIALIZED FUNCTIONS *************/

#line 86
#line 92
#line 100

#if (1 >= 1) && \
    (1 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 0 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 0 == 0 && 1 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_strided_to_strided_size1(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _strided_to_strided_size1\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 1 != 16
        (*((npy_uint8 *)dst)) = _NPY_NOP1(*((npy_uint8 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 1);
#  if 0 == 1
        _NPY_NOP1(dst);
#  elif 0 == 2
        _NPY_NOP0(dst);
        _NPY_NOP0(dst + 0);
#  endif

#endif

#if 0
        dst += 1;
#else
        dst += dst_stride;
#endif

#if 0
        src += 1;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_strided_to_strided_size1_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 1 == 1 && 0
    memset(dst, *src, N);
#else

#  if 1 != 16
    npy_uint8 temp = _NPY_NOP1(*((npy_uint8 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 1 != 16
        *((npy_uint8 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 1;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 1 >= 1 */


#line 100

#if (1 >= 1) && \
    (1 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 0 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 0 == 0 && 1 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_strided_to_strided_size1(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_strided_to_strided_size1\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 1 != 16
        (*((npy_uint8 *)dst)) = _NPY_NOP1(*((npy_uint8 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 1);
#  if 0 == 1
        _NPY_NOP1(dst);
#  elif 0 == 2
        _NPY_NOP0(dst);
        _NPY_NOP0(dst + 0);
#  endif

#endif

#if 0
        dst += 1;
#else
        dst += dst_stride;
#endif

#if 0
        src += 1;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_strided_to_strided_size1_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 1 == 1 && 0
    memset(dst, *src, N);
#else

#  if 1 != 16
    npy_uint8 temp = _NPY_NOP1(*((npy_uint8 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 1 != 16
        *((npy_uint8 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 1;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 1 >= 1 */


#line 100

#if (1 >= 2) && \
    (1 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 1 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 1 == 0 && 1 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_strided_to_strided_size1(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_strided_to_strided_size1\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 1 != 16
        (*((npy_uint8 *)dst)) = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 1);
#  if 1 == 1
        _NPY_SWAP_INPLACE1(dst);
#  elif 1 == 2
        _NPY_SWAP_INPLACE0(dst);
        _NPY_SWAP_INPLACE0(dst + 0);
#  endif

#endif

#if 0
        dst += 1;
#else
        dst += dst_stride;
#endif

#if 0
        src += 1;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_swap_strided_to_strided_size1_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 1 == 1 && 0
    memset(dst, *src, N);
#else

#  if 1 != 16
    npy_uint8 temp = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 1 != 16
        *((npy_uint8 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 1;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 1 >= 2 */


#line 100

#if (1 >= 2) && \
    (1 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 1 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 1 == 0 && 1 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_strided_to_strided_size1(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_strided_to_strided_size1\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 1 != 16
        (*((npy_uint8 *)dst)) = _NPY_SWAP1(*((npy_uint8 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 1);
#  if 1 == 1
        _NPY_SWAP1(dst);
#  elif 1 == 2
        _NPY_SWAP0(dst);
        _NPY_SWAP0(dst + 0);
#  endif

#endif

#if 0
        dst += 1;
#else
        dst += dst_stride;
#endif

#if 0
        src += 1;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_swap_strided_to_strided_size1_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 1 == 1 && 0
    memset(dst, *src, N);
#else

#  if 1 != 16
    npy_uint8 temp = _NPY_SWAP1(*((npy_uint8 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 1 != 16
        *((npy_uint8 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 1;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 1 >= 2 */


#line 100

#if (1 >= 4) && \
    (1 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 2 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 2 == 0 && 1 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_pair_strided_to_strided_size1(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_pair_strided_to_strided_size1\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 1 != 16
        (*((npy_uint8 *)dst)) = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 1);
#  if 2 == 1
        _NPY_SWAP_INPLACE1(dst);
#  elif 2 == 2
        _NPY_SWAP_INPLACE0(dst);
        _NPY_SWAP_INPLACE0(dst + 0);
#  endif

#endif

#if 0
        dst += 1;
#else
        dst += dst_stride;
#endif

#if 0
        src += 1;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_swap_pair_strided_to_strided_size1_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 1 == 1 && 0
    memset(dst, *src, N);
#else

#  if 1 != 16
    npy_uint8 temp = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 1 != 16
        *((npy_uint8 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 1;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 1 >= 4 */


#line 100

#if (1 >= 4) && \
    (1 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 2 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 2 == 0 && 1 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_pair_strided_to_strided_size1(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_pair_strided_to_strided_size1\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 1 != 16
        (*((npy_uint8 *)dst)) = _NPY_SWAP_PAIR1(*((npy_uint8 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 1);
#  if 2 == 1
        _NPY_SWAP_PAIR1(dst);
#  elif 2 == 2
        _NPY_SWAP_PAIR0(dst);
        _NPY_SWAP_PAIR0(dst + 0);
#  endif

#endif

#if 0
        dst += 1;
#else
        dst += dst_stride;
#endif

#if 0
        src += 1;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_swap_pair_strided_to_strided_size1_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 1 == 1 && 0
    memset(dst, *src, N);
#else

#  if 1 != 16
    npy_uint8 temp = _NPY_SWAP_PAIR1(*((npy_uint8 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 1 != 16
        *((npy_uint8 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 1;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 1 >= 4 */



#line 92
#line 100

#if (1 >= 1) && \
    (1 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 0 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 0 == 0 && 1 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_strided_to_contig_size1(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _strided_to_contig_size1\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 1 != 16
        (*((npy_uint8 *)dst)) = _NPY_NOP1(*((npy_uint8 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 1);
#  if 0 == 1
        _NPY_NOP1(dst);
#  elif 0 == 2
        _NPY_NOP0(dst);
        _NPY_NOP0(dst + 0);
#  endif

#endif

#if 1
        dst += 1;
#else
        dst += dst_stride;
#endif

#if 0
        src += 1;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_strided_to_contig_size1_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 1 == 1 && 1
    memset(dst, *src, N);
#else

#  if 1 != 16
    npy_uint8 temp = _NPY_NOP1(*((npy_uint8 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 1 != 16
        *((npy_uint8 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 1;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 1 >= 1 */


#line 100

#if (1 >= 1) && \
    (1 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 0 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 0 == 0 && 1 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_strided_to_contig_size1(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_strided_to_contig_size1\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 1 != 16
        (*((npy_uint8 *)dst)) = _NPY_NOP1(*((npy_uint8 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 1);
#  if 0 == 1
        _NPY_NOP1(dst);
#  elif 0 == 2
        _NPY_NOP0(dst);
        _NPY_NOP0(dst + 0);
#  endif

#endif

#if 1
        dst += 1;
#else
        dst += dst_stride;
#endif

#if 0
        src += 1;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_strided_to_contig_size1_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 1 == 1 && 1
    memset(dst, *src, N);
#else

#  if 1 != 16
    npy_uint8 temp = _NPY_NOP1(*((npy_uint8 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 1 != 16
        *((npy_uint8 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 1;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 1 >= 1 */


#line 100

#if (1 >= 2) && \
    (1 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 1 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 1 == 0 && 1 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_strided_to_contig_size1(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_strided_to_contig_size1\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 1 != 16
        (*((npy_uint8 *)dst)) = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 1);
#  if 1 == 1
        _NPY_SWAP_INPLACE1(dst);
#  elif 1 == 2
        _NPY_SWAP_INPLACE0(dst);
        _NPY_SWAP_INPLACE0(dst + 0);
#  endif

#endif

#if 1
        dst += 1;
#else
        dst += dst_stride;
#endif

#if 0
        src += 1;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_swap_strided_to_contig_size1_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 1 == 1 && 1
    memset(dst, *src, N);
#else

#  if 1 != 16
    npy_uint8 temp = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 1 != 16
        *((npy_uint8 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 1;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 1 >= 2 */


#line 100

#if (1 >= 2) && \
    (1 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 1 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 1 == 0 && 1 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_strided_to_contig_size1(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_strided_to_contig_size1\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 1 != 16
        (*((npy_uint8 *)dst)) = _NPY_SWAP1(*((npy_uint8 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 1);
#  if 1 == 1
        _NPY_SWAP1(dst);
#  elif 1 == 2
        _NPY_SWAP0(dst);
        _NPY_SWAP0(dst + 0);
#  endif

#endif

#if 1
        dst += 1;
#else
        dst += dst_stride;
#endif

#if 0
        src += 1;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_swap_strided_to_contig_size1_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 1 == 1 && 1
    memset(dst, *src, N);
#else

#  if 1 != 16
    npy_uint8 temp = _NPY_SWAP1(*((npy_uint8 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 1 != 16
        *((npy_uint8 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 1;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 1 >= 2 */


#line 100

#if (1 >= 4) && \
    (1 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 2 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 2 == 0 && 1 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_pair_strided_to_contig_size1(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_pair_strided_to_contig_size1\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 1 != 16
        (*((npy_uint8 *)dst)) = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 1);
#  if 2 == 1
        _NPY_SWAP_INPLACE1(dst);
#  elif 2 == 2
        _NPY_SWAP_INPLACE0(dst);
        _NPY_SWAP_INPLACE0(dst + 0);
#  endif

#endif

#if 1
        dst += 1;
#else
        dst += dst_stride;
#endif

#if 0
        src += 1;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_swap_pair_strided_to_contig_size1_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 1 == 1 && 1
    memset(dst, *src, N);
#else

#  if 1 != 16
    npy_uint8 temp = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 1 != 16
        *((npy_uint8 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 1;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 1 >= 4 */


#line 100

#if (1 >= 4) && \
    (1 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 2 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 2 == 0 && 1 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_pair_strided_to_contig_size1(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_pair_strided_to_contig_size1\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 1 != 16
        (*((npy_uint8 *)dst)) = _NPY_SWAP_PAIR1(*((npy_uint8 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 1);
#  if 2 == 1
        _NPY_SWAP_PAIR1(dst);
#  elif 2 == 2
        _NPY_SWAP_PAIR0(dst);
        _NPY_SWAP_PAIR0(dst + 0);
#  endif

#endif

#if 1
        dst += 1;
#else
        dst += dst_stride;
#endif

#if 0
        src += 1;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_swap_pair_strided_to_contig_size1_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 1 == 1 && 1
    memset(dst, *src, N);
#else

#  if 1 != 16
    npy_uint8 temp = _NPY_SWAP_PAIR1(*((npy_uint8 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 1 != 16
        *((npy_uint8 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 1;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 1 >= 4 */



#line 92
#line 100

#if (1 >= 1) && \
    (1 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 0 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 0 == 0 && 1 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_contig_to_strided_size1(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _contig_to_strided_size1\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 1 != 16
        (*((npy_uint8 *)dst)) = _NPY_NOP1(*((npy_uint8 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 1);
#  if 0 == 1
        _NPY_NOP1(dst);
#  elif 0 == 2
        _NPY_NOP0(dst);
        _NPY_NOP0(dst + 0);
#  endif

#endif

#if 0
        dst += 1;
#else
        dst += dst_stride;
#endif

#if 1
        src += 1;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_contig_to_strided_size1_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 1 == 1 && 0
    memset(dst, *src, N);
#else

#  if 1 != 16
    npy_uint8 temp = _NPY_NOP1(*((npy_uint8 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 1 != 16
        *((npy_uint8 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 1;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 1 >= 1 */


#line 100

#if (1 >= 1) && \
    (1 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 0 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 0 == 0 && 1 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_contig_to_strided_size1(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_contig_to_strided_size1\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 1 != 16
        (*((npy_uint8 *)dst)) = _NPY_NOP1(*((npy_uint8 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 1);
#  if 0 == 1
        _NPY_NOP1(dst);
#  elif 0 == 2
        _NPY_NOP0(dst);
        _NPY_NOP0(dst + 0);
#  endif

#endif

#if 0
        dst += 1;
#else
        dst += dst_stride;
#endif

#if 1
        src += 1;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_contig_to_strided_size1_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 1 == 1 && 0
    memset(dst, *src, N);
#else

#  if 1 != 16
    npy_uint8 temp = _NPY_NOP1(*((npy_uint8 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 1 != 16
        *((npy_uint8 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 1;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 1 >= 1 */


#line 100

#if (1 >= 2) && \
    (1 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 1 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 1 == 0 && 1 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_contig_to_strided_size1(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_contig_to_strided_size1\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 1 != 16
        (*((npy_uint8 *)dst)) = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 1);
#  if 1 == 1
        _NPY_SWAP_INPLACE1(dst);
#  elif 1 == 2
        _NPY_SWAP_INPLACE0(dst);
        _NPY_SWAP_INPLACE0(dst + 0);
#  endif

#endif

#if 0
        dst += 1;
#else
        dst += dst_stride;
#endif

#if 1
        src += 1;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_swap_contig_to_strided_size1_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 1 == 1 && 0
    memset(dst, *src, N);
#else

#  if 1 != 16
    npy_uint8 temp = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 1 != 16
        *((npy_uint8 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 1;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 1 >= 2 */


#line 100

#if (1 >= 2) && \
    (1 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 1 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 1 == 0 && 1 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_contig_to_strided_size1(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_contig_to_strided_size1\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 1 != 16
        (*((npy_uint8 *)dst)) = _NPY_SWAP1(*((npy_uint8 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 1);
#  if 1 == 1
        _NPY_SWAP1(dst);
#  elif 1 == 2
        _NPY_SWAP0(dst);
        _NPY_SWAP0(dst + 0);
#  endif

#endif

#if 0
        dst += 1;
#else
        dst += dst_stride;
#endif

#if 1
        src += 1;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_swap_contig_to_strided_size1_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 1 == 1 && 0
    memset(dst, *src, N);
#else

#  if 1 != 16
    npy_uint8 temp = _NPY_SWAP1(*((npy_uint8 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 1 != 16
        *((npy_uint8 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 1;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 1 >= 2 */


#line 100

#if (1 >= 4) && \
    (1 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 2 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 2 == 0 && 1 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_pair_contig_to_strided_size1(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_pair_contig_to_strided_size1\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 1 != 16
        (*((npy_uint8 *)dst)) = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 1);
#  if 2 == 1
        _NPY_SWAP_INPLACE1(dst);
#  elif 2 == 2
        _NPY_SWAP_INPLACE0(dst);
        _NPY_SWAP_INPLACE0(dst + 0);
#  endif

#endif

#if 0
        dst += 1;
#else
        dst += dst_stride;
#endif

#if 1
        src += 1;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_swap_pair_contig_to_strided_size1_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 1 == 1 && 0
    memset(dst, *src, N);
#else

#  if 1 != 16
    npy_uint8 temp = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 1 != 16
        *((npy_uint8 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 1;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 1 >= 4 */


#line 100

#if (1 >= 4) && \
    (1 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 2 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 2 == 0 && 1 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_pair_contig_to_strided_size1(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_pair_contig_to_strided_size1\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 1 != 16
        (*((npy_uint8 *)dst)) = _NPY_SWAP_PAIR1(*((npy_uint8 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 1);
#  if 2 == 1
        _NPY_SWAP_PAIR1(dst);
#  elif 2 == 2
        _NPY_SWAP_PAIR0(dst);
        _NPY_SWAP_PAIR0(dst + 0);
#  endif

#endif

#if 0
        dst += 1;
#else
        dst += dst_stride;
#endif

#if 1
        src += 1;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_swap_pair_contig_to_strided_size1_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 1 == 1 && 0
    memset(dst, *src, N);
#else

#  if 1 != 16
    npy_uint8 temp = _NPY_SWAP_PAIR1(*((npy_uint8 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 1 != 16
        *((npy_uint8 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 1;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 1 >= 4 */



#line 92
#line 100

#if (1 >= 1) && \
    (1 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 0 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 0 == 0 && 1 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_contig_to_contig_size1(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _contig_to_contig_size1\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 1 != 16
        (*((npy_uint8 *)dst)) = _NPY_NOP1(*((npy_uint8 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 1);
#  if 0 == 1
        _NPY_NOP1(dst);
#  elif 0 == 2
        _NPY_NOP0(dst);
        _NPY_NOP0(dst + 0);
#  endif

#endif

#if 1
        dst += 1;
#else
        dst += dst_stride;
#endif

#if 1
        src += 1;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_contig_to_contig_size1_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 1 == 1 && 1
    memset(dst, *src, N);
#else

#  if 1 != 16
    npy_uint8 temp = _NPY_NOP1(*((npy_uint8 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 1 != 16
        *((npy_uint8 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 1;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 1 >= 1 */


#line 100

#if (1 >= 1) && \
    (1 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 0 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 0 == 0 && 1 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_contig_to_contig_size1(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_contig_to_contig_size1\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 1 != 16
        (*((npy_uint8 *)dst)) = _NPY_NOP1(*((npy_uint8 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 1);
#  if 0 == 1
        _NPY_NOP1(dst);
#  elif 0 == 2
        _NPY_NOP0(dst);
        _NPY_NOP0(dst + 0);
#  endif

#endif

#if 1
        dst += 1;
#else
        dst += dst_stride;
#endif

#if 1
        src += 1;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_contig_to_contig_size1_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 1 == 1 && 1
    memset(dst, *src, N);
#else

#  if 1 != 16
    npy_uint8 temp = _NPY_NOP1(*((npy_uint8 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 1 != 16
        *((npy_uint8 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 1;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 1 >= 1 */


#line 100

#if (1 >= 2) && \
    (1 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 1 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 1 == 0 && 1 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_contig_to_contig_size1(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_contig_to_contig_size1\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 1 != 16
        (*((npy_uint8 *)dst)) = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 1);
#  if 1 == 1
        _NPY_SWAP_INPLACE1(dst);
#  elif 1 == 2
        _NPY_SWAP_INPLACE0(dst);
        _NPY_SWAP_INPLACE0(dst + 0);
#  endif

#endif

#if 1
        dst += 1;
#else
        dst += dst_stride;
#endif

#if 1
        src += 1;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_swap_contig_to_contig_size1_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 1 == 1 && 1
    memset(dst, *src, N);
#else

#  if 1 != 16
    npy_uint8 temp = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 1 != 16
        *((npy_uint8 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 1;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 1 >= 2 */


#line 100

#if (1 >= 2) && \
    (1 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 1 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 1 == 0 && 1 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_contig_to_contig_size1(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_contig_to_contig_size1\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 1 != 16
        (*((npy_uint8 *)dst)) = _NPY_SWAP1(*((npy_uint8 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 1);
#  if 1 == 1
        _NPY_SWAP1(dst);
#  elif 1 == 2
        _NPY_SWAP0(dst);
        _NPY_SWAP0(dst + 0);
#  endif

#endif

#if 1
        dst += 1;
#else
        dst += dst_stride;
#endif

#if 1
        src += 1;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_swap_contig_to_contig_size1_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 1 == 1 && 1
    memset(dst, *src, N);
#else

#  if 1 != 16
    npy_uint8 temp = _NPY_SWAP1(*((npy_uint8 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 1 != 16
        *((npy_uint8 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 1;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 1 >= 2 */


#line 100

#if (1 >= 4) && \
    (1 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 2 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 2 == 0 && 1 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_pair_contig_to_contig_size1(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_pair_contig_to_contig_size1\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 1 != 16
        (*((npy_uint8 *)dst)) = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 1);
#  if 2 == 1
        _NPY_SWAP_INPLACE1(dst);
#  elif 2 == 2
        _NPY_SWAP_INPLACE0(dst);
        _NPY_SWAP_INPLACE0(dst + 0);
#  endif

#endif

#if 1
        dst += 1;
#else
        dst += dst_stride;
#endif

#if 1
        src += 1;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_swap_pair_contig_to_contig_size1_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 1 == 1 && 1
    memset(dst, *src, N);
#else

#  if 1 != 16
    npy_uint8 temp = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 1 != 16
        *((npy_uint8 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 1;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 1 >= 4 */


#line 100

#if (1 >= 4) && \
    (1 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 2 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 2 == 0 && 1 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_pair_contig_to_contig_size1(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_pair_contig_to_contig_size1\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 1 != 16
        (*((npy_uint8 *)dst)) = _NPY_SWAP_PAIR1(*((npy_uint8 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 1);
#  if 2 == 1
        _NPY_SWAP_PAIR1(dst);
#  elif 2 == 2
        _NPY_SWAP_PAIR0(dst);
        _NPY_SWAP_PAIR0(dst + 0);
#  endif

#endif

#if 1
        dst += 1;
#else
        dst += dst_stride;
#endif

#if 1
        src += 1;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_swap_pair_contig_to_contig_size1_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 1 == 1 && 1
    memset(dst, *src, N);
#else

#  if 1 != 16
    npy_uint8 temp = _NPY_SWAP_PAIR1(*((npy_uint8 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 1 != 16
        *((npy_uint8 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 1;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 1 >= 4 */




#line 86
#line 92
#line 100

#if (2 >= 1) && \
    (2 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 0 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 0 == 0 && 2 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_strided_to_strided_size2(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _strided_to_strided_size2\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 2 != 16
        (*((npy_uint16 *)dst)) = _NPY_NOP2(*((npy_uint16 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 2);
#  if 0 == 1
        _NPY_NOP2(dst);
#  elif 0 == 2
        _NPY_NOP1(dst);
        _NPY_NOP1(dst + 1);
#  endif

#endif

#if 0
        dst += 2;
#else
        dst += dst_stride;
#endif

#if 0
        src += 2;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_strided_to_strided_size2_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 2 == 1 && 0
    memset(dst, *src, N);
#else

#  if 2 != 16
    npy_uint16 temp = _NPY_NOP2(*((npy_uint16 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 2 != 16
        *((npy_uint16 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 2;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 2 >= 1 */


#line 100

#if (2 >= 1) && \
    (2 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 0 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 0 == 0 && 2 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_strided_to_strided_size2(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_strided_to_strided_size2\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 2 != 16
        (*((npy_uint16 *)dst)) = _NPY_NOP2(*((npy_uint16 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 2);
#  if 0 == 1
        _NPY_NOP2(dst);
#  elif 0 == 2
        _NPY_NOP1(dst);
        _NPY_NOP1(dst + 1);
#  endif

#endif

#if 0
        dst += 2;
#else
        dst += dst_stride;
#endif

#if 0
        src += 2;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_strided_to_strided_size2_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 2 == 1 && 0
    memset(dst, *src, N);
#else

#  if 2 != 16
    npy_uint16 temp = _NPY_NOP2(*((npy_uint16 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 2 != 16
        *((npy_uint16 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 2;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 2 >= 1 */


#line 100

#if (2 >= 2) && \
    (2 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 1 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 1 == 0 && 2 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_strided_to_strided_size2(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_strided_to_strided_size2\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 2 != 16
        (*((npy_uint16 *)dst)) = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 2);
#  if 1 == 1
        _NPY_SWAP_INPLACE2(dst);
#  elif 1 == 2
        _NPY_SWAP_INPLACE1(dst);
        _NPY_SWAP_INPLACE1(dst + 1);
#  endif

#endif

#if 0
        dst += 2;
#else
        dst += dst_stride;
#endif

#if 0
        src += 2;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_swap_strided_to_strided_size2_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 2 == 1 && 0
    memset(dst, *src, N);
#else

#  if 2 != 16
    npy_uint16 temp = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 2 != 16
        *((npy_uint16 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 2;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 2 >= 2 */


#line 100

#if (2 >= 2) && \
    (2 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 1 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 1 == 0 && 2 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_strided_to_strided_size2(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_strided_to_strided_size2\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 2 != 16
        (*((npy_uint16 *)dst)) = _NPY_SWAP2(*((npy_uint16 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 2);
#  if 1 == 1
        _NPY_SWAP2(dst);
#  elif 1 == 2
        _NPY_SWAP1(dst);
        _NPY_SWAP1(dst + 1);
#  endif

#endif

#if 0
        dst += 2;
#else
        dst += dst_stride;
#endif

#if 0
        src += 2;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_swap_strided_to_strided_size2_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 2 == 1 && 0
    memset(dst, *src, N);
#else

#  if 2 != 16
    npy_uint16 temp = _NPY_SWAP2(*((npy_uint16 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 2 != 16
        *((npy_uint16 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 2;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 2 >= 2 */


#line 100

#if (2 >= 4) && \
    (2 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 2 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 2 == 0 && 2 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_pair_strided_to_strided_size2(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_pair_strided_to_strided_size2\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 2 != 16
        (*((npy_uint16 *)dst)) = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 2);
#  if 2 == 1
        _NPY_SWAP_INPLACE2(dst);
#  elif 2 == 2
        _NPY_SWAP_INPLACE1(dst);
        _NPY_SWAP_INPLACE1(dst + 1);
#  endif

#endif

#if 0
        dst += 2;
#else
        dst += dst_stride;
#endif

#if 0
        src += 2;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_swap_pair_strided_to_strided_size2_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 2 == 1 && 0
    memset(dst, *src, N);
#else

#  if 2 != 16
    npy_uint16 temp = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 2 != 16
        *((npy_uint16 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 2;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 2 >= 4 */


#line 100

#if (2 >= 4) && \
    (2 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 2 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 2 == 0 && 2 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_pair_strided_to_strided_size2(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_pair_strided_to_strided_size2\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 2 != 16
        (*((npy_uint16 *)dst)) = _NPY_SWAP_PAIR2(*((npy_uint16 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 2);
#  if 2 == 1
        _NPY_SWAP_PAIR2(dst);
#  elif 2 == 2
        _NPY_SWAP_PAIR1(dst);
        _NPY_SWAP_PAIR1(dst + 1);
#  endif

#endif

#if 0
        dst += 2;
#else
        dst += dst_stride;
#endif

#if 0
        src += 2;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_swap_pair_strided_to_strided_size2_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 2 == 1 && 0
    memset(dst, *src, N);
#else

#  if 2 != 16
    npy_uint16 temp = _NPY_SWAP_PAIR2(*((npy_uint16 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 2 != 16
        *((npy_uint16 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 2;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 2 >= 4 */



#line 92
#line 100

#if (2 >= 1) && \
    (2 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 0 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 0 == 0 && 2 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_strided_to_contig_size2(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _strided_to_contig_size2\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 2 != 16
        (*((npy_uint16 *)dst)) = _NPY_NOP2(*((npy_uint16 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 2);
#  if 0 == 1
        _NPY_NOP2(dst);
#  elif 0 == 2
        _NPY_NOP1(dst);
        _NPY_NOP1(dst + 1);
#  endif

#endif

#if 1
        dst += 2;
#else
        dst += dst_stride;
#endif

#if 0
        src += 2;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_strided_to_contig_size2_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 2 == 1 && 1
    memset(dst, *src, N);
#else

#  if 2 != 16
    npy_uint16 temp = _NPY_NOP2(*((npy_uint16 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 2 != 16
        *((npy_uint16 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 2;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 2 >= 1 */


#line 100

#if (2 >= 1) && \
    (2 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 0 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 0 == 0 && 2 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_strided_to_contig_size2(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_strided_to_contig_size2\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 2 != 16
        (*((npy_uint16 *)dst)) = _NPY_NOP2(*((npy_uint16 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 2);
#  if 0 == 1
        _NPY_NOP2(dst);
#  elif 0 == 2
        _NPY_NOP1(dst);
        _NPY_NOP1(dst + 1);
#  endif

#endif

#if 1
        dst += 2;
#else
        dst += dst_stride;
#endif

#if 0
        src += 2;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_strided_to_contig_size2_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 2 == 1 && 1
    memset(dst, *src, N);
#else

#  if 2 != 16
    npy_uint16 temp = _NPY_NOP2(*((npy_uint16 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 2 != 16
        *((npy_uint16 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 2;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 2 >= 1 */


#line 100

#if (2 >= 2) && \
    (2 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 1 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 1 == 0 && 2 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_strided_to_contig_size2(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_strided_to_contig_size2\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 2 != 16
        (*((npy_uint16 *)dst)) = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 2);
#  if 1 == 1
        _NPY_SWAP_INPLACE2(dst);
#  elif 1 == 2
        _NPY_SWAP_INPLACE1(dst);
        _NPY_SWAP_INPLACE1(dst + 1);
#  endif

#endif

#if 1
        dst += 2;
#else
        dst += dst_stride;
#endif

#if 0
        src += 2;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_swap_strided_to_contig_size2_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 2 == 1 && 1
    memset(dst, *src, N);
#else

#  if 2 != 16
    npy_uint16 temp = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 2 != 16
        *((npy_uint16 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 2;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 2 >= 2 */


#line 100

#if (2 >= 2) && \
    (2 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 1 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 1 == 0 && 2 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_strided_to_contig_size2(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_strided_to_contig_size2\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 2 != 16
        (*((npy_uint16 *)dst)) = _NPY_SWAP2(*((npy_uint16 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 2);
#  if 1 == 1
        _NPY_SWAP2(dst);
#  elif 1 == 2
        _NPY_SWAP1(dst);
        _NPY_SWAP1(dst + 1);
#  endif

#endif

#if 1
        dst += 2;
#else
        dst += dst_stride;
#endif

#if 0
        src += 2;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_swap_strided_to_contig_size2_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 2 == 1 && 1
    memset(dst, *src, N);
#else

#  if 2 != 16
    npy_uint16 temp = _NPY_SWAP2(*((npy_uint16 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 2 != 16
        *((npy_uint16 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 2;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 2 >= 2 */


#line 100

#if (2 >= 4) && \
    (2 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 2 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 2 == 0 && 2 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_pair_strided_to_contig_size2(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_pair_strided_to_contig_size2\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 2 != 16
        (*((npy_uint16 *)dst)) = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 2);
#  if 2 == 1
        _NPY_SWAP_INPLACE2(dst);
#  elif 2 == 2
        _NPY_SWAP_INPLACE1(dst);
        _NPY_SWAP_INPLACE1(dst + 1);
#  endif

#endif

#if 1
        dst += 2;
#else
        dst += dst_stride;
#endif

#if 0
        src += 2;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_swap_pair_strided_to_contig_size2_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 2 == 1 && 1
    memset(dst, *src, N);
#else

#  if 2 != 16
    npy_uint16 temp = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 2 != 16
        *((npy_uint16 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 2;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 2 >= 4 */


#line 100

#if (2 >= 4) && \
    (2 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 2 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 2 == 0 && 2 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_pair_strided_to_contig_size2(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_pair_strided_to_contig_size2\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 2 != 16
        (*((npy_uint16 *)dst)) = _NPY_SWAP_PAIR2(*((npy_uint16 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 2);
#  if 2 == 1
        _NPY_SWAP_PAIR2(dst);
#  elif 2 == 2
        _NPY_SWAP_PAIR1(dst);
        _NPY_SWAP_PAIR1(dst + 1);
#  endif

#endif

#if 1
        dst += 2;
#else
        dst += dst_stride;
#endif

#if 0
        src += 2;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_swap_pair_strided_to_contig_size2_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 2 == 1 && 1
    memset(dst, *src, N);
#else

#  if 2 != 16
    npy_uint16 temp = _NPY_SWAP_PAIR2(*((npy_uint16 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 2 != 16
        *((npy_uint16 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 2;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 2 >= 4 */



#line 92
#line 100

#if (2 >= 1) && \
    (2 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 0 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 0 == 0 && 2 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_contig_to_strided_size2(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _contig_to_strided_size2\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 2 != 16
        (*((npy_uint16 *)dst)) = _NPY_NOP2(*((npy_uint16 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 2);
#  if 0 == 1
        _NPY_NOP2(dst);
#  elif 0 == 2
        _NPY_NOP1(dst);
        _NPY_NOP1(dst + 1);
#  endif

#endif

#if 0
        dst += 2;
#else
        dst += dst_stride;
#endif

#if 1
        src += 2;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_contig_to_strided_size2_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 2 == 1 && 0
    memset(dst, *src, N);
#else

#  if 2 != 16
    npy_uint16 temp = _NPY_NOP2(*((npy_uint16 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 2 != 16
        *((npy_uint16 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 2;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 2 >= 1 */


#line 100

#if (2 >= 1) && \
    (2 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 0 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 0 == 0 && 2 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_contig_to_strided_size2(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_contig_to_strided_size2\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 2 != 16
        (*((npy_uint16 *)dst)) = _NPY_NOP2(*((npy_uint16 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 2);
#  if 0 == 1
        _NPY_NOP2(dst);
#  elif 0 == 2
        _NPY_NOP1(dst);
        _NPY_NOP1(dst + 1);
#  endif

#endif

#if 0
        dst += 2;
#else
        dst += dst_stride;
#endif

#if 1
        src += 2;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_contig_to_strided_size2_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 2 == 1 && 0
    memset(dst, *src, N);
#else

#  if 2 != 16
    npy_uint16 temp = _NPY_NOP2(*((npy_uint16 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 2 != 16
        *((npy_uint16 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 2;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 2 >= 1 */


#line 100

#if (2 >= 2) && \
    (2 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 1 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 1 == 0 && 2 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_contig_to_strided_size2(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_contig_to_strided_size2\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 2 != 16
        (*((npy_uint16 *)dst)) = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 2);
#  if 1 == 1
        _NPY_SWAP_INPLACE2(dst);
#  elif 1 == 2
        _NPY_SWAP_INPLACE1(dst);
        _NPY_SWAP_INPLACE1(dst + 1);
#  endif

#endif

#if 0
        dst += 2;
#else
        dst += dst_stride;
#endif

#if 1
        src += 2;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_swap_contig_to_strided_size2_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 2 == 1 && 0
    memset(dst, *src, N);
#else

#  if 2 != 16
    npy_uint16 temp = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 2 != 16
        *((npy_uint16 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 2;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 2 >= 2 */


#line 100

#if (2 >= 2) && \
    (2 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 1 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 1 == 0 && 2 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_contig_to_strided_size2(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_contig_to_strided_size2\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 2 != 16
        (*((npy_uint16 *)dst)) = _NPY_SWAP2(*((npy_uint16 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 2);
#  if 1 == 1
        _NPY_SWAP2(dst);
#  elif 1 == 2
        _NPY_SWAP1(dst);
        _NPY_SWAP1(dst + 1);
#  endif

#endif

#if 0
        dst += 2;
#else
        dst += dst_stride;
#endif

#if 1
        src += 2;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_swap_contig_to_strided_size2_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 2 == 1 && 0
    memset(dst, *src, N);
#else

#  if 2 != 16
    npy_uint16 temp = _NPY_SWAP2(*((npy_uint16 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 2 != 16
        *((npy_uint16 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 2;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 2 >= 2 */


#line 100

#if (2 >= 4) && \
    (2 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 2 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 2 == 0 && 2 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_pair_contig_to_strided_size2(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_pair_contig_to_strided_size2\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 2 != 16
        (*((npy_uint16 *)dst)) = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 2);
#  if 2 == 1
        _NPY_SWAP_INPLACE2(dst);
#  elif 2 == 2
        _NPY_SWAP_INPLACE1(dst);
        _NPY_SWAP_INPLACE1(dst + 1);
#  endif

#endif

#if 0
        dst += 2;
#else
        dst += dst_stride;
#endif

#if 1
        src += 2;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_swap_pair_contig_to_strided_size2_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 2 == 1 && 0
    memset(dst, *src, N);
#else

#  if 2 != 16
    npy_uint16 temp = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 2 != 16
        *((npy_uint16 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 2;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 2 >= 4 */


#line 100

#if (2 >= 4) && \
    (2 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 2 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 2 == 0 && 2 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_pair_contig_to_strided_size2(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_pair_contig_to_strided_size2\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 2 != 16
        (*((npy_uint16 *)dst)) = _NPY_SWAP_PAIR2(*((npy_uint16 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 2);
#  if 2 == 1
        _NPY_SWAP_PAIR2(dst);
#  elif 2 == 2
        _NPY_SWAP_PAIR1(dst);
        _NPY_SWAP_PAIR1(dst + 1);
#  endif

#endif

#if 0
        dst += 2;
#else
        dst += dst_stride;
#endif

#if 1
        src += 2;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_swap_pair_contig_to_strided_size2_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 2 == 1 && 0
    memset(dst, *src, N);
#else

#  if 2 != 16
    npy_uint16 temp = _NPY_SWAP_PAIR2(*((npy_uint16 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 2 != 16
        *((npy_uint16 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 2;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 2 >= 4 */



#line 92
#line 100

#if (2 >= 1) && \
    (2 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 0 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 0 == 0 && 2 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_contig_to_contig_size2(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _contig_to_contig_size2\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 2 != 16
        (*((npy_uint16 *)dst)) = _NPY_NOP2(*((npy_uint16 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 2);
#  if 0 == 1
        _NPY_NOP2(dst);
#  elif 0 == 2
        _NPY_NOP1(dst);
        _NPY_NOP1(dst + 1);
#  endif

#endif

#if 1
        dst += 2;
#else
        dst += dst_stride;
#endif

#if 1
        src += 2;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_contig_to_contig_size2_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 2 == 1 && 1
    memset(dst, *src, N);
#else

#  if 2 != 16
    npy_uint16 temp = _NPY_NOP2(*((npy_uint16 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 2 != 16
        *((npy_uint16 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 2;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 2 >= 1 */


#line 100

#if (2 >= 1) && \
    (2 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 0 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 0 == 0 && 2 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_contig_to_contig_size2(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_contig_to_contig_size2\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 2 != 16
        (*((npy_uint16 *)dst)) = _NPY_NOP2(*((npy_uint16 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 2);
#  if 0 == 1
        _NPY_NOP2(dst);
#  elif 0 == 2
        _NPY_NOP1(dst);
        _NPY_NOP1(dst + 1);
#  endif

#endif

#if 1
        dst += 2;
#else
        dst += dst_stride;
#endif

#if 1
        src += 2;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_contig_to_contig_size2_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 2 == 1 && 1
    memset(dst, *src, N);
#else

#  if 2 != 16
    npy_uint16 temp = _NPY_NOP2(*((npy_uint16 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 2 != 16
        *((npy_uint16 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 2;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 2 >= 1 */


#line 100

#if (2 >= 2) && \
    (2 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 1 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 1 == 0 && 2 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_contig_to_contig_size2(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_contig_to_contig_size2\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 2 != 16
        (*((npy_uint16 *)dst)) = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 2);
#  if 1 == 1
        _NPY_SWAP_INPLACE2(dst);
#  elif 1 == 2
        _NPY_SWAP_INPLACE1(dst);
        _NPY_SWAP_INPLACE1(dst + 1);
#  endif

#endif

#if 1
        dst += 2;
#else
        dst += dst_stride;
#endif

#if 1
        src += 2;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_swap_contig_to_contig_size2_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 2 == 1 && 1
    memset(dst, *src, N);
#else

#  if 2 != 16
    npy_uint16 temp = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 2 != 16
        *((npy_uint16 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 2;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 2 >= 2 */


#line 100

#if (2 >= 2) && \
    (2 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 1 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 1 == 0 && 2 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_contig_to_contig_size2(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_contig_to_contig_size2\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 2 != 16
        (*((npy_uint16 *)dst)) = _NPY_SWAP2(*((npy_uint16 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 2);
#  if 1 == 1
        _NPY_SWAP2(dst);
#  elif 1 == 2
        _NPY_SWAP1(dst);
        _NPY_SWAP1(dst + 1);
#  endif

#endif

#if 1
        dst += 2;
#else
        dst += dst_stride;
#endif

#if 1
        src += 2;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_swap_contig_to_contig_size2_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 2 == 1 && 1
    memset(dst, *src, N);
#else

#  if 2 != 16
    npy_uint16 temp = _NPY_SWAP2(*((npy_uint16 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 2 != 16
        *((npy_uint16 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 2;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 2 >= 2 */


#line 100

#if (2 >= 4) && \
    (2 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 2 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 2 == 0 && 2 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_pair_contig_to_contig_size2(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_pair_contig_to_contig_size2\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 2 != 16
        (*((npy_uint16 *)dst)) = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 2);
#  if 2 == 1
        _NPY_SWAP_INPLACE2(dst);
#  elif 2 == 2
        _NPY_SWAP_INPLACE1(dst);
        _NPY_SWAP_INPLACE1(dst + 1);
#  endif

#endif

#if 1
        dst += 2;
#else
        dst += dst_stride;
#endif

#if 1
        src += 2;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_swap_pair_contig_to_contig_size2_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 2 == 1 && 1
    memset(dst, *src, N);
#else

#  if 2 != 16
    npy_uint16 temp = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 2 != 16
        *((npy_uint16 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 2;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 2 >= 4 */


#line 100

#if (2 >= 4) && \
    (2 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 2 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 2 == 0 && 2 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_pair_contig_to_contig_size2(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_pair_contig_to_contig_size2\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 2 != 16
        (*((npy_uint16 *)dst)) = _NPY_SWAP_PAIR2(*((npy_uint16 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 2);
#  if 2 == 1
        _NPY_SWAP_PAIR2(dst);
#  elif 2 == 2
        _NPY_SWAP_PAIR1(dst);
        _NPY_SWAP_PAIR1(dst + 1);
#  endif

#endif

#if 1
        dst += 2;
#else
        dst += dst_stride;
#endif

#if 1
        src += 2;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_swap_pair_contig_to_contig_size2_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 2 == 1 && 1
    memset(dst, *src, N);
#else

#  if 2 != 16
    npy_uint16 temp = _NPY_SWAP_PAIR2(*((npy_uint16 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 2 != 16
        *((npy_uint16 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 2;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 2 >= 4 */




#line 86
#line 92
#line 100

#if (4 >= 1) && \
    (4 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 0 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 0 == 0 && 4 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_strided_to_strided_size4(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _strided_to_strided_size4\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 4 != 16
        (*((npy_uint32 *)dst)) = _NPY_NOP4(*((npy_uint32 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 4);
#  if 0 == 1
        _NPY_NOP4(dst);
#  elif 0 == 2
        _NPY_NOP2(dst);
        _NPY_NOP2(dst + 2);
#  endif

#endif

#if 0
        dst += 4;
#else
        dst += dst_stride;
#endif

#if 0
        src += 4;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_strided_to_strided_size4_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 4 == 1 && 0
    memset(dst, *src, N);
#else

#  if 4 != 16
    npy_uint32 temp = _NPY_NOP4(*((npy_uint32 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 4 != 16
        *((npy_uint32 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 4;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 4 >= 1 */


#line 100

#if (4 >= 1) && \
    (4 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 0 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 0 == 0 && 4 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_strided_to_strided_size4(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_strided_to_strided_size4\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 4 != 16
        (*((npy_uint32 *)dst)) = _NPY_NOP4(*((npy_uint32 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 4);
#  if 0 == 1
        _NPY_NOP4(dst);
#  elif 0 == 2
        _NPY_NOP2(dst);
        _NPY_NOP2(dst + 2);
#  endif

#endif

#if 0
        dst += 4;
#else
        dst += dst_stride;
#endif

#if 0
        src += 4;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_strided_to_strided_size4_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 4 == 1 && 0
    memset(dst, *src, N);
#else

#  if 4 != 16
    npy_uint32 temp = _NPY_NOP4(*((npy_uint32 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 4 != 16
        *((npy_uint32 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 4;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 4 >= 1 */


#line 100

#if (4 >= 2) && \
    (4 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 1 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 1 == 0 && 4 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_strided_to_strided_size4(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_strided_to_strided_size4\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 4 != 16
        (*((npy_uint32 *)dst)) = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 4);
#  if 1 == 1
        _NPY_SWAP_INPLACE4(dst);
#  elif 1 == 2
        _NPY_SWAP_INPLACE2(dst);
        _NPY_SWAP_INPLACE2(dst + 2);
#  endif

#endif

#if 0
        dst += 4;
#else
        dst += dst_stride;
#endif

#if 0
        src += 4;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_swap_strided_to_strided_size4_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 4 == 1 && 0
    memset(dst, *src, N);
#else

#  if 4 != 16
    npy_uint32 temp = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 4 != 16
        *((npy_uint32 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 4;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 4 >= 2 */


#line 100

#if (4 >= 2) && \
    (4 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 1 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 1 == 0 && 4 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_strided_to_strided_size4(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_strided_to_strided_size4\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 4 != 16
        (*((npy_uint32 *)dst)) = _NPY_SWAP4(*((npy_uint32 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 4);
#  if 1 == 1
        _NPY_SWAP4(dst);
#  elif 1 == 2
        _NPY_SWAP2(dst);
        _NPY_SWAP2(dst + 2);
#  endif

#endif

#if 0
        dst += 4;
#else
        dst += dst_stride;
#endif

#if 0
        src += 4;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_swap_strided_to_strided_size4_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 4 == 1 && 0
    memset(dst, *src, N);
#else

#  if 4 != 16
    npy_uint32 temp = _NPY_SWAP4(*((npy_uint32 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 4 != 16
        *((npy_uint32 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 4;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 4 >= 2 */


#line 100

#if (4 >= 4) && \
    (4 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 2 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 2 == 0 && 4 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_pair_strided_to_strided_size4(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_pair_strided_to_strided_size4\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 4 != 16
        (*((npy_uint32 *)dst)) = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 4);
#  if 2 == 1
        _NPY_SWAP_INPLACE4(dst);
#  elif 2 == 2
        _NPY_SWAP_INPLACE2(dst);
        _NPY_SWAP_INPLACE2(dst + 2);
#  endif

#endif

#if 0
        dst += 4;
#else
        dst += dst_stride;
#endif

#if 0
        src += 4;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_swap_pair_strided_to_strided_size4_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 4 == 1 && 0
    memset(dst, *src, N);
#else

#  if 4 != 16
    npy_uint32 temp = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 4 != 16
        *((npy_uint32 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 4;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 4 >= 4 */


#line 100

#if (4 >= 4) && \
    (4 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 2 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 2 == 0 && 4 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_pair_strided_to_strided_size4(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_pair_strided_to_strided_size4\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 4 != 16
        (*((npy_uint32 *)dst)) = _NPY_SWAP_PAIR4(*((npy_uint32 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 4);
#  if 2 == 1
        _NPY_SWAP_PAIR4(dst);
#  elif 2 == 2
        _NPY_SWAP_PAIR2(dst);
        _NPY_SWAP_PAIR2(dst + 2);
#  endif

#endif

#if 0
        dst += 4;
#else
        dst += dst_stride;
#endif

#if 0
        src += 4;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_swap_pair_strided_to_strided_size4_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 4 == 1 && 0
    memset(dst, *src, N);
#else

#  if 4 != 16
    npy_uint32 temp = _NPY_SWAP_PAIR4(*((npy_uint32 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 4 != 16
        *((npy_uint32 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 4;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 4 >= 4 */



#line 92
#line 100

#if (4 >= 1) && \
    (4 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 0 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 0 == 0 && 4 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_strided_to_contig_size4(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _strided_to_contig_size4\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 4 != 16
        (*((npy_uint32 *)dst)) = _NPY_NOP4(*((npy_uint32 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 4);
#  if 0 == 1
        _NPY_NOP4(dst);
#  elif 0 == 2
        _NPY_NOP2(dst);
        _NPY_NOP2(dst + 2);
#  endif

#endif

#if 1
        dst += 4;
#else
        dst += dst_stride;
#endif

#if 0
        src += 4;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_strided_to_contig_size4_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 4 == 1 && 1
    memset(dst, *src, N);
#else

#  if 4 != 16
    npy_uint32 temp = _NPY_NOP4(*((npy_uint32 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 4 != 16
        *((npy_uint32 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 4;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 4 >= 1 */


#line 100

#if (4 >= 1) && \
    (4 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 0 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 0 == 0 && 4 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_strided_to_contig_size4(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_strided_to_contig_size4\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 4 != 16
        (*((npy_uint32 *)dst)) = _NPY_NOP4(*((npy_uint32 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 4);
#  if 0 == 1
        _NPY_NOP4(dst);
#  elif 0 == 2
        _NPY_NOP2(dst);
        _NPY_NOP2(dst + 2);
#  endif

#endif

#if 1
        dst += 4;
#else
        dst += dst_stride;
#endif

#if 0
        src += 4;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_strided_to_contig_size4_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 4 == 1 && 1
    memset(dst, *src, N);
#else

#  if 4 != 16
    npy_uint32 temp = _NPY_NOP4(*((npy_uint32 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 4 != 16
        *((npy_uint32 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 4;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 4 >= 1 */


#line 100

#if (4 >= 2) && \
    (4 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 1 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 1 == 0 && 4 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_strided_to_contig_size4(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_strided_to_contig_size4\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 4 != 16
        (*((npy_uint32 *)dst)) = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 4);
#  if 1 == 1
        _NPY_SWAP_INPLACE4(dst);
#  elif 1 == 2
        _NPY_SWAP_INPLACE2(dst);
        _NPY_SWAP_INPLACE2(dst + 2);
#  endif

#endif

#if 1
        dst += 4;
#else
        dst += dst_stride;
#endif

#if 0
        src += 4;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_swap_strided_to_contig_size4_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 4 == 1 && 1
    memset(dst, *src, N);
#else

#  if 4 != 16
    npy_uint32 temp = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 4 != 16
        *((npy_uint32 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 4;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 4 >= 2 */


#line 100

#if (4 >= 2) && \
    (4 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 1 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 1 == 0 && 4 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_strided_to_contig_size4(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_strided_to_contig_size4\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 4 != 16
        (*((npy_uint32 *)dst)) = _NPY_SWAP4(*((npy_uint32 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 4);
#  if 1 == 1
        _NPY_SWAP4(dst);
#  elif 1 == 2
        _NPY_SWAP2(dst);
        _NPY_SWAP2(dst + 2);
#  endif

#endif

#if 1
        dst += 4;
#else
        dst += dst_stride;
#endif

#if 0
        src += 4;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_swap_strided_to_contig_size4_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 4 == 1 && 1
    memset(dst, *src, N);
#else

#  if 4 != 16
    npy_uint32 temp = _NPY_SWAP4(*((npy_uint32 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 4 != 16
        *((npy_uint32 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 4;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 4 >= 2 */


#line 100

#if (4 >= 4) && \
    (4 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 2 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 2 == 0 && 4 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_pair_strided_to_contig_size4(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_pair_strided_to_contig_size4\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 4 != 16
        (*((npy_uint32 *)dst)) = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 4);
#  if 2 == 1
        _NPY_SWAP_INPLACE4(dst);
#  elif 2 == 2
        _NPY_SWAP_INPLACE2(dst);
        _NPY_SWAP_INPLACE2(dst + 2);
#  endif

#endif

#if 1
        dst += 4;
#else
        dst += dst_stride;
#endif

#if 0
        src += 4;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_swap_pair_strided_to_contig_size4_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 4 == 1 && 1
    memset(dst, *src, N);
#else

#  if 4 != 16
    npy_uint32 temp = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 4 != 16
        *((npy_uint32 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 4;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 4 >= 4 */


#line 100

#if (4 >= 4) && \
    (4 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 2 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 2 == 0 && 4 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_pair_strided_to_contig_size4(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_pair_strided_to_contig_size4\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 4 != 16
        (*((npy_uint32 *)dst)) = _NPY_SWAP_PAIR4(*((npy_uint32 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 4);
#  if 2 == 1
        _NPY_SWAP_PAIR4(dst);
#  elif 2 == 2
        _NPY_SWAP_PAIR2(dst);
        _NPY_SWAP_PAIR2(dst + 2);
#  endif

#endif

#if 1
        dst += 4;
#else
        dst += dst_stride;
#endif

#if 0
        src += 4;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_swap_pair_strided_to_contig_size4_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 4 == 1 && 1
    memset(dst, *src, N);
#else

#  if 4 != 16
    npy_uint32 temp = _NPY_SWAP_PAIR4(*((npy_uint32 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 4 != 16
        *((npy_uint32 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 4;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 4 >= 4 */



#line 92
#line 100

#if (4 >= 1) && \
    (4 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 0 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 0 == 0 && 4 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_contig_to_strided_size4(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _contig_to_strided_size4\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 4 != 16
        (*((npy_uint32 *)dst)) = _NPY_NOP4(*((npy_uint32 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 4);
#  if 0 == 1
        _NPY_NOP4(dst);
#  elif 0 == 2
        _NPY_NOP2(dst);
        _NPY_NOP2(dst + 2);
#  endif

#endif

#if 0
        dst += 4;
#else
        dst += dst_stride;
#endif

#if 1
        src += 4;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_contig_to_strided_size4_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 4 == 1 && 0
    memset(dst, *src, N);
#else

#  if 4 != 16
    npy_uint32 temp = _NPY_NOP4(*((npy_uint32 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 4 != 16
        *((npy_uint32 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 4;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 4 >= 1 */


#line 100

#if (4 >= 1) && \
    (4 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 0 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 0 == 0 && 4 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_contig_to_strided_size4(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_contig_to_strided_size4\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 4 != 16
        (*((npy_uint32 *)dst)) = _NPY_NOP4(*((npy_uint32 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 4);
#  if 0 == 1
        _NPY_NOP4(dst);
#  elif 0 == 2
        _NPY_NOP2(dst);
        _NPY_NOP2(dst + 2);
#  endif

#endif

#if 0
        dst += 4;
#else
        dst += dst_stride;
#endif

#if 1
        src += 4;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_contig_to_strided_size4_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 4 == 1 && 0
    memset(dst, *src, N);
#else

#  if 4 != 16
    npy_uint32 temp = _NPY_NOP4(*((npy_uint32 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 4 != 16
        *((npy_uint32 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 4;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 4 >= 1 */


#line 100

#if (4 >= 2) && \
    (4 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 1 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 1 == 0 && 4 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_contig_to_strided_size4(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_contig_to_strided_size4\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 4 != 16
        (*((npy_uint32 *)dst)) = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 4);
#  if 1 == 1
        _NPY_SWAP_INPLACE4(dst);
#  elif 1 == 2
        _NPY_SWAP_INPLACE2(dst);
        _NPY_SWAP_INPLACE2(dst + 2);
#  endif

#endif

#if 0
        dst += 4;
#else
        dst += dst_stride;
#endif

#if 1
        src += 4;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_swap_contig_to_strided_size4_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 4 == 1 && 0
    memset(dst, *src, N);
#else

#  if 4 != 16
    npy_uint32 temp = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 4 != 16
        *((npy_uint32 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 4;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 4 >= 2 */


#line 100

#if (4 >= 2) && \
    (4 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 1 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 1 == 0 && 4 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_contig_to_strided_size4(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_contig_to_strided_size4\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 4 != 16
        (*((npy_uint32 *)dst)) = _NPY_SWAP4(*((npy_uint32 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 4);
#  if 1 == 1
        _NPY_SWAP4(dst);
#  elif 1 == 2
        _NPY_SWAP2(dst);
        _NPY_SWAP2(dst + 2);
#  endif

#endif

#if 0
        dst += 4;
#else
        dst += dst_stride;
#endif

#if 1
        src += 4;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_swap_contig_to_strided_size4_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 4 == 1 && 0
    memset(dst, *src, N);
#else

#  if 4 != 16
    npy_uint32 temp = _NPY_SWAP4(*((npy_uint32 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 4 != 16
        *((npy_uint32 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 4;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 4 >= 2 */


#line 100

#if (4 >= 4) && \
    (4 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 2 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 2 == 0 && 4 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_pair_contig_to_strided_size4(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_pair_contig_to_strided_size4\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 4 != 16
        (*((npy_uint32 *)dst)) = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 4);
#  if 2 == 1
        _NPY_SWAP_INPLACE4(dst);
#  elif 2 == 2
        _NPY_SWAP_INPLACE2(dst);
        _NPY_SWAP_INPLACE2(dst + 2);
#  endif

#endif

#if 0
        dst += 4;
#else
        dst += dst_stride;
#endif

#if 1
        src += 4;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_swap_pair_contig_to_strided_size4_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 4 == 1 && 0
    memset(dst, *src, N);
#else

#  if 4 != 16
    npy_uint32 temp = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 4 != 16
        *((npy_uint32 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 4;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 4 >= 4 */


#line 100

#if (4 >= 4) && \
    (4 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 2 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 2 == 0 && 4 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_pair_contig_to_strided_size4(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_pair_contig_to_strided_size4\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 4 != 16
        (*((npy_uint32 *)dst)) = _NPY_SWAP_PAIR4(*((npy_uint32 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 4);
#  if 2 == 1
        _NPY_SWAP_PAIR4(dst);
#  elif 2 == 2
        _NPY_SWAP_PAIR2(dst);
        _NPY_SWAP_PAIR2(dst + 2);
#  endif

#endif

#if 0
        dst += 4;
#else
        dst += dst_stride;
#endif

#if 1
        src += 4;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_swap_pair_contig_to_strided_size4_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 4 == 1 && 0
    memset(dst, *src, N);
#else

#  if 4 != 16
    npy_uint32 temp = _NPY_SWAP_PAIR4(*((npy_uint32 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 4 != 16
        *((npy_uint32 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 4;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 4 >= 4 */



#line 92
#line 100

#if (4 >= 1) && \
    (4 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 0 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 0 == 0 && 4 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_contig_to_contig_size4(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _contig_to_contig_size4\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 4 != 16
        (*((npy_uint32 *)dst)) = _NPY_NOP4(*((npy_uint32 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 4);
#  if 0 == 1
        _NPY_NOP4(dst);
#  elif 0 == 2
        _NPY_NOP2(dst);
        _NPY_NOP2(dst + 2);
#  endif

#endif

#if 1
        dst += 4;
#else
        dst += dst_stride;
#endif

#if 1
        src += 4;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_contig_to_contig_size4_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 4 == 1 && 1
    memset(dst, *src, N);
#else

#  if 4 != 16
    npy_uint32 temp = _NPY_NOP4(*((npy_uint32 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 4 != 16
        *((npy_uint32 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 4;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 4 >= 1 */


#line 100

#if (4 >= 1) && \
    (4 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 0 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 0 == 0 && 4 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_contig_to_contig_size4(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_contig_to_contig_size4\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 4 != 16
        (*((npy_uint32 *)dst)) = _NPY_NOP4(*((npy_uint32 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 4);
#  if 0 == 1
        _NPY_NOP4(dst);
#  elif 0 == 2
        _NPY_NOP2(dst);
        _NPY_NOP2(dst + 2);
#  endif

#endif

#if 1
        dst += 4;
#else
        dst += dst_stride;
#endif

#if 1
        src += 4;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_contig_to_contig_size4_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 4 == 1 && 1
    memset(dst, *src, N);
#else

#  if 4 != 16
    npy_uint32 temp = _NPY_NOP4(*((npy_uint32 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 4 != 16
        *((npy_uint32 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 4;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 4 >= 1 */


#line 100

#if (4 >= 2) && \
    (4 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 1 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 1 == 0 && 4 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_contig_to_contig_size4(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_contig_to_contig_size4\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 4 != 16
        (*((npy_uint32 *)dst)) = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 4);
#  if 1 == 1
        _NPY_SWAP_INPLACE4(dst);
#  elif 1 == 2
        _NPY_SWAP_INPLACE2(dst);
        _NPY_SWAP_INPLACE2(dst + 2);
#  endif

#endif

#if 1
        dst += 4;
#else
        dst += dst_stride;
#endif

#if 1
        src += 4;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_swap_contig_to_contig_size4_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 4 == 1 && 1
    memset(dst, *src, N);
#else

#  if 4 != 16
    npy_uint32 temp = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 4 != 16
        *((npy_uint32 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 4;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 4 >= 2 */


#line 100

#if (4 >= 2) && \
    (4 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 1 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 1 == 0 && 4 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_contig_to_contig_size4(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_contig_to_contig_size4\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 4 != 16
        (*((npy_uint32 *)dst)) = _NPY_SWAP4(*((npy_uint32 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 4);
#  if 1 == 1
        _NPY_SWAP4(dst);
#  elif 1 == 2
        _NPY_SWAP2(dst);
        _NPY_SWAP2(dst + 2);
#  endif

#endif

#if 1
        dst += 4;
#else
        dst += dst_stride;
#endif

#if 1
        src += 4;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_swap_contig_to_contig_size4_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 4 == 1 && 1
    memset(dst, *src, N);
#else

#  if 4 != 16
    npy_uint32 temp = _NPY_SWAP4(*((npy_uint32 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 4 != 16
        *((npy_uint32 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 4;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 4 >= 2 */


#line 100

#if (4 >= 4) && \
    (4 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 2 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 2 == 0 && 4 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_pair_contig_to_contig_size4(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_pair_contig_to_contig_size4\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 4 != 16
        (*((npy_uint32 *)dst)) = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 4);
#  if 2 == 1
        _NPY_SWAP_INPLACE4(dst);
#  elif 2 == 2
        _NPY_SWAP_INPLACE2(dst);
        _NPY_SWAP_INPLACE2(dst + 2);
#  endif

#endif

#if 1
        dst += 4;
#else
        dst += dst_stride;
#endif

#if 1
        src += 4;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_swap_pair_contig_to_contig_size4_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 4 == 1 && 1
    memset(dst, *src, N);
#else

#  if 4 != 16
    npy_uint32 temp = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 4 != 16
        *((npy_uint32 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 4;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 4 >= 4 */


#line 100

#if (4 >= 4) && \
    (4 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 2 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 2 == 0 && 4 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_pair_contig_to_contig_size4(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_pair_contig_to_contig_size4\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 4 != 16
        (*((npy_uint32 *)dst)) = _NPY_SWAP_PAIR4(*((npy_uint32 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 4);
#  if 2 == 1
        _NPY_SWAP_PAIR4(dst);
#  elif 2 == 2
        _NPY_SWAP_PAIR2(dst);
        _NPY_SWAP_PAIR2(dst + 2);
#  endif

#endif

#if 1
        dst += 4;
#else
        dst += dst_stride;
#endif

#if 1
        src += 4;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_swap_pair_contig_to_contig_size4_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 4 == 1 && 1
    memset(dst, *src, N);
#else

#  if 4 != 16
    npy_uint32 temp = _NPY_SWAP_PAIR4(*((npy_uint32 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 4 != 16
        *((npy_uint32 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 4;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 4 >= 4 */




#line 86
#line 92
#line 100

#if (8 >= 1) && \
    (8 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 0 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 0 == 0 && 8 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_strided_to_strided_size8(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _strided_to_strided_size8\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 8 != 16
        (*((npy_uint64 *)dst)) = _NPY_NOP8(*((npy_uint64 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 8);
#  if 0 == 1
        _NPY_NOP8(dst);
#  elif 0 == 2
        _NPY_NOP4(dst);
        _NPY_NOP4(dst + 4);
#  endif

#endif

#if 0
        dst += 8;
#else
        dst += dst_stride;
#endif

#if 0
        src += 8;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_strided_to_strided_size8_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 8 == 1 && 0
    memset(dst, *src, N);
#else

#  if 8 != 16
    npy_uint64 temp = _NPY_NOP8(*((npy_uint64 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 8 != 16
        *((npy_uint64 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 8;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 8 >= 1 */


#line 100

#if (8 >= 1) && \
    (8 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 0 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 0 == 0 && 8 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_strided_to_strided_size8(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_strided_to_strided_size8\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 8 != 16
        (*((npy_uint64 *)dst)) = _NPY_NOP8(*((npy_uint64 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 8);
#  if 0 == 1
        _NPY_NOP8(dst);
#  elif 0 == 2
        _NPY_NOP4(dst);
        _NPY_NOP4(dst + 4);
#  endif

#endif

#if 0
        dst += 8;
#else
        dst += dst_stride;
#endif

#if 0
        src += 8;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_strided_to_strided_size8_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 8 == 1 && 0
    memset(dst, *src, N);
#else

#  if 8 != 16
    npy_uint64 temp = _NPY_NOP8(*((npy_uint64 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 8 != 16
        *((npy_uint64 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 8;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 8 >= 1 */


#line 100

#if (8 >= 2) && \
    (8 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 1 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 1 == 0 && 8 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_strided_to_strided_size8(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_strided_to_strided_size8\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 8 != 16
        (*((npy_uint64 *)dst)) = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 8);
#  if 1 == 1
        _NPY_SWAP_INPLACE8(dst);
#  elif 1 == 2
        _NPY_SWAP_INPLACE4(dst);
        _NPY_SWAP_INPLACE4(dst + 4);
#  endif

#endif

#if 0
        dst += 8;
#else
        dst += dst_stride;
#endif

#if 0
        src += 8;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_swap_strided_to_strided_size8_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 8 == 1 && 0
    memset(dst, *src, N);
#else

#  if 8 != 16
    npy_uint64 temp = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 8 != 16
        *((npy_uint64 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 8;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 8 >= 2 */


#line 100

#if (8 >= 2) && \
    (8 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 1 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 1 == 0 && 8 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_strided_to_strided_size8(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_strided_to_strided_size8\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 8 != 16
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 8);
#  if 1 == 1
        _NPY_SWAP8(dst);
#  elif 1 == 2
        _NPY_SWAP4(dst);
        _NPY_SWAP4(dst + 4);
#  endif

#endif

#if 0
        dst += 8;
#else
        dst += dst_stride;
#endif

#if 0
        src += 8;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_swap_strided_to_strided_size8_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 8 == 1 && 0
    memset(dst, *src, N);
#else

#  if 8 != 16
    npy_uint64 temp = _NPY_SWAP8(*((npy_uint64 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 8 != 16
        *((npy_uint64 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 8;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 8 >= 2 */


#line 100

#if (8 >= 4) && \
    (8 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 2 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 2 == 0 && 8 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_pair_strided_to_strided_size8(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_pair_strided_to_strided_size8\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 8 != 16
        (*((npy_uint64 *)dst)) = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 8);
#  if 2 == 1
        _NPY_SWAP_INPLACE8(dst);
#  elif 2 == 2
        _NPY_SWAP_INPLACE4(dst);
        _NPY_SWAP_INPLACE4(dst + 4);
#  endif

#endif

#if 0
        dst += 8;
#else
        dst += dst_stride;
#endif

#if 0
        src += 8;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_swap_pair_strided_to_strided_size8_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 8 == 1 && 0
    memset(dst, *src, N);
#else

#  if 8 != 16
    npy_uint64 temp = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 8 != 16
        *((npy_uint64 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 8;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 8 >= 4 */


#line 100

#if (8 >= 4) && \
    (8 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 2 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 2 == 0 && 8 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_pair_strided_to_strided_size8(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_pair_strided_to_strided_size8\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 8 != 16
        (*((npy_uint64 *)dst)) = _NPY_SWAP_PAIR8(*((npy_uint64 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 8);
#  if 2 == 1
        _NPY_SWAP_PAIR8(dst);
#  elif 2 == 2
        _NPY_SWAP_PAIR4(dst);
        _NPY_SWAP_PAIR4(dst + 4);
#  endif

#endif

#if 0
        dst += 8;
#else
        dst += dst_stride;
#endif

#if 0
        src += 8;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_swap_pair_strided_to_strided_size8_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 8 == 1 && 0
    memset(dst, *src, N);
#else

#  if 8 != 16
    npy_uint64 temp = _NPY_SWAP_PAIR8(*((npy_uint64 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 8 != 16
        *((npy_uint64 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 8;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 8 >= 4 */



#line 92
#line 100

#if (8 >= 1) && \
    (8 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 0 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 0 == 0 && 8 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_strided_to_contig_size8(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _strided_to_contig_size8\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 8 != 16
        (*((npy_uint64 *)dst)) = _NPY_NOP8(*((npy_uint64 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 8);
#  if 0 == 1
        _NPY_NOP8(dst);
#  elif 0 == 2
        _NPY_NOP4(dst);
        _NPY_NOP4(dst + 4);
#  endif

#endif

#if 1
        dst += 8;
#else
        dst += dst_stride;
#endif

#if 0
        src += 8;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_strided_to_contig_size8_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 8 == 1 && 1
    memset(dst, *src, N);
#else

#  if 8 != 16
    npy_uint64 temp = _NPY_NOP8(*((npy_uint64 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 8 != 16
        *((npy_uint64 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 8;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 8 >= 1 */


#line 100

#if (8 >= 1) && \
    (8 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 0 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 0 == 0 && 8 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_strided_to_contig_size8(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_strided_to_contig_size8\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 8 != 16
        (*((npy_uint64 *)dst)) = _NPY_NOP8(*((npy_uint64 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 8);
#  if 0 == 1
        _NPY_NOP8(dst);
#  elif 0 == 2
        _NPY_NOP4(dst);
        _NPY_NOP4(dst + 4);
#  endif

#endif

#if 1
        dst += 8;
#else
        dst += dst_stride;
#endif

#if 0
        src += 8;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_strided_to_contig_size8_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 8 == 1 && 1
    memset(dst, *src, N);
#else

#  if 8 != 16
    npy_uint64 temp = _NPY_NOP8(*((npy_uint64 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 8 != 16
        *((npy_uint64 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 8;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 8 >= 1 */


#line 100

#if (8 >= 2) && \
    (8 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 1 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 1 == 0 && 8 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_strided_to_contig_size8(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_strided_to_contig_size8\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 8 != 16
        (*((npy_uint64 *)dst)) = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 8);
#  if 1 == 1
        _NPY_SWAP_INPLACE8(dst);
#  elif 1 == 2
        _NPY_SWAP_INPLACE4(dst);
        _NPY_SWAP_INPLACE4(dst + 4);
#  endif

#endif

#if 1
        dst += 8;
#else
        dst += dst_stride;
#endif

#if 0
        src += 8;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_swap_strided_to_contig_size8_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 8 == 1 && 1
    memset(dst, *src, N);
#else

#  if 8 != 16
    npy_uint64 temp = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 8 != 16
        *((npy_uint64 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 8;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 8 >= 2 */


#line 100

#if (8 >= 2) && \
    (8 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 1 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 1 == 0 && 8 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_strided_to_contig_size8(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_strided_to_contig_size8\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 8 != 16
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 8);
#  if 1 == 1
        _NPY_SWAP8(dst);
#  elif 1 == 2
        _NPY_SWAP4(dst);
        _NPY_SWAP4(dst + 4);
#  endif

#endif

#if 1
        dst += 8;
#else
        dst += dst_stride;
#endif

#if 0
        src += 8;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_swap_strided_to_contig_size8_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 8 == 1 && 1
    memset(dst, *src, N);
#else

#  if 8 != 16
    npy_uint64 temp = _NPY_SWAP8(*((npy_uint64 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 8 != 16
        *((npy_uint64 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 8;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 8 >= 2 */


#line 100

#if (8 >= 4) && \
    (8 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 2 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 2 == 0 && 8 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_pair_strided_to_contig_size8(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_pair_strided_to_contig_size8\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 8 != 16
        (*((npy_uint64 *)dst)) = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 8);
#  if 2 == 1
        _NPY_SWAP_INPLACE8(dst);
#  elif 2 == 2
        _NPY_SWAP_INPLACE4(dst);
        _NPY_SWAP_INPLACE4(dst + 4);
#  endif

#endif

#if 1
        dst += 8;
#else
        dst += dst_stride;
#endif

#if 0
        src += 8;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_swap_pair_strided_to_contig_size8_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 8 == 1 && 1
    memset(dst, *src, N);
#else

#  if 8 != 16
    npy_uint64 temp = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 8 != 16
        *((npy_uint64 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 8;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 8 >= 4 */


#line 100

#if (8 >= 4) && \
    (8 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 2 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 2 == 0 && 8 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_pair_strided_to_contig_size8(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_pair_strided_to_contig_size8\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 8 != 16
        (*((npy_uint64 *)dst)) = _NPY_SWAP_PAIR8(*((npy_uint64 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 8);
#  if 2 == 1
        _NPY_SWAP_PAIR8(dst);
#  elif 2 == 2
        _NPY_SWAP_PAIR4(dst);
        _NPY_SWAP_PAIR4(dst + 4);
#  endif

#endif

#if 1
        dst += 8;
#else
        dst += dst_stride;
#endif

#if 0
        src += 8;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_swap_pair_strided_to_contig_size8_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 8 == 1 && 1
    memset(dst, *src, N);
#else

#  if 8 != 16
    npy_uint64 temp = _NPY_SWAP_PAIR8(*((npy_uint64 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 8 != 16
        *((npy_uint64 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 8;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 8 >= 4 */



#line 92
#line 100

#if (8 >= 1) && \
    (8 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 0 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 0 == 0 && 8 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_contig_to_strided_size8(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _contig_to_strided_size8\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 8 != 16
        (*((npy_uint64 *)dst)) = _NPY_NOP8(*((npy_uint64 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 8);
#  if 0 == 1
        _NPY_NOP8(dst);
#  elif 0 == 2
        _NPY_NOP4(dst);
        _NPY_NOP4(dst + 4);
#  endif

#endif

#if 0
        dst += 8;
#else
        dst += dst_stride;
#endif

#if 1
        src += 8;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_contig_to_strided_size8_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 8 == 1 && 0
    memset(dst, *src, N);
#else

#  if 8 != 16
    npy_uint64 temp = _NPY_NOP8(*((npy_uint64 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 8 != 16
        *((npy_uint64 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 8;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 8 >= 1 */


#line 100

#if (8 >= 1) && \
    (8 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 0 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 0 == 0 && 8 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_contig_to_strided_size8(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_contig_to_strided_size8\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 8 != 16
        (*((npy_uint64 *)dst)) = _NPY_NOP8(*((npy_uint64 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 8);
#  if 0 == 1
        _NPY_NOP8(dst);
#  elif 0 == 2
        _NPY_NOP4(dst);
        _NPY_NOP4(dst + 4);
#  endif

#endif

#if 0
        dst += 8;
#else
        dst += dst_stride;
#endif

#if 1
        src += 8;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_contig_to_strided_size8_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 8 == 1 && 0
    memset(dst, *src, N);
#else

#  if 8 != 16
    npy_uint64 temp = _NPY_NOP8(*((npy_uint64 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 8 != 16
        *((npy_uint64 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 8;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 8 >= 1 */


#line 100

#if (8 >= 2) && \
    (8 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 1 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 1 == 0 && 8 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_contig_to_strided_size8(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_contig_to_strided_size8\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 8 != 16
        (*((npy_uint64 *)dst)) = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 8);
#  if 1 == 1
        _NPY_SWAP_INPLACE8(dst);
#  elif 1 == 2
        _NPY_SWAP_INPLACE4(dst);
        _NPY_SWAP_INPLACE4(dst + 4);
#  endif

#endif

#if 0
        dst += 8;
#else
        dst += dst_stride;
#endif

#if 1
        src += 8;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_swap_contig_to_strided_size8_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 8 == 1 && 0
    memset(dst, *src, N);
#else

#  if 8 != 16
    npy_uint64 temp = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 8 != 16
        *((npy_uint64 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 8;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 8 >= 2 */


#line 100

#if (8 >= 2) && \
    (8 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 1 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 1 == 0 && 8 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_contig_to_strided_size8(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_contig_to_strided_size8\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 8 != 16
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 8);
#  if 1 == 1
        _NPY_SWAP8(dst);
#  elif 1 == 2
        _NPY_SWAP4(dst);
        _NPY_SWAP4(dst + 4);
#  endif

#endif

#if 0
        dst += 8;
#else
        dst += dst_stride;
#endif

#if 1
        src += 8;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_swap_contig_to_strided_size8_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 8 == 1 && 0
    memset(dst, *src, N);
#else

#  if 8 != 16
    npy_uint64 temp = _NPY_SWAP8(*((npy_uint64 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 8 != 16
        *((npy_uint64 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 8;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 8 >= 2 */


#line 100

#if (8 >= 4) && \
    (8 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 2 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 2 == 0 && 8 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_pair_contig_to_strided_size8(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_pair_contig_to_strided_size8\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 8 != 16
        (*((npy_uint64 *)dst)) = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 8);
#  if 2 == 1
        _NPY_SWAP_INPLACE8(dst);
#  elif 2 == 2
        _NPY_SWAP_INPLACE4(dst);
        _NPY_SWAP_INPLACE4(dst + 4);
#  endif

#endif

#if 0
        dst += 8;
#else
        dst += dst_stride;
#endif

#if 1
        src += 8;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_swap_pair_contig_to_strided_size8_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 8 == 1 && 0
    memset(dst, *src, N);
#else

#  if 8 != 16
    npy_uint64 temp = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 8 != 16
        *((npy_uint64 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 8;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 8 >= 4 */


#line 100

#if (8 >= 4) && \
    (8 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 2 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 2 == 0 && 8 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_pair_contig_to_strided_size8(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_pair_contig_to_strided_size8\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 8 != 16
        (*((npy_uint64 *)dst)) = _NPY_SWAP_PAIR8(*((npy_uint64 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 8);
#  if 2 == 1
        _NPY_SWAP_PAIR8(dst);
#  elif 2 == 2
        _NPY_SWAP_PAIR4(dst);
        _NPY_SWAP_PAIR4(dst + 4);
#  endif

#endif

#if 0
        dst += 8;
#else
        dst += dst_stride;
#endif

#if 1
        src += 8;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_swap_pair_contig_to_strided_size8_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 8 == 1 && 0
    memset(dst, *src, N);
#else

#  if 8 != 16
    npy_uint64 temp = _NPY_SWAP_PAIR8(*((npy_uint64 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 8 != 16
        *((npy_uint64 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 8;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 8 >= 4 */



#line 92
#line 100

#if (8 >= 1) && \
    (8 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 0 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 0 == 0 && 8 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_contig_to_contig_size8(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _contig_to_contig_size8\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 8 != 16
        (*((npy_uint64 *)dst)) = _NPY_NOP8(*((npy_uint64 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 8);
#  if 0 == 1
        _NPY_NOP8(dst);
#  elif 0 == 2
        _NPY_NOP4(dst);
        _NPY_NOP4(dst + 4);
#  endif

#endif

#if 1
        dst += 8;
#else
        dst += dst_stride;
#endif

#if 1
        src += 8;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_contig_to_contig_size8_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 8 == 1 && 1
    memset(dst, *src, N);
#else

#  if 8 != 16
    npy_uint64 temp = _NPY_NOP8(*((npy_uint64 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 8 != 16
        *((npy_uint64 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 8;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 8 >= 1 */


#line 100

#if (8 >= 1) && \
    (8 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 0 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 0 == 0 && 8 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_contig_to_contig_size8(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_contig_to_contig_size8\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 8 != 16
        (*((npy_uint64 *)dst)) = _NPY_NOP8(*((npy_uint64 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 8);
#  if 0 == 1
        _NPY_NOP8(dst);
#  elif 0 == 2
        _NPY_NOP4(dst);
        _NPY_NOP4(dst + 4);
#  endif

#endif

#if 1
        dst += 8;
#else
        dst += dst_stride;
#endif

#if 1
        src += 8;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_contig_to_contig_size8_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 8 == 1 && 1
    memset(dst, *src, N);
#else

#  if 8 != 16
    npy_uint64 temp = _NPY_NOP8(*((npy_uint64 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 8 != 16
        *((npy_uint64 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 8;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 8 >= 1 */


#line 100

#if (8 >= 2) && \
    (8 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 1 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 1 == 0 && 8 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_contig_to_contig_size8(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_contig_to_contig_size8\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 8 != 16
        (*((npy_uint64 *)dst)) = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 8);
#  if 1 == 1
        _NPY_SWAP_INPLACE8(dst);
#  elif 1 == 2
        _NPY_SWAP_INPLACE4(dst);
        _NPY_SWAP_INPLACE4(dst + 4);
#  endif

#endif

#if 1
        dst += 8;
#else
        dst += dst_stride;
#endif

#if 1
        src += 8;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_swap_contig_to_contig_size8_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 8 == 1 && 1
    memset(dst, *src, N);
#else

#  if 8 != 16
    npy_uint64 temp = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 8 != 16
        *((npy_uint64 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 8;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 8 >= 2 */


#line 100

#if (8 >= 2) && \
    (8 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 1 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 1 == 0 && 8 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_contig_to_contig_size8(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_contig_to_contig_size8\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 8 != 16
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 8);
#  if 1 == 1
        _NPY_SWAP8(dst);
#  elif 1 == 2
        _NPY_SWAP4(dst);
        _NPY_SWAP4(dst + 4);
#  endif

#endif

#if 1
        dst += 8;
#else
        dst += dst_stride;
#endif

#if 1
        src += 8;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_swap_contig_to_contig_size8_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 8 == 1 && 1
    memset(dst, *src, N);
#else

#  if 8 != 16
    npy_uint64 temp = _NPY_SWAP8(*((npy_uint64 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 8 != 16
        *((npy_uint64 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 8;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 8 >= 2 */


#line 100

#if (8 >= 4) && \
    (8 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 2 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 2 == 0 && 8 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_pair_contig_to_contig_size8(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_pair_contig_to_contig_size8\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 8 != 16
        (*((npy_uint64 *)dst)) = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 8);
#  if 2 == 1
        _NPY_SWAP_INPLACE8(dst);
#  elif 2 == 2
        _NPY_SWAP_INPLACE4(dst);
        _NPY_SWAP_INPLACE4(dst + 4);
#  endif

#endif

#if 1
        dst += 8;
#else
        dst += dst_stride;
#endif

#if 1
        src += 8;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_swap_pair_contig_to_contig_size8_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 8 == 1 && 1
    memset(dst, *src, N);
#else

#  if 8 != 16
    npy_uint64 temp = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 8 != 16
        *((npy_uint64 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 8;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 8 >= 4 */


#line 100

#if (8 >= 4) && \
    (8 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 2 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 2 == 0 && 8 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_pair_contig_to_contig_size8(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_pair_contig_to_contig_size8\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 8 != 16
        (*((npy_uint64 *)dst)) = _NPY_SWAP_PAIR8(*((npy_uint64 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 8);
#  if 2 == 1
        _NPY_SWAP_PAIR8(dst);
#  elif 2 == 2
        _NPY_SWAP_PAIR4(dst);
        _NPY_SWAP_PAIR4(dst + 4);
#  endif

#endif

#if 1
        dst += 8;
#else
        dst += dst_stride;
#endif

#if 1
        src += 8;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_swap_pair_contig_to_contig_size8_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 8 == 1 && 1
    memset(dst, *src, N);
#else

#  if 8 != 16
    npy_uint64 temp = _NPY_SWAP_PAIR8(*((npy_uint64 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 8 != 16
        *((npy_uint64 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 8;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 8 >= 4 */




#line 86
#line 92
#line 100

#if (16 >= 1) && \
    (16 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 0 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 0 == 0 && 16 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_strided_to_strided_size16(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _strided_to_strided_size16\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 16 != 16
        (*((npy_uint128 *)dst)) = _NPY_NOP16(*((npy_uint128 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 16);
#  if 0 == 1
        _NPY_NOP16(dst);
#  elif 0 == 2
        _NPY_NOP8(dst);
        _NPY_NOP8(dst + 8);
#  endif

#endif

#if 0
        dst += 16;
#else
        dst += dst_stride;
#endif

#if 0
        src += 16;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_strided_to_strided_size16_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 16 == 1 && 0
    memset(dst, *src, N);
#else

#  if 16 != 16
    npy_uint128 temp = _NPY_NOP16(*((npy_uint128 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 16 != 16
        *((npy_uint128 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 16;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 16 >= 1 */


#line 100

#if (16 >= 1) && \
    (16 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 0 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 0 == 0 && 16 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_strided_to_strided_size16(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_strided_to_strided_size16\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 16 != 16
        (*((npy_uint128 *)dst)) = _NPY_NOP16(*((npy_uint128 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 16);
#  if 0 == 1
        _NPY_NOP16(dst);
#  elif 0 == 2
        _NPY_NOP8(dst);
        _NPY_NOP8(dst + 8);
#  endif

#endif

#if 0
        dst += 16;
#else
        dst += dst_stride;
#endif

#if 0
        src += 16;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_strided_to_strided_size16_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 16 == 1 && 0
    memset(dst, *src, N);
#else

#  if 16 != 16
    npy_uint128 temp = _NPY_NOP16(*((npy_uint128 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 16 != 16
        *((npy_uint128 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 16;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 16 >= 1 */


#line 100

#if (16 >= 2) && \
    (16 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 1 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 1 == 0 && 16 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_strided_to_strided_size16(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_strided_to_strided_size16\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 16 != 16
        (*((npy_uint128 *)dst)) = _NPY_SWAP_INPLACE16(*((npy_uint128 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 16);
#  if 1 == 1
        _NPY_SWAP_INPLACE16(dst);
#  elif 1 == 2
        _NPY_SWAP_INPLACE8(dst);
        _NPY_SWAP_INPLACE8(dst + 8);
#  endif

#endif

#if 0
        dst += 16;
#else
        dst += dst_stride;
#endif

#if 0
        src += 16;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_swap_strided_to_strided_size16_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 16 == 1 && 0
    memset(dst, *src, N);
#else

#  if 16 != 16
    npy_uint128 temp = _NPY_SWAP_INPLACE16(*((npy_uint128 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 16 != 16
        *((npy_uint128 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 16;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 16 >= 2 */


#line 100

#if (16 >= 2) && \
    (16 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 1 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 1 == 0 && 16 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_strided_to_strided_size16(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_strided_to_strided_size16\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 16 != 16
        (*((npy_uint128 *)dst)) = _NPY_SWAP16(*((npy_uint128 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 16);
#  if 1 == 1
        _NPY_SWAP16(dst);
#  elif 1 == 2
        _NPY_SWAP8(dst);
        _NPY_SWAP8(dst + 8);
#  endif

#endif

#if 0
        dst += 16;
#else
        dst += dst_stride;
#endif

#if 0
        src += 16;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_swap_strided_to_strided_size16_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 16 == 1 && 0
    memset(dst, *src, N);
#else

#  if 16 != 16
    npy_uint128 temp = _NPY_SWAP16(*((npy_uint128 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 16 != 16
        *((npy_uint128 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 16;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 16 >= 2 */


#line 100

#if (16 >= 4) && \
    (16 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 2 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 2 == 0 && 16 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_pair_strided_to_strided_size16(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_pair_strided_to_strided_size16\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 16 != 16
        (*((npy_uint128 *)dst)) = _NPY_SWAP_INPLACE16(*((npy_uint128 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 16);
#  if 2 == 1
        _NPY_SWAP_INPLACE16(dst);
#  elif 2 == 2
        _NPY_SWAP_INPLACE8(dst);
        _NPY_SWAP_INPLACE8(dst + 8);
#  endif

#endif

#if 0
        dst += 16;
#else
        dst += dst_stride;
#endif

#if 0
        src += 16;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_swap_pair_strided_to_strided_size16_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 16 == 1 && 0
    memset(dst, *src, N);
#else

#  if 16 != 16
    npy_uint128 temp = _NPY_SWAP_INPLACE16(*((npy_uint128 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 16 != 16
        *((npy_uint128 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 16;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 16 >= 4 */


#line 100

#if (16 >= 4) && \
    (16 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 2 || 0 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 2 == 0 && 16 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_pair_strided_to_strided_size16(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_pair_strided_to_strided_size16\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 16 != 16
        (*((npy_uint128 *)dst)) = _NPY_SWAP_PAIR16(*((npy_uint128 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 16);
#  if 2 == 1
        _NPY_SWAP_PAIR16(dst);
#  elif 2 == 2
        _NPY_SWAP_PAIR8(dst);
        _NPY_SWAP_PAIR8(dst + 8);
#  endif

#endif

#if 0
        dst += 16;
#else
        dst += dst_stride;
#endif

#if 0
        src += 16;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_swap_pair_strided_to_strided_size16_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 16 == 1 && 0
    memset(dst, *src, N);
#else

#  if 16 != 16
    npy_uint128 temp = _NPY_SWAP_PAIR16(*((npy_uint128 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 16 != 16
        *((npy_uint128 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 16;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 16 >= 4 */



#line 92
#line 100

#if (16 >= 1) && \
    (16 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 0 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 0 == 0 && 16 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_strided_to_contig_size16(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _strided_to_contig_size16\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 16 != 16
        (*((npy_uint128 *)dst)) = _NPY_NOP16(*((npy_uint128 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 16);
#  if 0 == 1
        _NPY_NOP16(dst);
#  elif 0 == 2
        _NPY_NOP8(dst);
        _NPY_NOP8(dst + 8);
#  endif

#endif

#if 1
        dst += 16;
#else
        dst += dst_stride;
#endif

#if 0
        src += 16;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_strided_to_contig_size16_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 16 == 1 && 1
    memset(dst, *src, N);
#else

#  if 16 != 16
    npy_uint128 temp = _NPY_NOP16(*((npy_uint128 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 16 != 16
        *((npy_uint128 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 16;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 16 >= 1 */


#line 100

#if (16 >= 1) && \
    (16 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 0 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 0 == 0 && 16 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_strided_to_contig_size16(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_strided_to_contig_size16\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 16 != 16
        (*((npy_uint128 *)dst)) = _NPY_NOP16(*((npy_uint128 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 16);
#  if 0 == 1
        _NPY_NOP16(dst);
#  elif 0 == 2
        _NPY_NOP8(dst);
        _NPY_NOP8(dst + 8);
#  endif

#endif

#if 1
        dst += 16;
#else
        dst += dst_stride;
#endif

#if 0
        src += 16;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_strided_to_contig_size16_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 16 == 1 && 1
    memset(dst, *src, N);
#else

#  if 16 != 16
    npy_uint128 temp = _NPY_NOP16(*((npy_uint128 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 16 != 16
        *((npy_uint128 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 16;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 16 >= 1 */


#line 100

#if (16 >= 2) && \
    (16 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 1 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 1 == 0 && 16 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_strided_to_contig_size16(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_strided_to_contig_size16\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 16 != 16
        (*((npy_uint128 *)dst)) = _NPY_SWAP_INPLACE16(*((npy_uint128 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 16);
#  if 1 == 1
        _NPY_SWAP_INPLACE16(dst);
#  elif 1 == 2
        _NPY_SWAP_INPLACE8(dst);
        _NPY_SWAP_INPLACE8(dst + 8);
#  endif

#endif

#if 1
        dst += 16;
#else
        dst += dst_stride;
#endif

#if 0
        src += 16;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_swap_strided_to_contig_size16_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 16 == 1 && 1
    memset(dst, *src, N);
#else

#  if 16 != 16
    npy_uint128 temp = _NPY_SWAP_INPLACE16(*((npy_uint128 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 16 != 16
        *((npy_uint128 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 16;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 16 >= 2 */


#line 100

#if (16 >= 2) && \
    (16 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 1 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 1 == 0 && 16 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_strided_to_contig_size16(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_strided_to_contig_size16\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 16 != 16
        (*((npy_uint128 *)dst)) = _NPY_SWAP16(*((npy_uint128 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 16);
#  if 1 == 1
        _NPY_SWAP16(dst);
#  elif 1 == 2
        _NPY_SWAP8(dst);
        _NPY_SWAP8(dst + 8);
#  endif

#endif

#if 1
        dst += 16;
#else
        dst += dst_stride;
#endif

#if 0
        src += 16;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_swap_strided_to_contig_size16_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 16 == 1 && 1
    memset(dst, *src, N);
#else

#  if 16 != 16
    npy_uint128 temp = _NPY_SWAP16(*((npy_uint128 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 16 != 16
        *((npy_uint128 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 16;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 16 >= 2 */


#line 100

#if (16 >= 4) && \
    (16 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 2 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 2 == 0 && 16 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_pair_strided_to_contig_size16(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_pair_strided_to_contig_size16\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 16 != 16
        (*((npy_uint128 *)dst)) = _NPY_SWAP_INPLACE16(*((npy_uint128 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 16);
#  if 2 == 1
        _NPY_SWAP_INPLACE16(dst);
#  elif 2 == 2
        _NPY_SWAP_INPLACE8(dst);
        _NPY_SWAP_INPLACE8(dst + 8);
#  endif

#endif

#if 1
        dst += 16;
#else
        dst += dst_stride;
#endif

#if 0
        src += 16;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 0
static void
_swap_pair_strided_to_contig_size16_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 16 == 1 && 1
    memset(dst, *src, N);
#else

#  if 16 != 16
    npy_uint128 temp = _NPY_SWAP_INPLACE16(*((npy_uint128 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 16 != 16
        *((npy_uint128 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 16;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 0 */

#endif/* 16 >= 4 */


#line 100

#if (16 >= 4) && \
    (16 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 2 || 0 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 2 == 0 && 16 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_pair_strided_to_contig_size16(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_pair_strided_to_contig_size16\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 16 != 16
        (*((npy_uint128 *)dst)) = _NPY_SWAP_PAIR16(*((npy_uint128 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 16);
#  if 2 == 1
        _NPY_SWAP_PAIR16(dst);
#  elif 2 == 2
        _NPY_SWAP_PAIR8(dst);
        _NPY_SWAP_PAIR8(dst + 8);
#  endif

#endif

#if 1
        dst += 16;
#else
        dst += dst_stride;
#endif

#if 0
        src += 16;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (0 == 0) && 1
static void
_aligned_swap_pair_strided_to_contig_size16_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 16 == 1 && 1
    memset(dst, *src, N);
#else

#  if 16 != 16
    npy_uint128 temp = _NPY_SWAP_PAIR16(*((npy_uint128 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 16 != 16
        *((npy_uint128 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 16;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (0 == 0) && 1 */

#endif/* 16 >= 4 */



#line 92
#line 100

#if (16 >= 1) && \
    (16 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 0 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 0 == 0 && 16 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_contig_to_strided_size16(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _contig_to_strided_size16\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 16 != 16
        (*((npy_uint128 *)dst)) = _NPY_NOP16(*((npy_uint128 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 16);
#  if 0 == 1
        _NPY_NOP16(dst);
#  elif 0 == 2
        _NPY_NOP8(dst);
        _NPY_NOP8(dst + 8);
#  endif

#endif

#if 0
        dst += 16;
#else
        dst += dst_stride;
#endif

#if 1
        src += 16;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_contig_to_strided_size16_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 16 == 1 && 0
    memset(dst, *src, N);
#else

#  if 16 != 16
    npy_uint128 temp = _NPY_NOP16(*((npy_uint128 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 16 != 16
        *((npy_uint128 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 16;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 16 >= 1 */


#line 100

#if (16 >= 1) && \
    (16 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 0 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 0 == 0 && 16 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_contig_to_strided_size16(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_contig_to_strided_size16\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 16 != 16
        (*((npy_uint128 *)dst)) = _NPY_NOP16(*((npy_uint128 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 16);
#  if 0 == 1
        _NPY_NOP16(dst);
#  elif 0 == 2
        _NPY_NOP8(dst);
        _NPY_NOP8(dst + 8);
#  endif

#endif

#if 0
        dst += 16;
#else
        dst += dst_stride;
#endif

#if 1
        src += 16;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_contig_to_strided_size16_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 16 == 1 && 0
    memset(dst, *src, N);
#else

#  if 16 != 16
    npy_uint128 temp = _NPY_NOP16(*((npy_uint128 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 16 != 16
        *((npy_uint128 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 16;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 16 >= 1 */


#line 100

#if (16 >= 2) && \
    (16 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 1 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 1 == 0 && 16 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_contig_to_strided_size16(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_contig_to_strided_size16\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 16 != 16
        (*((npy_uint128 *)dst)) = _NPY_SWAP_INPLACE16(*((npy_uint128 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 16);
#  if 1 == 1
        _NPY_SWAP_INPLACE16(dst);
#  elif 1 == 2
        _NPY_SWAP_INPLACE8(dst);
        _NPY_SWAP_INPLACE8(dst + 8);
#  endif

#endif

#if 0
        dst += 16;
#else
        dst += dst_stride;
#endif

#if 1
        src += 16;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_swap_contig_to_strided_size16_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 16 == 1 && 0
    memset(dst, *src, N);
#else

#  if 16 != 16
    npy_uint128 temp = _NPY_SWAP_INPLACE16(*((npy_uint128 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 16 != 16
        *((npy_uint128 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 16;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 16 >= 2 */


#line 100

#if (16 >= 2) && \
    (16 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 1 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 1 == 0 && 16 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_contig_to_strided_size16(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_contig_to_strided_size16\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 16 != 16
        (*((npy_uint128 *)dst)) = _NPY_SWAP16(*((npy_uint128 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 16);
#  if 1 == 1
        _NPY_SWAP16(dst);
#  elif 1 == 2
        _NPY_SWAP8(dst);
        _NPY_SWAP8(dst + 8);
#  endif

#endif

#if 0
        dst += 16;
#else
        dst += dst_stride;
#endif

#if 1
        src += 16;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_swap_contig_to_strided_size16_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 16 == 1 && 0
    memset(dst, *src, N);
#else

#  if 16 != 16
    npy_uint128 temp = _NPY_SWAP16(*((npy_uint128 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 16 != 16
        *((npy_uint128 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 16;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 16 >= 2 */


#line 100

#if (16 >= 4) && \
    (16 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 2 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 2 == 0 && 16 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_pair_contig_to_strided_size16(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_pair_contig_to_strided_size16\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 16 != 16
        (*((npy_uint128 *)dst)) = _NPY_SWAP_INPLACE16(*((npy_uint128 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 16);
#  if 2 == 1
        _NPY_SWAP_INPLACE16(dst);
#  elif 2 == 2
        _NPY_SWAP_INPLACE8(dst);
        _NPY_SWAP_INPLACE8(dst + 8);
#  endif

#endif

#if 0
        dst += 16;
#else
        dst += dst_stride;
#endif

#if 1
        src += 16;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_swap_pair_contig_to_strided_size16_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 16 == 1 && 0
    memset(dst, *src, N);
#else

#  if 16 != 16
    npy_uint128 temp = _NPY_SWAP_INPLACE16(*((npy_uint128 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 16 != 16
        *((npy_uint128 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 16;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 16 >= 4 */


#line 100

#if (16 >= 4) && \
    (16 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 2 || 1 == 0 || 0 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 2 == 0 && 16 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_pair_contig_to_strided_size16(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_pair_contig_to_strided_size16\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 16 != 16
        (*((npy_uint128 *)dst)) = _NPY_SWAP_PAIR16(*((npy_uint128 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 16);
#  if 2 == 1
        _NPY_SWAP_PAIR16(dst);
#  elif 2 == 2
        _NPY_SWAP_PAIR8(dst);
        _NPY_SWAP_PAIR8(dst + 8);
#  endif

#endif

#if 0
        dst += 16;
#else
        dst += dst_stride;
#endif

#if 1
        src += 16;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_swap_pair_contig_to_strided_size16_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 16 == 1 && 0
    memset(dst, *src, N);
#else

#  if 16 != 16
    npy_uint128 temp = _NPY_SWAP_PAIR16(*((npy_uint128 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 16 != 16
        *((npy_uint128 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 0
        dst += 16;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 0 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 16 >= 4 */



#line 92
#line 100

#if (16 >= 1) && \
    (16 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 0 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 0 == 0 && 16 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_contig_to_contig_size16(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _contig_to_contig_size16\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 16 != 16
        (*((npy_uint128 *)dst)) = _NPY_NOP16(*((npy_uint128 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 16);
#  if 0 == 1
        _NPY_NOP16(dst);
#  elif 0 == 2
        _NPY_NOP8(dst);
        _NPY_NOP8(dst + 8);
#  endif

#endif

#if 1
        dst += 16;
#else
        dst += dst_stride;
#endif

#if 1
        src += 16;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_contig_to_contig_size16_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 16 == 1 && 1
    memset(dst, *src, N);
#else

#  if 16 != 16
    npy_uint128 temp = _NPY_NOP16(*((npy_uint128 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 16 != 16
        *((npy_uint128 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 16;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 16 >= 1 */


#line 100

#if (16 >= 1) && \
    (16 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 0 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 0 == 0 && 16 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_contig_to_contig_size16(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_contig_to_contig_size16\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 16 != 16
        (*((npy_uint128 *)dst)) = _NPY_NOP16(*((npy_uint128 *)src));
#  else
#    if 0 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 16);
#  if 0 == 1
        _NPY_NOP16(dst);
#  elif 0 == 2
        _NPY_NOP8(dst);
        _NPY_NOP8(dst + 8);
#  endif

#endif

#if 1
        dst += 16;
#else
        dst += dst_stride;
#endif

#if 1
        src += 16;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_contig_to_contig_size16_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 16 == 1 && 1
    memset(dst, *src, N);
#else

#  if 16 != 16
    npy_uint128 temp = _NPY_NOP16(*((npy_uint128 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 0 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 0 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 0 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 16 != 16
        *((npy_uint128 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 16;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 16 >= 1 */


#line 100

#if (16 >= 2) && \
    (16 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 1 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 1 == 0 && 16 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_contig_to_contig_size16(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_contig_to_contig_size16\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 16 != 16
        (*((npy_uint128 *)dst)) = _NPY_SWAP_INPLACE16(*((npy_uint128 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 16);
#  if 1 == 1
        _NPY_SWAP_INPLACE16(dst);
#  elif 1 == 2
        _NPY_SWAP_INPLACE8(dst);
        _NPY_SWAP_INPLACE8(dst + 8);
#  endif

#endif

#if 1
        dst += 16;
#else
        dst += dst_stride;
#endif

#if 1
        src += 16;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_swap_contig_to_contig_size16_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 16 == 1 && 1
    memset(dst, *src, N);
#else

#  if 16 != 16
    npy_uint128 temp = _NPY_SWAP_INPLACE16(*((npy_uint128 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 16 != 16
        *((npy_uint128 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 16;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 16 >= 2 */


#line 100

#if (16 >= 2) && \
    (16 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 1 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 1 == 0 && 16 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_contig_to_contig_size16(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_contig_to_contig_size16\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 16 != 16
        (*((npy_uint128 *)dst)) = _NPY_SWAP16(*((npy_uint128 *)src));
#  else
#    if 1 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 16);
#  if 1 == 1
        _NPY_SWAP16(dst);
#  elif 1 == 2
        _NPY_SWAP8(dst);
        _NPY_SWAP8(dst + 8);
#  endif

#endif

#if 1
        dst += 16;
#else
        dst += dst_stride;
#endif

#if 1
        src += 16;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_swap_contig_to_contig_size16_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 16 == 1 && 1
    memset(dst, *src, N);
#else

#  if 16 != 16
    npy_uint128 temp = _NPY_SWAP16(*((npy_uint128 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 1 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 1 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 1 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 16 != 16
        *((npy_uint128 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 16;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 16 >= 2 */


#line 100

#if (16 >= 4) && \
    (16 > 1 || 0) && \
    (!NPY_USE_UNALIGNED_ACCESS || 0)


#if 2 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 0 && 2 == 0 && 16 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_swap_pair_contig_to_contig_size16(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _swap_pair_contig_to_contig_size16\n");*/
    while (N > 0) {
#if 0

        /* aligned copy and swap */
#  if 16 != 16
        (*((npy_uint128 *)dst)) = _NPY_SWAP_INPLACE16(*((npy_uint128 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 16);
#  if 2 == 1
        _NPY_SWAP_INPLACE16(dst);
#  elif 2 == 2
        _NPY_SWAP_INPLACE8(dst);
        _NPY_SWAP_INPLACE8(dst + 8);
#  endif

#endif

#if 1
        dst += 16;
#else
        dst += dst_stride;
#endif

#if 1
        src += 16;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 0
static void
_swap_pair_contig_to_contig_size16_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 16 == 1 && 1
    memset(dst, *src, N);
#else

#  if 16 != 16
    npy_uint128 temp = _NPY_SWAP_INPLACE16(*((npy_uint128 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 16 != 16
        *((npy_uint128 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 16;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 0 */

#endif/* 16 >= 4 */


#line 100

#if (16 >= 4) && \
    (16 > 1 || 1) && \
    (!NPY_USE_UNALIGNED_ACCESS || 1)


#if 2 || 1 == 0 || 1 == 0
/*
 * unrolling gains about 20-50% if the copy can be done in one mov instruction
 * if not it can decrease performance
 * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
 */
static void
#if 1 && 2 == 0 && 16 <= NPY_SIZEOF_INTP
    NPY_GCC_UNROLL_LOOPS
#endif
_aligned_swap_pair_contig_to_contig_size16(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
    /*printf("fn _aligned_swap_pair_contig_to_contig_size16\n");*/
    while (N > 0) {
#if 1

        /* aligned copy and swap */
#  if 16 != 16
        (*((npy_uint128 *)dst)) = _NPY_SWAP_PAIR16(*((npy_uint128 *)src));
#  else
#    if 2 == 0
        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

#else

        /* unaligned copy and swap */
        memmove(dst, src, 16);
#  if 2 == 1
        _NPY_SWAP_PAIR16(dst);
#  elif 2 == 2
        _NPY_SWAP_PAIR8(dst);
        _NPY_SWAP_PAIR8(dst + 8);
#  endif

#endif

#if 1
        dst += 16;
#else
        dst += dst_stride;
#endif

#if 1
        src += 16;
#else
        src += src_stride;
#endif

        --N;
    }
}
#endif


/*
 * specialized copy and swap for source stride 0,
 * interestingly unrolling here is like above is only marginally profitable for
 * small types and detrimental for >= 8byte moves on x86
 */
#if (1 == 0) && 1
static void
_aligned_swap_pair_contig_to_contig_size16_srcstride0(char *dst,
                        npy_intp dst_stride,
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 16 == 1 && 1
    memset(dst, *src, N);
#else

#  if 16 != 16
    npy_uint128 temp = _NPY_SWAP_PAIR16(*((npy_uint128 *)src));
#  else
    npy_uint64 temp0, temp1;
#    if 2 == 0
        temp0 = (*((npy_uint64 *)src));
        temp1 = (*((npy_uint64 *)src + 1));
#    elif 2 == 1
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
#    elif 2 == 2
        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
#    endif
#  endif

    while (N > 0) {
#  if 16 != 16
        *((npy_uint128 *)dst) = temp;
#  else
        *((npy_uint64 *)dst) = temp0;
        *((npy_uint64 *)dst + 1) = temp1;
#  endif
#  if 1
        dst += 16;
#  else
        dst += dst_stride;
#  endif
        --N;
    }
#endif/* @elsize == 1 && 1 -- else */
}
#endif/* (1 == 0) && 1 */

#endif/* 16 >= 4 */





static void
_strided_to_strided(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp src_itemsize,
                        NpyAuxData *NPY_UNUSED(data))
{
    while (N > 0) {
        memmove(dst, src, src_itemsize);
        dst += dst_stride;
        src += src_stride;
        --N;
    }
}

static void
_swap_strided_to_strided(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp src_itemsize,
                        NpyAuxData *NPY_UNUSED(data))
{
    char *a, *b, c;

    while (N > 0) {
        memmove(dst, src, src_itemsize);
        /* general in-place swap */
        a = dst;
        b = dst + src_itemsize - 1;
        while (a < b) {
            c = *a;
            *a = *b;
            *b = c;
            ++a; --b;
        }
        dst += dst_stride;
        src += src_stride;
        --N;
    }
}

static void
_swap_pair_strided_to_strided(char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp src_itemsize,
                        NpyAuxData *NPY_UNUSED(data))
{
    char *a, *b, c;
    npy_intp itemsize_half = src_itemsize / 2;

    while (N > 0) {
        memmove(dst, src, src_itemsize);
        /* general in-place swap */
        a = dst;
        b = dst + itemsize_half - 1;
        while (a < b) {
            c = *a;
            *a = *b;
            *b = c;
            ++a; --b;
        }
        /* general in-place swap */
        a = dst + itemsize_half;
        b = dst + 2*itemsize_half - 1;
        while (a < b) {
            c = *a;
            *a = *b;
            *b = c;
            ++a; --b;
        }
        dst += dst_stride;
        src += src_stride;
        --N;
    }
}

static void
_contig_to_contig(char *dst, npy_intp NPY_UNUSED(dst_stride),
                        char *src, npy_intp NPY_UNUSED(src_stride),
                        npy_intp N, npy_intp src_itemsize,
                        NpyAuxData *NPY_UNUSED(data))
{
    memmove(dst, src, src_itemsize*N);
}


NPY_NO_EXPORT PyArray_StridedUnaryOp *
PyArray_GetStridedCopyFn(int aligned, npy_intp src_stride,
                         npy_intp dst_stride, npy_intp itemsize)
{
/*
 * Skip the "unaligned" versions on CPUs which support unaligned
 * memory accesses.
 */
#if !NPY_USE_UNALIGNED_ACCESS
    if (aligned) {
#endif/*!NPY_USE_UNALIGNED_ACCESS*/

        /* contiguous dst */
        if (itemsize != 0 && dst_stride == itemsize) {
            /* constant src */
            if (src_stride == 0) {
                switch (itemsize) {
#line 333
                    case 1:
                        return
                          &_aligned_strided_to_contig_size1_srcstride0;

#line 333
                    case 2:
                        return
                          &_aligned_strided_to_contig_size2_srcstride0;

#line 333
                    case 4:
                        return
                          &_aligned_strided_to_contig_size4_srcstride0;

#line 333
                    case 8:
                        return
                          &_aligned_strided_to_contig_size8_srcstride0;

#line 333
                    case 16:
                        return
                          &_aligned_strided_to_contig_size16_srcstride0;

                }
            }
            /* contiguous src */
            else if (src_stride == itemsize) {
                return &_contig_to_contig;
            }
            /* general src */
            else {
                switch (itemsize) {
#line 349
                    case 1:
                        return &_aligned_strided_to_contig_size1;

#line 349
                    case 2:
                        return &_aligned_strided_to_contig_size2;

#line 349
                    case 4:
                        return &_aligned_strided_to_contig_size4;

#line 349
                    case 8:
                        return &_aligned_strided_to_contig_size8;

#line 349
                    case 16:
                        return &_aligned_strided_to_contig_size16;

                }
            }

            return &_strided_to_strided;
        }
        /* general dst */
        else {
            /* constant src */
            if (src_stride == 0) {
                switch (itemsize) {
#line 365
                    case 1:
                        return
                          &_aligned_strided_to_strided_size1_srcstride0;

#line 365
                    case 2:
                        return
                          &_aligned_strided_to_strided_size2_srcstride0;

#line 365
                    case 4:
                        return
                          &_aligned_strided_to_strided_size4_srcstride0;

#line 365
                    case 8:
                        return
                          &_aligned_strided_to_strided_size8_srcstride0;

#line 365
                    case 16:
                        return
                          &_aligned_strided_to_strided_size16_srcstride0;

                }
            }
            /* contiguous src */
            else if (src_stride == itemsize) {
                switch (itemsize) {
#line 377
                    case 1:
                        return &_aligned_contig_to_strided_size1;

#line 377
                    case 2:
                        return &_aligned_contig_to_strided_size2;

#line 377
                    case 4:
                        return &_aligned_contig_to_strided_size4;

#line 377
                    case 8:
                        return &_aligned_contig_to_strided_size8;

#line 377
                    case 16:
                        return &_aligned_contig_to_strided_size16;

                }

                return &_strided_to_strided;
            }
            else {
                switch (itemsize) {
#line 389
                    case 1:
                        return &_aligned_strided_to_strided_size1;

#line 389
                    case 2:
                        return &_aligned_strided_to_strided_size2;

#line 389
                    case 4:
                        return &_aligned_strided_to_strided_size4;

#line 389
                    case 8:
                        return &_aligned_strided_to_strided_size8;

#line 389
                    case 16:
                        return &_aligned_strided_to_strided_size16;

                }
            }
        }

#if !NPY_USE_UNALIGNED_ACCESS
    }
    else {
        /* contiguous dst */
        if (itemsize != 0 && dst_stride == itemsize) {
            /* contiguous src */
            if (itemsize != 0 && src_stride == itemsize) {
                return &_contig_to_contig;
            }
            /* general src */
            else {
                switch (itemsize) {
                    case 1:
                        return &_aligned_strided_to_contig_size1;
#line 413
                    case 2:
                        return &_strided_to_contig_size2;

#line 413
                    case 4:
                        return &_strided_to_contig_size4;

#line 413
                    case 8:
                        return &_strided_to_contig_size8;

#line 413
                    case 16:
                        return &_strided_to_contig_size16;

                }
            }

            return &_strided_to_strided;
        }
        /* general dst */
        else {
            /* contiguous src */
            if (itemsize != 0 && src_stride == itemsize) {
                switch (itemsize) {
                    case 1:
                        return &_aligned_contig_to_strided_size1;
#line 431
                    case 2:
                        return &_contig_to_strided_size2;

#line 431
                    case 4:
                        return &_contig_to_strided_size4;

#line 431
                    case 8:
                        return &_contig_to_strided_size8;

#line 431
                    case 16:
                        return &_contig_to_strided_size16;

                }

                return &_strided_to_strided;
            }
            /* general src */
            else {
                switch (itemsize) {
                    case 1:
                        return &_aligned_strided_to_strided_size1;
#line 446
                    case 2:
                        return &_strided_to_strided_size2;

#line 446
                    case 4:
                        return &_strided_to_strided_size4;

#line 446
                    case 8:
                        return &_strided_to_strided_size8;

#line 446
                    case 16:
                        return &_strided_to_strided_size16;

                }
            }
        }
    }
#endif/*!NPY_USE_UNALIGNED_ACCESS*/

    return &_strided_to_strided;
}

/*
 * PyArray_GetStridedCopySwapFn and PyArray_GetStridedCopySwapPairFn are
 * nearly identical, so can do a repeat for them.
 */
#line 467

NPY_NO_EXPORT PyArray_StridedUnaryOp *
PyArray_GetStridedCopySwapFn(int aligned, npy_intp src_stride,
                             npy_intp dst_stride, npy_intp itemsize)
{
/*
 * Skip the "unaligned" versions on CPUs which support unaligned
 * memory accesses.
 */
#if !NPY_USE_UNALIGNED_ACCESS
    if (aligned) {
#endif/*!NPY_USE_UNALIGNED_ACCESS*/

        /* contiguous dst */
        if (itemsize != 0 && dst_stride == itemsize) {
            /* constant src */
            if (src_stride == 0) {
                switch (itemsize) {
#line 488
#if 1 || 2 > 2
                case 2:
                    return
                 &_aligned_swap_strided_to_contig_size2_srcstride0;
#endif

#line 488
#if 1 || 4 > 2
                case 4:
                    return
                 &_aligned_swap_strided_to_contig_size4_srcstride0;
#endif

#line 488
#if 1 || 8 > 2
                case 8:
                    return
                 &_aligned_swap_strided_to_contig_size8_srcstride0;
#endif

#line 488
#if 1 || 16 > 2
                case 16:
                    return
                 &_aligned_swap_strided_to_contig_size16_srcstride0;
#endif

                }
            }
            /* contiguous src */
            else if (src_stride == itemsize) {
                switch (itemsize) {
#line 502
#if 1 || 2 > 2
                case 2:
                    return &_aligned_swap_contig_to_contig_size2;
#endif

#line 502
#if 1 || 4 > 2
                case 4:
                    return &_aligned_swap_contig_to_contig_size4;
#endif

#line 502
#if 1 || 8 > 2
                case 8:
                    return &_aligned_swap_contig_to_contig_size8;
#endif

#line 502
#if 1 || 16 > 2
                case 16:
                    return &_aligned_swap_contig_to_contig_size16;
#endif

                }
            }
            /* general src */
            else {
                switch (itemsize) {
#line 515
#if 1 || 2 > 2
                case 2:
                    return &_aligned_swap_strided_to_contig_size2;
#endif

#line 515
#if 1 || 4 > 2
                case 4:
                    return &_aligned_swap_strided_to_contig_size4;
#endif

#line 515
#if 1 || 8 > 2
                case 8:
                    return &_aligned_swap_strided_to_contig_size8;
#endif

#line 515
#if 1 || 16 > 2
                case 16:
                    return &_aligned_swap_strided_to_contig_size16;
#endif

                }
            }
        }
        /* general dst */
        else {
            /* constant src */
            if (src_stride == 0) {
                switch (itemsize) {
#line 531
#if 1 || 2 > 2
                case 2:
                    return
                &_aligned_swap_strided_to_strided_size2_srcstride0;
#endif

#line 531
#if 1 || 4 > 2
                case 4:
                    return
                &_aligned_swap_strided_to_strided_size4_srcstride0;
#endif

#line 531
#if 1 || 8 > 2
                case 8:
                    return
                &_aligned_swap_strided_to_strided_size8_srcstride0;
#endif

#line 531
#if 1 || 16 > 2
                case 16:
                    return
                &_aligned_swap_strided_to_strided_size16_srcstride0;
#endif

                }
            }
            /* contiguous src */
            else if (src_stride == itemsize) {
                switch (itemsize) {
#line 545
#if 1 || 2 > 2
                case 2:
                    return &_aligned_swap_contig_to_strided_size2;
#endif

#line 545
#if 1 || 4 > 2
                case 4:
                    return &_aligned_swap_contig_to_strided_size4;
#endif

#line 545
#if 1 || 8 > 2
                case 8:
                    return &_aligned_swap_contig_to_strided_size8;
#endif

#line 545
#if 1 || 16 > 2
                case 16:
                    return &_aligned_swap_contig_to_strided_size16;
#endif

                }

                return  &_swap_strided_to_strided;
            }
            else {
                switch (itemsize) {
#line 559
#if 1 || 2 > 2
                case 2:
                    return &_aligned_swap_strided_to_strided_size2;
#endif

#line 559
#if 1 || 4 > 2
                case 4:
                    return &_aligned_swap_strided_to_strided_size4;
#endif

#line 559
#if 1 || 8 > 2
                case 8:
                    return &_aligned_swap_strided_to_strided_size8;
#endif

#line 559
#if 1 || 16 > 2
                case 16:
                    return &_aligned_swap_strided_to_strided_size16;
#endif

                }
            }
        }

#if !NPY_USE_UNALIGNED_ACCESS
    }
    else {
        /* contiguous dst */
        if (itemsize != 0 && dst_stride == itemsize) {
            /* contiguous src */
            if (itemsize != 0 && src_stride == itemsize) {
                switch (itemsize) {
#line 579
#if 1 || 2 > 2
                case 2:
                    return &_swap_contig_to_contig_size2;
#endif

#line 579
#if 1 || 4 > 2
                case 4:
                    return &_swap_contig_to_contig_size4;
#endif

#line 579
#if 1 || 8 > 2
                case 8:
                    return &_swap_contig_to_contig_size8;
#endif

#line 579
#if 1 || 16 > 2
                case 16:
                    return &_swap_contig_to_contig_size16;
#endif

                }
            }
            /* general src */
            else {
                switch (itemsize) {
#line 592
#if 1 || 2 > 2
                    case 2:
                        return &_swap_strided_to_contig_size2;
#endif

#line 592
#if 1 || 4 > 2
                    case 4:
                        return &_swap_strided_to_contig_size4;
#endif

#line 592
#if 1 || 8 > 2
                    case 8:
                        return &_swap_strided_to_contig_size8;
#endif

#line 592
#if 1 || 16 > 2
                    case 16:
                        return &_swap_strided_to_contig_size16;
#endif

                }
            }

            return  &_swap_strided_to_strided;
        }
        /* general dst */
        else {
            /* contiguous src */
            if (itemsize != 0 && src_stride == itemsize) {
                switch (itemsize) {
#line 610
#if 1 || 2 > 2
                case 2:
                    return &_swap_contig_to_strided_size2;
#endif

#line 610
#if 1 || 4 > 2
                case 4:
                    return &_swap_contig_to_strided_size4;
#endif

#line 610
#if 1 || 8 > 2
                case 8:
                    return &_swap_contig_to_strided_size8;
#endif

#line 610
#if 1 || 16 > 2
                case 16:
                    return &_swap_contig_to_strided_size16;
#endif

                }

                return  &_swap_strided_to_strided;
            }
            /* general src */
            else {
                switch (itemsize) {
#line 625
#if 1 || 2 > 2
                case 2:
                    return &_swap_strided_to_strided_size2;
#endif

#line 625
#if 1 || 4 > 2
                case 4:
                    return &_swap_strided_to_strided_size4;
#endif

#line 625
#if 1 || 8 > 2
                case 8:
                    return &_swap_strided_to_strided_size8;
#endif

#line 625
#if 1 || 16 > 2
                case 16:
                    return &_swap_strided_to_strided_size16;
#endif

                }
            }
        }
    }
#endif/*!NPY_USE_UNALIGNED_ACCESS*/

    return &_swap_strided_to_strided;
}


#line 467

NPY_NO_EXPORT PyArray_StridedUnaryOp *
PyArray_GetStridedCopySwapPairFn(int aligned, npy_intp src_stride,
                             npy_intp dst_stride, npy_intp itemsize)
{
/*
 * Skip the "unaligned" versions on CPUs which support unaligned
 * memory accesses.
 */
#if !NPY_USE_UNALIGNED_ACCESS
    if (aligned) {
#endif/*!NPY_USE_UNALIGNED_ACCESS*/

        /* contiguous dst */
        if (itemsize != 0 && dst_stride == itemsize) {
            /* constant src */
            if (src_stride == 0) {
                switch (itemsize) {
#line 488
#if 0 || 2 > 2
                case 2:
                    return
                 &_aligned_swap_pair_strided_to_contig_size2_srcstride0;
#endif

#line 488
#if 0 || 4 > 2
                case 4:
                    return
                 &_aligned_swap_pair_strided_to_contig_size4_srcstride0;
#endif

#line 488
#if 0 || 8 > 2
                case 8:
                    return
                 &_aligned_swap_pair_strided_to_contig_size8_srcstride0;
#endif

#line 488
#if 0 || 16 > 2
                case 16:
                    return
                 &_aligned_swap_pair_strided_to_contig_size16_srcstride0;
#endif

                }
            }
            /* contiguous src */
            else if (src_stride == itemsize) {
                switch (itemsize) {
#line 502
#if 0 || 2 > 2
                case 2:
                    return &_aligned_swap_pair_contig_to_contig_size2;
#endif

#line 502
#if 0 || 4 > 2
                case 4:
                    return &_aligned_swap_pair_contig_to_contig_size4;
#endif

#line 502
#if 0 || 8 > 2
                case 8:
                    return &_aligned_swap_pair_contig_to_contig_size8;
#endif

#line 502
#if 0 || 16 > 2
                case 16:
                    return &_aligned_swap_pair_contig_to_contig_size16;
#endif

                }
            }
            /* general src */
            else {
                switch (itemsize) {
#line 515
#if 0 || 2 > 2
                case 2:
                    return &_aligned_swap_pair_strided_to_contig_size2;
#endif

#line 515
#if 0 || 4 > 2
                case 4:
                    return &_aligned_swap_pair_strided_to_contig_size4;
#endif

#line 515
#if 0 || 8 > 2
                case 8:
                    return &_aligned_swap_pair_strided_to_contig_size8;
#endif

#line 515
#if 0 || 16 > 2
                case 16:
                    return &_aligned_swap_pair_strided_to_contig_size16;
#endif

                }
            }
        }
        /* general dst */
        else {
            /* constant src */
            if (src_stride == 0) {
                switch (itemsize) {
#line 531
#if 0 || 2 > 2
                case 2:
                    return
                &_aligned_swap_pair_strided_to_strided_size2_srcstride0;
#endif

#line 531
#if 0 || 4 > 2
                case 4:
                    return
                &_aligned_swap_pair_strided_to_strided_size4_srcstride0;
#endif

#line 531
#if 0 || 8 > 2
                case 8:
                    return
                &_aligned_swap_pair_strided_to_strided_size8_srcstride0;
#endif

#line 531
#if 0 || 16 > 2
                case 16:
                    return
                &_aligned_swap_pair_strided_to_strided_size16_srcstride0;
#endif

                }
            }
            /* contiguous src */
            else if (src_stride == itemsize) {
                switch (itemsize) {
#line 545
#if 0 || 2 > 2
                case 2:
                    return &_aligned_swap_pair_contig_to_strided_size2;
#endif

#line 545
#if 0 || 4 > 2
                case 4:
                    return &_aligned_swap_pair_contig_to_strided_size4;
#endif

#line 545
#if 0 || 8 > 2
                case 8:
                    return &_aligned_swap_pair_contig_to_strided_size8;
#endif

#line 545
#if 0 || 16 > 2
                case 16:
                    return &_aligned_swap_pair_contig_to_strided_size16;
#endif

                }

                return  &_swap_pair_strided_to_strided;
            }
            else {
                switch (itemsize) {
#line 559
#if 0 || 2 > 2
                case 2:
                    return &_aligned_swap_pair_strided_to_strided_size2;
#endif

#line 559
#if 0 || 4 > 2
                case 4:
                    return &_aligned_swap_pair_strided_to_strided_size4;
#endif

#line 559
#if 0 || 8 > 2
                case 8:
                    return &_aligned_swap_pair_strided_to_strided_size8;
#endif

#line 559
#if 0 || 16 > 2
                case 16:
                    return &_aligned_swap_pair_strided_to_strided_size16;
#endif

                }
            }
        }

#if !NPY_USE_UNALIGNED_ACCESS
    }
    else {
        /* contiguous dst */
        if (itemsize != 0 && dst_stride == itemsize) {
            /* contiguous src */
            if (itemsize != 0 && src_stride == itemsize) {
                switch (itemsize) {
#line 579
#if 0 || 2 > 2
                case 2:
                    return &_swap_pair_contig_to_contig_size2;
#endif

#line 579
#if 0 || 4 > 2
                case 4:
                    return &_swap_pair_contig_to_contig_size4;
#endif

#line 579
#if 0 || 8 > 2
                case 8:
                    return &_swap_pair_contig_to_contig_size8;
#endif

#line 579
#if 0 || 16 > 2
                case 16:
                    return &_swap_pair_contig_to_contig_size16;
#endif

                }
            }
            /* general src */
            else {
                switch (itemsize) {
#line 592
#if 0 || 2 > 2
                    case 2:
                        return &_swap_pair_strided_to_contig_size2;
#endif

#line 592
#if 0 || 4 > 2
                    case 4:
                        return &_swap_pair_strided_to_contig_size4;
#endif

#line 592
#if 0 || 8 > 2
                    case 8:
                        return &_swap_pair_strided_to_contig_size8;
#endif

#line 592
#if 0 || 16 > 2
                    case 16:
                        return &_swap_pair_strided_to_contig_size16;
#endif

                }
            }

            return  &_swap_pair_strided_to_strided;
        }
        /* general dst */
        else {
            /* contiguous src */
            if (itemsize != 0 && src_stride == itemsize) {
                switch (itemsize) {
#line 610
#if 0 || 2 > 2
                case 2:
                    return &_swap_pair_contig_to_strided_size2;
#endif

#line 610
#if 0 || 4 > 2
                case 4:
                    return &_swap_pair_contig_to_strided_size4;
#endif

#line 610
#if 0 || 8 > 2
                case 8:
                    return &_swap_pair_contig_to_strided_size8;
#endif

#line 610
#if 0 || 16 > 2
                case 16:
                    return &_swap_pair_contig_to_strided_size16;
#endif

                }

                return  &_swap_pair_strided_to_strided;
            }
            /* general src */
            else {
                switch (itemsize) {
#line 625
#if 0 || 2 > 2
                case 2:
                    return &_swap_pair_strided_to_strided_size2;
#endif

#line 625
#if 0 || 4 > 2
                case 4:
                    return &_swap_pair_strided_to_strided_size4;
#endif

#line 625
#if 0 || 8 > 2
                case 8:
                    return &_swap_pair_strided_to_strided_size8;
#endif

#line 625
#if 0 || 16 > 2
                case 16:
                    return &_swap_pair_strided_to_strided_size16;
#endif

                }
            }
        }
    }
#endif/*!NPY_USE_UNALIGNED_ACCESS*/

    return &_swap_pair_strided_to_strided;
}



/************* STRIDED CASTING SPECIALIZED FUNCTIONS *************/

#line 671

#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_bool
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_bool

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 1
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 1
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_bool_to_bool(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_bool_to_bool\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 1
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 1
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_bool);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_bool
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_bool

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 1
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 1
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_bool_to_bool(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_bool_to_bool\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 1
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 1
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_bool);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_bool
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_bool

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 1
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 1
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_bool_to_bool(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_bool_to_bool\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 1
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 1
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_bool);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_bool
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_bool

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 1
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 1
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_bool_to_bool(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_bool_to_bool\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 1
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 1
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_bool);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ubyte
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_ubyte

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_bool_to_ubyte(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_bool_to_ubyte\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_ubyte);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ubyte
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_ubyte

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_bool_to_ubyte(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_bool_to_ubyte\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_ubyte);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ubyte
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_ubyte

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_bool_to_ubyte(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_bool_to_ubyte\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_ubyte);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ubyte
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_ubyte

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_bool_to_ubyte(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_bool_to_ubyte\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_ubyte);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ushort
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_ushort

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_bool_to_ushort(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_bool_to_ushort\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_ushort);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ushort
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_ushort

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_bool_to_ushort(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_bool_to_ushort\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_ushort);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ushort
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_ushort

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_bool_to_ushort(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_bool_to_ushort\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_ushort);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ushort
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_ushort

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_bool_to_ushort(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_bool_to_ushort\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_ushort);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_uint
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_uint

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_bool_to_uint(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_bool_to_uint\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_uint);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_uint
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_uint

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_bool_to_uint(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_bool_to_uint\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_uint);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_uint
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_uint

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_bool_to_uint(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_bool_to_uint\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_uint);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_uint
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_uint

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_bool_to_uint(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_bool_to_uint\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_uint);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ulong
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_ulong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_bool_to_ulong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_bool_to_ulong\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_ulong);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ulong
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_ulong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_bool_to_ulong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_bool_to_ulong\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_ulong);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ulong
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_ulong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_bool_to_ulong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_bool_to_ulong\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_ulong);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ulong
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_ulong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_bool_to_ulong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_bool_to_ulong\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_ulong);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ulonglong
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_ulonglong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_bool_to_ulonglong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_bool_to_ulonglong\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_ulonglong);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ulonglong
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_ulonglong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_bool_to_ulonglong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_bool_to_ulonglong\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_ulonglong);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ulonglong
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_ulonglong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_bool_to_ulonglong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_bool_to_ulonglong\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_ulonglong);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ulonglong
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_ulonglong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_bool_to_ulonglong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_bool_to_ulonglong\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_ulonglong);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_byte
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_byte

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_bool_to_byte(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_bool_to_byte\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_byte);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_byte
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_byte

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_bool_to_byte(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_bool_to_byte\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_byte);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_byte
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_byte

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_bool_to_byte(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_bool_to_byte\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_byte);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_byte
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_byte

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_bool_to_byte(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_bool_to_byte\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_byte);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_short
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_short

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_bool_to_short(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_bool_to_short\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_short);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_short
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_short

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_bool_to_short(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_bool_to_short\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_short);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_short
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_short

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_bool_to_short(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_bool_to_short\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_short);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_short
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_short

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_bool_to_short(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_bool_to_short\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_short);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_int
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_int

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_bool_to_int(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_bool_to_int\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_int);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_int
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_int

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_bool_to_int(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_bool_to_int\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_int);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_int
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_int

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_bool_to_int(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_bool_to_int\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_int);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_int
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_int

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_bool_to_int(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_bool_to_int\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_int);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_long
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_long

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_bool_to_long(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_bool_to_long\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_long);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_long
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_long

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_bool_to_long(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_bool_to_long\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_long);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_long
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_long

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_bool_to_long(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_bool_to_long\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_long);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_long
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_long

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_bool_to_long(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_bool_to_long\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_long);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longlong
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_longlong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_bool_to_longlong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_bool_to_longlong\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_longlong);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longlong
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_longlong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_bool_to_longlong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_bool_to_longlong\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_longlong);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longlong
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_longlong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_bool_to_longlong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_bool_to_longlong\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_longlong);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longlong
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_longlong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_bool_to_longlong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_bool_to_longlong\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_longlong);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 1

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_half
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_half

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 1
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 1

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_bool_to_half(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_bool_to_half\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_half);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 1

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_half
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_half

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 1
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 1

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_bool_to_half(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_bool_to_half\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_half);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 1

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_half
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_half

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 1
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 1

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_bool_to_half(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_bool_to_half\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_half);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 1

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_half
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_half

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 1
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 1

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_bool_to_half(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_bool_to_half\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_half);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 1
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_float
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_float

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 1
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_bool_to_float(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_bool_to_float\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_float);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 1
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_float
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_float

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 1
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_bool_to_float(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_bool_to_float\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_float);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 1
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_float
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_float

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 1
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_bool_to_float(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_bool_to_float\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_float);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 1
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_float
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_float

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 1
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_bool_to_float(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_bool_to_float\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_float);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 1
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_double
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_double

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 1
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_bool_to_double(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_bool_to_double\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_double);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 1
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_double
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_double

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 1
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_bool_to_double(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_bool_to_double\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_double);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 1
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_double
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_double

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 1
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_bool_to_double(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_bool_to_double\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_double);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 1
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_double
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_double

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 1
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_bool_to_double(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_bool_to_double\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_double);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longdouble
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_longdouble

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_bool_to_longdouble(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_bool_to_longdouble\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_longdouble);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longdouble
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_longdouble

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_bool_to_longdouble(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_bool_to_longdouble\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_longdouble);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longdouble
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_longdouble

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_bool_to_longdouble(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_bool_to_longdouble\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_longdouble);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longdouble
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_longdouble

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_bool_to_longdouble(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_bool_to_longdouble\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_longdouble);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 1
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_float
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_float

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 1
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_bool_to_cfloat(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 1
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_bool_to_cfloat\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 1
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 1
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 1
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_cfloat);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 1
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_float
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_float

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 1
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_bool_to_cfloat(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 1
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_bool_to_cfloat\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 1
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 1
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 1
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_cfloat);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 1
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_float
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_float

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 1
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_bool_to_cfloat(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 1
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_bool_to_cfloat\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 1
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 1
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 1
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_cfloat);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 1
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_float
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_float

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 1
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_bool_to_cfloat(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 1
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_bool_to_cfloat\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 1
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 1
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 1
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_cfloat);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 1
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_double
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_double

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 1
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_bool_to_cdouble(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 1
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_bool_to_cdouble\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 1
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 1
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 1
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_cdouble);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 1
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_double
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_double

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 1
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_bool_to_cdouble(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 1
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_bool_to_cdouble\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 1
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 1
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 1
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_cdouble);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 1
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_double
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_double

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 1
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_bool_to_cdouble(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 1
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_bool_to_cdouble\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 1
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 1
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 1
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_cdouble);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 1
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_double
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_double

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 1
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_bool_to_cdouble(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 1
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_bool_to_cdouble\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 1
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 1
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 1
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_cdouble);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longdouble
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_longdouble

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_bool_to_clongdouble(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 1
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_bool_to_clongdouble\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 1
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 1
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 1
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_clongdouble);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longdouble
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_longdouble

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_bool_to_clongdouble(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 1
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_bool_to_clongdouble\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 1
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 1
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 1
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_clongdouble);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longdouble
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_longdouble

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_bool_to_clongdouble(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 1
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_bool_to_clongdouble\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 1
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 1
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 1
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_clongdouble);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_bool
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longdouble
#  endif

#else

#define _TYPE1 npy_bool
#define _TYPE2 npy_longdouble

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_bool_to_clongdouble(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 1
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_bool_to_clongdouble\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 1
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 1
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 1
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_clongdouble);
        src += sizeof(npy_bool);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif






#line 671

#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_bool
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_bool

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 1
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 1
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ubyte_to_bool(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ubyte_to_bool\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 1
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 1
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_bool);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_bool
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_bool

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 1
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 1
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ubyte_to_bool(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ubyte_to_bool\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 1
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 1
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_bool);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_bool
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_bool

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 1
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 1
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ubyte_to_bool(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ubyte_to_bool\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 1
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 1
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_bool);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_bool
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_bool

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 1
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 1
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ubyte_to_bool(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ubyte_to_bool\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 1
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 1
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_bool);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ubyte
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_ubyte

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ubyte_to_ubyte(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ubyte_to_ubyte\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_ubyte);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ubyte
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_ubyte

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ubyte_to_ubyte(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ubyte_to_ubyte\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_ubyte);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ubyte
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_ubyte

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ubyte_to_ubyte(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ubyte_to_ubyte\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_ubyte);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ubyte
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_ubyte

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ubyte_to_ubyte(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ubyte_to_ubyte\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_ubyte);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ushort
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_ushort

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ubyte_to_ushort(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ubyte_to_ushort\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_ushort);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ushort
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_ushort

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ubyte_to_ushort(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ubyte_to_ushort\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_ushort);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ushort
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_ushort

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ubyte_to_ushort(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ubyte_to_ushort\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_ushort);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ushort
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_ushort

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ubyte_to_ushort(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ubyte_to_ushort\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_ushort);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_uint
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_uint

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ubyte_to_uint(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ubyte_to_uint\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_uint);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_uint
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_uint

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ubyte_to_uint(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ubyte_to_uint\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_uint);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_uint
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_uint

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ubyte_to_uint(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ubyte_to_uint\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_uint);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_uint
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_uint

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ubyte_to_uint(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ubyte_to_uint\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_uint);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ulong
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_ulong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ubyte_to_ulong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ubyte_to_ulong\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_ulong);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ulong
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_ulong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ubyte_to_ulong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ubyte_to_ulong\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_ulong);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ulong
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_ulong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ubyte_to_ulong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ubyte_to_ulong\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_ulong);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ulong
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_ulong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ubyte_to_ulong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ubyte_to_ulong\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_ulong);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ulonglong
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_ulonglong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ubyte_to_ulonglong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ubyte_to_ulonglong\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_ulonglong);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ulonglong
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_ulonglong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ubyte_to_ulonglong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ubyte_to_ulonglong\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_ulonglong);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ulonglong
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_ulonglong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ubyte_to_ulonglong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ubyte_to_ulonglong\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_ulonglong);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ulonglong
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_ulonglong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ubyte_to_ulonglong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ubyte_to_ulonglong\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_ulonglong);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_byte
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_byte

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ubyte_to_byte(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ubyte_to_byte\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_byte);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_byte
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_byte

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ubyte_to_byte(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ubyte_to_byte\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_byte);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_byte
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_byte

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ubyte_to_byte(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ubyte_to_byte\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_byte);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_byte
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_byte

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ubyte_to_byte(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ubyte_to_byte\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_byte);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_short
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_short

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ubyte_to_short(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ubyte_to_short\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_short);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_short
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_short

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ubyte_to_short(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ubyte_to_short\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_short);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_short
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_short

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ubyte_to_short(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ubyte_to_short\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_short);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_short
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_short

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ubyte_to_short(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ubyte_to_short\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_short);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_int
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_int

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ubyte_to_int(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ubyte_to_int\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_int);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_int
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_int

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ubyte_to_int(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ubyte_to_int\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_int);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_int
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_int

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ubyte_to_int(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ubyte_to_int\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_int);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_int
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_int

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ubyte_to_int(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ubyte_to_int\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_int);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_long
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_long

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ubyte_to_long(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ubyte_to_long\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_long);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_long
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_long

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ubyte_to_long(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ubyte_to_long\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_long);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_long
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_long

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ubyte_to_long(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ubyte_to_long\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_long);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_long
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_long

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ubyte_to_long(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ubyte_to_long\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_long);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longlong
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_longlong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ubyte_to_longlong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ubyte_to_longlong\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_longlong);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longlong
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_longlong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ubyte_to_longlong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ubyte_to_longlong\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_longlong);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longlong
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_longlong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ubyte_to_longlong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ubyte_to_longlong\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_longlong);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longlong
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_longlong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ubyte_to_longlong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ubyte_to_longlong\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_longlong);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 1

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_half
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_half

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 1
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 1

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ubyte_to_half(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ubyte_to_half\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_half);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 1

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_half
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_half

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 1
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 1

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ubyte_to_half(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ubyte_to_half\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_half);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 1

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_half
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_half

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 1
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 1

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ubyte_to_half(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ubyte_to_half\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_half);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 1

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_half
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_half

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 1
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 1

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ubyte_to_half(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ubyte_to_half\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_half);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 1
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_float
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_float

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 1
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ubyte_to_float(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ubyte_to_float\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_float);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 1
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_float
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_float

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 1
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ubyte_to_float(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ubyte_to_float\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_float);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 1
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_float
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_float

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 1
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ubyte_to_float(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ubyte_to_float\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_float);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 1
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_float
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_float

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 1
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ubyte_to_float(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ubyte_to_float\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_float);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 1
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_double
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_double

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 1
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ubyte_to_double(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ubyte_to_double\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_double);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 1
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_double
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_double

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 1
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ubyte_to_double(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ubyte_to_double\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_double);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 1
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_double
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_double

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 1
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ubyte_to_double(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ubyte_to_double\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_double);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 1
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_double
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_double

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 1
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ubyte_to_double(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ubyte_to_double\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_double);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longdouble
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_longdouble

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ubyte_to_longdouble(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ubyte_to_longdouble\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_longdouble);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longdouble
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_longdouble

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ubyte_to_longdouble(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ubyte_to_longdouble\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_longdouble);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longdouble
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_longdouble

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ubyte_to_longdouble(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ubyte_to_longdouble\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_longdouble);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longdouble
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_longdouble

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ubyte_to_longdouble(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ubyte_to_longdouble\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_longdouble);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 1
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_float
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_float

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 1
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ubyte_to_cfloat(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 1
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ubyte_to_cfloat\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 1
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 1
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 1
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_cfloat);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 1
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_float
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_float

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 1
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ubyte_to_cfloat(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 1
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ubyte_to_cfloat\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 1
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 1
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 1
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_cfloat);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 1
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_float
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_float

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 1
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ubyte_to_cfloat(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 1
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ubyte_to_cfloat\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 1
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 1
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 1
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_cfloat);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 1
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_float
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_float

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 1
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ubyte_to_cfloat(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 1
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ubyte_to_cfloat\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 1
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 1
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 1
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_cfloat);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 1
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_double
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_double

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 1
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ubyte_to_cdouble(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 1
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ubyte_to_cdouble\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 1
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 1
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 1
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_cdouble);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 1
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_double
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_double

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 1
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ubyte_to_cdouble(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 1
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ubyte_to_cdouble\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 1
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 1
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 1
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_cdouble);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 1
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_double
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_double

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 1
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ubyte_to_cdouble(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 1
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ubyte_to_cdouble\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 1
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 1
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 1
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_cdouble);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 1
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_double
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_double

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 1
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ubyte_to_cdouble(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 1
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ubyte_to_cdouble\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 1
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 1
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 1
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_cdouble);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longdouble
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_longdouble

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ubyte_to_clongdouble(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 1
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ubyte_to_clongdouble\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 1
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 1
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 1
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_clongdouble);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longdouble
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_longdouble

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ubyte_to_clongdouble(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 1
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ubyte_to_clongdouble\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 1
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 1
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 1
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_clongdouble);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longdouble
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_longdouble

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ubyte_to_clongdouble(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 1
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ubyte_to_clongdouble\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 1
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 1
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 1
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_clongdouble);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ubyte
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longdouble
#  endif

#else

#define _TYPE1 npy_ubyte
#define _TYPE2 npy_longdouble

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ubyte_to_clongdouble(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 1
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ubyte_to_clongdouble\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 1
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 1
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 1
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_clongdouble);
        src += sizeof(npy_ubyte);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif






#line 671

#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_bool
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_bool

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 1
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 1
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ushort_to_bool(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ushort_to_bool\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 1
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 1
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_bool);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_bool
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_bool

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 1
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 1
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ushort_to_bool(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ushort_to_bool\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 1
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 1
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_bool);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_bool
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_bool

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 1
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 1
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ushort_to_bool(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ushort_to_bool\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 1
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 1
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_bool);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_bool
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_bool

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 1
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 1
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ushort_to_bool(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ushort_to_bool\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 1
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 1
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_bool);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ubyte
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_ubyte

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ushort_to_ubyte(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ushort_to_ubyte\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_ubyte);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ubyte
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_ubyte

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ushort_to_ubyte(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ushort_to_ubyte\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_ubyte);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ubyte
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_ubyte

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ushort_to_ubyte(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ushort_to_ubyte\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_ubyte);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ubyte
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_ubyte

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ushort_to_ubyte(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ushort_to_ubyte\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_ubyte);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ushort
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_ushort

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ushort_to_ushort(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ushort_to_ushort\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_ushort);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ushort
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_ushort

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ushort_to_ushort(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ushort_to_ushort\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_ushort);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ushort
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_ushort

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ushort_to_ushort(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ushort_to_ushort\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_ushort);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ushort
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_ushort

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ushort_to_ushort(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ushort_to_ushort\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_ushort);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_uint
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_uint

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ushort_to_uint(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ushort_to_uint\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_uint);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_uint
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_uint

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ushort_to_uint(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ushort_to_uint\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_uint);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_uint
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_uint

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ushort_to_uint(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ushort_to_uint\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_uint);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_uint
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_uint

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ushort_to_uint(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ushort_to_uint\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_uint);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ulong
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_ulong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ushort_to_ulong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ushort_to_ulong\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_ulong);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ulong
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_ulong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ushort_to_ulong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ushort_to_ulong\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_ulong);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ulong
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_ulong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ushort_to_ulong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ushort_to_ulong\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_ulong);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ulong
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_ulong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ushort_to_ulong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ushort_to_ulong\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_ulong);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ulonglong
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_ulonglong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ushort_to_ulonglong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ushort_to_ulonglong\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_ulonglong);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ulonglong
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_ulonglong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ushort_to_ulonglong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ushort_to_ulonglong\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_ulonglong);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ulonglong
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_ulonglong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ushort_to_ulonglong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ushort_to_ulonglong\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_ulonglong);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_ulonglong
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_ulonglong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ushort_to_ulonglong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ushort_to_ulonglong\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_ulonglong);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_byte
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_byte

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ushort_to_byte(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ushort_to_byte\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_byte);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_byte
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_byte

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ushort_to_byte(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ushort_to_byte\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_byte);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_byte
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_byte

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ushort_to_byte(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ushort_to_byte\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_byte);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_byte
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_byte

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ushort_to_byte(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ushort_to_byte\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_byte);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_short
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_short

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ushort_to_short(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ushort_to_short\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_short);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_short
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_short

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ushort_to_short(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ushort_to_short\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_short);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_short
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_short

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ushort_to_short(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ushort_to_short\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_short);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_short
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_short

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ushort_to_short(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ushort_to_short\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_short);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_int
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_int

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ushort_to_int(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ushort_to_int\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_int);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_int
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_int

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ushort_to_int(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ushort_to_int\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_int);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_int
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_int

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ushort_to_int(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ushort_to_int\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_int);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_int
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_int

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ushort_to_int(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ushort_to_int\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_int);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_long
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_long

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ushort_to_long(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ushort_to_long\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_long);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_long
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_long

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ushort_to_long(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ushort_to_long\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_long);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_long
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_long

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ushort_to_long(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ushort_to_long\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_long);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_long
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_long

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ushort_to_long(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ushort_to_long\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_long);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longlong
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_longlong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ushort_to_longlong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ushort_to_longlong\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_longlong);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longlong
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_longlong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ushort_to_longlong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ushort_to_longlong\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_longlong);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longlong
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_longlong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ushort_to_longlong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ushort_to_longlong\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_longlong);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_longlong
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_longlong

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ushort_to_longlong(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ushort_to_longlong\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_longlong);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 1

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_half
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_half

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 1
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 1

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ushort_to_half(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ushort_to_half\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_half);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 1

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_half
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_half

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 1
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 1

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ushort_to_half(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ushort_to_half\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_half);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 1

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_half
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_half

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 1
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 1

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ushort_to_half(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ushort_to_half\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_half);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 1

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 0
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_half
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_half

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 0
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 1
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 1

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_contig_cast_ushort_to_half(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_contig_cast_ushort_to_half\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_half);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif




#line 700

#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 1
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_float
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_float

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 1
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_cast_ushort_to_float(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_cast_ushort_to_float\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_float);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 1
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_float
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_float

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 1
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_cast_ushort_to_float(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !0
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !0
    _TYPE2 dst_value;
#endif

    /*printf("_cast_ushort_to_float\n");*/

    while (N--) {
#if 0
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !0
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !0
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !0
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !0
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 0
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !0
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 0
        dst += sizeof(npy_float);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !1)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 1
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_float
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_float

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 1
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
#  endif

#elif 0

#  if 0
#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
#  else
#    define _CONVERT_FN(x) npy_float_to_half((float)x)
#  endif

#else

#  if 0
#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
#  else
#    define _CONVERT_FN(x) ((_TYPE2)x)
#  endif

#endif

static void
_aligned_contig_cast_ushort_to_float(
                        char *dst, npy_intp dst_stride,
                        char *src, npy_intp src_stride,
                        npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                        NpyAuxData *NPY_UNUSED(data))
{
#if 0
    _TYPE1 src_value[2];
#elif !1
    _TYPE1 src_value;
#endif
#if 0
    _TYPE2 dst_value[2];
#elif !1
    _TYPE2 dst_value;
#endif

    /*printf("_aligned_contig_cast_ushort_to_float\n");*/

    while (N--) {
#if 1
#  if 0
        src_value[0] = ((_TYPE1 *)src)[0];
        src_value[1] = ((_TYPE1 *)src)[1];
#  elif !1
        src_value = *((_TYPE1 *)src);
#  endif
#else
        memmove(&src_value, src, sizeof(src_value));
#endif

/* Do the cast */
#if 0
#  if 0
    dst_value[0] = _CONVERT_FN(src_value[0]);
    dst_value[1] = _CONVERT_FN(src_value[1]);
#  elif !1
#    if 0
       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       dst_value = _CONVERT_FN(src_value[0]);
#    endif
#  else
#    if 0
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
#    else
       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
#    endif
#  endif
#else
#  if 0
#    if !1
    dst_value[0] = _CONVERT_FN(src_value);
#    else
    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
#    endif
    dst_value[1] = 0;
#  elif !1
    dst_value = _CONVERT_FN(src_value);
#  else
    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
#  endif
#endif

#if 1
#  if 0
        ((_TYPE2 *)dst)[0] = dst_value[0];
        ((_TYPE2 *)dst)[1] = dst_value[1];
#  elif !1
        *((_TYPE2 *)dst) = dst_value;
#  endif
#else
        memmove(dst, &dst_value, sizeof(dst_value));
#endif

#if 1
        dst += sizeof(npy_float);
        src += sizeof(npy_ushort);
#else
        dst += dst_stride;
        src += src_stride;
#endif
    }
}

#undef _CONVERT_FN
#undef _TYPE2
#undef _TYPE1

#endif


#line 706

#if !(NPY_USE_UNALIGNED_ACCESS && !0)

/* For half types, don't use actual double/float types in conversion */
#if 0 || 0

#  if 0
#    define _TYPE1 npy_uint32
#  elif 0
#    define _TYPE1 npy_uint64
#  else
#    define _TYPE1 npy_ushort
#  endif

#  if 1
#    define _TYPE2 npy_uint32
#  elif 0
#    define _TYPE2 npy_uint64
#  else
#    define _TYPE2 npy_float
#  endif

#else

#define _TYPE1 npy_ushort
#define _TYPE2 npy_float

#endif

/* Determine an appropriate casting conversion function */
#if 0

#  if 1
#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
#  elif 0
#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
#  elif 0
#    define _CONVERT_FN(x) (x)
#  elif 0
#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
#  else
#    de