/*-----------------------------------------------------------------------*/

/*   Product:   LIBJACKET                                                */

/*   Copyright (c) AccelerEyes LLC. All rights reserved.                 */
/*   See http://www.accelereyes.com/eula for details.                    */

/*   This software is distributed WITHOUT ANY WARRANTY; without even     */
/*   the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR */
/*   PURPOSE.  See the above copyright notices for more information.     */

/*-----------------------------------------------------------------------*/

#ifndef __JACKET_H
#define __JACKET_H

#include <cuComplex.h>

typedef enum jktError {
    JKT_SUCCESS = 0, // No error
    JKT_ERR_NOMEM,   // Out of memory
    JKT_ERR_ACCESS,  // Permission denied
    JKT_ERR_FAULT,   // Bad address
    JKT_ERR_INVAL,   // Invalid argument
    JKT_ERR_NODEV,   // No hardware support
    JKT_ERR_NOTSUP,  // Operation (parameter types) unsupported
    JKT_ERR_IO,      // I/O error (processing hw/lib exception)
    JKT_ERR_NOJIT,   // DISCUSS: Will be supported after JIT port 
    JKT_ERR_MIXEDTYPE,// Unsupported Mixed Type Arguments
} jktError_t;

#ifdef _WIN32
#define JKTAPI  __declspec(dllexport)
#else
#define JKTAPI  __attribute__ ((visibility("default")))
#endif

#ifdef __cplusplus
extern "C" {
#endif


JKTAPI const char *jkt_strerror(jktError_t e);

// all vector
JKTAPI jktError_t jkt_all_vector_bf32(bool *h_dst, float *d_src, unsigned numel);
JKTAPI jktError_t jkt_all_vector_bb(bool *h_dst, bool *d_src, unsigned numel);
JKTAPI jktError_t jkt_all_vector_bc32(bool *h_dst, cuFloatComplex *d_src, unsigned numel);
JKTAPI jktError_t jkt_all_vector_bf64(bool *h_dst, double *d_src, unsigned numel);
JKTAPI jktError_t jkt_all_vector_bc64(bool *h_dst, cuDoubleComplex *d_src, unsigned numel);



// all columns
JKTAPI jktError_t jkt_all_columns_bf32(bool *h_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_all_columns_bb(bool *h_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_all_columns_bc32(bool *h_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_all_columns_bf64(bool *h_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_all_columns_bc64(bool *h_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);



// all rows
JKTAPI jktError_t jkt_all_rows_bf32(bool *h_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_all_rows_bb(bool *h_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_all_rows_bc32(bool *h_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_all_rows_bf64(bool *h_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_all_rows_bc64(bool *h_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);



// all over a stride (most general case)
JKTAPI jktError_t all_stride_r(unsigned nout, unsigned numel, unsigned stride, unsigned batch,
                                    unsigned outer_stride, unsigned outer_batch, void *d_dst, unsigned cls_dst,
                                    void *d_src, unsigned cls_src, float *d_idx, bool cplx,
                                    bool idx_out, bool device_result);
#ifdef __cplusplus
}
#endif




#ifdef __cplusplus
extern "C" {
#endif
// any vector
JKTAPI jktError_t jkt_any_vector_bf32(bool *h_dst, float *d_src, unsigned numel);
JKTAPI jktError_t jkt_any_vector_bb(bool *h_dst, bool *d_src, unsigned numel);
JKTAPI jktError_t jkt_any_vector_bc32(bool *h_dst, cuFloatComplex *d_src, unsigned numel);
JKTAPI jktError_t jkt_any_vector_bf64(bool *h_dst, double *d_src, unsigned numel);
JKTAPI jktError_t jkt_any_vector_bc64(bool *h_dst, cuDoubleComplex *d_src, unsigned numel);



// any columns
JKTAPI jktError_t jkt_any_columns_bf32(bool *h_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_any_columns_bb(bool *h_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_any_columns_bc32(bool *h_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_any_columns_bf64(bool *h_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_any_columns_bc64(bool *h_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);



// any rows
JKTAPI jktError_t jkt_any_rows_bf32(bool *h_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_any_rows_bb(bool *h_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_any_rows_bc32(bool *h_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_any_rows_bf64(bool *h_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_any_rows_bc64(bool *h_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);



// any over a stride (most general case)
JKTAPI jktError_t any_stride_r(unsigned nout, unsigned numel, unsigned stride, unsigned batch,
                                    unsigned outer_stride, unsigned outer_batch, void *d_dst, unsigned cls_dst,
                                    void *d_src, unsigned cls_src, float *d_idx, bool cplx,
                                    bool idx_out, bool device_result);
#ifdef __cplusplus
}
#endif



#ifdef __cplusplus
// any vector
JKTAPI jktError_t jkt_any_vector(bool *h_dst, float *d_src, unsigned numel);
JKTAPI jktError_t jkt_any_vector(bool *h_dst, bool *d_src, unsigned numel);
JKTAPI jktError_t jkt_any_vector(bool *h_dst, cuFloatComplex *d_src, unsigned numel);
JKTAPI jktError_t jkt_any_vector(bool *h_dst, double *d_src, unsigned numel);
JKTAPI jktError_t jkt_any_vector(bool *h_dst, cuDoubleComplex *d_src, unsigned numel);



// any columns
JKTAPI jktError_t jkt_any_columns(bool *d_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_any_columns(bool *d_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_any_columns(bool *d_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_any_columns(bool *d_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_any_columns(bool *d_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);



// any rows
JKTAPI jktError_t jkt_any_rows(bool *d_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_any_rows(bool *d_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_any_rows(bool *d_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_any_rows(bool *d_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_any_rows(bool *d_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);
#endif // ifdef __cplusplus



#ifdef __cplusplus
extern "C" {
#endif



// besselh
JKTAPI jktError_t jkt_besselh_f32(float *_d_J, float *_d_Z, float nu, unsigned n, unsigned k);
JKTAPI jktError_t jkt_besselh_z32(float *_d_J, float *_d_Z, float nu, unsigned n, unsigned k);
JKTAPI jktError_t jkt_besselh_f64(double *_d_J, double *_d_Z, double nu, unsigned n, unsigned k);
JKTAPI jktError_t jkt_besselh_z64(double *_d_J, double *_d_Z, double nu, unsigned n, unsigned k);



// besseli
JKTAPI jktError_t jkt_besseli_f32(float *_d_J, float *_d_Z, float nu, unsigned n);
JKTAPI jktError_t jkt_besseli_z32(float *_d_J, float *_d_Z, float nu, unsigned n);
JKTAPI jktError_t jkt_besseli_f64(double *_d_J, double *_d_Z, double nu, unsigned n);
JKTAPI jktError_t jkt_besseli_z64(double *_d_J, double *_d_Z, double nu, unsigned n);



// besselj
JKTAPI jktError_t jkt_besselj_f32(float *_d_J, float *_d_Z, float nu, unsigned n);
JKTAPI jktError_t jkt_besselj_c32(cuComplex *_d_J, cuComplex *_d_Z, float nu, unsigned n);
JKTAPI jktError_t jkt_besselj_f64(double *_d_J, double *_d_Z, double nu, unsigned n);
JKTAPI jktError_t jkt_besselj_c64(cuDoubleComplex *_d_J, cuDoubleComplex *_d_Z, double nu, unsigned n);



// besselk
JKTAPI jktError_t jkt_besselk_f32(float *_d_J, float *_d_Z, float nu, unsigned n);
JKTAPI jktError_t jkt_besselk_z32(float *_d_J, float *_d_Z, float nu, unsigned n);
JKTAPI jktError_t jkt_besselk_f64(double *_d_J, double *_d_Z, double nu, unsigned n);
JKTAPI jktError_t jkt_besselk_z64(double *_d_J, double *_d_Z, double nu, unsigned n);



// bessely
JKTAPI jktError_t jkt_bessely_f32(float *_d_Y, float *_d_Z, float nu, unsigned n);
JKTAPI jktError_t jkt_bessely_z32(float *_d_Y, float *_d_Z, float nu, unsigned n);
JKTAPI jktError_t jkt_bessely_f64(double *_d_Y, double *_d_Z, double nu, unsigned n);
JKTAPI jktError_t jkt_bessely_z64(double *_d_Y, double *_d_Z, double nu, unsigned n);
#ifdef __cplusplus
}
#endif



#ifdef __cplusplus
extern "C" {
#endif



// ctranspose
JKTAPI jktError_t jkt_ctranspose_b(const bool *d_A, bool *d_B, unsigned w, unsigned h);
JKTAPI jktError_t jkt_ctranspose_f32(const float *d_A, float *d_B, unsigned w, unsigned h);
JKTAPI jktError_t jkt_ctranspose_f64(const double *d_A, double *d_B, unsigned w, unsigned h);
JKTAPI jktError_t jkt_ctranspose_c32(const cuComplex *d_A, cuComplex *d_B, unsigned w, unsigned h);
JKTAPI jktError_t jkt_ctranspose_c64(const cuDoubleComplex *d_A, cuDoubleComplex *d_B, unsigned w, unsigned h);



// ctranspose (gfor)
JKTAPI jktError_t jkt_ctranspose_b_gfor(const bool *d_A, bool *d_B, unsigned w, unsigned h, unsigned ngfor);
JKTAPI jktError_t jkt_ctranspose_f32_gfor(const float *d_A, float *d_B, unsigned w, unsigned h, unsigned ngfor);
JKTAPI jktError_t jkt_ctranspose_f64_gfor(const double *d_A, double *d_B, unsigned w, unsigned h, unsigned ngfor);
JKTAPI jktError_t jkt_ctranspose_c32_gfor(const cuComplex *d_A, cuComplex *d_B, unsigned w, unsigned h, unsigned ngfor);
JKTAPI jktError_t jkt_ctranspose_c64_gfor(const cuDoubleComplex *d_A, cuDoubleComplex *d_B, unsigned w, unsigned h, unsigned ngfor);
#ifdef __cplusplus
}
#endif



#ifdef __cplusplus
extern "C" {
#endif



// diff
JKTAPI jktError_t jkt_diff_f32(float *d_Y, float *d_X, int *dims_Y, int ndims_Y, int *dims_X, int ndims_X, int order, int dim);
JKTAPI jktError_t jkt_diff_f64(double *d_Y, double *d_X, int *dims_Y, int ndims_Y, int *dims_X, int ndims_X, int order, int dim);
#ifdef __cplusplus
}
#endif



#ifdef __cplusplus
extern "C" {
#endif



// gradient
JKTAPI jktError_t jkt_grad_b(bool *d_dx, bool *d_dy, bool *d_A, unsigned ny, unsigned nx);
JKTAPI jktError_t jkt_grad_u32(unsigned *d_dx, unsigned *d_dy, unsigned *d_A, unsigned ny, unsigned nx);
JKTAPI jktError_t jkt_grad_i32(int *d_dx, int *d_dy, int *d_A, unsigned ny, unsigned nx);
JKTAPI jktError_t jkt_grad_f32(float *d_dx, float *d_dy, float *d_A, unsigned ny, unsigned nx);
JKTAPI jktError_t jkt_grad_f64(double *d_dx, double *d_dy, double *d_A, unsigned ny, unsigned nx);
#ifdef __cplusplus
}
#endif



#ifdef __cplusplus
extern "C" {
#endif



// rand
JKTAPI jktError_t jkt_grand_f32(float *d_Y, unsigned numel, int *bytes);
JKTAPI jktError_t jkt_grand_f64(double *d_Y, unsigned numel, int *bytes);
JKTAPI void jkt_grand_set_seed(unsigned seed);
JKTAPI unsigned jkt_grand_get_seed();
#ifdef __cplusplus
}
#endif



#ifdef __cplusplus
extern "C" {
#endif
// hankel
JKTAPI jktError_t jkt_hankel_f32f32f32(float *d_H, unsigned nc, float *d_C, unsigned nr, float *d_R);
JKTAPI jktError_t jkt_hankel_c32f32c32(cuFloatComplex *d_H, unsigned nc, float *d_C, unsigned nr, cuFloatComplex *d_R);
JKTAPI jktError_t jkt_hankel_c32c32f32(cuFloatComplex *d_H, unsigned nc, cuFloatComplex *d_C, unsigned nr, float *d_R);
JKTAPI jktError_t jkt_hankel_c32c32c32(cuFloatComplex *d_H, unsigned nc, cuFloatComplex *d_C, unsigned nr, cuFloatComplex *d_R);
JKTAPI jktError_t jkt_hankel_f32f32f64(float *d_H, unsigned nc, float *d_C, unsigned nr, double *d_R);
JKTAPI jktError_t jkt_hankel_c32f32c64(cuFloatComplex *d_H, unsigned nc, float *d_C, unsigned nr, cuDoubleComplex *d_R);
JKTAPI jktError_t jkt_hankel_c32c32f64(cuFloatComplex *d_H, unsigned nc, cuFloatComplex *d_C, unsigned nr, double *d_R);
JKTAPI jktError_t jkt_hankel_c32c32c64(cuFloatComplex *d_H, unsigned nc, cuFloatComplex *d_C, unsigned nr, cuDoubleComplex *d_R);
JKTAPI jktError_t jkt_hankel_f32f64f32(float *d_H, unsigned nc, double *d_C, unsigned nr, float *d_R);
JKTAPI jktError_t jkt_hankel_c32f64c32(cuFloatComplex *d_H, unsigned nc, double *d_C, unsigned nr, cuFloatComplex *d_R);
JKTAPI jktError_t jkt_hankel_c32c64f32(cuFloatComplex *d_H, unsigned nc, cuDoubleComplex *d_C, unsigned nr, float *d_R);
JKTAPI jktError_t jkt_hankel_c32c64c32(cuFloatComplex *d_H, unsigned nc, cuDoubleComplex *d_C, unsigned nr, cuFloatComplex *d_R);
JKTAPI jktError_t jkt_hankel_f64f32f64(double *d_H, unsigned nc, float *d_C, unsigned nr, double *d_R);
JKTAPI jktError_t jkt_hankel_c64f32c64(cuDoubleComplex *d_H, unsigned nc, float *d_C, unsigned nr, cuDoubleComplex *d_R);
JKTAPI jktError_t jkt_hankel_c64c32f64(cuDoubleComplex *d_H, unsigned nc, cuFloatComplex *d_C, unsigned nr, double *d_R);
JKTAPI jktError_t jkt_hankel_c64c32c64(cuDoubleComplex *d_H, unsigned nc, cuFloatComplex *d_C, unsigned nr, cuDoubleComplex *d_R);
JKTAPI jktError_t jkt_hankel_f64f64f32(double *d_H, unsigned nc, double *d_C, unsigned nr, float *d_R);
JKTAPI jktError_t jkt_hankel_c64f64c32(cuDoubleComplex *d_H, unsigned nc, double *d_C, unsigned nr, cuFloatComplex *d_R);
JKTAPI jktError_t jkt_hankel_c64c64f32(cuDoubleComplex *d_H, unsigned nc, cuDoubleComplex *d_C, unsigned nr, float *d_R);
JKTAPI jktError_t jkt_hankel_c64c64c32(cuDoubleComplex *d_H, unsigned nc, cuDoubleComplex *d_C, unsigned nr, cuFloatComplex *d_R);
JKTAPI jktError_t jkt_hankel_f64f64f64(double *d_H, unsigned nc, double *d_C, unsigned nr, double *d_R);
JKTAPI jktError_t jkt_hankel_c64f64c64(cuDoubleComplex *d_H, unsigned nc, double *d_C, unsigned nr, cuDoubleComplex *d_R);
JKTAPI jktError_t jkt_hankel_c64c64f64(cuDoubleComplex *d_H, unsigned nc, cuDoubleComplex *d_C, unsigned nr, double *d_R);
JKTAPI jktError_t jkt_hankel_c64c64c64(cuDoubleComplex *d_H, unsigned nc, cuDoubleComplex *d_C, unsigned nr, cuDoubleComplex *d_R);
#ifdef __cplusplus
}
#endif



#ifdef __cplusplus
extern "C" {
#endif
// max vector
JKTAPI jktError_t jkt_max_vector_f32f32(float *h_dst, float *d_src, unsigned numel);
JKTAPI jktError_t jkt_max_vector_bb(bool *h_dst, bool *d_src, unsigned numel);
JKTAPI jktError_t jkt_max_vector_c32c32(cuFloatComplex *h_dst, cuFloatComplex *d_src, unsigned numel);
JKTAPI jktError_t jkt_max_vector_f64f64(double *h_dst, double *d_src, unsigned numel);
JKTAPI jktError_t jkt_max_vector_c64c64(cuDoubleComplex *h_dst, cuDoubleComplex *d_src, unsigned numel);



// max columns
JKTAPI jktError_t jkt_max_columns_f32f32(float *h_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_max_columns_bb(bool *h_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_max_columns_c32c32(cuFloatComplex *h_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_max_columns_f64f64(double *h_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_max_columns_c64c64(cuDoubleComplex *h_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);



// max rows
JKTAPI jktError_t jkt_max_rows_f32f32(float *h_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_max_rows_bb(bool *h_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_max_rows_c32c32(cuFloatComplex *h_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_max_rows_f64f64(double *h_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_max_rows_c64c64(cuDoubleComplex *h_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);



// max over a stride (most general case)
JKTAPI jktError_t max_stride_r(unsigned nout, unsigned numel, unsigned stride, unsigned batch,
                                    unsigned outer_stride, unsigned outer_batch, void *d_dst, unsigned cls_dst,
                                    void *d_src, unsigned cls_src, float *d_idx, bool cplx,
                                    bool idx_out, bool device_result);
#ifdef __cplusplus
}
#endif



#ifdef __cplusplus
// max vector
JKTAPI jktError_t jkt_max_vector(float *h_dst, float *d_src, unsigned numel);
JKTAPI jktError_t jkt_max_vector(bool *h_dst, bool *d_src, unsigned numel);
JKTAPI jktError_t jkt_max_vector(cuFloatComplex *h_dst, cuFloatComplex *d_src, unsigned numel);
JKTAPI jktError_t jkt_max_vector(double *h_dst, double *d_src, unsigned numel);
JKTAPI jktError_t jkt_max_vector(cuDoubleComplex *h_dst, cuDoubleComplex *d_src, unsigned numel);



// max columns
JKTAPI jktError_t jkt_max_columns(float *d_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_max_columns(bool *d_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_max_columns(cuFloatComplex *d_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_max_columns(double *d_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_max_columns(cuDoubleComplex *d_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);



// max rows
JKTAPI jktError_t jkt_max_rows(float *d_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_max_rows(bool *d_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_max_rows(cuFloatComplex *d_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_max_rows(double *d_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_max_rows(cuDoubleComplex *d_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);
#endif // ifdef __cplusplus



#ifdef __cplusplus
extern "C" {
#endif
// min vector
JKTAPI jktError_t jkt_min_vector_f32f32(float *h_dst, float *d_src, unsigned numel);
JKTAPI jktError_t jkt_min_vector_bb(bool *h_dst, bool *d_src, unsigned numel);
JKTAPI jktError_t jkt_min_vector_c32c32(cuFloatComplex *h_dst, cuFloatComplex *d_src, unsigned numel);
JKTAPI jktError_t jkt_min_vector_f64f64(double *h_dst, double *d_src, unsigned numel);
JKTAPI jktError_t jkt_min_vector_c64c64(cuDoubleComplex *h_dst, cuDoubleComplex *d_src, unsigned numel);



// min columns
JKTAPI jktError_t jkt_min_columns_f32f32(float *h_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_min_columns_bb(bool *h_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_min_columns_c32c32(cuFloatComplex *h_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_min_columns_f64f64(double *h_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_min_columns_c64c64(cuDoubleComplex *h_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);



// min rows
JKTAPI jktError_t jkt_min_rows_f32f32(float *h_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_min_rows_bb(bool *h_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_min_rows_c32c32(cuFloatComplex *h_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_min_rows_f64f64(double *h_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_min_rows_c64c64(cuDoubleComplex *h_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);



// min over a stride (most general case)
JKTAPI jktError_t min_stride_r(unsigned nout, unsigned numel, unsigned stride, unsigned batch,
                                    unsigned outer_stride, unsigned outer_batch, void *d_dst, unsigned cls_dst,
                                    void *d_src, unsigned cls_src, float *d_idx, bool cplx,
                                    bool idx_out, bool device_result);
#ifdef __cplusplus
}
#endif



#ifdef __cplusplus
// min vector
JKTAPI jktError_t jkt_min_vector(float *h_dst, float *d_src, unsigned numel);
JKTAPI jktError_t jkt_min_vector(bool *h_dst, bool *d_src, unsigned numel);
JKTAPI jktError_t jkt_min_vector(cuFloatComplex *h_dst, cuFloatComplex *d_src, unsigned numel);
JKTAPI jktError_t jkt_min_vector(double *h_dst, double *d_src, unsigned numel);
JKTAPI jktError_t jkt_min_vector(cuDoubleComplex *h_dst, cuDoubleComplex *d_src, unsigned numel);



// min columns
JKTAPI jktError_t jkt_min_columns(float *d_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_min_columns(bool *d_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_min_columns(cuFloatComplex *d_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_min_columns(double *d_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_min_columns(cuDoubleComplex *d_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);



// min rows
JKTAPI jktError_t jkt_min_rows(float *d_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_min_rows(bool *d_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_min_rows(cuFloatComplex *d_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_min_rows(double *d_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_min_rows(cuDoubleComplex *d_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);
#endif // ifdef __cplusplus





#ifdef __cplusplus
extern "C" {
#endif
// sum vector
JKTAPI jktError_t jkt_sum_vector_f32f32(float *h_dst, float *d_src, unsigned numel);
JKTAPI jktError_t jkt_sum_vector_f32b(float *h_dst, bool *d_src, unsigned numel);
JKTAPI jktError_t jkt_sum_vector_c32c32(cuFloatComplex *h_dst, cuFloatComplex *d_src, unsigned numel);
JKTAPI jktError_t jkt_sum_vector_f64f64(double *h_dst, double *d_src, unsigned numel);
JKTAPI jktError_t jkt_sum_vector_c64c64(cuDoubleComplex *h_dst, cuDoubleComplex *d_src, unsigned numel);



// sum columns
JKTAPI jktError_t jkt_sum_columns_f32f32(float *h_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_sum_columns_f32b(float *h_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_sum_columns_c32c32(cuFloatComplex *h_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_sum_columns_f64f64(double *h_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_sum_columns_c64c64(cuDoubleComplex *h_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);



// sum rows
JKTAPI jktError_t jkt_sum_rows_f32f32(float *h_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_sum_rows_f32b(float *h_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_sum_rows_c32c32(cuFloatComplex *h_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_sum_rows_f64f64(double *h_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_sum_rows_c64c64(cuDoubleComplex *h_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);



// sum over a stride (most general case)
JKTAPI jktError_t sum_stride_r(unsigned nout, unsigned numel, unsigned stride, unsigned batch,
                                   unsigned outer_stride, unsigned outer_batch, void *d_dst, unsigned cls_dst,
                                   void *d_src, unsigned cls_src, float *d_idx, bool cplx,
                                   bool idx_out, bool device_result);
#ifdef __cplusplus
}
#endif



#ifdef __cplusplus
// sum vector
JKTAPI jktError_t jkt_sum_vector(float *h_dst, float *d_src, unsigned numel);
JKTAPI jktError_t jkt_sum_vector(float *h_dst, bool *d_src, unsigned numel);
JKTAPI jktError_t jkt_sum_vector(cuFloatComplex *h_dst, cuFloatComplex *d_src, unsigned numel);
JKTAPI jktError_t jkt_sum_vector(double *h_dst, double *d_src, unsigned numel);
JKTAPI jktError_t jkt_sum_vector(cuDoubleComplex *h_dst, cuDoubleComplex *d_src, unsigned numel);



// sum columns
JKTAPI jktError_t jkt_sum_columns(float *d_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_sum_columns(float *d_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_sum_columns(cuFloatComplex *d_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_sum_columns(double *d_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_sum_columns(cuDoubleComplex *d_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);



// sum rows
JKTAPI jktError_t jkt_sum_rows(float *d_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_sum_rows(float *d_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_sum_rows(cuFloatComplex *d_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_sum_rows(double *d_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_sum_rows(cuDoubleComplex *d_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);
#endif // ifdef __cplusplus



#ifdef __cplusplus
extern "C" {
#endif



// transpose
JKTAPI jktError_t jkt_transpose_b(const bool *d_A, bool *d_B, unsigned w, unsigned h);
JKTAPI jktError_t jkt_transpose_f32(const float *d_A, float *d_B, unsigned w, unsigned h);
JKTAPI jktError_t jkt_transpose_f64(const double *d_A, double *d_B, unsigned w, unsigned h);
JKTAPI jktError_t jkt_transpose_c32(const cuComplex *d_A, cuComplex *d_B, unsigned w, unsigned h);
JKTAPI jktError_t jkt_transpose_c64(const cuDoubleComplex *d_A, cuDoubleComplex *d_B, unsigned w, unsigned h);



// transpose (gfor)
JKTAPI jktError_t jkt_transpose_b_gfor(const bool *d_A, bool *d_B, unsigned w, unsigned h, unsigned ngfor);
JKTAPI jktError_t jkt_transpose_f32_gfor(const float *d_A, float *d_B, unsigned w, unsigned h, unsigned ngfor);
JKTAPI jktError_t jkt_transpose_f64_gfor(const double *d_A, double *d_B, unsigned w, unsigned h, unsigned ngfor);
JKTAPI jktError_t jkt_transpose_c32_gfor(const cuComplex *d_A, cuComplex *d_B, unsigned w, unsigned h, unsigned ngfor);
JKTAPI jktError_t jkt_transpose_c64_gfor(const cuDoubleComplex *d_A, cuDoubleComplex *d_B, unsigned w, unsigned h, unsigned ngfor);

#ifdef __cplusplus
}
#endif


#endif
