/*-----------------------------------------------------------------------*/

/*   Product:   LIBJACKET                                                */

/*   Copyright (c) AccelerEyes LLC. All rights reserved.                 */
/*   See http://www.accelereyes.com/eula for details.                    */

/*   This software is distributed WITHOUT ANY WARRANTY; without even     */
/*   the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR */
/*   PURPOSE.  See the above copyright notices for more information.     */

/*-----------------------------------------------------------------------*/

#ifndef __JACKET_H
#define __JACKET_H

#include <cuComplex.h>
#include "jacket_defines.h"

#ifdef __cplusplus
extern "C" {
#endif

    JKTAPI const char *jkt_strerror(jktError_t e);

    // all vector
    JKTAPI jktError_t jkt_all_vector_BS(bool *h_dst, float *d_src, unsigned numel);
    JKTAPI jktError_t jkt_all_vector_BB(bool *h_dst, bool *d_src, unsigned numel);
    JKTAPI jktError_t jkt_all_vector_BC(bool *h_dst, cuFloatComplex *d_src, unsigned numel);
    JKTAPI jktError_t jkt_all_vector_BD(bool *h_dst, double *d_src, unsigned numel);
    JKTAPI jktError_t jkt_all_vector_BZ(bool *h_dst, cuDoubleComplex *d_src, unsigned numel);

    // all columns
    JKTAPI jktError_t jkt_all_columns_BS(bool *h_dst, float *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_all_columns_BB(bool *h_dst, bool *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_all_columns_BC(bool *h_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_all_columns_BD(bool *h_dst, double *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_all_columns_BZ(bool *h_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);

    // all rows
    JKTAPI jktError_t jkt_all_rows_BS(bool *h_dst, float *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_all_rows_BB(bool *h_dst, bool *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_all_rows_BC(bool *h_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_all_rows_BD(bool *h_dst, double *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_all_rows_BZ(bool *h_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);

    // all over a stride (most general case)
    JKTAPI jktError_t all_stride_r(unsigned nout, unsigned numel, unsigned stride, unsigned batch,
                                   unsigned outer_stride, unsigned outer_batch, void *d_dst, unsigned cls_dst,
                                   void *d_src, unsigned cls_src, float *d_idx, bool cplx,
                                   bool idx_out, bool device_result);

    // any vector
    JKTAPI jktError_t jkt_any_vector_BS(bool *h_dst, float *d_src, unsigned numel);
    JKTAPI jktError_t jkt_any_vector_BB(bool *h_dst, bool *d_src, unsigned numel);
    JKTAPI jktError_t jkt_any_vector_BC(bool *h_dst, cuFloatComplex *d_src, unsigned numel);
    JKTAPI jktError_t jkt_any_vector_BD(bool *h_dst, double *d_src, unsigned numel);
    JKTAPI jktError_t jkt_any_vector_BZ(bool *h_dst, cuDoubleComplex *d_src, unsigned numel);

    // any columns
    JKTAPI jktError_t jkt_any_columns_BS(bool *h_dst, float *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_any_columns_BB(bool *h_dst, bool *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_any_columns_BC(bool *h_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_any_columns_BD(bool *h_dst, double *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_any_columns_BZ(bool *h_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);

    // any rows
    JKTAPI jktError_t jkt_any_rows_BS(bool *h_dst, float *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_any_rows_BB(bool *h_dst, bool *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_any_rows_BC(bool *h_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_any_rows_BD(bool *h_dst, double *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_any_rows_BZ(bool *h_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);

    // any over a stride (most general case)
    JKTAPI jktError_t any_stride_r(unsigned nout, unsigned numel, unsigned stride, unsigned batch,
                                   unsigned outer_stride, unsigned outer_batch, void *d_dst, unsigned cls_dst,
                                   void *d_src, unsigned cls_src, float *d_idx, bool cplx,
                                   bool idx_out, bool device_result);

    // besselh
    JKTAPI jktError_t jkt_besselh_S(float *_D_J, float *_D_Z, float nu, unsigned n, unsigned k);
    JKTAPI jktError_t jkt_besselh_C(float *_D_J, float *_D_Z, float nu, unsigned n, unsigned k);
    JKTAPI jktError_t jkt_besselh_D(double *_D_J, double *_D_Z, double nu, unsigned n, unsigned k);
    JKTAPI jktError_t jkt_besselh_Z(double *_D_J, double *_D_Z, double nu, unsigned n, unsigned k);

    // besseli
    JKTAPI jktError_t jkt_besseli_S(float *_D_J, float *_D_Z, float nu, unsigned n);
    JKTAPI jktError_t jkt_besseli_C(float *_D_J, float *_D_Z, float nu, unsigned n);
    JKTAPI jktError_t jkt_besseli_D(double *_D_J, double *_D_Z, double nu, unsigned n);
    JKTAPI jktError_t jkt_besseli_Z(double *_D_J, double *_D_Z, double nu, unsigned n);

    // besselj
    JKTAPI jktError_t jkt_besselj_S(float *_D_J, float *_D_Z, float nu, unsigned n);
    JKTAPI jktError_t jkt_besselj_C(cuFloatComplex *_D_J, cuFloatComplex *_D_Z, float nu, unsigned n);
    JKTAPI jktError_t jkt_besselj_D(double *_D_J, double *_D_Z, double nu, unsigned n);
    JKTAPI jktError_t jkt_besselj_Z(cuDoubleComplex *_D_J, cuDoubleComplex *_D_Z, double nu, unsigned n);

    // besselk
    JKTAPI jktError_t jkt_besselk_S(float *_D_J, float *_D_Z, float nu, unsigned n);
    JKTAPI jktError_t jkt_besselk_C(float *_D_J, float *_D_Z, float nu, unsigned n);
    JKTAPI jktError_t jkt_besselk_D(double *_D_J, double *_D_Z, double nu, unsigned n);
    JKTAPI jktError_t jkt_besselk_Z(double *_D_J, double *_D_Z, double nu, unsigned n);

    // bessely
    JKTAPI jktError_t jkt_bessely_S(float *_D_Y, float *_D_Z, float nu, unsigned n);
    JKTAPI jktError_t jkt_bessely_C(float *_D_Y, float *_D_Z, float nu, unsigned n);
    JKTAPI jktError_t jkt_bessely_D(double *_D_Y, double *_D_Z, double nu, unsigned n);
    JKTAPI jktError_t jkt_bessely_Z(double *_D_Y, double *_D_Z, double nu, unsigned n);

    // matrix multiply, sum, min, max columns, general case
    JKTAPI jktError_t blas(const bool is_Acplx, const bool is_Bcplx, const bool is_Adbl,
                           const bool is_Bdbl, const void* d_a, const void* d_B, void* d_c,
                           const unsigned m, const unsigned k, const unsigned n,
                           const unsigned nA, const unsigned nB, const int is_reduce, void *ri);

    // ctranspose
    JKTAPI jktError_t jkt_ctranspose_B(const bool *d_A, bool *d_B, unsigned w, unsigned h);
    JKTAPI jktError_t jkt_ctranspose_S(const float *d_A, float *d_B, unsigned w, unsigned h);
    JKTAPI jktError_t jkt_ctranspose_D(const double *d_A, double *d_B, unsigned w, unsigned h);
    JKTAPI jktError_t jkt_ctranspose_C(const cuFloatComplex *d_A, cuFloatComplex *d_B, unsigned w, unsigned h);
    JKTAPI jktError_t jkt_ctranspose_Z(const cuDoubleComplex *d_A, cuDoubleComplex *d_B, unsigned w, unsigned h);

    // ctranspose (gfor)
    JKTAPI jktError_t jkt_ctranspose_B_gfor(const bool *d_A, bool *d_B, unsigned w, unsigned h, unsigned ngfor);
    JKTAPI jktError_t jkt_ctranspose_S_gfor(const float *d_A, float *d_B, unsigned w, unsigned h, unsigned ngfor);
    JKTAPI jktError_t jkt_ctranspose_D_gfor(const double *d_A, double *d_B, unsigned w, unsigned h, unsigned ngfor);
    JKTAPI jktError_t jkt_ctranspose_C_gfor(const cuFloatComplex *d_A, cuFloatComplex *d_B, unsigned w, unsigned h, unsigned ngfor);
    JKTAPI jktError_t jkt_ctranspose_Z_gfor(const cuDoubleComplex *d_A, cuDoubleComplex *d_B, unsigned w, unsigned h, unsigned ngfor);

    // diff
    JKTAPI jktError_t jkt_diff_S(float *d_Y, float *d_X, int *dims_Y, int ndims_Y, int *dims_X, int ndims_X, int order, int dim);
    JKTAPI jktError_t jkt_diff_D(double *d_Y, double *d_X, int *dims_Y, int ndims_Y, int *dims_X, int ndims_X, int order, int dim);

    JKTAPI jktError_t jkt_find_SS(unsigned *n_out, float **d_I, unsigned n_in, float *d_X);
    JKTAPI jktError_t jkt_find_SB(unsigned *n_out, float **d_I, unsigned n_in, bool *d_X);
    JKTAPI jktError_t jkt_find_SU(unsigned *n_out, float **d_I, unsigned n_in, unsigned *d_X);
    JKTAPI jktError_t jkt_find_SI(unsigned *n_out, float **d_I, unsigned n_in, int *d_X);
    JKTAPI jktError_t jkt_find_DD(unsigned *n_out, double **d_I, unsigned n_in, double *d_X);


    JKTAPI jktError_t jkt_find_SSS(unsigned *n_out, float **d_I, float **d_J, unsigned n_in, unsigned rows, float *d_X);
    JKTAPI jktError_t jkt_find_SSB(unsigned *n_out, float **d_I, float **d_J, unsigned n_in, unsigned rows, bool *d_X);
    JKTAPI jktError_t jkt_find_SSU(unsigned *n_out, float **d_I, float **d_J, unsigned n_in, unsigned rows, unsigned *d_X);
    JKTAPI jktError_t jkt_find_SSI(unsigned *n_out, float **d_I, float **d_J, unsigned n_in, unsigned rows, int *d_X);
    JKTAPI jktError_t jkt_find_DDD(unsigned *n_out, double **d_I, double **d_J, unsigned n_in, unsigned rows, double *d_X);

    JKTAPI jktError_t jkt_grad_B(bool *d_dx, bool *d_dy, bool *d_A, unsigned ny, unsigned nx, unsigned batch);
    JKTAPI jktError_t jkt_grad_U(unsigned *d_dx, unsigned *d_dy, unsigned *d_A, unsigned ny, unsigned nx, unsigned batch);
    JKTAPI jktError_t jkt_grad_I(int *d_dx, int *d_dy, int *d_A, unsigned ny, unsigned nx, unsigned batch);
    JKTAPI jktError_t jkt_grad_S(float *d_dx, float *d_dy, float *d_A, unsigned ny, unsigned nx, unsigned batch);
    JKTAPI jktError_t jkt_grad_D(double *d_dx, double *d_dy, double *d_A, unsigned ny, unsigned nx, unsigned batch);

    // fft
    // use the direction parameter from CUFFT
    JKTAPI jktError_t jkt_fft_SC(float  *d_I, unsigned nin, 
                                 cuFloatComplex  *d_O, unsigned nout,
                                 unsigned batch, int direction);
    JKTAPI jktError_t jkt_fft_DZ(double *d_I, unsigned nin,
                                 cuDoubleComplex *d_O, unsigned nout,
                                 unsigned batch, int direction);
    JKTAPI jktError_t jkt_fft_CC(cuFloatComplex  *d_I, unsigned nin,
                                 cuFloatComplex  *d_O, unsigned nout,
                                 unsigned batch, int direction);
    JKTAPI jktError_t jkt_fft_ZZ(cuDoubleComplex *d_I, unsigned nin,
                                 cuDoubleComplex *d_O, unsigned nout,
                                 unsigned batch, int direction);

    // fft2
    JKTAPI jktError_t jkt_fft2_SC(float  *d_I, unsigned xi, unsigned yi, 
                                  cuFloatComplex  *d_O, unsigned xo, unsigned yo,
                                  unsigned batch, int direction);
    JKTAPI jktError_t jkt_fft2_DZ(double *d_I, unsigned xi, unsigned yi, 
                                  cuDoubleComplex *d_O, unsigned xo, unsigned yo,
                                  unsigned batch, int direction);
    JKTAPI jktError_t jkt_fft2_CC(cuFloatComplex  *d_I, unsigned xi, unsigned yi, 
                                  cuFloatComplex  *d_O, unsigned xo, unsigned yo,
                                  unsigned batch, int direction);
    JKTAPI jktError_t jkt_fft2_ZZ(cuDoubleComplex *d_I, unsigned xi, unsigned yi, 
                                  cuDoubleComplex *d_O, unsigned xo, unsigned yo,
                                  unsigned batch, int direction);

    // fftn
    JKTAPI jktError_t jkt_fftn_SC(float  *d_I, dim3 dims_I,
                                  cuFloatComplex  *d_O, dim3 dims_O,
                                  unsigned batch, int direction);
    JKTAPI jktError_t jkt_fftn_DZ(double *d_I, dim3 dims_I,
                                  cuDoubleComplex *d_O, dim3 dims_O,
                                  unsigned batch, int direction);
    JKTAPI jktError_t jkt_fftn_CC(cuFloatComplex  *d_I, dim3 dims_I,
                                  cuFloatComplex  *d_O, dim3 dims_O,
                                  unsigned batch, int direction);
    JKTAPI jktError_t jkt_fftn_ZZ(cuDoubleComplex *d_I, dim3 dims_I,
                                  cuDoubleComplex *d_O, dim3 dims_O,
                                  unsigned batch, int direction);

    // fft plans
    // You can tune how many plans to cache
    JKTAPI jktError_t jkt_fft_setplans(int max_plans);
    JKTAPI jktError_t jkt_fft_clrplans(void);


    // LU Decomposition
    JKTAPI jktError_t jkt_lu_S(unsigned m, unsigned n, float *d_LU, int *d_piv,
                               float  *d_L, float  *d_U, unsigned k, unsigned batch);
    JKTAPI jktError_t jkt_lu_D(unsigned m, unsigned n, double *d_LU, int *d_piv,
                               double *d_L, double *d_U, unsigned k, unsigned batch);
    JKTAPI jktError_t jkt_lu_C(unsigned m, unsigned n, 
                               cuFloatComplex *d_LU, int *d_piv,
                               cuFloatComplex  *d_L, cuFloatComplex  *d_U,
                               unsigned k, unsigned batch);
    JKTAPI jktError_t jkt_lu_Z(unsigned m, unsigned n, 
                               cuDoubleComplex *d_LU, int *d_piv,
                               cuDoubleComplex *d_L, cuDoubleComplex *d_U, 
                               unsigned k, unsigned batch);

    // Get final locations of pivot indices
    JKTAPI jktError_t jkt_piv_final_I(unsigned m, unsigned k, int *d_piv, 
                                      int      *d_out, unsigned batch);
    JKTAPI jktError_t jkt_piv_final_U(unsigned m, unsigned k, int *d_piv, 
                                      unsigned *d_out, unsigned batch);
    JKTAPI jktError_t jkt_piv_final_S(unsigned m, unsigned k, int *d_piv, 
                                      float    *d_out, unsigned batch);
    JKTAPI jktError_t jkt_piv_final_D(unsigned m, unsigned k, int *d_piv, 
                                      double   *d_out, unsigned batch);

    // QR
    JKTAPI jktError_t jkt_qr_S(unsigned m, unsigned n, float  *d_Q, float  *d_tau,
                               float  *d_R, unsigned k, unsigned batch);
    JKTAPI jktError_t jkt_qr_D(unsigned m, unsigned n, double *d_Q, double *d_tau, 
                               double *d_R, unsigned k, unsigned batch);
    JKTAPI jktError_t jkt_qr_C(unsigned m, unsigned n, 
                               cuFloatComplex *d_Q,  cuFloatComplex *d_tau, 
                               cuFloatComplex  *d_R, unsigned k, unsigned batch);
    JKTAPI jktError_t jkt_qr_Z(unsigned m, unsigned n, 
                               cuDoubleComplex *d_Q, cuDoubleComplex *d_tau, 
                               cuDoubleComplex *d_R, unsigned k, unsigned batch);

    // CHOLESKY
    JKTAPI jktError_t jkt_cholesky_S(bool ISUP, unsigned n, float *d_A, float *d_R,
                                     unsigned *INFO, unsigned batch);
    JKTAPI jktError_t jkt_cholesky_D(bool ISUP, unsigned n, double *d_A, double *d_R,
                                     unsigned *INFO, unsigned batch);
    JKTAPI jktError_t jkt_cholesky_C(bool ISUP, unsigned n, 
                                     cuFloatComplex *d_A, cuFloatComplex *d_R, 
                                     unsigned *INFO, unsigned batch);
    JKTAPI jktError_t jkt_cholesky_Z(bool ISUP, unsigned n,
                                     cuDoubleComplex *d_A, cuDoubleComplex *d_R,
                                     unsigned *INFO, unsigned batch);
    // HESSENBERG
    JKTAPI jktError_t jkt_hessenberg_S(unsigned n, float *d_A, float *d_H,
                                     float *d_Q, unsigned batch);
    JKTAPI jktError_t jkt_hessenberg_D(unsigned n, double *d_A, double *d_H,
                                     double *d_Q, unsigned batch);
    JKTAPI jktError_t jkt_hessenberg_C(unsigned n, 
                                     cuFloatComplex *d_A, cuFloatComplex *d_H, 
                                     cuFloatComplex *d_Q, unsigned batch);
    JKTAPI jktError_t jkt_hessenberg_Z(unsigned n,
                                     cuDoubleComplex *d_A, cuDoubleComplex *d_H,
                                     cuDoubleComplex *d_Q, unsigned batch);

    // EIG
    JKTAPI jktError_t jkt_eig_S(int type, unsigned n, float *d_A, void **d_Val,
                                void **d_Vec, unsigned batch,
                                bool is_diag, bool *is_imag);
    JKTAPI jktError_t jkt_eig_D(int type, unsigned n, double *d_A, void **d_Val,
                                void **d_Vec, unsigned batch,
                                bool is_diag, bool *is_imag);
    JKTAPI jktError_t jkt_eig_C(int type, unsigned n, 
                                cuFloatComplex  *d_A, cuFloatComplex *d_Val,
                                cuFloatComplex  *d_Vec, unsigned batch, bool is_diag);
    JKTAPI jktError_t jkt_eig_Z(int type, unsigned n,
                                cuDoubleComplex *d_A, cuDoubleComplex *d_Val,
                                cuDoubleComplex *d_Vec, unsigned batch,bool is_diag);

    // SVD 
    JKTAPI jktError_t jkt_svd_S(char jobU, char jobV, unsigned m, unsigned n,
                                float *d_A, float *d_S, float *d_U, float *d_V,
                                unsigned m_, unsigned n_, unsigned batch, bool is_diag);
    JKTAPI jktError_t jkt_svd_D(char jobU, char jobV, unsigned m, unsigned n,
                                double *d_A, double *d_S, double *d_U, double *d_V,
                                unsigned m_, unsigned n_, unsigned batch, bool is_diag);
    JKTAPI jktError_t jkt_svd_C(char jobU, char jobV, unsigned m, unsigned n,
                                cuFloatComplex *d_A, cuFloatComplex *d_S, 
                                cuFloatComplex *d_U, cuFloatComplex *d_V,
                                unsigned m_, unsigned n_, unsigned batch, bool is_diag);
    JKTAPI jktError_t jkt_svd_Z(char jobU, char jobV, unsigned m, unsigned n,
                                cuDoubleComplex *d_A, cuDoubleComplex *d_S, 
                                cuDoubleComplex *d_U, cuDoubleComplex *d_V,
                                unsigned m_, unsigned n_, unsigned batch, bool is_diag);

    // INV
    JKTAPI jktError_t jkt_inv_S(unsigned n, float  *d_X, float  *d_Y, unsigned batch);
    JKTAPI jktError_t jkt_inv_D(unsigned n, double *d_X, double *d_Y, unsigned batch);
    JKTAPI jktError_t jkt_inv_C(unsigned n, cuFloatComplex  *d_X, cuFloatComplex  *d_Y, unsigned batch);
    JKTAPI jktError_t jkt_inv_Z(unsigned n, cuDoubleComplex *d_X, cuDoubleComplex *d_Y, unsigned batch);

    // DET
    JKTAPI jktError_t jkt_det_S(unsigned n, float *d_X, float *res, unsigned batch, bool inplace);
    JKTAPI jktError_t jkt_det_D(unsigned n, double *d_X, double *res, unsigned batch, bool inplace);
    JKTAPI jktError_t jkt_det_C(unsigned n, cuFloatComplex *d_X, cuFloatComplex *res, unsigned batch, bool inplace);
    JKTAPI jktError_t jkt_det_Z(unsigned n, cuDoubleComplex *d_X, cuDoubleComplex *res, unsigned batch, bool inplace);

    // MATPOW
    JKTAPI jktError_t jkt_matrixPower_S(unsigned n, float  *d_X, float y, float  *d_Z, unsigned batch);
    JKTAPI jktError_t jkt_matrixPower_D(unsigned n, double *d_X, float y, double *d_Z, unsigned batch);
    JKTAPI jktError_t jkt_matrixPower_C(unsigned n, cuFloatComplex  *d_X, float y, cuFloatComplex  *d_Z, unsigned batch);
    JKTAPI jktError_t jkt_matrixPower_Z(unsigned n, cuDoubleComplex *d_X, float y, cuDoubleComplex *d_Z, unsigned batch);
 
    // LINSOLVE
    // Use options specified in jacket_defines.h
    // e.g OPTIONS = MAT_POSDEF | MAT_UP_TRI | MAT_TRANS;
    JKTAPI jktError_t jkt_linearSolve_S(unsigned m, unsigned n, float *d_A, unsigned k, float *d_B, float *d_X, unsigned OPTIONS, unsigned batchA, unsigned batchB);
    JKTAPI jktError_t jkt_linearSolve_D(unsigned m, unsigned n, double *d_A, unsigned k, double *d_B, double *d_X, unsigned OPTIONS, unsigned batchA, unsigned batchB);
    JKTAPI jktError_t jkt_linearSolve_C(unsigned m, unsigned n, cuFloatComplex *d_A, unsigned k, cuFloatComplex *d_B, cuFloatComplex *d_X, unsigned OPTIONS, unsigned batchA, unsigned batchB);
    JKTAPI jktError_t jkt_linearSolve_Z(unsigned m, unsigned n, cuDoubleComplex *d_A, unsigned k, cuDoubleComplex *d_B, cuDoubleComplex *d_X, unsigned OPTIONS, unsigned batchA, unsigned batchB);

    // rand
    JKTAPI jktError_t jkt_rand_S(float **d_Y, unsigned numel);
    JKTAPI jktError_t jkt_rand_D(double **d_Y, unsigned numel);
    JKTAPI void jkt_rand_set_seed(unsigned seed);
    JKTAPI unsigned jkt_rand_get_seed();

    // randn
    JKTAPI jktError_t jkt_randn_S(float **d_Y, unsigned numel);
    JKTAPI jktError_t jkt_randn_D(double **d_Y, unsigned numel);
    JKTAPI void jkt_randn_set_seed(unsigned seed);
    JKTAPI unsigned jkt_randn_get_seed();

    // hankel
    JKTAPI jktError_t jkt_hankel_SSS(float *d_H, unsigned nc, float *d_C, unsigned nr, float *d_R);
    JKTAPI jktError_t jkt_hankel_CSC(cuFloatComplex *d_H, unsigned nc, float *d_C, unsigned nr, cuFloatComplex *d_R);
    JKTAPI jktError_t jkt_hankel_CCS(cuFloatComplex *d_H, unsigned nc, cuFloatComplex *d_C, unsigned nr, float *d_R);
    JKTAPI jktError_t jkt_hankel_CCC(cuFloatComplex *d_H, unsigned nc, cuFloatComplex *d_C, unsigned nr, cuFloatComplex *d_R);
    JKTAPI jktError_t jkt_hankel_SSD(float *d_H, unsigned nc, float *d_C, unsigned nr, double *d_R);
    JKTAPI jktError_t jkt_hankel_CSZ(cuFloatComplex *d_H, unsigned nc, float *d_C, unsigned nr, cuDoubleComplex *d_R);
    JKTAPI jktError_t jkt_hankel_CCD(cuFloatComplex *d_H, unsigned nc, cuFloatComplex *d_C, unsigned nr, double *d_R);
    JKTAPI jktError_t jkt_hankel_CCZ(cuFloatComplex *d_H, unsigned nc, cuFloatComplex *d_C, unsigned nr, cuDoubleComplex *d_R);
    JKTAPI jktError_t jkt_hankel_SDS(float *d_H, unsigned nc, double *d_C, unsigned nr, float *d_R);
    JKTAPI jktError_t jkt_hankel_CDC(cuFloatComplex *d_H, unsigned nc, double *d_C, unsigned nr, cuFloatComplex *d_R);
    JKTAPI jktError_t jkt_hankel_CZS(cuFloatComplex *d_H, unsigned nc, cuDoubleComplex *d_C, unsigned nr, float *d_R);
    JKTAPI jktError_t jkt_hankel_CZC(cuFloatComplex *d_H, unsigned nc, cuDoubleComplex *d_C, unsigned nr, cuFloatComplex *d_R);
    JKTAPI jktError_t jkt_hankel_DSD(double *d_H, unsigned nc, float *d_C, unsigned nr, double *d_R);
    JKTAPI jktError_t jkt_hankel_ZSZ(cuDoubleComplex *d_H, unsigned nc, float *d_C, unsigned nr, cuDoubleComplex *d_R);
    JKTAPI jktError_t jkt_hankel_ZCD(cuDoubleComplex *d_H, unsigned nc, cuFloatComplex *d_C, unsigned nr, double *d_R);
    JKTAPI jktError_t jkt_hankel_ZCZ(cuDoubleComplex *d_H, unsigned nc, cuFloatComplex *d_C, unsigned nr, cuDoubleComplex *d_R);
    JKTAPI jktError_t jkt_hankel_DDS(double *d_H, unsigned nc, double *d_C, unsigned nr, float *d_R);
    JKTAPI jktError_t jkt_hankel_ZDC(cuDoubleComplex *d_H, unsigned nc, double *d_C, unsigned nr, cuFloatComplex *d_R);
    JKTAPI jktError_t jkt_hankel_ZZS(cuDoubleComplex *d_H, unsigned nc, cuDoubleComplex *d_C, unsigned nr, float *d_R);
    JKTAPI jktError_t jkt_hankel_ZZC(cuDoubleComplex *d_H, unsigned nc, cuDoubleComplex *d_C, unsigned nr, cuFloatComplex *d_R);
    JKTAPI jktError_t jkt_hankel_DDD(double *d_H, unsigned nc, double *d_C, unsigned nr, double *d_R);
    JKTAPI jktError_t jkt_hankel_ZDZ(cuDoubleComplex *d_H, unsigned nc, double *d_C, unsigned nr, cuDoubleComplex *d_R);
    JKTAPI jktError_t jkt_hankel_ZZD(cuDoubleComplex *d_H, unsigned nc, cuDoubleComplex *d_C, unsigned nr, double *d_R);
    JKTAPI jktError_t jkt_hankel_ZZZ(cuDoubleComplex *d_H, unsigned nc, cuDoubleComplex *d_C, unsigned nr, cuDoubleComplex *d_R);

    // Non zero element count
    JKTAPI jktError_t jkt_nnz_S(unsigned numel, float *mat, 
                                unsigned *count, unsigned batch);
    JKTAPI jktError_t jkt_nnz_B(unsigned numel, bool *mat, 
                                unsigned *count, unsigned batch);
    JKTAPI jktError_t jkt_nnz_D(unsigned numel, double *mat, 
                                unsigned *count, unsigned batch);


    // max vector
    JKTAPI jktError_t jkt_max_vector_SS(float *h_dst, float *d_src, unsigned numel);
    JKTAPI jktError_t jkt_max_vector_BB(bool *h_dst, bool *d_src, unsigned numel);
    JKTAPI jktError_t jkt_max_vector_CC(cuFloatComplex *h_dst, cuFloatComplex *d_src, unsigned numel);
    JKTAPI jktError_t jkt_max_vector_DD(double *h_dst, double *d_src, unsigned numel);
    JKTAPI jktError_t jkt_max_vector_ZZ(cuDoubleComplex *h_dst, cuDoubleComplex *d_src, unsigned numel);



    // max columns
    JKTAPI jktError_t jkt_max_columns_SS(float *h_dst, float *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_max_columns_BB(bool *h_dst, bool *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_max_columns_CC(cuFloatComplex *h_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_max_columns_DD(double *h_dst, double *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_max_columns_ZZ(cuDoubleComplex *h_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);



    // max rows
    JKTAPI jktError_t jkt_max_rows_SS(float *h_dst, float *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_max_rows_BB(bool *h_dst, bool *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_max_rows_CC(cuFloatComplex *h_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_max_rows_DD(double *h_dst, double *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_max_rows_ZZ(cuDoubleComplex *h_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);



    // max over a stride (most general case)
    JKTAPI jktError_t max_stride_r(unsigned nout, unsigned numel, unsigned stride, unsigned batch,
                                   unsigned outer_stride, unsigned outer_batch, void *d_dst, unsigned cls_dst,
                                   void *d_src, unsigned cls_src, float *d_idx, bool cplx,
                                   bool idx_out, bool device_result);

    // min vector
    JKTAPI jktError_t jkt_min_vector_SS(float *h_dst, float *d_src, unsigned numel);
    JKTAPI jktError_t jkt_min_vector_BB(bool *h_dst, bool *d_src, unsigned numel);
    JKTAPI jktError_t jkt_min_vector_CC(cuFloatComplex *h_dst, cuFloatComplex *d_src, unsigned numel);
    JKTAPI jktError_t jkt_min_vector_DD(double *h_dst, double *d_src, unsigned numel);
    JKTAPI jktError_t jkt_min_vector_ZZ(cuDoubleComplex *h_dst, cuDoubleComplex *d_src, unsigned numel);



    // min columns
    JKTAPI jktError_t jkt_min_columns_SS(float *h_dst, float *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_min_columns_BB(bool *h_dst, bool *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_min_columns_CC(cuFloatComplex *h_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_min_columns_DD(double *h_dst, double *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_min_columns_ZZ(cuDoubleComplex *h_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);



    // min rows
    JKTAPI jktError_t jkt_min_rows_SS(float *h_dst, float *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_min_rows_BB(bool *h_dst, bool *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_min_rows_CC(cuFloatComplex *h_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_min_rows_DD(double *h_dst, double *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_min_rows_ZZ(cuDoubleComplex *h_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);



    // min over a stride (most general case)
    JKTAPI jktError_t min_stride_r(unsigned nout, unsigned numel, unsigned stride, unsigned batch,
                                   unsigned outer_stride, unsigned outer_batch, void *d_dst, unsigned cls_dst,
                                   void *d_src, unsigned cls_src, float *d_idx, bool cplx,
                                   bool idx_out, bool device_result);

    // sum vector
    JKTAPI jktError_t jkt_sum_vector_SS(float *h_dst, float *d_src, unsigned numel);
    JKTAPI jktError_t jkt_sum_vector_SB(float *h_dst, bool *d_src, unsigned numel);
    JKTAPI jktError_t jkt_sum_vector_CC(cuFloatComplex *h_dst, cuFloatComplex *d_src, unsigned numel);
    JKTAPI jktError_t jkt_sum_vector_DD(double *h_dst, double *d_src, unsigned numel);
    JKTAPI jktError_t jkt_sum_vector_ZZ(cuDoubleComplex *h_dst, cuDoubleComplex *d_src, unsigned numel);



    // sum columns
    JKTAPI jktError_t jkt_sum_columns_SS(float *h_dst, float *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_sum_columns_SB(float *h_dst, bool *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_sum_columns_CC(cuFloatComplex *h_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_sum_columns_DD(double *h_dst, double *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_sum_columns_ZZ(cuDoubleComplex *h_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);



    // sum rows
    JKTAPI jktError_t jkt_sum_rows_SS(float *h_dst, float *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_sum_rows_SB(float *h_dst, bool *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_sum_rows_CC(cuFloatComplex *h_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_sum_rows_DD(double *h_dst, double *d_src, unsigned nrows, unsigned ncols);
    JKTAPI jktError_t jkt_sum_rows_ZZ(cuDoubleComplex *h_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);



    // sum over a stride (most general case)
    JKTAPI jktError_t sum_stride_r(unsigned nout, unsigned numel, unsigned stride, unsigned batch,
                                   unsigned outer_stride, unsigned outer_batch, void *d_dst, unsigned cls_dst,
                                   void *d_src, unsigned cls_src, float *d_idx, bool cplx,
                                   bool idx_out, bool device_result);

    // transpose
    JKTAPI jktError_t jkt_transpose_B(const bool *d_A, bool *d_B, unsigned w, unsigned h);
    JKTAPI jktError_t jkt_transpose_S(const float *d_A, float *d_B, unsigned w, unsigned h);
    JKTAPI jktError_t jkt_transpose_D(const double *d_A, double *d_B, unsigned w, unsigned h);
    JKTAPI jktError_t jkt_transpose_C(const cuFloatComplex *d_A, cuFloatComplex *d_B, unsigned w, unsigned h);
    JKTAPI jktError_t jkt_transpose_Z(const cuDoubleComplex *d_A, cuDoubleComplex *d_B, unsigned w, unsigned h);



    // transpose (gfor)
    JKTAPI jktError_t jkt_transpose_B_gfor(const bool *d_A, bool *d_B, unsigned w, unsigned h, unsigned ngfor);
    JKTAPI jktError_t jkt_transpose_S_gfor(const float *d_A, float *d_B, unsigned w, unsigned h, unsigned ngfor);
    JKTAPI jktError_t jkt_transpose_D_gfor(const double *d_A, double *d_B, unsigned w, unsigned h, unsigned ngfor);
    JKTAPI jktError_t jkt_transpose_C_gfor(const cuFloatComplex *d_A, cuFloatComplex *d_B, unsigned w, unsigned h, unsigned ngfor);
    JKTAPI jktError_t jkt_transpose_Z_gfor(const cuDoubleComplex *d_A, cuDoubleComplex *d_B, unsigned w, unsigned h, unsigned ngfor);


    // BLAS Level 3
    JKTAPI jktError_t jkt_sgemm(char TRANSA, char TRANSB, int M, int N, int K, float ALPHA, float *A, int LDA, float *B, int LDB, float BETA, float *C, int LDC);
    JKTAPI jktError_t jkt_dgemm(char TRANSA, char TRANSB, int M, int N, int K, float ALPHA, float *A, int LDA, float *B, int LDB, float BETA, float *C, int LDC);
    JKTAPI jktError_t jkt_cgemm(char TRANSA, char TRANSB, int M, int N, int K, float ALPHA, float *A, int LDA, float *B, int LDB, float BETA, float *C, int LDC);
    JKTAPI jktError_t jkt_zgemm(char TRANSA, char TRANSB, int M, int N, int K, float ALPHA, float *A, int LDA, float *B, int LDB, float BETA, float *C, int LDC);


    // BLAS Level 3 gfor
    JKTAPI jktError_t jkt_sgemm_gfor(char TRANSA, char TRANSB, int M, int N, int K, float ALPHA, float *A, int LDA, float *B, int LDB, float BETA, float *C, int LDC, unsigned ngforA, unsigned ngforB);
    JKTAPI jktError_t jkt_dgemm_gfor(char TRANSA, char TRANSB, int M, int N, int K, float ALPHA, float *A, int LDA, float *B, int LDB, float BETA, float *C, int LDC, unsigned ngforA, unsigned ngforB);
    JKTAPI jktError_t jkt_cgemm_gfor(char TRANSA, char TRANSB, int M, int N, int K, float ALPHA, float *A, int LDA, float *B, int LDB, float BETA, float *C, int LDC, unsigned ngforA, unsigned ngforB);
    JKTAPI jktError_t jkt_zgemm_gfor(char TRANSA, char TRANSB, int M, int N, int K, float ALPHA, float *A, int LDA, float *B, int LDB, float BETA, float *C, int LDC, unsigned ngforA, unsigned ngforB);

    // SORT
    JKTAPI jktError_t jkt_sort_S(unsigned *dims, unsigned ndims, bool DIR,
                                 float *d_inVals,
                                 float *d_outVals, float *d_outIndices,
                                 unsigned dim, unsigned batch);
    JKTAPI jktError_t jkt_sort_D(unsigned *dims, unsigned ndims, bool DIR,
                                 double *d_inVals,
                                 double *d_outVals, float *d_outIndices,
                                 unsigned dim, unsigned batch);

    // Is sorted ?
    JKTAPI jktError_t jkt_isSorted_D(unsigned m, unsigned n, double *in, 
                                     unsigned char *res, bool is_row, unsigned batch);
    JKTAPI jktError_t jkt_isSorted_S(unsigned m, unsigned n, float *in, 
                                     unsigned char *res, bool is_row, unsigned batch);

    // SORT Keys
    /* JKTAPI jktError_t jkt_sortByKey_S(unsigned *dims, unsigned ndims, bool DIR, */
    /*                                   float *d_inVals,  float *d_inKeys, */
    /*                                   float *d_outVals, float *d_outKeys, */
    /*                                   unsigned dim, unsigned batch); */


    // TRIANGULAR MATRIX
    JKTAPI jktError_t jkt_triMat_S(unsigned m, unsigned n,
                                   float *d_A, float *d_T,
                                   int diag, bool direction,
                                   unsigned batch);
    JKTAPI jktError_t jkt_triMat_C(unsigned m, unsigned n,
                                   cuFloatComplex *d_A, cuFloatComplex *d_T,
                                   int diag, bool direction,
                                   unsigned batch);
    JKTAPI jktError_t jkt_triMat_D(unsigned m, unsigned n,
                                   double *d_A, double *d_T,
                                   int diag, bool direction,
                                   unsigned batch);
    JKTAPI jktError_t jkt_triMat_Z(unsigned m, unsigned n,
                                   cuDoubleComplex *d_A, cuDoubleComplex *d_T,
                                   int diag, bool direction,
                                   unsigned batch);


    // INCLUSIVE SUM
    JKTAPI jktError_t jkt_inclusiveSum_D(unsigned m, unsigned n,
                                         double *d_X, double *d_Y, 
                                         unsigned dim, unsigned batch);
    JKTAPI jktError_t jkt_inclusiveSum_S(unsigned m, unsigned n,
                                         float *d_X, float *d_Y, 
                                         unsigned dim, unsigned batch);

    // INCLUSIVE PRODUCT
    JKTAPI jktError_t jkt_inclusiveProduct_D(unsigned m, unsigned n,
                                             double *d_X, double *d_Y, 
                                             unsigned dim, unsigned batch);
    JKTAPI jktError_t jkt_inclusiveProduct_S(unsigned m, unsigned n,
                                             float *d_X, float *d_Y, 
                                             unsigned dim, unsigned batch);

    // MATRIX SHIFT
    JKTAPI jktError_t jkt_matrixShift_B(unsigned *dims,   unsigned ndims,
                                        int *shifts, int nshifts,
                                        bool *d_A, bool *d_B,
                                        unsigned batch);
    JKTAPI jktError_t jkt_matrixShift_U(unsigned *dims,   unsigned ndims,
                                        int *shifts, int nshifts,
                                        unsigned *d_A, unsigned *d_B,
                                        unsigned batch);
    JKTAPI jktError_t jkt_matrixShift_I(unsigned *dims,   unsigned ndims,
                                        int *shifts, int nshifts,
                                        int *d_A, int *d_B,
                                        unsigned batch);
    JKTAPI jktError_t jkt_matrixShift_S(unsigned *dims,   unsigned ndims,
                                        int *shifts, int nshifts,
                                        float *d_A, float *d_B,
                                        unsigned batch);
    JKTAPI jktError_t jkt_matrixShift_D(unsigned *dims,   unsigned ndims,
                                        int *shifts, int nshifts,
                                        double *d_A, double *d_B,
                                        unsigned batch);
    JKTAPI jktError_t jkt_matrixShift_C(unsigned *dims,   unsigned ndims,
                                        int *shifts, int nshifts,
                                        cuFloatComplex *d_A, 
                                        cuFloatComplex *d_B,
                                        unsigned batch);
    JKTAPI jktError_t jkt_matrixShift_Z(unsigned *dims,   unsigned ndims,
                                        int *shifts, int nshifts,
                                        cuDoubleComplex *d_A, 
                                        cuDoubleComplex *d_B,
                                        unsigned batch);

    // matrix shuffle
    JKTAPI jktError_t jkt_matrixShuffle_B(unsigned *dims,   unsigned ndims,
                                        int *shuffles, int nshuffles,
                                        bool *d_A, bool *d_B,
                                        unsigned batch);
    JKTAPI jktError_t jkt_matrixShuffle_U(unsigned *dims,   unsigned ndims,
                                        int *shuffles, int nshuffles,
                                        unsigned *d_A, unsigned *d_B,
                                        unsigned batch);
    JKTAPI jktError_t jkt_matrixShuffle_I(unsigned *dims,   unsigned ndims,
                                        int *shuffles, int nshuffles,
                                        int *d_A, int *d_B,
                                        unsigned batch);
    JKTAPI jktError_t jkt_matrixShuffle_S(unsigned *dims,   unsigned ndims,
                                        int *shuffles, int nshuffles,
                                        float *d_A, float *d_B,
                                        unsigned batch);
    JKTAPI jktError_t jkt_matrixShuffle_D(unsigned *dims,   unsigned ndims,
                                        int *shuffles, int nshuffles,
                                        double *d_A, double *d_B,
                                        unsigned batch);
    JKTAPI jktError_t jkt_matrixShuffle_C(unsigned *dims,   unsigned ndims,
                                        int *shuffles, int nshuffles,
                                        cuFloatComplex *d_A, 
                                        cuFloatComplex *d_B,
                                        unsigned batch);
    JKTAPI jktError_t jkt_matrixShuffle_Z(unsigned *dims,   unsigned ndims,
                                        int *shuffles, int nshuffles,
                                        cuDoubleComplex *d_A, 
                                        cuDoubleComplex *d_B,
                                        unsigned batch);

    // Interp2D
    JKTAPI jktError_t jkt_interp2D_S(unsigned m , unsigned n ,
                                     float *Z , float *X , float *Y ,
                                     unsigned mi, unsigned ni,
                                     float *ZI, float *XI, float *YI,
                                     float Z0, unsigned method, 
                                     unsigned NZ, unsigned NX, unsigned NY);

    // Histogram
    JKTAPI jktError_t jkt_historgram_S(unsigned length, float *data,
                                       unsigned nbins , float *bins,
                                       float *hist, unsigned batch);

    // Scale2D
    JKTAPI jktError_t jkt_scale2D_S(unsigned md, unsigned nd,
                                    float *Z, float **ZI,
                                    int times, unsigned method,
                                    unsigned batch);
    JKTAPI jktError_t jkt_scale2D_c(unsigned md, unsigned nd,
                                    char *Z, char **ZI,
                                    int times, unsigned method,
                                    unsigned batch);

    // Interp1D
    JKTAPI jktError_t jkt_interp1D_S(unsigned m , 
                                     float *Y , float *X ,
                                     unsigned mi,
                                     float *YI, float *XI,
                                     float Y0, unsigned method, 
                                     unsigned NY, unsigned NX);
    JKTAPI jktError_t jkt_interp1D_C(unsigned m ,
                                     cuFloatComplex *Y , float *X ,
                                     unsigned mi,
                                     cuFloatComplex *YI, float *XI,
                                     float Y0, unsigned method,
                                     unsigned NY, unsigned NX);
    JKTAPI jktError_t jkt_interp1D_D(unsigned m ,
                                     double *Y , double *X ,
                                     unsigned mi,
                                     double *YI, double *XI,
                                     double Y0, unsigned method, 
                                     unsigned NY, unsigned NX);
    JKTAPI jktError_t jkt_interp1D_Z(unsigned m ,
                                     cuDoubleComplex *Y , double *X ,
                                     unsigned mi,
                                     cuDoubleComplex *YI, double *XI,
                                     double Y0, unsigned method,
                                     unsigned NY, unsigned NX);

    // Filtering
    JKTAPI jktError_t jkt_filter_S(unsigned ilen, float *in, float *out,
                                   unsigned xlen, float *h_x,
                                   unsigned ylen, float *h_y,
                                   float *d_init, float *d_final,                                   
                                   unsigned Nin, unsigned Ninit);
                                   

    // Convolution
    JKTAPI jktError_t jkt_convolve2D_S(unsigned m, unsigned n,
                                       float *d_in, float *d_out,
                                       unsigned x, unsigned y,
                                       float *filter, unsigned type,
                                       unsigned batch);

    JKTAPI jktError_t jkt_convolveND_S(unsigned *dims, unsigned ndims,
                                       float *d_in, float *d_out,
                                       unsigned *dims_, unsigned ndims_,
                                       float *h_filter, unsigned type,
                                       unsigned batch);
    JKTAPI jktError_t jkt_convolveND_D(unsigned *dims, unsigned ndims,
                                       double *d_in, double *d_out,
                                       unsigned *dims_, unsigned ndims_,
                                       double *h_filter, unsigned type,
                                       unsigned batch);

    // Image processing

    //Image Filter
    JKTAPI jktError_t jkt_imageFilter_S(unsigned x, unsigned y,
                                        float *in, float *out,
                                        unsigned xf, unsigned yf,
                                        float *filter,
                                        unsigned type, float value,
                                        bool conv, bool full,
                                        unsigned batch);
    // Image Rotate
    JKTAPI jktError_t jkt_imageRotate_S(unsigned xa, unsigned ya,
                                        float *in,
                                        unsigned xb, unsigned yb,
                                        float *out,
                                        float theta,
                                        unsigned batch);


    // Image Erode
    JKTAPI jktError_t jkt_imageErode_S(unsigned mI, unsigned nI,
                                       float *in, float *out,
                                       unsigned mH, unsigned nH,
                                       float *nhood, unsigned batch);
    JKTAPI jktError_t jkt_imageErode_D(unsigned mI, unsigned nI,
                                       double *in, double *out,
                                       unsigned mH, unsigned nH,
                                       float *nhood, unsigned batch);

    // Image Dilate
    JKTAPI jktError_t jkt_imageDilate_S(unsigned mI, unsigned nI,
                                       float *in, float *out,
                                       unsigned mH, unsigned nH,
                                        float *nhood, unsigned batch);
    JKTAPI jktError_t jkt_imageDilate_D(unsigned mI, unsigned nI,
                                        double *in, double *out,
                                        unsigned mH, unsigned nH,
                                        float *nhood, unsigned batch);
    
    
    // Image morph
    JKTAPI jktError_t jkt_imageMorph_B(unsigned mI, unsigned nI,
                                       bool *in, bool *out,
                                       unsigned opts , unsigned N,
                                       unsigned batch);


    // Matrix products
    JKTAPI jktError_t jkt_kronProd_SS(unsigned mA, unsigned nA, 
                                      float *A,
                                      unsigned mB, unsigned nB,
                                      float *B,
                                      float *C, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_kronProd_SC(unsigned mA, unsigned nA, 
                                      float *A,
                                      unsigned mB, unsigned nB,
                                      cuFloatComplex *B,
                                      cuFloatComplex *C, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_kronProd_CS(unsigned mA, unsigned nA, 
                                      cuFloatComplex *A,
                                      unsigned mB, unsigned nB,
                                      float *B,
                                      cuFloatComplex *C, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_kronProd_CC(unsigned mA, unsigned nA, 
                                      cuFloatComplex *A,
                                      unsigned mB, unsigned nB,
                                      cuFloatComplex *B,
                                      cuFloatComplex *C, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_kronProd_SD(unsigned mA, unsigned nA, 
                                      float *A,
                                      unsigned mB, unsigned nB,
                                      double *B,
                                      float *C, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_kronProd_SZ(unsigned mA, unsigned nA, 
                                      float *A,
                                      unsigned mB, unsigned nB,
                                      cuDoubleComplex *B,
                                      cuFloatComplex *C, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_kronProd_CD(unsigned mA, unsigned nA, 
                                      cuFloatComplex *A,
                                      unsigned mB, unsigned nB,
                                      double *B,
                                      cuFloatComplex *C, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_kronProd_CZ(unsigned mA, unsigned nA, 
                                      cuFloatComplex *A,
                                      unsigned mB, unsigned nB,
                                      cuDoubleComplex *B,
                                      cuFloatComplex *C, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_kronProd_DS(unsigned mA, unsigned nA, 
                                      double *A,
                                      unsigned mB, unsigned nB,
                                      float *B,
                                      float *C, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_kronProd_DC(unsigned mA, unsigned nA, 
                                      double *A,
                                      unsigned mB, unsigned nB,
                                      cuFloatComplex *B,
                                      cuFloatComplex *C, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_kronProd_ZS(unsigned mA, unsigned nA, 
                                      cuDoubleComplex *A,
                                      unsigned mB, unsigned nB,
                                      float *B,
                                      cuFloatComplex *C, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_kronProd_ZC(unsigned mA, unsigned nA, 
                                      cuDoubleComplex *A,
                                      unsigned mB, unsigned nB,
                                      cuFloatComplex *B,
                                      cuFloatComplex *C, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_kronProd_DD(unsigned mA, unsigned nA, 
                                      double *A,
                                      unsigned mB, unsigned nB,
                                      double *B,
                                      double *C, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_kronProd_DZ(unsigned mA, unsigned nA, 
                                      double *A,
                                      unsigned mB, unsigned nB,
                                      cuDoubleComplex *B,
                                      cuDoubleComplex *C, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_kronProd_ZD(unsigned mA, unsigned nA, 
                                      cuDoubleComplex *A,
                                      unsigned mB, unsigned nB,
                                      double *B,
                                      cuDoubleComplex *C, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_kronProd_ZZ(unsigned mA, unsigned nA, 
                                      cuDoubleComplex *A,
                                      unsigned mB, unsigned nB,
                                      cuDoubleComplex *B,
                                      cuDoubleComplex *C, 
                                      unsigned NA, unsigned NB);

    JKTAPI jktError_t jkt_crossProd_SS(unsigned numel, unsigned offset, 
                                       float *A,
                                       float *B,
                                       float *C, 
                                       unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_crossProd_SC(unsigned numel, unsigned offset, 
                                       float *A,
                                       cuFloatComplex *B,
                                       cuFloatComplex *C, 
                                       unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_crossProd_CS(unsigned numel, unsigned offset, 
                                       cuFloatComplex *A,
                                       float *B,
                                       cuFloatComplex *C, 
                                       unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_crossProd_CC(unsigned numel, unsigned offset, 
                                       cuFloatComplex *A,
                                       cuFloatComplex *B,
                                       cuFloatComplex *C, 
                                       unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_crossProd_SD(unsigned numel, unsigned offset, 
                                       float *A,
                                       double *B,
                                       float *C, 
                                       unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_crossProd_SZ(unsigned numel, unsigned offset, 
                                       float *A,
                                       cuDoubleComplex *B,
                                       cuFloatComplex *C, 
                                       unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_crossProd_CD(unsigned numel, unsigned offset, 
                                       cuFloatComplex *A,
                                       double *B,
                                       cuFloatComplex *C, 
                                       unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_crossProd_CZ(unsigned numel, unsigned offset, 
                                       cuFloatComplex *A,
                                       cuDoubleComplex *B,
                                       cuFloatComplex *C, 
                                       unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_crossProd_DS(unsigned numel, unsigned offset, 
                                       double *A,
                                       float *B,
                                       float *C, 
                                       unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_crossProd_DC(unsigned numel, unsigned offset, 
                                       double *A,
                                       cuFloatComplex *B,
                                       cuFloatComplex *C, 
                                       unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_crossProd_ZS(unsigned numel, unsigned offset, 
                                       cuDoubleComplex *A,
                                       float *B,
                                       cuFloatComplex *C, 
                                       unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_crossProd_ZC(unsigned numel, unsigned offset, 
                                       cuDoubleComplex *A,
                                       cuFloatComplex *B,
                                       cuFloatComplex *C, 
                                       unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_crossProd_DD(unsigned numel, unsigned offset, 
                                       double *A,
                                       double *B,
                                       double *C, 
                                       unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_crossProd_DZ(unsigned numel, unsigned offset, 
                                       double *A,
                                       cuDoubleComplex *B,
                                       cuDoubleComplex *C, 
                                       unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_crossProd_ZD(unsigned numel, unsigned offset, 
                                       cuDoubleComplex *A,
                                       double *B,
                                       cuDoubleComplex *C, 
                                       unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_crossProd_ZZ(unsigned numel, unsigned offset, 
                                       cuDoubleComplex *A,
                                       cuDoubleComplex *B,
                                       cuDoubleComplex *C, 
                                       unsigned NA, unsigned NB);


    // Toeplitz
    JKTAPI jktError_t jkt_toeplitz_SS(unsigned numelC, 
                                      float *TPLZol,
                                      unsigned numelR,
                                      float *Row,
                                      float *TPLZ, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_toeplitz_SC(unsigned numelC, 
                                      float *TPLZol,
                                      unsigned numelR,
                                      cuFloatComplex *Row,
                                      cuFloatComplex *TPLZ, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_toeplitz_CS(unsigned numelC, 
                                      cuFloatComplex *TPLZol,
                                      unsigned numelR,
                                      float *Row,
                                      cuFloatComplex *TPLZ, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_toeplitz_CC(unsigned numelC, 
                                      cuFloatComplex *TPLZol,
                                      unsigned numelR,
                                      cuFloatComplex *Row,
                                      cuFloatComplex *TPLZ, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_toeplitz_SD(unsigned numelC, 
                                      float *TPLZol,
                                      unsigned numelR,
                                      double *Row,
                                      float *TPLZ, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_toeplitz_SZ(unsigned numelC, 
                                      float *TPLZol,
                                      unsigned numelR,
                                      cuDoubleComplex *Row,
                                      cuFloatComplex *TPLZ, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_toeplitz_CD(unsigned numelC, 
                                      cuFloatComplex *TPLZol,
                                      unsigned numelR,
                                      double *Row,
                                      cuFloatComplex *TPLZ, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_toeplitz_CZ(unsigned numelC, 
                                      cuFloatComplex *TPLZol,
                                      unsigned numelR,
                                      cuDoubleComplex *Row,
                                      cuFloatComplex *TPLZ, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_toeplitz_DS(unsigned numelC, 
                                      double *TPLZol,
                                      unsigned numelR,
                                      float *Row,
                                      float *TPLZ, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_toeplitz_DC(unsigned numelC, 
                                      double *TPLZol,
                                      unsigned numelR,
                                      cuFloatComplex *Row,
                                      cuFloatComplex *TPLZ, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_toeplitz_ZS(unsigned numelC, 
                                      cuDoubleComplex *TPLZol,
                                      unsigned numelR,
                                      float *Row,
                                      cuFloatComplex *TPLZ, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_toeplitz_ZC(unsigned numelC, 
                                      cuDoubleComplex *TPLZol,
                                      unsigned numelR,
                                      cuFloatComplex *Row,
                                      cuFloatComplex *TPLZ, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_toeplitz_DD(unsigned numelC, 
                                      double *TPLZol,
                                      unsigned numelR,
                                      double *Row,
                                      double *TPLZ, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_toeplitz_DZ(unsigned numelC, 
                                      double *TPLZol,
                                      unsigned numelR,
                                      cuDoubleComplex *Row,
                                      cuDoubleComplex *TPLZ, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_toeplitz_ZD(unsigned numelC, 
                                      cuDoubleComplex *TPLZol,
                                      unsigned numelR,
                                      double *Row,
                                      cuDoubleComplex *TPLZ, 
                                      unsigned NA, unsigned NB);
    JKTAPI jktError_t jkt_toeplitz_ZZ(unsigned numelC, 
                                      cuDoubleComplex *TPLZol,
                                      unsigned numelR,
                                      cuDoubleComplex *Row,
                                      cuDoubleComplex *TPLZ, 
                                      unsigned NA, unsigned NB);

#ifdef __cplusplus
}
#endif

#ifdef __cplusplus

// any vector
JKTAPI jktError_t jkt_any_vector(bool *h_dst, float *d_src, unsigned numel);
JKTAPI jktError_t jkt_any_vector(bool *h_dst, bool *d_src, unsigned numel);
JKTAPI jktError_t jkt_any_vector(bool *h_dst, cuFloatComplex *d_src, unsigned numel);
JKTAPI jktError_t jkt_any_vector(bool *h_dst, double *d_src, unsigned numel);
JKTAPI jktError_t jkt_any_vector(bool *h_dst, cuDoubleComplex *d_src, unsigned numel);

// any columns
JKTAPI jktError_t jkt_any_columns(bool *d_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_any_columns(bool *d_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_any_columns(bool *d_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_any_columns(bool *d_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_any_columns(bool *d_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);

// any rows
JKTAPI jktError_t jkt_any_rows(bool *d_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_any_rows(bool *d_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_any_rows(bool *d_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_any_rows(bool *d_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_any_rows(bool *d_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);

// max vector
JKTAPI jktError_t jkt_max_vector(float *h_dst, float *d_src, unsigned numel);
JKTAPI jktError_t jkt_max_vector(bool *h_dst, bool *d_src, unsigned numel);
JKTAPI jktError_t jkt_max_vector(cuFloatComplex *h_dst, cuFloatComplex *d_src, unsigned numel);
JKTAPI jktError_t jkt_max_vector(double *h_dst, double *d_src, unsigned numel);
JKTAPI jktError_t jkt_max_vector(cuDoubleComplex *h_dst, cuDoubleComplex *d_src, unsigned numel);

// max columns
JKTAPI jktError_t jkt_max_columns(float *d_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_max_columns(bool *d_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_max_columns(cuFloatComplex *d_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_max_columns(double *d_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_max_columns(cuDoubleComplex *d_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);

// max rows
JKTAPI jktError_t jkt_max_rows(float *d_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_max_rows(bool *d_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_max_rows(cuFloatComplex *d_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_max_rows(double *d_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_max_rows(cuDoubleComplex *d_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);

// min vector
JKTAPI jktError_t jkt_min_vector(float *h_dst, float *d_src, unsigned numel);
JKTAPI jktError_t jkt_min_vector(bool *h_dst, bool *d_src, unsigned numel);
JKTAPI jktError_t jkt_min_vector(cuFloatComplex *h_dst, cuFloatComplex *d_src, unsigned numel);
JKTAPI jktError_t jkt_min_vector(double *h_dst, double *d_src, unsigned numel);
JKTAPI jktError_t jkt_min_vector(cuDoubleComplex *h_dst, cuDoubleComplex *d_src, unsigned numel);

// min columns
JKTAPI jktError_t jkt_min_columns(float *d_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_min_columns(bool *d_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_min_columns(cuFloatComplex *d_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_min_columns(double *d_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_min_columns(cuDoubleComplex *d_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);

// min rows
JKTAPI jktError_t jkt_min_rows(float *d_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_min_rows(bool *d_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_min_rows(cuFloatComplex *d_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_min_rows(double *d_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_min_rows(cuDoubleComplex *d_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);

// sum vector
JKTAPI jktError_t jkt_sum_vector(float *h_dst, float *d_src, unsigned numel);
JKTAPI jktError_t jkt_sum_vector(float *h_dst, bool *d_src, unsigned numel);
JKTAPI jktError_t jkt_sum_vector(cuFloatComplex *h_dst, cuFloatComplex *d_src, unsigned numel);
JKTAPI jktError_t jkt_sum_vector(double *h_dst, double *d_src, unsigned numel);
JKTAPI jktError_t jkt_sum_vector(cuDoubleComplex *h_dst, cuDoubleComplex *d_src, unsigned numel);

// sum columns
JKTAPI jktError_t jkt_sum_columns(float *d_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_sum_columns(float *d_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_sum_columns(cuFloatComplex *d_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_sum_columns(double *d_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_sum_columns(cuDoubleComplex *d_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);

// sum rows
JKTAPI jktError_t jkt_sum_rows(float *d_dst, float *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_sum_rows(float *d_dst, bool *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_sum_rows(cuFloatComplex *d_dst, cuFloatComplex *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_sum_rows(double *d_dst, double *d_src, unsigned nrows, unsigned ncols);
JKTAPI jktError_t jkt_sum_rows(cuDoubleComplex *d_dst, cuDoubleComplex *d_src, unsigned nrows, unsigned ncols);
#endif // ifdef __cplusplus


#endif  // __JACKET_H
