Convolution Functions

riscv_nmsis_nn_status riscv_convolve_1_x_n_s8(const nmsis_nn_context *ctx, const nmsis_nn_conv_params *conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int8_t *input_data, const nmsis_nn_dims *filter_dims, const int8_t *filter_data, const nmsis_nn_dims *bias_dims, const int32_t *bias_data, const nmsis_nn_dims *output_dims, int8_t *output_data)
riscv_nmsis_nn_status riscv_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB)
riscv_nmsis_nn_status riscv_convolve_1x1_s8(const nmsis_nn_context *ctx, const nmsis_nn_conv_params *conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int8_t *input_data, const nmsis_nn_dims *filter_dims, const int8_t *filter_data, const nmsis_nn_dims *bias_dims, const int32_t *bias_data, const nmsis_nn_dims *output_dims, int8_t *output_data)
riscv_nmsis_nn_status riscv_convolve_1x1_s8_fast(const nmsis_nn_context *ctx, const nmsis_nn_conv_params *conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int8_t *input_data, const nmsis_nn_dims *filter_dims, const int8_t *filter_data, const nmsis_nn_dims *bias_dims, const int32_t *bias_data, const nmsis_nn_dims *output_dims, int8_t *output_data)
riscv_nmsis_nn_status riscv_convolve_fast_s16(const nmsis_nn_context *ctx, const nmsis_nn_conv_params *conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int16_t *input_data, const nmsis_nn_dims *filter_dims, const int8_t *filter_data, const nmsis_nn_dims *bias_dims, const int64_t *bias_data, const nmsis_nn_dims *output_dims, int16_t *output_data)
riscv_nmsis_nn_status riscv_convolve_HWC_q15_basic(const q15_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q15_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q15_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q15_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB)
riscv_nmsis_nn_status riscv_convolve_HWC_q15_fast(const q15_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q15_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q15_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q15_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB)
riscv_nmsis_nn_status riscv_convolve_HWC_q15_fast_nonsquare(const q15_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q15_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q15_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q15_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB)
riscv_nmsis_nn_status riscv_convolve_HWC_q7_basic(const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB)
riscv_nmsis_nn_status riscv_convolve_HWC_q7_basic_nonsquare(const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB)
riscv_nmsis_nn_status riscv_convolve_HWC_q7_fast(const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB)
riscv_nmsis_nn_status riscv_convolve_HWC_q7_fast_nonsquare(const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB)
riscv_nmsis_nn_status riscv_convolve_HWC_q7_RGB(const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB)
riscv_nmsis_nn_status riscv_convolve_s16(const nmsis_nn_context *ctx, const nmsis_nn_conv_params *conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int16_t *input_data, const nmsis_nn_dims *filter_dims, const int8_t *filter_data, const nmsis_nn_dims *bias_dims, const int64_t *bias_data, const nmsis_nn_dims *output_dims, int16_t *output_data)
riscv_nmsis_nn_status riscv_convolve_s8(const nmsis_nn_context *ctx, const nmsis_nn_conv_params *conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int8_t *input_data, const nmsis_nn_dims *filter_dims, const int8_t *filter_data, const nmsis_nn_dims *bias_dims, const int32_t *bias_data, const nmsis_nn_dims *output_dims, int8_t *output_data)
riscv_nmsis_nn_status riscv_convolve_wrapper_s16(const nmsis_nn_context *ctx, const nmsis_nn_conv_params *conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int16_t *input_data, const nmsis_nn_dims *filter_dims, const int8_t *filter_data, const nmsis_nn_dims *bias_dims, const int64_t *bias_data, const nmsis_nn_dims *output_dims, int16_t *output_data)
riscv_nmsis_nn_status riscv_convolve_wrapper_s8(const nmsis_nn_context *ctx, const nmsis_nn_conv_params *conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int8_t *input_data, const nmsis_nn_dims *filter_dims, const int8_t *filter_data, const nmsis_nn_dims *bias_dims, const int32_t *bias_data, const nmsis_nn_dims *output_dims, int8_t *output_data)
riscv_nmsis_nn_status riscv_depthwise_conv_3x3_s8(const nmsis_nn_context *ctx, const nmsis_nn_dw_conv_params *dw_conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int8_t *input, const nmsis_nn_dims *filter_dims, const int8_t *kernel, const nmsis_nn_dims *bias_dims, const int32_t *bias, const nmsis_nn_dims *output_dims, int8_t *output)
riscv_nmsis_nn_status riscv_depthwise_conv_fast_s16(const nmsis_nn_context *ctx, const nmsis_nn_dw_conv_params *dw_conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int16_t *input, const nmsis_nn_dims *filter_dims, const int8_t *kernel, const nmsis_nn_dims *bias_dims, const int64_t *bias, const nmsis_nn_dims *output_dims, int16_t *output)
static void __attribute__ ((unused))
static void depthwise_conv_s16_generic_s16(const int16_t *input, const uint16_t input_batches, const uint16_t input_x, const uint16_t input_y, const uint16_t input_ch, const int8_t *kernel, const uint16_t ch_mult, const uint16_t kernel_x, const uint16_t kernel_y, const uint16_t pad_x, const uint16_t pad_y, const uint16_t stride_x, const uint16_t stride_y, const int64_t *bias, int16_t *output, const int32_t *output_shift, const int32_t *output_mult, const uint16_t output_x, const uint16_t output_y, const int32_t output_activation_min, const int32_t output_activation_max, const uint16_t dilation_x, const uint16_t dilation_y)
riscv_nmsis_nn_status riscv_depthwise_conv_s16(const nmsis_nn_context *ctx, const nmsis_nn_dw_conv_params *dw_conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int16_t *input, const nmsis_nn_dims *filter_dims, const int8_t *kernel, const nmsis_nn_dims *bias_dims, const int64_t *bias, const nmsis_nn_dims *output_dims, int16_t *output)
static void depthwise_conv_s8_mult_4(const int8_t *input, const int32_t input_x, const int32_t input_y, const int32_t input_ch, const int8_t *kernel, const int32_t output_ch, const int32_t ch_mult, const int32_t kernel_x, const int32_t kernel_y, const int32_t pad_x, const int32_t pad_y, const int32_t stride_x, const int32_t stride_y, const int32_t *bias, int8_t *output, const int32_t *output_shift, const int32_t *output_mult, const int32_t output_x, const int32_t output_y, const int32_t output_offset, const int32_t input_offset, const int32_t output_activation_min, const int32_t output_activation_max)
static void depthwise_conv_s8_generic(const int8_t *input, const uint16_t input_batches, const uint16_t input_x, const uint16_t input_y, const uint16_t input_ch, const int8_t *kernel, const uint16_t output_ch, const uint16_t ch_mult, const uint16_t kernel_x, const uint16_t kernel_y, const uint16_t pad_x, const uint16_t pad_y, const uint16_t stride_x, const uint16_t stride_y, const int32_t *bias, int8_t *output, const int32_t *output_shift, const int32_t *output_mult, const uint16_t output_x, const uint16_t output_y, const int32_t output_offset, const int32_t input_offset, const int32_t output_activation_min, const int32_t output_activation_max, const uint16_t dilation_x, const uint16_t dilation_y)
riscv_nmsis_nn_status riscv_depthwise_conv_s8(const nmsis_nn_context *ctx, const nmsis_nn_dw_conv_params *dw_conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int8_t *input, const nmsis_nn_dims *filter_dims, const int8_t *kernel, const nmsis_nn_dims *bias_dims, const int32_t *bias, const nmsis_nn_dims *output_dims, int8_t *output)
riscv_nmsis_nn_status riscv_depthwise_conv_s8_opt(const nmsis_nn_context *ctx, const nmsis_nn_dw_conv_params *dw_conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int8_t *input, const nmsis_nn_dims *filter_dims, const int8_t *kernel, const nmsis_nn_dims *bias_dims, const int32_t *bias, const nmsis_nn_dims *output_dims, int8_t *output)
static void depthwise_conv_u8_mult_4(const uint8_t *input, const int32_t input_x, const int32_t input_y, const int32_t input_ch, const uint8_t *kernel, const int32_t output_ch, const int32_t ch_mult, const int32_t kernel_x, const int32_t kernel_y, const int32_t pad_x, const int32_t pad_y, const int32_t stride_x, const int32_t stride_y, const int32_t *bias, uint8_t *output, const int32_t output_shift, const int32_t output_mult, const int32_t output_x, const int32_t output_y, const int32_t output_offset, const int32_t input_offset, const int32_t filter_offset, const int32_t output_activation_min, const int32_t output_activation_max)
static void depthwise_conv_u8_generic(const uint8_t *input, const int32_t input_x, const int32_t input_y, const int32_t input_ch, const uint8_t *kernel, const int32_t output_ch, const int32_t ch_mult, const int32_t kernel_x, const int32_t kernel_y, const int32_t pad_x, const int32_t pad_y, const int32_t stride_x, const int32_t stride_y, const int32_t *bias, uint8_t *output, const int32_t output_shift, const int32_t output_mult, const int32_t output_x, const int32_t output_y, const int32_t output_offset, const int32_t input_offset, const int32_t filter_offset, const int32_t output_activation_min, const int32_t output_activation_max)
riscv_nmsis_nn_status riscv_depthwise_conv_u8_basic_ver1(const uint8_t *input, const uint16_t input_x, const uint16_t input_y, const uint16_t input_ch, const uint8_t *kernel, const uint16_t kernel_x, const uint16_t kernel_y, const int16_t ch_mult, const int16_t pad_x, const int16_t pad_y, const int16_t stride_x, const int16_t stride_y, const int16_t dilation_x, const int16_t dilation_y, const int32_t *bias, const int32_t input_offset, const int32_t filter_offset, const int32_t output_offset, uint8_t *output, const uint16_t output_x, const uint16_t output_y, const int32_t output_activation_min, const int32_t output_activation_max, const int32_t output_shift, const int32_t output_mult)
riscv_nmsis_nn_status riscv_depthwise_conv_wrapper_s16(const nmsis_nn_context *ctx, const nmsis_nn_dw_conv_params *dw_conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int16_t *input, const nmsis_nn_dims *filter_dims, const int8_t *filter, const nmsis_nn_dims *bias_dims, const int64_t *bias, const nmsis_nn_dims *output_dims, int16_t *output)
riscv_nmsis_nn_status riscv_depthwise_conv_wrapper_s8(const nmsis_nn_context *ctx, const nmsis_nn_dw_conv_params *dw_conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int8_t *input, const nmsis_nn_dims *filter_dims, const int8_t *filter, const nmsis_nn_dims *bias_dims, const int32_t *bias, const nmsis_nn_dims *output_dims, int8_t *output)
riscv_nmsis_nn_status riscv_depthwise_separable_conv_HWC_q7(const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB)
riscv_nmsis_nn_status riscv_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB)
group NNConv

Collection of convolution, depthwise convolution functions and their variants.

The convolution is implemented in 2 steps: im2col and General Matrix Multiplication(GEMM)

im2col is a process of converting each patch of image data into a column. After im2col, the convolution is computed as matrix-matrix multiplication.

To reduce the memory footprint, the im2col is performed partially. Each iteration, only a few column (i.e., patches) are generated followed by GEMM.

Functions

riscv_nmsis_nn_status riscv_convolve_1_x_n_s8(const nmsis_nn_context *ctx, const nmsis_nn_conv_params *conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int8_t *input_data, const nmsis_nn_dims *filter_dims, const int8_t *filter_data, const nmsis_nn_dims *bias_dims, const int32_t *bias_data, const nmsis_nn_dims *output_dims, int8_t *output_data)

1xn convolution

  • Supported framework : TensorFlow Lite Micro

  • The following constrains on the arguments apply

    1. input_dims->n equals 1

    2. ouput_dims->w is a multiple of 4

    3. Explicit constraints(since it is for 1xN convolution) -## input_dims->h equals 1 -## output_dims->h equals 1 -## filter_dims->h equals 1

      Todo:

      Remove constraint on output_dims->w to make the function generic.

Parameters
  • ctx[inout] Function context that contains the additional buffer if required by the function. riscv_convolve_1_x_n_s8_get_buffer_size will return the buffer_size if required The caller is expected to clear the buffer ,if applicable, for security reasons.

  • conv_params[in] Convolution parameters (e.g. strides, dilations, pads,…). Range of conv_params->input_offset : [-127, 128] Range of conv_params->output_offset : [-128, 127]

  • quant_params[in] Per-channel quantization info. It contains the multiplier and shift values to be applied to each output channel

  • input_dims[in] Input (activation) tensor dimensions. Format: [N, H, W, C_IN]

  • input_data[in] Input (activation) data pointer. Data type: int8

  • filter_dims[in] Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the horizontal spatial filter dimension

  • filter_data[in] Filter data pointer. Data type: int8

  • bias_dims[in] Bias tensor dimensions. Format: [C_OUT]

  • bias_data[in] Optional bias data pointer. Data type: int32

  • output_dims[in] Output tensor dimensions. Format: [N, H, W, C_OUT]

  • output_data[out] Output data pointer. Data type: int8

Returns

The function returns either RISCV_NMSIS_NN_ARG_ERROR if argument constraints fail. or, RISCV_NMSIS_NN_SUCCESS on successful completion.

riscv_nmsis_nn_status riscv_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB)

Fast Q7 version of 1x1 convolution (non-sqaure shape)

This function is optimized for convolution with 1x1 kernel size (i.e., dim_kernel_x=1 and dim_kernel_y=1). It can be used for the second half of MobileNets [1] after depthwise separable convolution.

This function is the version with full list of optimization tricks, but with some constraints: ch_im_in is multiple of 4 ch_im_out is multiple of 2

[1] MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications https://arxiv.org/abs/1704.04861

Parameters
  • Im_in[in] pointer to input tensor

  • dim_im_in_x[in] input tensor dimention x

  • dim_im_in_y[in] input tensor dimention y

  • ch_im_in[in] number of input tensor channels

  • wt[in] pointer to kernel weights

  • ch_im_out[in] number of filters, i.e., output tensor channels

  • dim_kernel_x[in] filter kernel size x

  • dim_kernel_y[in] filter kernel size y

  • padding_x[in] padding size x

  • padding_y[in] padding size y

  • stride_x[in] convolution stride x

  • stride_y[in] convolution stride y

  • bias[in] pointer to bias

  • bias_shift[in] amount of left-shift for bias

  • out_shift[in] amount of right-shift for output

  • Im_out[inout] pointer to output tensor

  • dim_im_out_x[in] output tensor dimension x

  • dim_im_out_y[in] output tensor dimension y

  • bufferA[inout] pointer to buffer space for input

  • bufferB[inout] pointer to buffer space for output

Returns

The function returns either RISCV_NMSIS_NN_SIZE_MISMATCH or RISCV_NMSIS_NN_SUCCESS based on the outcome of size checking.

riscv_nmsis_nn_status riscv_convolve_1x1_s8(const nmsis_nn_context *ctx, const nmsis_nn_conv_params *conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int8_t *input_data, const nmsis_nn_dims *filter_dims, const int8_t *filter_data, const nmsis_nn_dims *bias_dims, const int32_t *bias_data, const nmsis_nn_dims *output_dims, int8_t *output_data)

s8 version for 1x1 convolution with support for non-unity stride values

  • Supported framework : TensorFlow Lite Micro

  • The following constrains on the arguments apply

    1. conv_params->padding.w = conv_params->padding.h = 0

Parameters
  • ctx[inout] Function context that contains the additional buffer if required by the function. None is required by this function.

  • conv_params[in] Convolution parameters (e.g. strides, dilations, pads,…). Range of conv_params->input_offset : [-127, 128] Range of conv_params->output_offset : [-128, 127]

  • quant_params[in] Per-channel quantization info. It contains the multiplier and shift values to be applied to each output channel

  • input_dims[in] Input (activation) tensor dimensions. Format: [N, H, W, C_IN]

  • input_data[in] Input (activation) data pointer. Data type: int8

  • filter_dims[in] Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN]

  • filter_data[in] Filter data pointer. Data type: int8

  • bias_dims[in] Bias tensor dimensions. Format: [C_OUT]

  • bias_data[in] Optional bias data pointer. Data type: int32

  • output_dims[in] Output tensor dimensions. Format: [N, H, W, C_OUT]

  • output_data[out] Output data pointer. Data type: int8

Returns

The function returns either RISCV_NMSIS_NN_ARG_ERROR if argument constraints fail. or, RISCV_NMSIS_NN_SUCCESS on successful completion.

riscv_nmsis_nn_status riscv_convolve_1x1_s8_fast(const nmsis_nn_context *ctx, const nmsis_nn_conv_params *conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int8_t *input_data, const nmsis_nn_dims *filter_dims, const int8_t *filter_data, const nmsis_nn_dims *bias_dims, const int32_t *bias_data, const nmsis_nn_dims *output_dims, int8_t *output_data)

Fast s8 version for 1x1 convolution (non-square shape)

  • Supported framework : TensorFlow Lite Micro

  • The following constrains on the arguments apply

    1. conv_params->padding.w = conv_params->padding.h = 0

    2. conv_params->stride.w = conv_params->stride.h = 1

  • Supported framework : TensorFlow Lite Micro

  • The following constrains on the arguments apply

    1. input_dims->c is a multiple of 4

    2. conv_params->padding.w = conv_params->padding.h = 0

    3. conv_params->stride.w = conv_params->stride.h = 1

Parameters
  • ctx[inout] Function context that contains the additional buffer if required by the function. riscv_convolve_1x1_s8_fast_get_buffer_size will return the buffer_size if required. The caller is expected to clear the buffer ,if applicable, for security reasons.

  • conv_params[in] Convolution parameters (e.g. strides, dilations, pads,…). Range of conv_params->input_offset : [-127, 128] Range of conv_params->output_offset : [-128, 127]

  • quant_params[in] Per-channel quantization info. It contains the multiplier and shift values to be applied to each output channel

  • input_dims[in] Input (activation) tensor dimensions. Format: [N, H, W, C_IN]

  • input_data[in] Input (activation) data pointer. Data type: int8

  • filter_dims[in] Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN]

  • filter_data[in] Filter data pointer. Data type: int8

  • bias_dims[in] Bias tensor dimensions. Format: [C_OUT]

  • bias_data[in] Optional bias data pointer. Data type: int32

  • output_dims[in] Output tensor dimensions. Format: [N, H, W, C_OUT]

  • output_data[out] Output data pointer. Data type: int8

  • ctx[inout] Function context that contains the additional buffer if required by the function. riscv_convolve_1x1_s8_fast_get_buffer_size will return the buffer_size if required

  • conv_params[in] Convolution parameters (e.g. strides, dilations, pads,…). Range of conv_params->input_offset : [-127, 128] Range of conv_params->output_offset : [-128, 127]

  • quant_params[in] Per-channel quantization info. It contains the multiplier and shift values to be applied to each output channel

  • input_dims[in] Input (activation) tensor dimensions. Format: [N, H, W, C_IN]

  • input_data[in] Input (activation) data pointer. Data type: int8

  • filter_dims[in] Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN]

  • filter_data[in] Filter data pointer. Data type: int8

  • bias_dims[in] Bias tensor dimensions. Format: [C_OUT]

  • bias_data[in] Optional bias data pointer. Data type: int32

  • output_dims[in] Output tensor dimensions. Format: [N, H, W, C_OUT]

  • output_data[out] Output data pointer. Data type: int8

Returns

The function returns either RISCV_NMSIS_NN_ARG_ERROR if argument constraints fail. or, RISCV_NMSIS_NN_SUCCESS on successful completion.

Returns

The function returns either RISCV_NMSIS_NN_SIZE_MISMATCH if argument constraints fail. or, RISCV_NMSIS_NN_SUCCESS on successful completion.

riscv_nmsis_nn_status riscv_convolve_fast_s16(const nmsis_nn_context *ctx, const nmsis_nn_conv_params *conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int16_t *input_data, const nmsis_nn_dims *filter_dims, const int8_t *filter_data, const nmsis_nn_dims *bias_dims, const int64_t *bias_data, const nmsis_nn_dims *output_dims, int16_t *output_data)

Optimized s16 convolution function.

  1. Supported framework: TensorFlow Lite micro

  2. Additional memory is required for optimization. Refer to argument ‘ctx’ for details.

  3. Implementation supports kernel volumes (filter width * filter height * input channels) < 512.

Parameters
  • ctx[inout] Function context that contains the additional buffer if required by the function. riscv_convolve_fast_s16_get_buffer_size will return the buffer_size if required. The caller is expected to clear the buffer ,if applicable, for security reasons.

  • conv_params[in] Convolution parameters (e.g. strides, dilations, pads,…). conv_params->input_offset : Not used conv_params->output_offset : Not used

  • quant_params[in] Per-channel quantization info. It contains the multiplier and shift values to be applied to each output channel

  • input_dims[in] Input (activation) tensor dimensions. Format: [N, H, W, C_IN]

  • input_data[in] Input (activation) data pointer. Data type: int16

  • filter_dims[in] Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial filter dimensions. (filter_dims->w * filter_dims->h * input_dims->c) must not exceed 512

  • filter_data[in] Filter data pointer. Data type: int8

  • bias_dims[in] Bias tensor dimensions. Format: [C_OUT]

  • bias_data[in] Optional bias data pointer. Data type: int64

  • output_dims[in] Output tensor dimensions. Format: [N, H, W, C_OUT]

  • output_data[out] Output data pointer. Data type: int16

Returns

The function returns RISCV_NMSIS_NN_SUCCESS

riscv_nmsis_nn_status riscv_convolve_HWC_q15_basic(const q15_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q15_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q15_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q15_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB)

Basic Q15 convolution function.

Buffer size:

bufferA size: ch_im_in*dim_kernel*dim_kernel

bufferB size: 0

This basic version is designed to work for any input tensor and weight dimension.

Parameters
  • Im_in[in] pointer to input tensor

  • dim_im_in[in] input tensor dimention

  • ch_im_in[in] number of input tensor channels

  • wt[in] pointer to kernel weights

  • ch_im_out[in] number of filters, i.e., output tensor channels

  • dim_kernel[in] filter kernel size

  • padding[in] padding sizes

  • stride[in] convolution stride

  • bias[in] pointer to bias

  • bias_shift[in] amount of left-shift for bias

  • out_shift[in] amount of right-shift for output

  • Im_out[inout] pointer to output tensor

  • dim_im_out[in] output tensor dimension

  • bufferA[inout] pointer to buffer space for input

  • bufferB[inout] pointer to buffer space for output

Returns

The function returns RISCV_NMSIS_NN_SUCCESS

riscv_nmsis_nn_status riscv_convolve_HWC_q15_fast(const q15_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q15_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q15_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q15_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB)

Fast Q15 convolution function.

Buffer size:

bufferA size: 2*ch_im_in*dim_kernel*dim_kernel

bufferB size: 0

Input dimension constraints:

ch_im_in is multiple of 2

ch_im_out is multiple of 2

dim_im_out is a multiple of 2

Parameters
  • Im_in[in] pointer to input tensor

  • dim_im_in[in] input tensor dimention

  • ch_im_in[in] number of input tensor channels

  • wt[in] pointer to kernel weights

  • ch_im_out[in] number of filters, i.e., output tensor channels

  • dim_kernel[in] filter kernel size

  • padding[in] padding sizes

  • stride[in] convolution stride

  • bias[in] pointer to bias

  • bias_shift[in] amount of left-shift for bias

  • out_shift[in] amount of right-shift for output

  • Im_out[inout] pointer to output tensor

  • dim_im_out[in] output tensor dimension

  • bufferA[inout] pointer to buffer space for input

  • bufferB[inout] pointer to buffer space for output

Returns

The function returns either RISCV_NMSIS_NN_SIZE_MISMATCH or RISCV_NMSIS_NN_SUCCESS based on the outcome of size checking.

riscv_nmsis_nn_status riscv_convolve_HWC_q15_fast_nonsquare(const q15_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q15_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q15_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q15_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB)

Fast Q15 convolution function (non-sqaure shape)

Buffer size:

bufferA size: 2*ch_im_in*dim_kernel*dim_kernel

bufferB size: 0

Input dimension constraints:

ch_im_in is multiple of 2

ch_im_out is multiple of 2

Parameters
  • Im_in[in] pointer to input tensor

  • dim_im_in_x[in] input tensor dimention x

  • dim_im_in_y[in] input tensor dimention y

  • ch_im_in[in] number of input tensor channels

  • wt[in] pointer to kernel weights

  • ch_im_out[in] number of filters, i.e., output tensor channels

  • dim_kernel_x[in] filter kernel size x

  • dim_kernel_y[in] filter kernel size y

  • padding_x[in] padding size x

  • padding_y[in] padding size y

  • stride_x[in] convolution stride x

  • stride_y[in] convolution stride y

  • bias[in] pointer to bias

  • bias_shift[in] amount of left-shift for bias

  • out_shift[in] amount of right-shift for output

  • Im_out[inout] pointer to output tensor

  • dim_im_out_x[in] output tensor dimension x

  • dim_im_out_y[in] output tensor dimension y

  • bufferA[inout] pointer to buffer space for input

  • bufferB[inout] pointer to buffer space for output

Returns

The function returns either RISCV_NMSIS_NN_SIZE_MISMATCH or RISCV_NMSIS_NN_SUCCESS based on the outcome of size checking.

riscv_nmsis_nn_status riscv_convolve_HWC_q7_basic(const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB)

Basic Q7 convolution function.

Buffer size:

bufferA size: 2*ch_im_in*dim_kernel*dim_kernel

bufferB size: 0

This basic version is designed to work for any input tensor and weight dimension.

Parameters
  • Im_in[in] pointer to input tensor

  • dim_im_in[in] input tensor dimention

  • ch_im_in[in] number of input tensor channels

  • wt[in] pointer to kernel weights

  • ch_im_out[in] number of filters, i.e., output tensor channels

  • dim_kernel[in] filter kernel size

  • padding[in] padding sizes

  • stride[in] convolution stride

  • bias[in] pointer to bias

  • bias_shift[in] amount of left-shift for bias

  • out_shift[in] amount of right-shift for output

  • Im_out[inout] pointer to output tensor

  • dim_im_out[in] output tensor dimension

  • bufferA[inout] pointer to buffer space for input

  • bufferB[inout] pointer to buffer space for output

Returns

The function returns RISCV_NMSIS_NN_SUCCESS

riscv_nmsis_nn_status riscv_convolve_HWC_q7_basic_nonsquare(const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB)

Basic Q7 convolution function (non-sqaure shape)

Basic Q7 convolution function (non-square shape)

Parameters
  • Im_in[in] pointer to input tensor

  • dim_im_in_x[in] input tensor dimention x

  • dim_im_in_y[in] input tensor dimention y

  • ch_im_in[in] number of input tensor channels

  • wt[in] pointer to kernel weights

  • ch_im_out[in] number of filters, i.e., output tensor channels

  • dim_kernel_x[in] filter kernel size x

  • dim_kernel_y[in] filter kernel size y

  • padding_x[in] padding size x

  • padding_y[in] padding size y

  • stride_x[in] convolution stride x

  • stride_y[in] convolution stride y

  • bias[in] pointer to bias

  • bias_shift[in] amount of left-shift for bias

  • out_shift[in] amount of right-shift for output

  • Im_out[inout] pointer to output tensor

  • dim_im_out_x[in] output tensor dimension x

  • dim_im_out_y[in] output tensor dimension y

  • bufferA[inout] pointer to buffer space for input

  • bufferB[inout] pointer to buffer space for output

Returns

The function returns RISCV_NMSIS_NN_SUCCESS

riscv_nmsis_nn_status riscv_convolve_HWC_q7_fast(const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB)

Fast Q7 convolution function.

Buffer size:

bufferA size: 2*ch_im_in*dim_kernel*dim_kernel

bufferB size: 0

Input dimension constraints:

ch_im_in is multiple of 4 ( because of the SIMD32 read and swap )

ch_im_out is multiple of 2 ( bacause 2x2 mat_mult kernel )

The im2col converts the Q7 tensor input into Q15 column, which is stored in bufferA. There is reordering happenning during this im2col process with riscv_q7_to_q15_reordered_no_shift. For every four elements, the second and third elements are swapped.

The computation kernel riscv_nn_mat_mult_kernel_q7_q15_reordered does the GEMM computation with the reordered columns.

To speed-up the determination of the padding condition, we split the computation into 3x3 parts, i.e., {top, mid, bottom} X {left, mid, right}. This reduces the total number of boundary condition checks and improves the data copying performance.

Parameters
  • Im_in[in] pointer to input tensor

  • dim_im_in[in] input tensor dimention

  • ch_im_in[in] number of input tensor channels

  • wt[in] pointer to kernel weights

  • ch_im_out[in] number of filters, i.e., output tensor channels

  • dim_kernel[in] filter kernel size

  • padding[in] padding sizes

  • stride[in] convolution stride

  • bias[in] pointer to bias

  • bias_shift[in] amount of left-shift for bias

  • out_shift[in] amount of right-shift for output

  • Im_out[inout] pointer to output tensor

  • dim_im_out[in] output tensor dimension

  • bufferA[inout] pointer to buffer space for input

  • bufferB[inout] pointer to buffer space for output

Returns

The function returns either RISCV_NMSIS_NN_SIZE_MISMATCH or RISCV_NMSIS_NN_SUCCESS based on the outcome of size checking.

riscv_nmsis_nn_status riscv_convolve_HWC_q7_fast_nonsquare(const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB)

Fast Q7 convolution function (non-sqaure shape)

This function is the version with full list of optimization tricks, but with some constraints: ch_im_in is multiple of 4 ch_im_out is multiple of 2

Parameters
  • Im_in[in] pointer to input tensor

  • dim_im_in_x[in] input tensor dimention x

  • dim_im_in_y[in] input tensor dimention y

  • ch_im_in[in] number of input tensor channels

  • wt[in] pointer to kernel weights

  • ch_im_out[in] number of filters, i.e., output tensor channels

  • dim_kernel_x[in] filter kernel size x

  • dim_kernel_y[in] filter kernel size y

  • padding_x[in] padding size x

  • padding_y[in] padding size y

  • stride_x[in] convolution stride x

  • stride_y[in] convolution stride y

  • bias[in] pointer to bias

  • bias_shift[in] amount of left-shift for bias

  • out_shift[in] amount of right-shift for output

  • Im_out[inout] pointer to output tensor

  • dim_im_out_x[in] output tensor dimension x

  • dim_im_out_y[in] output tensor dimension y

  • bufferA[inout] pointer to buffer space for input

  • bufferB[inout] pointer to buffer space for output

Returns

The function returns either RISCV_NMSIS_NN_SIZE_MISMATCH or RISCV_NMSIS_NN_SUCCESS based on the outcome of size checking.

riscv_nmsis_nn_status riscv_convolve_HWC_q7_RGB(const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB)

Q7 convolution function for RGB image.

Q7 version of convolution for RGB image.

Buffer size:

bufferA size: 2*ch_im_in*dim_kernel*dim_kernel

bufferB size: 0

Input dimension constraints:

ch_im_in equals 3

This kernel is written exclusively for convolution with ch_im_in equals 3. This applies on the first layer of CNNs which has input image with RGB format.

Parameters
  • Im_in[in] pointer to input tensor

  • dim_im_in[in] input tensor dimention

  • ch_im_in[in] number of input tensor channels

  • wt[in] pointer to kernel weights

  • ch_im_out[in] number of filters, i.e., output tensor channels

  • dim_kernel[in] filter kernel size

  • padding[in] padding sizes

  • stride[in] convolution stride

  • bias[in] pointer to bias

  • bias_shift[in] amount of left-shift for bias

  • out_shift[in] amount of right-shift for output

  • Im_out[inout] pointer to output tensor

  • dim_im_out[in] output tensor dimension

  • bufferA[inout] pointer to buffer space for input

  • bufferB[inout] pointer to buffer space for output

Returns

The function returns either RISCV_NMSIS_NN_SIZE_MISMATCH or RISCV_NMSIS_NN_SUCCESS based on the outcome of size checking.

riscv_nmsis_nn_status riscv_convolve_s16(const nmsis_nn_context *ctx, const nmsis_nn_conv_params *conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int16_t *input_data, const nmsis_nn_dims *filter_dims, const int8_t *filter_data, const nmsis_nn_dims *bias_dims, const int64_t *bias_data, const nmsis_nn_dims *output_dims, int16_t *output_data)

Basic s16 convolution function.

  1. Supported framework: TensorFlow Lite micro

  2. Additional memory is required for optimization. Refer to argument ‘ctx’ for details.

Parameters
  • ctx[inout] Function context that contains the additional buffer if required by the function. riscv_convolve_s16_get_buffer_size will return the buffer_size if required. The caller is expected to clear the buffer ,if applicable, for security reasons.

  • conv_params[in] Convolution parameters (e.g. strides, dilations, pads,…). conv_params->input_offset : Not used conv_params->output_offset : Not used

  • quant_params[in] Per-channel quantization info. It contains the multiplier and shift values to be applied to each output channel

  • input_dims[in] Input (activation) tensor dimensions. Format: [N, H, W, C_IN]

  • input_data[in] Input (activation) data pointer. Data type: int16

  • filter_dims[in] Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial filter dimensions

  • filter_data[in] Filter data pointer. Data type: int8

  • bias_dims[in] Bias tensor dimensions. Format: [C_OUT]

  • bias_data[in] Optional bias data pointer. Data type: int64

  • output_dims[in] Output tensor dimensions. Format: [N, H, W, C_OUT]

  • output_data[out] Output data pointer. Data type: int16

Returns

The function returns RISCV_NMSIS_NN_SUCCESS

riscv_nmsis_nn_status riscv_convolve_s8(const nmsis_nn_context *ctx, const nmsis_nn_conv_params *conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int8_t *input_data, const nmsis_nn_dims *filter_dims, const int8_t *filter_data, const nmsis_nn_dims *bias_dims, const int32_t *bias_data, const nmsis_nn_dims *output_dims, int8_t *output_data)

Basic s8 convolution function.

  1. Supported framework: TensorFlow Lite micro

  2. Additional memory is required for optimization. Refer to argument ‘ctx’ for details.

Parameters
  • ctx[inout] Function context that contains the additional buffer if required by the function. riscv_convolve_s8_get_buffer_size will return the buffer_size if required. The caller is expected to clear the buffer ,if applicable, for security reasons.

  • conv_params[in] Convolution parameters (e.g. strides, dilations, pads,…). Range of conv_params->input_offset : [-127, 128] Range of conv_params->output_offset : [-128, 127]

  • quant_params[in] Per-channel quantization info. It contains the multiplier and shift values to be applied to each output channel

  • input_dims[in] Input (activation) tensor dimensions. Format: [N, H, W, C_IN]

  • input_data[in] Input (activation) data pointer. Data type: int8

  • filter_dims[in] Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial filter dimensions

  • filter_data[in] Filter data pointer. Data type: int8

  • bias_dims[in] Bias tensor dimensions. Format: [C_OUT]

  • bias_data[in] Optional bias data pointer. Data type: int32

  • output_dims[in] Output tensor dimensions. Format: [N, H, W, C_OUT]

  • output_data[out] Output data pointer. Data type: int8

Returns

The function returns RISCV_NMSIS_NN_SUCCESS

riscv_nmsis_nn_status riscv_convolve_wrapper_s16(const nmsis_nn_context *ctx, const nmsis_nn_conv_params *conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int16_t *input_data, const nmsis_nn_dims *filter_dims, const int8_t *filter_data, const nmsis_nn_dims *bias_dims, const int64_t *bias_data, const nmsis_nn_dims *output_dims, int16_t *output_data)

s16 convolution layer wrapper function with the main purpose to call the optimal kernel available in nmsis-nn to perform the convolution.

Parameters
  • ctx[inout] Function context that contains the additional buffer if required by the function. riscv_convolve_wrapper_s8_get_buffer_size will return the buffer_size if required The caller is expected to clear the buffer ,if applicable, for security reasons.

  • conv_params[in] Convolution parameters (e.g. strides, dilations, pads,…). conv_params->input_offset : Not used conv_params->output_offset : Not used

  • quant_params[in] Per-channel quantization info. It contains the multiplier and shift values to be applied to each output channel

  • input_dims[in] Input (activation) tensor dimensions. Format: [N, H, W, C_IN]

  • input_data[in] Input (activation) data pointer. Data type: int16

  • filter_dims[in] Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial filter dimensions

  • filter_data[in] Filter data pointer. Data type: int8

  • bias_dims[in] Bias tensor dimensions. Format: [C_OUT]

  • bias_data[in] Bias data pointer. Data type: int64

  • output_dims[in] Output tensor dimensions. Format: [N, H, W, C_OUT]

  • output_data[out] Output data pointer. Data type: int16

Returns

The function returns either RISCV_NMSIS_NN_ARG_ERROR if argument constraints fail. or, RISCV_NMSIS_NN_SUCCESS on successful completion.

riscv_nmsis_nn_status riscv_convolve_wrapper_s8(const nmsis_nn_context *ctx, const nmsis_nn_conv_params *conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int8_t *input_data, const nmsis_nn_dims *filter_dims, const int8_t *filter_data, const nmsis_nn_dims *bias_dims, const int32_t *bias_data, const nmsis_nn_dims *output_dims, int8_t *output_data)

s8 convolution layer wrapper function with the main purpose to call the optimal kernel available in nmsis-nn to perform the convolution.

Parameters
  • ctx[inout] Function context that contains the additional buffer if required by the function. riscv_convolve_wrapper_s8_get_buffer_size will return the buffer_size if required. The caller is expected to clear the buffer ,if applicable, for security reasons.

  • conv_params[in] Convolution parameters (e.g. strides, dilations, pads,…). Range of conv_params->input_offset : [-127, 128] Range of conv_params->output_offset : [-128, 127]

  • quant_params[in] Per-channel quantization info. It contains the multiplier and shift values to be applied to each output channel

  • input_dims[in] Input (activation) tensor dimensions. Format: [N, H, W, C_IN]

  • input_data[in] Input (activation) data pointer. Data type: int8

  • filter_dims[in] Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial filter dimensions

  • filter_data[in] Filter data pointer. Data type: int8

  • bias_dims[in] Bias tensor dimensions. Format: [C_OUT]

  • bias_data[in] Bias data pointer. Data type: int32

  • output_dims[in] Output tensor dimensions. Format: [N, H, W, C_OUT]

  • output_data[out] Output data pointer. Data type: int8

Returns

The function returns either RISCV_NMSIS_NN_ARG_ERROR if argument constraints fail. or, RISCV_NMSIS_NN_SUCCESS on successful completion.

riscv_nmsis_nn_status riscv_depthwise_conv_3x3_s8(const nmsis_nn_context *ctx, const nmsis_nn_dw_conv_params *dw_conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int8_t *input, const nmsis_nn_dims *filter_dims, const int8_t *kernel, const nmsis_nn_dims *bias_dims, const int32_t *bias, const nmsis_nn_dims *output_dims, int8_t *output)

Optimized s8 depthwise convolution function for 3x3 kernel size with some constraints on the input arguments(documented below). Refer riscv_depthwise_conv_s8() for function argument details.

  • Supported framework : TensorFlow Lite Micro

  • The following constrains on the arguments apply

    1. Number of input channel equals number of output channels

    2. Filter height and width equals 3

    3. Padding along x is either 0 or 1.

Returns

The function returns one of the following RISCV_NMSIS_NN_ARG_ERROR - Unsupported dimension of tensors

  • Unsupported pad size along the x axis RISCV_NMSIS_NN_SUCCESS - Successful operation

riscv_nmsis_nn_status riscv_depthwise_conv_fast_s16(const nmsis_nn_context *ctx, const nmsis_nn_dw_conv_params *dw_conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int16_t *input, const nmsis_nn_dims *filter_dims, const int8_t *kernel, const nmsis_nn_dims *bias_dims, const int64_t *bias, const nmsis_nn_dims *output_dims, int16_t *output)

Optimized s16 depthwise convolution function with constraint that in_channel equals out_channel. Refer riscv_depthwise_conv_s16() for function argument details.

RISCV_NMSIS_NN_SUCCESS - Successful operation

  • Supported framework: TensorFlow Lite

  • The following constrains on the arguments apply

    1. Number of input channel equals number of output channels or ch_mult equals 1

  • Reccomended when number of channels is 4 or greater.

Returns

The function returns one of the following RISCV_NMSIS_NN_ARG_ERROR - ctx-buff == NULL and riscv_depthwise_conv_fast_s16_get_buffer_size() > 0 or input channel != output channel or ch_mult != 1

static void __attribute__ ((unused))
static void depthwise_conv_s16_generic_s16(const int16_t *input, const uint16_t input_batches, const uint16_t input_x, const uint16_t input_y, const uint16_t input_ch, const int8_t *kernel, const uint16_t ch_mult, const uint16_t kernel_x, const uint16_t kernel_y, const uint16_t pad_x, const uint16_t pad_y, const uint16_t stride_x, const uint16_t stride_y, const int64_t *bias, int16_t *output, const int32_t *output_shift, const int32_t *output_mult, const uint16_t output_x, const uint16_t output_y, const int32_t output_activation_min, const int32_t output_activation_max, const uint16_t dilation_x, const uint16_t dilation_y)
riscv_nmsis_nn_status riscv_depthwise_conv_s16(const nmsis_nn_context *ctx, const nmsis_nn_dw_conv_params *dw_conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int16_t *input, const nmsis_nn_dims *filter_dims, const int8_t *kernel, const nmsis_nn_dims *bias_dims, const int64_t *bias, const nmsis_nn_dims *output_dims, int16_t *output)

Basic s16 depthwise convolution function that doesn’t have any constraints on the input dimensions.

  • Supported framework: TensorFlow Lite

Parameters
  • ctx[inout] Function context (e.g. temporary buffer). Check the function definition file to see if an additional buffer is required. Optional function {API}_get_buffer_size() provides the buffer size if an additional buffer is required. exists if additional memory is. The caller is expected to clear the buffer ,if applicable, for security reasons.

  • dw_conv_params[in] Depthwise convolution parameters (e.g. strides, dilations, pads,…) conv_params->input_offset : Not used conv_params->output_offset : Not used

  • quant_params[in] Per-channel quantization info. It contains the multiplier and shift values to be applied to each output channel

  • input_dims[in] Input (activation) tensor dimensions. Format: [N, H, W, C_IN] Batch argument N is not used.

  • input_data[in] Input (activation) data pointer. Data type: int8

  • filter_dims[in] Filter tensor dimensions. Format: [1, H, W, C_OUT]

  • filter_data[in] Filter data pointer. Data type: int8

  • bias_dims[in] Bias tensor dimensions. Format: [C_OUT]

  • bias_data[in] Bias data pointer. Data type: int64

  • output_dims[in] Output tensor dimensions. Format: [N, H, W, C_OUT]

  • output_data[inout] Output data pointer. Data type: int16

Returns

The function returns RISCV_NMSIS_NN_SUCCESS

static void depthwise_conv_s8_mult_4(const int8_t *input, const int32_t input_x, const int32_t input_y, const int32_t input_ch, const int8_t *kernel, const int32_t output_ch, const int32_t ch_mult, const int32_t kernel_x, const int32_t kernel_y, const int32_t pad_x, const int32_t pad_y, const int32_t stride_x, const int32_t stride_y, const int32_t *bias, int8_t *output, const int32_t *output_shift, const int32_t *output_mult, const int32_t output_x, const int32_t output_y, const int32_t output_offset, const int32_t input_offset, const int32_t output_activation_min, const int32_t output_activation_max)
static void depthwise_conv_s8_generic(const int8_t *input, const uint16_t input_batches, const uint16_t input_x, const uint16_t input_y, const uint16_t input_ch, const int8_t *kernel, const uint16_t output_ch, const uint16_t ch_mult, const uint16_t kernel_x, const uint16_t kernel_y, const uint16_t pad_x, const uint16_t pad_y, const uint16_t stride_x, const uint16_t stride_y, const int32_t *bias, int8_t *output, const int32_t *output_shift, const int32_t *output_mult, const uint16_t output_x, const uint16_t output_y, const int32_t output_offset, const int32_t input_offset, const int32_t output_activation_min, const int32_t output_activation_max, const uint16_t dilation_x, const uint16_t dilation_y)
riscv_nmsis_nn_status riscv_depthwise_conv_s8(const nmsis_nn_context *ctx, const nmsis_nn_dw_conv_params *dw_conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int8_t *input, const nmsis_nn_dims *filter_dims, const int8_t *kernel, const nmsis_nn_dims *bias_dims, const int32_t *bias, const nmsis_nn_dims *output_dims, int8_t *output)

Basic s8 depthwise convolution function that doesn’t have any constraints on the input dimensions.

  • Supported framework: TensorFlow Lite

Parameters
  • ctx[inout] Function context (e.g. temporary buffer). Check the function definition file to see if an additional buffer is required. Optional function {API}_get_buffer_size() provides the buffer size if an additional buffer is required exists if additional memory is. The caller is expected to clear the buffer ,if applicable, for security reasons.

  • dw_conv_params[in] Depthwise convolution parameters (e.g. strides, dilations, pads,…) dw_conv_params->dilation is not used. Range of dw_conv_params->input_offset : [-127, 128] Range of dw_conv_params->input_offset : [-128, 127]

  • quant_params[in] Per-channel quantization info. It contains the multiplier and shift values to be applied to each output channel

  • input_dims[in] Input (activation) tensor dimensions. Format: [N, H, W, C_IN] Batch argument N is not used.

  • input_data[in] Input (activation) data pointer. Data type: int8

  • filter_dims[in] Filter tensor dimensions. Format: [1, H, W, C_OUT]

  • filter_data[in] Filter data pointer. Data type: int8

  • bias_dims[in] Bias tensor dimensions. Format: [C_OUT]

  • bias_data[in] Bias data pointer. Data type: int32

  • output_dims[in] Output tensor dimensions. Format: [N, H, W, C_OUT]

  • output_data[inout] Output data pointer. Data type: int8

Returns

The function returns RISCV_NMSIS_NN_SUCCESS

riscv_nmsis_nn_status riscv_depthwise_conv_s8_opt(const nmsis_nn_context *ctx, const nmsis_nn_dw_conv_params *dw_conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int8_t *input, const nmsis_nn_dims *filter_dims, const int8_t *kernel, const nmsis_nn_dims *bias_dims, const int32_t *bias, const nmsis_nn_dims *output_dims, int8_t *output)

Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel. Refer riscv_depthwise_conv_s8() for function argument details.

  • Supported framework: TensorFlow Lite

  • The following constrains on the arguments apply

    1. Number of input channel equals number of output channels or ch_mult equals 1

  • Reccomended when number of channels is 4 or greater.

Note

If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out for the following if MVE optimizations(Arm Helium Technology) are used.

  • Output shift

  • Output multiplier

  • Output bias

  • kernel

Returns

The function returns one of the following RISCV_NMSIS_NN_ARG_ERROR - input channel != output channel or ch_mult != 1 RISCV_NMSIS_NN_SUCCESS - Successful operation

static void depthwise_conv_u8_mult_4(const uint8_t *input, const int32_t input_x, const int32_t input_y, const int32_t input_ch, const uint8_t *kernel, const int32_t output_ch, const int32_t ch_mult, const int32_t kernel_x, const int32_t kernel_y, const int32_t pad_x, const int32_t pad_y, const int32_t stride_x, const int32_t stride_y, const int32_t *bias, uint8_t *output, const int32_t output_shift, const int32_t output_mult, const int32_t output_x, const int32_t output_y, const int32_t output_offset, const int32_t input_offset, const int32_t filter_offset, const int32_t output_activation_min, const int32_t output_activation_max)
static void depthwise_conv_u8_generic(const uint8_t *input, const int32_t input_x, const int32_t input_y, const int32_t input_ch, const uint8_t *kernel, const int32_t output_ch, const int32_t ch_mult, const int32_t kernel_x, const int32_t kernel_y, const int32_t pad_x, const int32_t pad_y, const int32_t stride_x, const int32_t stride_y, const int32_t *bias, uint8_t *output, const int32_t output_shift, const int32_t output_mult, const int32_t output_x, const int32_t output_y, const int32_t output_offset, const int32_t input_offset, const int32_t filter_offset, const int32_t output_activation_min, const int32_t output_activation_max)
riscv_nmsis_nn_status riscv_depthwise_conv_u8_basic_ver1(const uint8_t *input, const uint16_t input_x, const uint16_t input_y, const uint16_t input_ch, const uint8_t *kernel, const uint16_t kernel_x, const uint16_t kernel_y, const int16_t ch_mult, const int16_t pad_x, const int16_t pad_y, const int16_t stride_x, const int16_t stride_y, const int16_t dilation_x, const int16_t dilation_y, const int32_t *bias, const int32_t input_offset, const int32_t filter_offset, const int32_t output_offset, uint8_t *output, const uint16_t output_x, const uint16_t output_y, const int32_t output_activation_min, const int32_t output_activation_max, const int32_t output_shift, const int32_t output_mult)

uint8 depthwise convolution function with asymmetric quantization

uint8 depthwise convolution function with asymmetric quantization Unless specified otherwise, arguments are mandatory.

Parameters
  • input[in] Pointer to input tensor

  • input_x[in] Width of input tensor

  • input_y[in] Height of input tensor

  • input_ch[in] Channels in input tensor

  • kernel[in] Pointer to kernel weights

  • kernel_x[in] Width of kernel

  • kernel_y[in] Height of kernel

  • ch_mult[in] Number of channel multiplier

  • pad_x[in] Padding sizes x

  • pad_y[in] Padding sizes y

  • stride_x[in] Convolution stride along the width

  • stride_y[in] Convolution stride along the height

  • dilation_x[in] Dilation along width. Not used and intended for future enhancement.

  • dilation_y[in] Dilation along height. Not used and intended for future enhancement.

  • bias[in] Pointer to optional bias values. If no bias is available, NULL is expected

  • input_offset[in] Input tensor zero offset

  • filter_offset[in] Kernel tensor zero offset

  • output_offset[in] Output tensor zero offset

  • output[inout] Pointer to output tensor

  • output_x[in] Width of output tensor

  • output_y[in] Height of output tensor

  • output_activation_min[in] Minimum value to clamp the output to. Range : {0, 255}

  • output_activation_max[in] Minimum value to clamp the output to. Range : {0, 255}

  • output_shift[in] Amount of right-shift for output

  • output_mult[in] Output multiplier for requantization

Returns

The function returns one of the following RISCV_NMSIS_NN_SIZE_MISMATCH - Not supported dimension of tensors RISCV_NMSIS_NN_SUCCESS - Successful operation RISCV_NMSIS_NN_ARG_ERROR - Implementation not available

riscv_nmsis_nn_status riscv_depthwise_conv_wrapper_s16(const nmsis_nn_context *ctx, const nmsis_nn_dw_conv_params *dw_conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int16_t *input, const nmsis_nn_dims *filter_dims, const int8_t *filter, const nmsis_nn_dims *bias_dims, const int64_t *bias, const nmsis_nn_dims *output_dims, int16_t *output)

Wrapper function to pick the right optimized s16 depthwise convolution function.

  • Supported framework: TensorFlow Lite

  • Picks one of the the following functions

    1. riscv_depthwise_conv_s16()

    2. riscv_depthwise_conv_fast_s16() - RISC-V CPUs with DSP extension only

Parameters
  • ctx[inout] Function context (e.g. temporary buffer). Check the function definition file to see if an additional buffer is required. Optional function {API}_get_buffer_size() provides the buffer size if required. The caller is expected to clear the buffer ,if applicable, for security reasons.

  • dw_conv_params[in] Depthwise convolution parameters (e.g. strides, dilations, pads,…) dw_conv_params->dilation is not used. Range of dw_conv_params->input_offset : Not used Range of dw_conv_params->output_offset : Not used

  • quant_params[in] Per-channel quantization info. It contains the multiplier and shift values to be applied to each output channel

  • input_dims[in] Input (activation) tensor dimensions. Format: [H, W, C_IN] Batch argument N is not used and assumed to be 1.

  • input_data[in] Input (activation) data pointer. Data type: int16

  • filter_dims[in] Filter tensor dimensions. Format: [1, H, W, C_OUT]

  • filter_data[in] Filter data pointer. Data type: int8

  • bias_dims[in] Bias tensor dimensions. Format: [C_OUT]

  • bias_data[in] Bias data pointer. Data type: int64

  • output_dims[in] Output tensor dimensions. Format: [1, H, W, C_OUT]

  • output_data[inout] Output data pointer. Data type: int16

Returns

The function returns RISCV_NMSIS_NN_SUCCESS - Successful completion.

riscv_nmsis_nn_status riscv_depthwise_conv_wrapper_s8(const nmsis_nn_context *ctx, const nmsis_nn_dw_conv_params *dw_conv_params, const nmsis_nn_per_channel_quant_params *quant_params, const nmsis_nn_dims *input_dims, const int8_t *input, const nmsis_nn_dims *filter_dims, const int8_t *filter, const nmsis_nn_dims *bias_dims, const int32_t *bias, const nmsis_nn_dims *output_dims, int8_t *output)

Wrapper function to pick the right optimized s8 depthwise convolution function.

  • Supported framework: TensorFlow Lite

  • Picks one of the the following functions

    1. riscv_depthwise_conv_s8()

    2. riscv_depthwise_conv_3x3_s8() - RISC-V CPUs with DSP extension only

    3. riscv_depthwise_conv_s8_opt()

  • Check details of riscv_depthwise_conv_s8_opt() for potential data that can be accessed outside of the boundary.

Parameters
  • ctx[inout] Function context (e.g. temporary buffer). Check the function definition file to see if an additional buffer is required. Optional function {API}_get_buffer_size() provides the buffer size if required. The caller is expected to clear the buffer ,if applicable, for security reasons.

  • dw_conv_params[in] Depthwise convolution parameters (e.g. strides, dilations, pads,…) dw_conv_params->dilation is not used. Range of dw_conv_params->input_offset : [-127, 128] Range of dw_conv_params->output_offset : [-128, 127]

  • quant_params[in] Per-channel quantization info. It contains the multiplier and shift values to be applied to each output channel

  • input_dims[in] Input (activation) tensor dimensions. Format: [H, W, C_IN] Batch argument N is not used and assumed to be 1.

  • input_data[in] Input (activation) data pointer. Data type: int8

  • filter_dims[in] Filter tensor dimensions. Format: [1, H, W, C_OUT]

  • filter_data[in] Filter data pointer. Data type: int8

  • bias_dims[in] Bias tensor dimensions. Format: [C_OUT]

  • bias_data[in] Bias data pointer. Data type: int32

  • output_dims[in] Output tensor dimensions. Format: [1, H, W, C_OUT]

  • output_data[inout] Output data pointer. Data type: int8

Returns

The function returns RISCV_NMSIS_NN_SUCCESS - Successful completion.

riscv_nmsis_nn_status riscv_depthwise_separable_conv_HWC_q7(const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB)

Q7 depthwise separable convolution function.

Buffer size:

bufferA size: 2*ch_im_in*dim_kernel*dim_kernel

bufferB size: 0

Input dimension constraints:

ch_im_in equals ch_im_out

Implementation: There are 3 nested loop here: Inner loop: calculate each output value with MAC instruction over an accumulator Mid loop: loop over different output channel Outer loop: loop over different output (x, y)

Parameters
  • Im_in[in] pointer to input tensor

  • dim_im_in[in] input tensor dimension

  • ch_im_in[in] number of input tensor channels

  • wt[in] pointer to kernel weights

  • ch_im_out[in] number of filters, i.e., output tensor channels

  • dim_kernel[in] filter kernel size

  • padding[in] padding sizes

  • stride[in] convolution stride

  • bias[in] pointer to bias

  • bias_shift[in] amount of left-shift for bias

  • out_shift[in] amount of right-shift for output

  • Im_out[inout] pointer to output tensor

  • dim_im_out[in] output tensor dimension

  • bufferA[inout] pointer to buffer space for input

  • bufferB[inout] pointer to buffer space for output

Returns

The function returns either RISCV_NMSIS_NN_SIZE_MISMATCH or RISCV_NMSIS_NN_SUCCESS based on the outcome of size checking.

riscv_nmsis_nn_status riscv_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB)

Q7 depthwise separable convolution function (non-square shape)

This function is the version with full list of optimization tricks, but with some constraints: ch_im_in is equal to ch_im_out

Parameters
  • Im_in[in] pointer to input tensor

  • dim_im_in_x[in] input tensor dimension x

  • dim_im_in_y[in] input tensor dimension y

  • ch_im_in[in] number of input tensor channels

  • wt[in] pointer to kernel weights

  • ch_im_out[in] number of filters, i.e., output tensor channels

  • dim_kernel_x[in] filter kernel size x

  • dim_kernel_y[in] filter kernel size y

  • padding_x[in] padding sizes x

  • padding_y[in] padding sizes y

  • stride_x[in] convolution stride x

  • stride_y[in] convolution stride y

  • bias[in] pointer to bias

  • bias_shift[in] amount of left-shift for bias

  • out_shift[in] amount of right-shift for output

  • Im_out[inout] pointer to output tensor

  • dim_im_out_x[in] output tensor dimension x

  • dim_im_out_y[in] output tensor dimension y

  • bufferA[inout] pointer to buffer space for input

  • bufferB[inout] pointer to buffer space for output

Returns

The function returns either RISCV_NMSIS_NN_SIZE_MISMATCH or RISCV_NMSIS_NN_SUCCESS based on the outcome of size checking.