You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1802 lines
67 KiB

/**
********************************************************************************
* Copyright (C) 2021 NEXTCHIP Inc. All rights reserved.
* This software is the confidential and proprietary information of
* NEXTCHIP, Inc. ("Confidential Information"). You shall not disclose such
* Confidential Information and shall use it only in accordance with
* the terms of the license agreement you entered into with NEXTCHIP.
********************************************************************************
*/
/**
********************************************************************************
* @file : nc_neon.c
*
* @brief : frame_mixel api c code
*
* @author : SW Solution team. NextChip Inc.
*
* @date : 2022.09.02.
*
* @version : 1.0.0
********************************************************************************
* @note
* 09.02.2022 / 1.0.0 / Initial released.
*
********************************************************************************
*/
/*
********************************************************************************
* INCLUDES
********************************************************************************
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <arm_neon.h>
#include "nc_neon.h"
#include "nc_types.h"
/*
********************************************************************************
* DEFINES
********************************************************************************
*/
#define INTER_RESIZE_COEF_BITS (11)
#define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS)
#define MAX_ESIZE (16)
#define MIN(a, b) ((a) > (b) ? (b) : (a))
#define MAX(a, b) ((a) < (b) ? (b) : (a))
#define VRESIZE_LINEAR_MASK_TABLE_SIZE (7)
#define BITS (INTER_RESIZE_COEF_BITS * 2)
#define DELTA (1 << ((INTER_RESIZE_COEF_BITS * 2) - 1))
/*
********************************************************************************
* FUNCTION DEFINITIONS
********************************************************************************
*/
static inline uint32_t neon_align_size(int32_t sz, int32_t n)
{
return (sz + n - 1) & (-n);
}
static inline int32_t neon_floor(float a)
{
return (((a) >= 0) ? ((int32_t) a) : ((int32_t) a - 1));
}
static inline int32_t neon_clip(int32_t x, int32_t a, int32_t b)
{
return (x >= a ? (x < b ? x : (b - 1)) : a);
}
static inline uint8_t neon_cast_op(int32_t val)
{
int32_t bits = INTER_RESIZE_COEF_BITS * 2;
int32_t lvShift = bits;
int32_t lvDelta = 1 << (bits - 1);
int32_t temp = MIN (255, MAX (0, (val + lvDelta) >> lvShift));
return (uint8_t) (temp);
}
static void img_hresize_linear_c(const uint8_t **src,
int32_t **dst,
int32_t count,
const int32_t *xofs,
const int16_t *alpha,
int32_t dwidth,
int32_t cn,
int32_t xmax)
{
int32_t dx, k;
int32_t dx0 = 0;
if(count == 2)
{
k = 0;
const uint8_t *S0 = src[k], *S1 = src[k + 1];
int32_t *D0 = dst[k], *D1 = dst[k + 1];
for(dx = dx0; dx < xmax; dx++)
{
int32_t sx = xofs[dx];
int32_t a0 = alpha[dx * 2], a1 = alpha[dx * 2 + 1];
int32_t t0 = S0[sx] * a0;
int32_t t1 = S1[sx] * a0;
t0 += S0[sx + cn] * a1;
t1 += S1[sx + cn] * a1;
D0[dx] = t0;
D1[dx] = t1;
}
for(; dx < dwidth; dx++)
{
int32_t sx = xofs[dx];
D0[dx] = (int32_t) S0[sx] * INTER_RESIZE_COEF_SCALE;
D1[dx] = (int32_t) S1[sx] * INTER_RESIZE_COEF_SCALE;
}
}
if(count == 1)
{
k = 0;
const uint8_t *S = src[k];
int32_t *D = dst[k];
for(dx = 0; dx < xmax; dx++)
{
int32_t sx = xofs[dx];
D[dx] = S[sx] * alpha[dx * 2] + S[sx + cn] * alpha[dx * 2 + 1];
}
for(; dx < dwidth; dx++)
D[dx] = (int32_t) S[xofs[dx]] * INTER_RESIZE_COEF_SCALE;
}
}
void nc_img_vresize_linear_c(const int32_t **src, uint8_t *dst, const int16_t *beta, int32_t width)
{
int32_t b0 = beta[0], b1 = beta[1];
const int32_t *S0 = src[0], *S1 = src[1];
int32_t x = 0;
for(; x <= ((width / 2) - 4); x += 4)
{
int32_t t0, t1;
t0 = S0[x] * b0 + S1[x] * b1;
t1 = S0[x + 1] * b0 + S1[x + 1] * b1;
dst[x] = neon_cast_op(t0);
dst[x + 1] = neon_cast_op(t1);
t0 = S0[x + 2] * b0 + S1[x + 2] * b1;
t1 = S0[x + 3] * b0 + S1[x + 3] * b1;
dst[x + 2] = neon_cast_op(t0);
dst[x + 3] = neon_cast_op(t1);
}
for(; x < (width/2); x++)
dst[x] = neon_cast_op (S0[x] * b0 + S1[x] * b1);
}
static void img_resize_cal_offset_linear(int32_t *xofs,
int16_t *ialpha,
int32_t *yofs,
int16_t *ibeta,
int32_t *xmin,
int32_t *xmax,
int32_t ksize,
int32_t ksize2,
int32_t srcw,
int32_t srch,
int32_t dstw,
int32_t dsth,
int32_t channels)
{
float inv_scale_x = (float) dstw / (float)srcw;
float inv_scale_y = (float) dsth / (float)srch;
int32_t cn = channels;
float scale_x = (float)(1. / inv_scale_x);
float scale_y = (float)(1. / inv_scale_y);
int32_t k, sx, sy, dx, dy;
float fx, fy;
float cbuf[MAX_ESIZE];
// horizontal
for(dx = 0; dx < dstw; dx++)
{
fx = (float) (((dx + 0.5) * scale_x) - 0.5);
sx = neon_floor(fx);
fx -= (float)sx;
if(sx < (ksize2 - 1))
{
*xmin = dx + 1;
if(sx < 0)
fx = 0, sx = 0;
}
if((sx + ksize2) >= srcw)
{
*xmax = MIN (*xmax, dx);
if(sx >= (srcw - 1))
fx = 0, sx = srcw - 1;
}
for(k = 0, sx *= cn; k < cn; k++)
xofs[(dx * cn) + k] = sx + k;
cbuf[0] = 1.f - fx;
cbuf[1] = fx;
for(k = 0; k < ksize; k++)
ialpha[((dx * cn) * ksize) + k] = (int16_t) (cbuf[k] * INTER_RESIZE_COEF_SCALE);
for(; k < cn * ksize; k++)
ialpha[((dx * cn) * ksize) + k] = ialpha[((dx * cn) * ksize) + k - ksize];
}
// vertical
for(dy = 0; dy < dsth; dy++)
{
fy = (float) ((dy + 0.5) * scale_y - 0.5);
sy = neon_floor(fy);
fy -= (float)sy;
yofs[dy] = sy;
cbuf[0] = 1.f - fy;
cbuf[1] = fy;
for(k = 0; k < ksize; k++)
ibeta[(dy * ksize) + k] = (int16_t) (cbuf[k] * INTER_RESIZE_COEF_SCALE);
}
}
static void img_hresize_4channels_linear_neon(const uint8_t **src, int32_t **dst, int32_t count,
const int32_t *xofs, const int16_t *alpha,
int32_t dwidth, int32_t xmax)
{
int32_t dx, k;
int32_t dx0 = 0;
uint8x8_t dS0_vec, dS1_vec;
int16x8_t qS0_vec, qS1_vec;
int16x4_t dS0_0123, dS1_0123;
int32x4_t qT0_vec, qT1_vec;
int32_t dx_buf;
int16x4x2_t alpha_vec_A, alpha_vec_B, alpha_vec_C, alpha_vec_D;
uint8x8_t dS01_vec, dS02_vec, dS03_vec, dS04_vec;
int16x8_t qS01_vec, qS02_vec, qS03_vec, qS04_vec;
int16x4_t dS01_0123, dS01_4567, dS02_0123, dS02_4567, dS03_0123, dS03_4567, dS04_0123, dS04_4567;
int32x4_t qT01_vec, qT02_vec, qT03_vec, qT04_vec;
uint8x8_t dS11_vec, dS12_vec, dS13_vec, dS14_vec;
int16x8_t qS11_vec, qS12_vec, qS13_vec, qS14_vec;
int16x4_t dS11_0123, dS11_4567, dS12_0123, dS12_4567, dS13_0123, dS13_4567, dS14_0123, dS14_4567;
int32x4_t qT11_vec, qT12_vec, qT13_vec, qT14_vec;
int16x4_t dCoeff;
dCoeff = vdup_n_s16(INTER_RESIZE_COEF_SCALE);
for(k = 0; k <= count - 2; k++)
{
const uint8_t *S0 = src[k], *S1 = src[k + 1];
int32_t *D0 = dst[k], *D1 = dst[k + 1];
for(dx = dx0; dx < xmax; dx += 16)
{
dx_buf=dx;
int32_t sx1 = xofs[dx];
int32_t sx2 = xofs[dx + 4];
int32_t sx3 = xofs[dx + 8];
int32_t sx4 = xofs[dx + 12];
alpha_vec_A = vld2_s16(&alpha[dx * 2]); // 1 set
alpha_vec_B = vld2_s16(&alpha[(dx+4) * 2]); // 2 set
alpha_vec_C = vld2_s16(&alpha[(dx+8) * 2]); // 3 set
alpha_vec_D = vld2_s16(&alpha[(dx+12) * 2]); // 4 set
dS01_vec = vld1_u8(&S0[sx1]);
dS02_vec = vld1_u8(&S0[sx2]);
dS03_vec = vld1_u8(&S0[sx3]);
dS04_vec = vld1_u8(&S0[sx4]);
dS11_vec = vld1_u8(&S1[sx1]);
dS12_vec = vld1_u8(&S1[sx2]);
dS13_vec = vld1_u8(&S1[sx3]);
dS14_vec = vld1_u8(&S1[sx4]);
qS01_vec = vreinterpretq_s16_u16(vmovl_u8(dS01_vec));
qS02_vec = vreinterpretq_s16_u16(vmovl_u8(dS02_vec));
qS03_vec = vreinterpretq_s16_u16(vmovl_u8(dS03_vec));
qS04_vec = vreinterpretq_s16_u16(vmovl_u8(dS04_vec));
qS11_vec = vreinterpretq_s16_u16(vmovl_u8(dS11_vec));
qS12_vec = vreinterpretq_s16_u16(vmovl_u8(dS12_vec));
qS13_vec = vreinterpretq_s16_u16(vmovl_u8(dS13_vec));
qS14_vec = vreinterpretq_s16_u16(vmovl_u8(dS14_vec));
dS01_0123 = vget_low_s16(qS01_vec);
dS02_0123 = vget_low_s16(qS02_vec);
dS03_0123 = vget_low_s16(qS03_vec);
dS04_0123 = vget_low_s16(qS04_vec);
dS11_0123 = vget_low_s16(qS11_vec);
dS12_0123 = vget_low_s16(qS12_vec);
dS13_0123 = vget_low_s16(qS13_vec);
dS14_0123 = vget_low_s16(qS14_vec);
dS01_4567 = vget_high_s16(qS01_vec);
dS02_4567 = vget_high_s16(qS02_vec);
dS03_4567 = vget_high_s16(qS03_vec);
dS04_4567 = vget_high_s16(qS04_vec);
dS11_4567 = vget_high_s16(qS11_vec);
dS12_4567 = vget_high_s16(qS12_vec);
dS13_4567 = vget_high_s16(qS13_vec);
dS14_4567 = vget_high_s16(qS14_vec);
qT01_vec = vmull_s16(dS01_0123, alpha_vec_A.val[0]);
qT02_vec = vmull_s16(dS02_0123, alpha_vec_A.val[0]);
qT03_vec = vmull_s16(dS03_0123, alpha_vec_A.val[0]);
qT04_vec = vmull_s16(dS04_0123, alpha_vec_A.val[0]);
qT11_vec = vmull_s16(dS11_0123, alpha_vec_A.val[0]);
qT12_vec = vmull_s16(dS12_0123, alpha_vec_A.val[0]);
qT13_vec = vmull_s16(dS13_0123, alpha_vec_A.val[0]);
qT14_vec = vmull_s16(dS14_0123, alpha_vec_A.val[0]);
qT01_vec = vmlal_s16(qT01_vec, dS01_4567, alpha_vec_A.val[1]);
qT02_vec = vmlal_s16(qT02_vec, dS02_4567, alpha_vec_A.val[1]);
qT03_vec = vmlal_s16(qT03_vec, dS03_4567, alpha_vec_A.val[1]);
qT04_vec = vmlal_s16(qT04_vec, dS04_4567, alpha_vec_A.val[1]);
qT11_vec = vmlal_s16(qT11_vec, dS11_4567, alpha_vec_A.val[1]);
qT12_vec = vmlal_s16(qT12_vec, dS12_4567, alpha_vec_A.val[1]);
qT13_vec = vmlal_s16(qT13_vec, dS13_4567, alpha_vec_A.val[1]);
qT14_vec = vmlal_s16(qT14_vec, dS14_4567, alpha_vec_A.val[1]);
vst1q_s32(&D0[dx_buf], qT01_vec);
vst1q_s32(&D0[dx_buf + 4], qT02_vec);
vst1q_s32(&D0[dx_buf + 8], qT03_vec);
vst1q_s32(&D0[dx_buf + 12], qT04_vec);
vst1q_s32(&D1[dx_buf], qT11_vec);
vst1q_s32(&D1[dx_buf + 4], qT12_vec);
vst1q_s32(&D1[dx_buf + 8], qT13_vec);
vst1q_s32(&D1[dx_buf + 12], qT14_vec);
}
for(; dx < dwidth; dx += 4)
{
int32_t sx = xofs[dx];
dx_buf=dx;
dS0_vec = vld1_u8(&S0[sx]);
dS1_vec = vld1_u8(&S1[sx]);
qS0_vec = vreinterpretq_s16_u16(vmovl_u8(dS0_vec));
qS1_vec = vreinterpretq_s16_u16(vmovl_u8(dS1_vec));
dS0_0123 = vget_low_s16(qS0_vec);
dS1_0123 = vget_low_s16(qS1_vec);
qT0_vec = vmull_s16(dS0_0123, dCoeff);
qT1_vec = vmull_s16(dS1_0123, dCoeff);
vst1q_s32(&D0[dx_buf], qT0_vec);
vst1q_s32(&D1[dx_buf], qT1_vec);
}
}
for(; k < count; k++)
{
const uint8_t *S = src[k];
int32_t *D = dst[k];
for(dx = 0; dx < xmax; dx += 16)
{
int32_t sx1 = xofs[dx];
int32_t sx2 = xofs[dx + 4];
int32_t sx3 = xofs[dx + 8];
int32_t sx4 = xofs[dx + 12];
dx_buf=dx;
alpha_vec_A = vld2_s16(&alpha[dx * 2]); // 1 set
alpha_vec_B = vld2_s16(&alpha[(dx + 4) * 2]); // 2 set
alpha_vec_C = vld2_s16(&alpha[(dx + 8) * 2]); // 3 set
alpha_vec_D = vld2_s16(&alpha[(dx + 12) * 2]); // 4 set
dS01_vec = vld1_u8(&S[sx1]);
dS02_vec = vld1_u8(&S[sx2]);
dS03_vec = vld1_u8(&S[sx3]);
dS04_vec = vld1_u8(&S[sx4]);
qS01_vec = vreinterpretq_s16_u16(vmovl_u8(dS01_vec));
qS02_vec = vreinterpretq_s16_u16(vmovl_u8(dS02_vec));
qS03_vec = vreinterpretq_s16_u16(vmovl_u8(dS03_vec));
qS04_vec = vreinterpretq_s16_u16(vmovl_u8(dS04_vec));
dS01_0123 = vget_low_s16(qS01_vec);
dS02_0123 = vget_low_s16(qS02_vec);
dS03_0123 = vget_low_s16(qS03_vec);
dS04_0123 = vget_low_s16(qS04_vec);
dS01_4567 = vget_high_s16(qS01_vec);
dS02_4567 = vget_high_s16(qS02_vec);
dS03_4567 = vget_high_s16(qS03_vec);
dS04_4567 = vget_high_s16(qS04_vec);
qT01_vec = vmull_s16(dS01_0123, alpha_vec_A.val[0]);
qT02_vec = vmull_s16(dS02_0123, alpha_vec_A.val[0]);
qT03_vec = vmull_s16(dS03_0123, alpha_vec_A.val[0]);
qT04_vec = vmull_s16(dS04_0123, alpha_vec_A.val[0]);
qT01_vec = vmlal_s16(qT01_vec, dS01_4567, alpha_vec_A.val[1]);
qT02_vec = vmlal_s16(qT02_vec, dS02_4567, alpha_vec_A.val[1]);
qT03_vec = vmlal_s16(qT03_vec, dS03_4567, alpha_vec_A.val[1]);
qT04_vec = vmlal_s16(qT04_vec, dS04_4567, alpha_vec_A.val[1]);
vst1q_s32(&D[dx_buf], qT01_vec);
vst1q_s32(&D[dx_buf + 4], qT02_vec);
vst1q_s32(&D[dx_buf + 8], qT03_vec);
vst1q_s32(&D[dx_buf + 12], qT04_vec);
}
for(; dx < dwidth; dx += 4)
{
int32_t sx = xofs[dx];
dx_buf=dx;
dS0_vec = vld1_u8(&S[sx]);
qS0_vec = vreinterpretq_s16_u16(vmovl_u8(dS0_vec));
dS0_0123 = vget_low_s16(qS0_vec);
qT0_vec = vmull_s16(dS0_0123, dCoeff);
vst1q_s32(&D[dx_buf], qT0_vec);
}
}
}
static void img_vresize_linear_neon(const int32_t **src, uint8_t *dst, const int16_t *beta, int32_t width)
{
const uint64_t img_vresize_linear_mask_residual_table[VRESIZE_LINEAR_MASK_TABLE_SIZE] =
{
0x00000000000000FF, 0x000000000000FFFF,
0x0000000000FFFFFF, 0x00000000FFFFFFFF,
0x000000FFFFFFFFFF, 0x0000FFFFFFFFFFFF,
0x00FFFFFFFFFFFFFF
};
const int32_t *S0 = src[0], *S1 = src[1];
int32x4_t qS0_0123, qS0_4567, qS1_0123, qS1_4567;
int32x4_t qS0_89AB, qS0_CDEF, qS1_89AB, qS1_CDEF;
int32x4_t qT_0123, qT_4567;
int32x4_t qT_89AB, qT_CDEF;
int16x4_t dT_0123, dT_4567;
int16x4_t dT_89AB, dT_CDEF;
uint16x8_t qT_01234567;
uint16x8_t qT_89ABCDEF;
uint8x8_t dT_01234567, dDst_01234567;
uint8x8_t dT_89ABCDEF;
int32x2_t dBeta = {};
dBeta = vset_lane_s32((int32_t)(beta[0]), dBeta, 0);
dBeta = vset_lane_s32((int32_t)(beta[1]), dBeta, 1);
int32x4_t qDelta, qMin, qMax;
qDelta = vdupq_n_s32(DELTA);
qMin = vdupq_n_s32(0);
qMax = vdupq_n_s32(255);
int32_t x = 0;
int32_t x_buf = 0;
for(; x <= ((width/2) - 16); x += 16)
{
x_buf = x;
qS0_0123 = vld1q_s32(&S0[x]);
qS0_4567 = vld1q_s32(&S0[x + 4]);
qS0_89AB = vld1q_s32(&S0[x + 8]);
qS0_CDEF = vld1q_s32(&S0[x + 12]);
qS1_0123 = vld1q_s32(&S1[x]);
qS1_4567 = vld1q_s32(&S1[x + 4]);
qS1_89AB = vld1q_s32(&S1[x + 8]);
qS1_CDEF = vld1q_s32(&S1[x + 12]);
qT_0123 = vmulq_lane_s32(qS0_0123, dBeta, 0);
qT_4567 = vmulq_lane_s32(qS0_4567, dBeta, 0);
qT_89AB = vmulq_lane_s32(qS0_89AB, dBeta, 0);
qT_CDEF = vmulq_lane_s32(qS0_CDEF, dBeta, 0);
qT_0123 = vmlaq_lane_s32(qT_0123, qS1_0123, dBeta, 1);
qT_4567 = vmlaq_lane_s32(qT_4567, qS1_4567, dBeta, 1);
qT_89AB = vmlaq_lane_s32(qT_89AB, qS1_89AB, dBeta, 1);
qT_CDEF = vmlaq_lane_s32(qT_CDEF, qS1_CDEF, dBeta, 1);
qT_0123 = vaddq_s32(qT_0123, qDelta);
qT_4567 = vaddq_s32(qT_4567, qDelta);
qT_89AB = vaddq_s32(qT_89AB, qDelta);
qT_CDEF = vaddq_s32(qT_CDEF, qDelta);
qT_0123 = vshrq_n_s32(qT_0123, BITS);
qT_4567 = vshrq_n_s32(qT_4567, BITS);
qT_89AB = vshrq_n_s32(qT_89AB, BITS);
qT_CDEF = vshrq_n_s32(qT_CDEF, BITS);
qT_0123 = vmaxq_s32(qT_0123, qMin);
qT_4567 = vmaxq_s32(qT_4567, qMin);
qT_89AB = vmaxq_s32(qT_89AB, qMin);
qT_CDEF = vmaxq_s32(qT_CDEF, qMin);
qT_0123 = vminq_s32(qT_0123, qMax);
qT_4567 = vminq_s32(qT_4567, qMax);
qT_89AB = vminq_s32(qT_89AB, qMax);
qT_CDEF = vminq_s32(qT_CDEF, qMax);
dT_0123 = vmovn_s32(qT_0123);
dT_4567 = vmovn_s32(qT_4567);
dT_89AB = vmovn_s32(qT_89AB);
dT_CDEF = vmovn_s32(qT_CDEF);
qT_01234567 = vreinterpretq_u16_s16(vcombine_s16(dT_0123, dT_4567));
qT_89ABCDEF = vreinterpretq_u16_s16(vcombine_s16(dT_89AB, dT_CDEF));
dT_01234567 = vmovn_u16(qT_01234567);
dT_89ABCDEF = vmovn_u16(qT_89ABCDEF);
vst1_u8(&dst[x_buf], dT_01234567);
vst1_u8(&dst[x_buf+8], dT_89ABCDEF);
}
for(; x <= ((width/2) - 8); x += 8)
{
qS0_0123 = vld1q_s32(&S0[x]);
qS0_4567 = vld1q_s32(&S0[x + 4]);
qS1_0123 = vld1q_s32(&S1[x]);
qS1_4567 = vld1q_s32(&S1[x + 4]);
qT_0123 = vmulq_lane_s32(qS0_0123, dBeta, 0);
qT_4567 = vmulq_lane_s32(qS0_4567, dBeta, 0);
qT_0123 = vmlaq_lane_s32(qT_0123, qS1_0123, dBeta, 1);
qT_4567 = vmlaq_lane_s32(qT_4567, qS1_4567, dBeta, 1);
qT_0123 = vaddq_s32(qT_0123, qDelta);
qT_4567 = vaddq_s32(qT_4567, qDelta);
qT_0123 = vshrq_n_s32(qT_0123, BITS);
qT_4567 = vshrq_n_s32(qT_4567, BITS);
qT_0123 = vmaxq_s32(qT_0123, qMin);
qT_4567 = vmaxq_s32(qT_4567, qMin);
qT_0123 = vminq_s32(qT_0123, qMax);
qT_4567 = vminq_s32(qT_4567, qMax);
dT_0123 = vmovn_s32(qT_0123);
dT_4567 = vmovn_s32(qT_4567);
qT_01234567 = vreinterpretq_u16_s16(vcombine_s16(dT_0123, dT_4567));
dT_01234567 = vmovn_u16(qT_01234567);
vst1_u8(&dst[x], dT_01234567);
}
if(x < (width/2))
{
uint8x8_t dMask;
dMask = vld1_u8((uint8_t*)(&img_vresize_linear_mask_residual_table[(width/2) - x - 1]));
dDst_01234567 = vld1_u8 (&dst[x]);
qS0_0123 = vld1q_s32(&S0[x]);
qS0_4567 = vld1q_s32(&S0[x + 4]);
qS1_0123 = vld1q_s32(&S1[x]);
qS1_4567 = vld1q_s32(&S1[x + 4]);
qT_0123 = vmulq_lane_s32(qS0_0123, dBeta, 0);
qT_4567 = vmulq_lane_s32(qS0_4567, dBeta, 0);
qT_0123 = vmlaq_lane_s32(qT_0123, qS1_0123, dBeta, 1);
qT_4567 = vmlaq_lane_s32(qT_4567, qS1_4567, dBeta, 1);
qT_0123 = vaddq_s32(qT_0123, qDelta);
qT_4567 = vaddq_s32(qT_4567, qDelta);
qT_0123 = vshrq_n_s32(qT_0123, BITS);
qT_4567 = vshrq_n_s32(qT_4567, BITS);
qT_0123 = vmaxq_s32(qT_0123, qMin);
qT_4567 = vmaxq_s32(qT_4567, qMin);
qT_0123 = vminq_s32(qT_0123, qMax);
qT_4567 = vminq_s32(qT_4567, qMax);
dT_0123 = vmovn_s32(qT_0123);
dT_4567 = vmovn_s32(qT_4567);
qT_01234567 = vreinterpretq_u16_s16(vcombine_s16 (dT_0123, dT_4567));
dT_01234567 = vmovn_u16(qT_01234567);
dMask = vbsl_u8(dMask, dT_01234567, dDst_01234567);
vst1_u8(&dst[x], dMask);
}
}
static void img_resize_generic_linear_neon(uint8_t *src,
uint8_t *dst,
const int32_t *xofs,
const int16_t *_alpha,
const int32_t *yofs,
const int16_t *_beta,
int32_t xmin,
int32_t xmax,
int32_t ksize,
int32_t srcw,
int32_t srch,
int32_t srcstep,
int32_t dstw,
int32_t dsth,
int32_t dststep,
int32_t channels)
{
const int16_t *alpha = _alpha;
const int16_t *beta = _beta;
int32_t cn = channels;
srcw *= cn;
dstw *= cn;
int32_t bufstep = (int32_t)neon_align_size(dstw, 16);
int32_t *buffer_ = (int32_t*)malloc(bufstep * ksize * sizeof(int32_t));
const uint8_t *srows[MAX_ESIZE];
int32_t *rows[MAX_ESIZE];
int32_t prev_sy[MAX_ESIZE];
int32_t k, dy;
xmin *= cn;
xmax *= cn;
for(k = 0; k < ksize; k++)
{
prev_sy[k] = -1;
rows[k] = (int32_t*)(buffer_ + (bufstep * k));
}
for(dy = 0; dy < dsth; dy++, beta += ksize)
{
int32_t sy0 = yofs[dy], k, k0 = ksize, k1 = 0, ksize2 = ksize / 2;
for(k = 0; k < ksize; k++)
{
int32_t sy = neon_clip(sy0 - ksize2 + 1 + k, 0, srch);
for(k1 = MAX (k1, k); k1 < ksize; k1++)
{
if(sy == prev_sy[k1])
{
if(k1 > k)
memcpy(rows[k], rows[k1], bufstep * sizeof (rows[0][0]));
break;
}
}
if(k1 == ksize)
k0 = MIN (k0, k);
srows[k] = (const uint8_t*) (src + (srcstep * sy));
prev_sy[k] = sy;
}
if(k0 < ksize)
{
if(cn == 4)
{
img_hresize_4channels_linear_neon(srows + k0, rows + k0, ksize - k0, xofs, alpha,
dstw, xmax);
}
else
{
img_hresize_linear_c(srows + k0, rows + k0, ksize - k0, xofs, alpha,
dstw, cn, xmax);
}
}
img_vresize_linear_neon((const int32_t**)rows, (uint8_t*)(dst + (dststep * dy)), beta, dstw);
}
if(buffer_)
free(buffer_);
}
int32_t nc_resize_image_bilinear_yuyv(uint8_t *src, uint32_t src_stride, uint32_t src_width, uint32_t src_height,
uint8_t *dst, uint32_t dst_stride, uint32_t dst_width, uint32_t dst_height)
{
int32_t ret = 0;
int32_t cn = 4;
int32_t xmin = 0;
int32_t xmax = dst_width;
int32_t width = dst_width * cn;
int32_t ksize = 0, ksize2;
ksize = 2;
ksize2 = ksize / 2;
uint8_t *buffer_ = (uint8_t*)malloc((width + dst_height) * (sizeof(int32_t) + (sizeof(float) * ksize)));
int32_t *xofs = (int32_t*) buffer_;
int32_t *yofs = xofs + width;
int16_t *ialpha = (int16_t*) (yofs + dst_height);
int16_t *ibeta = ialpha + width * ksize;
img_resize_cal_offset_linear(xofs, ialpha, yofs, ibeta, &xmin, &xmax, ksize, ksize2, src_width, src_height, dst_width, dst_height, cn);
img_resize_generic_linear_neon(src, dst, xofs, ialpha, yofs, ibeta, xmin, xmax, ksize, src_width, src_height, src_stride, dst_width, dst_height, dst_stride, cn);
if(buffer_)
free(buffer_);
return ret;
}
/* example */
int32_t nc_pip_image_bilinear_yuyv(uint8_t *src, int32_t src_w, int32_t src_h, int32_t trg_w, int32_t trg_h,
uint8_t *pip_plane, int32_t pip_w, int32_t pip_h,
int32_t x_ofs, int32_t y_ofs)
{
int32_t ret = OK;
// source info
int32_t pip_stride = pip_w * 2;
int32_t src_stride = src_w * 2;
// destination info
int32_t dst_width = trg_w;
int32_t dst_height = trg_h;
// image start offset
int32_t dst_start_xpos = x_ofs;
int32_t dst_start_ypos = y_ofs;
// over range prevent code
if((dst_start_xpos + dst_width) > pip_w)
return ERR;
if((dst_start_ypos + dst_height) > pip_h)
return ERR;
nc_resize_image_bilinear_yuyv(src, src_stride, src_w, src_h,
(pip_plane + ((dst_start_ypos * pip_stride) + (dst_start_xpos * 2))), pip_stride, dst_width, dst_height);
return ret;
}
void nc_ScaleYUY2RowDown2Box_NEON(uint8_t *src_ptr, int32_t src_stride,
uint8_t *dst, int32_t dst_width)
{
uint8_t *src_ptr_nextline = src_ptr + src_stride;
asm volatile (
// src_yuy2
// y0 u01 y1 v01 y2 u23 y3 v23 y4 u45 y5 v45 y6 u67 y7 v67 : 8 pixels (16 bytes)
// 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
"1: \n" // 1 loop - src : 64bytes(32pixels) | dst : 32bytes(16pixels)
"ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%[src_yuy2]], #64 \n" // v0,v2 = y | v1 = u | v3 = v
"ld4 {v16.16b, v17.16b, v18.16b, v19.16b}, [%[src_yuy2_nextline]], #64 \n"
"uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
"uaddlp v3.8h, v3.16b \n" // V 16 bytes -> 8 shorts.
"uadalp v1.8h, v17.16b \n" // U 16 bytes -> 8 shorts.
"uadalp v3.8h, v19.16b \n" // V 16 bytes -> 8 shorts.
"rshrn v1.8b, v1.8h, #2 \n" // round and pack
"rshrn v3.8b, v3.8h, #2 \n" // round and pack
// v0 <- y2 y0
// v1 <- u23 u01
// v2 <- y3 y1
// v3 <- u34 v01
"zip1 v4.16b, v0.16b, v2.16b \n"
"zip2 v5.16b, v0.16b, v2.16b \n"
"zip1 v6.16b, v16.16b, v18.16b \n"
"zip2 v7.16b, v16.16b, v18.16b \n"
// v0 <- y3 y2 y1 y0
"uaddlp v4.8h, v4.16b \n" // row 1 add adjacent
"uaddlp v5.8h, v5.16b \n"
"uadalp v4.8h, v6.16b \n" // += row 2 add adjacent
"uadalp v5.8h, v7.16b \n"
"rshrn v4.8b, v4.8h, #2 \n" // round and pack
"rshrn2 v4.16b, v5.8h, #2 \n"
"uzp1 v6.16b, v4.16b, v5.16b \n"
"uzp2 v7.16b, v4.16b, v5.16b \n"
// store : v4 v1 v5 v3 order is YUY2
"dup v8.2D, v6.D[0] \n"
"dup v9.2D, v1.D[0] \n"
"dup v10.2D, v7.D[0] \n"
"dup v11.2D, v3.D[0] \n"
"st4 {v8.8b, v9.8b, v10.8b, v11.8b}, [%[dst_yuy2]], #32 \n"
"subs %w[remain_dst_width], %w[remain_dst_width], #16 \n"
"b.gt 1b \n"
: [src_yuy2] "+r"(src_ptr)
,[src_yuy2_nextline] "+r"(src_ptr_nextline)
,[dst_yuy2] "+r"(dst)
,[remain_dst_width] "+r"(dst_width)
:
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19"
);
return;
}
void nc_ScaleYUY2Down2Box(uint8_t* src, int32_t src_stride, int32_t src_height,
uint8_t* dst, int32_t dst_stride, int32_t dst_width)
{
for (int ypos = 0; ypos < src_height; ypos += 2)
nc_ScaleYUY2RowDown2Box_NEON((src + (src_stride * ypos)), src_stride, (dst + ((dst_stride * ypos) / 2)), dst_width);
return;
}
static int32_t halfwidth_stripe(uint8_t *yuv_src_line, uint8_t *yuv_dst_line, int32_t src_width)
{
int32_t ret = OK;
asm volatile (
"1: \n" // 1 loop - src : 64bytes(32pixels) | dst : 32bytes(16pixels)
"ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%[src_yuy2]], #64 \n" // v0,v2 = y | v1 = u | v3 = v
"uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
"uaddlp v3.8h, v3.16b \n" // V 16 bytes -> 8 shorts.
"rshrn v1.8b, v1.8h, #1 \n" // round and pack
"rshrn v3.8b, v3.8h, #1 \n" // round and pack
"zip1 v4.16b, v0.16b, v2.16b \n"
"zip2 v5.16b, v0.16b, v2.16b \n"
"uaddlp v4.8h, v4.16b \n" // row 1 add adjacent
"uaddlp v5.8h, v5.16b \n"
"rshrn v4.8b, v4.8h, #1 \n" // round and pack
"rshrn2 v4.16b, v5.8h, #1 \n"
"uzp1 v6.16b, v4.16b, v5.16b \n"
"uzp2 v7.16b, v4.16b, v5.16b \n"
"dup v8.2D, v6.D[0] \n"
"dup v9.2D, v1.D[0] \n"
"dup v10.2D, v7.D[0] \n"
"dup v11.2D, v3.D[0] \n"
"st4 {v8.8b, v9.8b, v10.8b, v11.8b}, [%[dst_yuy2]], #32 \n"
"subs %w[remain_src_width], %w[remain_src_width], #32 \n"
"b.gt 1b \n"
: [src_yuy2] "+r"(yuv_src_line)
,[dst_yuy2] "+r"(yuv_dst_line)
,[remain_src_width] "+r"(src_width)
:
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19"
);
return ret;
}
int32_t nc_resize_yuy2_halfwidth(uint8_t *yuv_src, uint8_t *yuv_dst, int32_t src_width, int32_t src_height)
{
int32_t ret = OK;
int32_t stride = src_width * 2;
for (int ypos = 0; ypos < src_height; ypos++)
halfwidth_stripe(yuv_src + (stride * ypos), yuv_dst + (stride/2 * ypos), src_width);
return ret;
}
static void img_resize_cal_offset_linear_stripe(int32_t *yofs, uint16_t *ibeta, int32_t src_size, int32_t dst_size)
{
float inv_scale_y = (float)dst_size / (float)src_size;
float scale_y = (float)(1. / inv_scale_y);
int32_t k, sy, dy;
float fy;
float cbuf[2];
// vertical
for(dy = 0; dy < dst_size; dy++)
{
fy = (float) ((dy + 0.5) * scale_y - 0.5);
sy = neon_floor(fy);
fy -= (float)sy;
yofs[dy] = sy;
cbuf[0] = 1.f - fy;
cbuf[1] = fy;
for(k = 0; k < 2; k++)
ibeta[(dy * 2) + k] = (uint16_t) (cbuf[k] * (1<<11)); // floating value x 2^8
}
}
static int32_t hafwid_verti_scaledn_stripe(uint8_t *yuv_src_line, uint8_t *yuv_dst_line, int32_t src_width, uint16_t ibeta1, uint16_t ibeta2)
{
int32_t ret = OK;
uint8_t *yuv_src_line_next = yuv_src_line + (src_width * 2);
uint16_t c_ibeta1 = ibeta1;
uint16_t c_ibeta2 = ibeta2;
uint16_t *ptr_ibeta1;
uint16_t *ptr_ibeta2;
ptr_ibeta1 = &c_ibeta1;
ptr_ibeta2 = &c_ibeta2;
asm volatile (
// "ldr w0, asm_ibeta1 \n"
// "ldr w1, asm_ibeta2 \n"
"mov v8.h[0], w3 \n"
"mov v8.h[1], w4 \n"
"1: \n" // 1 loop - src : 64bytes(32pixels) | dst : 32bytes(16pixels)
"ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%[src_yuy2]], #64 \n" // v0,v2 = y | v1 = u | v3 = v
"ld4 {v10.16b, v11.16b, v12.16b, v13.16b}, [%[src_yuy2_next]], #64 \n" // v0,v2 = y | v1 = u | v3 = v
// half width start
"uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
"uaddlp v3.8h, v3.16b \n" // V 16 bytes -> 8 shorts.
"uaddlp v11.8h, v11.16b \n" // U 16 bytes -> 8 shorts.
"uaddlp v13.8h, v13.16b \n" // V 16 bytes -> 8 shorts.
"rshrn v1.8b, v1.8h, #1 \n" // round and pack
"rshrn v3.8b, v3.8h, #1 \n" // round and pack
"rshrn v11.8b, v11.8h, #1 \n" // round and pack
"rshrn v13.8b, v13.8h, #1 \n" // round and pack
"zip1 v4.16b, v0.16b, v2.16b \n" // y first
"zip1 v14.16b, v10.16b, v12.16b \n"
"zip2 v5.16b, v0.16b, v2.16b \n" // y second
"zip2 v15.16b, v10.16b, v12.16b \n"
"uaddlp v4.8h, v4.16b \n" // y first pair add
"uaddlp v5.8h, v5.16b \n" // y second pair add
"uaddlp v14.8h, v14.16b \n"
"uaddlp v15.8h, v15.16b \n"
"rshrn v4.8b, v4.8h, #1 \n" // round and pack
"rshrn v14.8b, v14.8h, #1 \n" // round and pack
"rshrn2 v4.16b, v5.8h, #1 \n"
"rshrn2 v14.16b, v15.8h, #1 \n"
"uzp1 v0.16b, v4.16b, v5.16b \n" // v5 data 사용 안함 : v0 하단 = v4 홀수번째 + v0 상단 = v5 짝수번째
"uzp1 v10.16b, v14.16b, v15.16b \n"
"uzp2 v2.16b, v4.16b, v5.16b \n"
"uzp2 v12.16b, v14.16b, v15.16b \n"
// half width until here
// fisrt line - u8 to u16
"UXTL v4.8h, v0.8b \n"
"UXTL v5.8h, v1.8b \n"
"UXTL v6.8h, v2.8b \n"
"UXTL v7.8h, v3.8b \n"
// second line - u8 to u16
"UXTL v14.8h, v10.8b \n"
"UXTL v15.8h, v11.8b \n"
"UXTL v16.8h, v12.8b \n"
"UXTL v17.8h, v13.8b \n"
// mul beta1 - first line
"UMULL v21.4s, v4.4h, v8.h[0] \n"
"UMULL2 v22.4s, v4.8h, v8.h[0] \n"
"UMULL v23.4s, v5.4h, v8.h[0] \n"
"UMULL2 v24.4s, v5.8h, v8.h[0] \n"
"UMULL v25.4s, v6.4h, v8.h[0] \n"
"UMULL2 v26.4s, v6.8h, v8.h[0] \n"
"UMULL v27.4s, v7.4h, v8.h[0] \n"
"UMULL2 v28.4s, v7.8h, v8.h[0] \n"
// mla beta2 - second line
"UMLAL v21.4s, v14.4h, v8.h[1] \n"
"UMLAL2 v22.4s, v14.8h, v8.h[1] \n"
"UMLAL v23.4s, v15.4h, v8.h[1] \n"
"UMLAL2 v24.4s, v15.8h, v8.h[1] \n"
"UMLAL v25.4s, v16.4h, v8.h[1] \n"
"UMLAL2 v26.4s, v16.8h, v8.h[1] \n"
"UMLAL v27.4s, v17.4h, v8.h[1] \n"
"UMLAL2 v28.4s, v17.8h, v8.h[1] \n"
// rshrn
"rshrn v21.4h, v21.4s, #11 \n"
"rshrn2 v21.8h, v22.4s, #11 \n"
"rshrn v23.4h, v23.4s, #11 \n"
"rshrn2 v23.8h, v24.4s, #11 \n"
"rshrn v25.4h, v25.4s, #11 \n"
"rshrn2 v25.8h, v26.4s, #11 \n"
"rshrn v27.4h, v27.4s, #11 \n"
"rshrn2 v27.8h, v28.4s, #11 \n"
// compress
"XTN v0.8b, v21.8h \n"
"XTN v1.8b, v23.8h \n"
"XTN v2.8b, v25.8h \n"
"XTN v3.8b, v27.8h \n"
// st
"st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[dst_yuy2]], #32 \n"
"subs %w[remain_src_width], %w[remain_src_width], #32 \n"
"b.gt 1b \n"
: [src_yuy2] "+r"(yuv_src_line)
,[src_yuy2_next] "+r"(yuv_src_line_next)
,[dst_yuy2] "+r"(yuv_dst_line)
,[asm_ibeta1] "+r"(ptr_ibeta1)
,[asm_ibeta2] "+r"(ptr_ibeta2)
,[remain_src_width] "+r"(src_width)
:
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
"x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13"
);
return ret;
}
int32_t nc_resize_yuy2_hafwid_verti_scaledn(uint8_t *yuv_src, uint8_t *yuv_dst, int32_t src_width, int32_t src_height, int32_t dst_height)
{
int32_t ret = OK;
int32_t stride = src_width * 2;
uint8_t *buffer_ = (uint8_t*)malloc(dst_height * (sizeof(int32_t) + (sizeof(uint16_t) * 2))); // (Vertical 점의 개수) x (좌표 1개 + 두점간 비 2개)
int32_t *yofs = (int32_t*) buffer_;
uint16_t *ibeta = (uint16_t*) (&yofs[dst_height]);
img_resize_cal_offset_linear_stripe(yofs, ibeta, src_height, dst_height);
for (int ypos = 0; ypos < dst_height; ypos++)
hafwid_verti_scaledn_stripe(yuv_src + (stride * yofs[ypos]), yuv_dst + (stride/2 * ypos), src_width, ibeta[ypos*2], ibeta[ypos*2 + 1]);
if (buffer_)
free(buffer_);
return ret;
}
int32_t nc_img_yuyv_packed_to_planar(uint8_t *yuv_Packed, uint8_t *yuv_Plannar, int32_t width, int32_t height)
{
int32_t ret = OK;
uint8_t *Yplane = yuv_Plannar;
uint8_t *Uplane = &yuv_Plannar[(width*height)];
uint8_t *Vplane = &yuv_Plannar[(width*height + width*height/2)];
int32_t img_langth = (width*height/2);
asm volatile (
"1: \n" // 1 loop - src : 64bytes(32pixels) | dst : 32bytes(16pixels)
"ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%[src]], #64 \n"
"zip1 v4.16b, v0.16b, v2.16b \n"
"zip2 v5.16b, v0.16b, v2.16b \n"
"st1 {v4.16b, v5.16b}, [%[dst_Y]], #32 \n"
"st1 {v1.16b}, [%[dst_U]], #16 \n"
"st1 {v3.16b}, [%[dst_V]], #16 \n"
"subs %w[remain_dst_width], %w[remain_dst_width], #16 \n"
"b.gt 1b \n"
: [src] "+r"(yuv_Packed)
,[dst_Y] "+r"(Yplane)
,[dst_U] "+r"(Uplane)
,[dst_V] "+r"(Vplane)
,[remain_dst_width] "+r"(img_langth)
:
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19"
);
return ret;
}
int32_t nc_img_yuyv_planar_to_packed(uint8_t *yuv_Plannar, uint8_t *yuv_Packed, int32_t width, int32_t height)
{
int32_t ret = OK;
uint8_t *Yplane = yuv_Plannar;
uint8_t *Uplane = &yuv_Plannar[(width*height)];
uint8_t *Vplane = &yuv_Plannar[(width*height + width*height/2)];
int32_t img_langth = (width*height/2);
// YUYV 순서 제확인 필요
asm volatile (
"1: \n" // 1 loop - src : 64bytes(32pixels) | dst : 32bytes(16pixels)
"ld2 {v0.16b, v1.16b}, [%[src_Y]], #32 \n"
"ld1 {v4.16b}, [%[src_U]], #16 \n"
"ld1 {v3.16b}, [%[src_V]], #16 \n"
"mov v2.16b, v1.16b \n"
"mov v1.16b, v4.16b \n"
"st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%[dst]], #64 \n" // right -> v0 v2 v1 v3
"subs %w[remain_dst_width], %w[remain_dst_width], #16 \n"
"b.gt 1b \n"
: [dst] "+r"(yuv_Packed)
,[src_Y] "+r"(Yplane)
,[src_U] "+r"(Uplane)
,[src_V] "+r"(Vplane)
,[remain_dst_width] "+r"(img_langth)
:
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19"
);
return ret;
}
int32_t nc_img_nv12_to_yuv422(uint8_t* nv_ptr, uint8_t* yuv_ptr, int32_t width, int32_t height)
{
int32_t ret = OK;
int32_t lvCnt_H = 0;
uint8_t *DSTplane;
uint8_t *Yplane;
uint8_t *UVplane;
int32_t lvWidth_target;
for(lvCnt_H = 0; lvCnt_H < height; lvCnt_H++)
{
lvWidth_target = width/2;
DSTplane = &yuv_ptr[(width*2)*(lvCnt_H)];
Yplane = &nv_ptr[0 + (width)*(lvCnt_H)];
UVplane = &nv_ptr[(width * height) + (width)*(lvCnt_H/2)];
asm volatile (
"1: \n"
"ld2 {v0.16b, v1.16b}, [%[src_Y]], #32 \n" // Y0 Y1
"ld2 {v4.16b, v5.16b}, [%[src_UV]], #32 \n" // U0 V0
"mov v2.16b, v1.16b \n" // Y1
"mov v1.16b, v4.16b \n" // U0
"mov v3.16b, v5.16b \n" // V0
"st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%[dst]], #64 \n" // [Y0 U0 Y1 V0]
"subs %w[remain_dst_width], %w[remain_dst_width], #16 \n"
"b.gt 1b \n"
: [dst] "+r"(DSTplane)
,[src_Y] "+r"(Yplane)
,[src_UV] "+r"(UVplane)
,[remain_dst_width] "+r"(lvWidth_target)
:
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19"
);
}
return ret;
}
#define TILESIZE 64
int32_t nc_img_rgb24_packed_to_tiled_planar(uint8_t *rgb_Packed, uint8_t *rgb_Tiled, int32_t width, int32_t height)
{ // 64_8x8
int32_t ret = OK;
long int in_x_idx, in_y_idx;
int plane_offset = width * height;
int32_t img_stried = width * 3;
uint8_t *RGBplane = rgb_Packed;
uint8_t *Rplane = &rgb_Tiled[plane_offset * 0];
uint8_t *Gplane = &rgb_Tiled[plane_offset * 1];
uint8_t *Bplane = &rgb_Tiled[plane_offset * 2];
for (long y_super = 0; y_super < (height / TILESIZE); y_super++)
{
for (long x_super = 0; x_super < (width / TILESIZE); x_super++)
{
for (long y_sub = 0; y_sub < 8; y_sub++)
{
for (long x_sub = 0; x_sub < 8; x_sub++)
{
for (long y_pix = 0; y_pix < 8; y_pix++)
{
in_x_idx = (x_super * TILESIZE * 3) + (x_sub * 8 * 3);
in_y_idx = (y_super * TILESIZE) + (y_sub * 8) + y_pix;
RGBplane = &rgb_Packed[(in_y_idx * img_stried) + in_x_idx];
asm volatile (
// R G B
"ld3 {v0.8b, v1.8b, v2.8b}, [%[src]], #24 \n"
"st1 {v0.8b}, [%[dst_R]], #8 \n" // R Plane
"st1 {v1.8b}, [%[dst_G]], #8 \n" // G Plane
"st1 {v2.8b}, [%[dst_B]], #8 \n" // B Plane
:[src] "+r"(RGBplane)
,[dst_R] "+r"(Rplane)
,[dst_G] "+r"(Gplane)
,[dst_B] "+r"(Bplane)
:
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19"
);
}
}
}
}
}
return ret;
}
/*
* Full swing for BT.601
* https://en.wikipedia.org/wiki/YUV#Full_swing_for_BT.601
*
* Y = ( 77 * R + 150 * G + 29 * B) >> 8 + 0;
* U = (-43 * R - 84 * G + 127 * B) >> 8 + 128;
* V = (127 * R - 106 * G - 21 * B) >> 8 + 128;
*/
void neon_Mix_Rgbx2Yuyv(uint8_t *src, uint8_t *dst, int32_t w, int32_t h, int32_t isRgbaNotBgra)
{
int32_t i, loop;
int16x8_t sqw_r, sqw_g, sqw_b;
int16x8_t sqw_y, sqw_u, sqw_v;
int16x8_t sqw_128 = vmovq_n_s16(128);
uint8x8x4_t udb4_rgb;
uint8x8x2_t udb2_yuyv, udb2_org, udb2_uv;
uint8x8_t udb_y, udb_u, udb_v;
uint8x8_t udb_oy;
uint8x8_t udb_a, udb_n;
uint16x8_t uqh_y, uqh_u, uqh_v;
loop = h * w;
for (i = 0; i < loop; i += 8)
{
/* load rgb */
// 8 bit x 8 lane x 4 vector
// v0 : r1 r2 r3 r4 r5 r6 r7 r8 r9 r10
// v1 : g1 g2 g3 g4 g5 g6 g7 g8 g9 g10
// v2 : b1 b2 b3 b4 b5 b6 b7 b8 b9 b10
// v3 : a1 a2 a3 a4 a5 a6 a7 a8 a9 a10
udb4_rgb = vld4_u8(src + i * 4);
/* check alpha */
udb_a = udb4_rgb.val[3];
// 8 bit x 8 lane => 64 bit x 1 lane으로 변경 후 0번 lane값이 0인지 확인 => alpha 값이 있는지 확인
if (0 != vget_lane_u64(vreinterpret_u64_u8(udb_a), 0))
{
// rgb 값 u8에서 u16으로 변경 후 다시 s16으로 변경
/* long signed rgb */
if (isRgbaNotBgra) {
sqw_r = vreinterpretq_s16_u16(vmovl_u8(udb4_rgb.val[0]));
sqw_g = vreinterpretq_s16_u16(vmovl_u8(udb4_rgb.val[1]));
sqw_b = vreinterpretq_s16_u16(vmovl_u8(udb4_rgb.val[2]));
} else {
sqw_r = vreinterpretq_s16_u16(vmovl_u8(udb4_rgb.val[2]));
sqw_g = vreinterpretq_s16_u16(vmovl_u8(udb4_rgb.val[1]));
sqw_b = vreinterpretq_s16_u16(vmovl_u8(udb4_rgb.val[0]));
}
/* Y = ( 77 * R + 150 * G + 29 * B) >> 8 + 0; */
// y = r * 77
sqw_y = vmulq_n_s16(sqw_r, 77);
// y = y + (g * 150)
sqw_y = vmlaq_n_s16(sqw_y, sqw_g, 150);
// y = y + (b * 29)
sqw_y = vmlaq_n_s16(sqw_y, sqw_b, 29);
// y = (y >> 8)
sqw_y = vshrq_n_s16(sqw_y, 8);
/* U = (-43 * R - 84 * G + 127 * B) >> 8 + 128; */
// u = r x -43
sqw_u = vmulq_n_s16(sqw_r, -43);
// u = u + (g x -84)
sqw_u = vmlaq_n_s16(sqw_u, sqw_g, -84);
// u = u + (b x 127)
sqw_u = vmlaq_n_s16(sqw_u, sqw_b, 127);
// u = (u >> 8) + v16x8(128)
sqw_u = vsraq_n_s16(sqw_128, sqw_u, 8);
/* V = (127 * R - 106 * G - 21 * B) >> 8 + 128; */
// v = r x 127
sqw_v = vmulq_n_s16(sqw_r, 127);
// v = v + (g x -106)
sqw_v = vmlaq_n_s16(sqw_v, sqw_g, -106);
// v = v + (b x -21)
sqw_v = vmlaq_n_s16(sqw_v, sqw_b, -21);
// v = (u >> v) + v16x8(128)
sqw_v = vsraq_n_s16(sqw_128, sqw_v, 8);
// yuv 값 s16에서 u16으로 변경 후 다시 u8으로 변경
/* narrow unsigned rgb */
udb_y = vmovn_u16(vreinterpretq_u16_s16(sqw_y));
udb_u = vmovn_u16(vreinterpretq_u16_s16(sqw_u));
udb_v = vmovn_u16(vreinterpretq_u16_s16(sqw_v));
/* load original yuyv 422 to 444 */
// v0 : y1 y2 y3 y4 y5 y6 y7 y8
// v1 : u1 v1 u2 v2 u3 v3 u4 v4
udb2_org = vld2_u8(dst + i * 2);
udb_oy = udb2_org.val[0];
// v0 : u1 u1 u2 u2 u3 u3 u4 u4 // dst u
// v1 : v1 v1 v2 v2 v3 v3 v4 v4 // dst v
udb2_uv = vtrn_u8(udb2_org.val[1], udb2_org.val[1]);
/* alpha' = ~alpha */
udb_n = vmvn_u8(udb_a);
/* blending = ((alpha x src + src) + (alpha' x org)) / 2 */
/* why (+ src)? : (alpha + 1) + alpha' = 0x100 */
// v16_y = ~alpha x dst_v8_y
uqh_y = vmull_u8(udb_n, udb_oy);
// v16_y = v16_y + (alpha x src_v8_y)
uqh_y = vmlal_u8(uqh_y, udb_a, udb_y);
// src_v8_y = (v16_y >> 8)
udb_y = vshrn_n_u16(uqh_y, 8);
// v16_u = ~alpha x dst_v8_u
uqh_u = vmull_u8(udb_n, udb2_uv.val[0]);
// v16_u = v16_u + (alpha x src_v8_u)
uqh_u = vmlal_u8(uqh_u, udb_a, udb_u);
// src_v8_u = (v16_u >> 8)
udb_u = vshrn_n_u16(uqh_u, 8);
// v16_v = ~alpha x dst_v8_v
uqh_v = vmull_u8(udb_n, udb2_uv.val[1]);
// v16_v = v16_v + (alpha x src_v8_v)
uqh_v = vmlal_u8(uqh_v, udb_a, udb_v);
// src_v8_v = (v16_v >> 8)
udb_v = vshrn_n_u16(uqh_v, 8);
/* 444 to 422 subsampling also - u, v : averaging */
// val[0] = y1 y2 y3 ... y8
udb2_yuyv.val[0] = udb_y;
// uv_v0 : u1 v1 u2 v2 u3 v3 u4 v4
// uv_v1 : u1 v1 u2 v2 u3 v3 u4 v4
udb2_uv = vtrn_u8(udb_u, udb_v);
// val[1] = (u1 + u1)/2 (v1 + v1)/2 ... (v4 + v4)/2 :: (uv_v0 + uv_v1)/2
// so, val[1] = u1 v1 u2 v2 u3 v3 u4 v4
udb2_yuyv.val[1] = vhadd_u8(udb2_uv.val[0], udb2_uv.val[1]);
/* store yuyv */
vst2_u8(dst + i * 2, udb2_yuyv);
} // if
} // for
return;
}
void neon_Mix_Rgba2Yuyv(uint8_t *src, uint8_t *dst, int32_t w, int32_t h)
{
neon_Mix_Rgbx2Yuyv(src, dst, w, h, 1);
}
void neon_Mix_Bgra2Yuyv(uint8_t *src, uint8_t *dst, int32_t w, int32_t h)
{
neon_Mix_Rgbx2Yuyv(src, dst, w, h, 0);
}
#define ELEM_X (8)
#define ELEM_Y (8)
#define CELL_X (8)
#define CELL_Y (8)
#define CELL8_Y_OFFSET (64) //(ELEM_Y*CELL_Y)
#define CELL_SIZE (8) //(CELL_X*CELL_Y)
#define TILE_SIZE (64) //(CELL_X*CELL_Y)
#define VALID_CELL_X (tinfo->dim.w>>3)//(tinfo->dim.w/ELEM_X) // 40/8=5, (dim.w / ELEM_X)
#define VALID_CELL_Y (tinfo->dim.h>>3)//(tinfo->dim.h/ELEM_Y) // 24/8=3, (dim.h / ELEM_Y)
#define CELL1_SIZE (64) //(CELL_X*CELL_Y) // 64
#define CELL8_SIZE (512) //(CELL1_SIZE*CELL_X) // 64 * 8
int nc_neon_get_data_NCHW_float(aiwTensorInfo *tinfo, unsigned char *tiled, float *scanline)
{
unsigned int w_idx=0, ch_offs=0, in_buf_offs=0;
unsigned int offs_tile_x=0, offs_tile_y=0, offs_cell_x=0, offs_cell_y=0;
unsigned int valid_cell_x=CELL_X, valid_cell_y=CELL_Y;
unsigned int valid_ele_x=ELEM_X, valid_ele_y=ELEM_Y;
unsigned int tile_x = 0, tile_y = 0;
unsigned int x_cell = 0, y_cell = 0;
unsigned int y = 0;
int8x8_t s88;
float32x4_t f324div;
float32x4x2_t f3242;
memset(&f3242, 0, sizeof(f3242));
if(!tinfo || !tiled || !scanline)
{
printf("null pointer (%s) 0x%lx, 0x%lx, 0x%lx\n", __func__, (uintptr_t)tinfo, (uintptr_t)tiled, (uintptr_t)scanline);
return AIW_ERROR;
}
const unsigned int num_x_tiles = (tinfo->dim.w + TILE_SIZE - 1) / TILE_SIZE;
const unsigned int num_y_tiles = (tinfo->dim.h + TILE_SIZE - 1) / TILE_SIZE;
const unsigned int plane_size = (tinfo->dim.w * tinfo->dim.h); // 1 channel size
const unsigned int tile_y_size = (tinfo->dim.w * CELL8_Y_OFFSET); // tile y line size
const unsigned int cell_y_size = (CELL1_SIZE * (tinfo->dim.w)/8); // cell y line size
unsigned int remain_valid_cell_x = ((tinfo->dim.w % TILE_SIZE)+7)/CELL_X;
unsigned int remain_valid_cell_y = ((tinfo->dim.h % TILE_SIZE)+7)/CELL_Y;
unsigned int remain_valid_elem_x = (tinfo->dim.w % ELEM_X);
unsigned int remain_valid_elem_y = (tinfo->dim.h % ELEM_Y);
const float multiplier = tinfo->exponent > 0 ? (float)(1 << tinfo->exponent) : (float)(1 << -tinfo->exponent);
if(remain_valid_cell_x == 0) { remain_valid_cell_x = CELL_X; }
if(remain_valid_cell_y == 0) { remain_valid_cell_y = CELL_Y; }
if(remain_valid_elem_x == 0) { remain_valid_elem_x = ELEM_X; }
if(remain_valid_elem_y == 0) { remain_valid_elem_y = ELEM_Y; }
f324div =vld1q_dup_f32(&multiplier);
for (long ch = 0; ch < tinfo->dim.ch; ch++)
{
valid_cell_x = CELL_X; valid_cell_y = CELL_Y;
for (tile_y = 0; tile_y < num_y_tiles; tile_y++)
{
if(tile_y == (num_y_tiles - 1)) {
// Calculate the number of valid cells in the last tile_y.(When tile size is not aligned)
valid_cell_y = remain_valid_cell_y;
}
offs_tile_y = (tile_y * tile_y_size); //Calculate the starting position of the y-axis tile.
for (tile_x = 0; tile_x < num_x_tiles; tile_x++) // count X_tiles in one channel.
{
if(tile_x == (num_x_tiles - 1)) {
// Calculate the number of valid cells in the last tile_x.(When tile size is not aligned)
valid_cell_x = remain_valid_cell_x;
}
// tile0: 0 8 16 24 32 40 48 56, tile1:64 72 80 88 96 104 112 120, 128 136 .......
offs_tile_x = (tile_x * TILE_SIZE); //Calculate the starting position of the x-axis tile.
for (y_cell = 0; y_cell < CELL_Y; y_cell++)
{
if(y_cell >= valid_cell_y) { // Invalid ycells are skipped.
in_buf_offs += CELL8_SIZE; // Skip the padding area.
continue;
}
valid_ele_x = ELEM_X;
valid_ele_y = ELEM_Y;
if(y_cell == (valid_cell_y-1)) { // last y-axis cell
valid_ele_y = remain_valid_elem_y;
}
offs_cell_y = (y_cell*cell_y_size);
for (x_cell = 0; x_cell < CELL_X; x_cell++)
{
if(x_cell >= valid_cell_x) { // Invalid xcells are skipped.
in_buf_offs+=CELL1_SIZE; // Skip the padding area
continue;
}
if(x_cell == (valid_cell_x-1)) {
valid_ele_x = remain_valid_elem_x;
}
offs_cell_x = (x_cell*CELL_X);
for (y = 0; y < ELEM_Y; y++)
{ // 8x8 element
if(y >= valid_ele_y) { // Invalid elements are skipped.
in_buf_offs+=ELEM_Y;
continue;
}
w_idx = ch_offs + offs_tile_y + offs_tile_x + offs_cell_y + offs_cell_x + (y*tinfo->dim.w);
s88=vld1_s8((const signed char*)tiled+in_buf_offs); // read 8 bytes
if(tinfo->sign) {
f3242.val[0][0] = s88[0];
f3242.val[0][1] = s88[1];
f3242.val[0][2] = s88[2];
f3242.val[0][3] = s88[3];
}
else {
f3242.val[0][0] = (unsigned char)s88[0];
f3242.val[0][1] = (unsigned char)s88[1];
f3242.val[0][2] = (unsigned char)s88[2];
f3242.val[0][3] = (unsigned char)s88[3];
}
f3242.val[0] = vdivq_f32(f3242.val[0], f324div); // (val/multiplier)
vst1q_f32(scanline+w_idx, f3242.val[0]);
if(valid_ele_x > 4) {
if(tinfo->sign) {
f3242.val[1][0] = s88[4];
f3242.val[1][1] = s88[5];
f3242.val[1][2] = s88[6];
f3242.val[1][3] = s88[7];
}
else {
f3242.val[1][0] = (unsigned char)s88[4];
f3242.val[1][1] = (unsigned char)s88[5];
f3242.val[1][2] = (unsigned char)s88[6];
f3242.val[1][3] = (unsigned char)s88[7];
}
f3242.val[1] = vdivq_f32(f3242.val[1], f324div); // (
vst1q_f32(scanline+w_idx+4, f3242.val[1]);
}
in_buf_offs+=ELEM_X; // 8바이트 읽어서 처리
}
}
}
}
}
ch_offs += plane_size;
}
return AIW_SUCCESS;
}
#define SEG_COLOR (0x400000FFU)
int nc_neon_tiled_to_scanline_n_scale_up(unsigned int npu_seg_out_w, unsigned int npu_seg_out_h, unsigned int canvas_w, unsigned int canvas_h, unsigned int *canvas, unsigned char *cnn_output) {
unsigned int in_x_idx, in_y_idx;
float xpos, ypos;
int outidx = 0;
unsigned int seg_valid_cell_x = (((npu_seg_out_w % TILE_SIZE)+7)>>3);
unsigned int seg_valid_cell_y = (((npu_seg_out_h % TILE_SIZE)+7)>>3);
unsigned int valid_cell_x=CELL_X, valid_cell_y=CELL_Y;
const unsigned int num_x_tiles=(npu_seg_out_w+TILE_SIZE-1)/TILE_SIZE; // calc the number of x-tiles
const unsigned int num_y_tiles=(npu_seg_out_h+TILE_SIZE-1)/TILE_SIZE; // calc the number of y-tiles
const float h_ratio = (float)canvas_w/(float)npu_seg_out_w;
const float v_ratio = (float)canvas_h/(float)npu_seg_out_h;
const uint32x4_t u324 ={SEG_COLOR, SEG_COLOR, SEG_COLOR, SEG_COLOR};
uint8x8_t u88;
unsigned int x_super = 0, y_super = 0;
unsigned int x_sub = 0, y_sub = 0;
unsigned int x = 0, y = 0;
unsigned int y_up = 0;
if(!canvas || !cnn_output)
{
printf("null pointer (%s) 0x%lx, 0x%lx\n", __func__, (uintptr_t)canvas, (uintptr_t)cnn_output);
return NC_INVALID;
}
valid_cell_x = (npu_seg_out_w % TILE_SIZE) ? seg_valid_cell_x : CELL_X;
valid_cell_y = (npu_seg_out_h % TILE_SIZE) ? seg_valid_cell_y : CELL_Y;
for (y_super = 0; y_super < num_y_tiles; y_super++) {
for (x_super = 0; x_super < num_x_tiles; x_super++) {
for (y_sub = 0; y_sub < CELL_Y; y_sub++) {
if(y_super == (num_y_tiles-1)) { // last y tile
if(y_sub >= valid_cell_y) { // skip invalid y cells
outidx+=(CELL8_SIZE * valid_cell_y);
break;
}
}
for (x_sub = 0; x_sub < CELL_X; x_sub++) {
if(x_super == (num_x_tiles-1)) { // last x tile
if(x_sub >= valid_cell_x) { // skip invalid x cells
outidx+=(CELL1_SIZE * valid_cell_x);
break;
}
}
for (y = 0; y < ELEM_Y; y++) {
in_y_idx = (y_super * TILESIZE) + (y_sub * CELL_Y + y);
in_x_idx = (x_super * TILESIZE) + (x_sub * CELL_X);
u88=vld1_u8((unsigned char*)cnn_output+outidx); // read 8 bytes
ypos = (float)in_y_idx * v_ratio;
for (x = 0; x < ELEM_X; x++) {
// increase drawing speed (not drawing background)
if (u88[x]) {
continue;
}
xpos = (float)(in_x_idx+x) * h_ratio;
if (ypos > 0) {
for (y_up = 0; y_up < (unsigned int)v_ratio; y_up++) { // line
#if 1
vst1q_u32(canvas + (unsigned int)((ypos + (float)y_up)* (float)canvas_w +(xpos)), u324);// ARGB x 4 pixels, peleeSeg 320x192
//vst1q_u32(canvas + ((ypos + y_up)* canvas_w +(xpos))+4, u324);// ARGB x 4 pixels, for pavliteSeg 160x96
#else
for (int x_up = 0; x_up < h_ratio; x_up++) {
canvas[(ypos + y_up) * canvas_w + (xpos + x_up)] = SEG_COLOR;
}
#endif
}
}
}
outidx+=8;
}
}
}
}
}
return NC_SUCCESS;
}
int nc_neon_get_data_NCHW_float_cellrow(aiwTensorInfo *tinfo, unsigned char *tiled, float *scanline)
{
unsigned int w_idx=0, ch_offs=0, in_buf_offs=0;
unsigned int offs_cell_x=0, offs_cell_y=0;
unsigned int valid_ele_x=ELEM_X, valid_ele_y=ELEM_Y;
unsigned int x_cell = 0, y_cell = 0;
unsigned int y = 0;
int8x8_t s88;
float32x4_t f324div;
float32x4x2_t f3242;
memset(&f3242, 0, sizeof(f3242));
if(!tinfo || !tiled || !scanline)
{
printf("null pointer (%s) 0x%lx, 0x%lx, 0x%lx\n", __func__, (uintptr_t)tinfo, (uintptr_t)tiled, (uintptr_t)scanline);
return AIW_ERROR;
}
const unsigned int num_x_cells = (tinfo->dim.w + TILE_SIZE - 1) / TILE_SIZE * (TILE_SIZE / CELL_SIZE);
const unsigned int num_y_cells = (tinfo->dim.h + CELL_SIZE - 1) / CELL_SIZE;
const unsigned int plane_size = (tinfo->dim.w * tinfo->dim.h); // 1 channel size
const unsigned int cell_y_size = (CELL1_SIZE * (tinfo->dim.w)/8); // cell y line size
unsigned int remain_valid_cell_x = (tinfo->dim.w + 7) / CELL_X;
unsigned int remain_valid_elem_x = (tinfo->dim.w % ELEM_X);
unsigned int remain_valid_elem_y = (tinfo->dim.h % ELEM_Y);
const float multiplier = tinfo->exponent > 0 ? (float)(1 << tinfo->exponent) : (float)(1 << -tinfo->exponent);
if(remain_valid_elem_x == 0) { remain_valid_elem_x = ELEM_X; }
if(remain_valid_elem_y == 0) { remain_valid_elem_y = ELEM_Y; }
f324div =vld1q_dup_f32(&multiplier);
for (long ch = 0; ch < tinfo->dim.ch; ch++)
{
for (y_cell = 0; y_cell < num_y_cells; y_cell++)
{
valid_ele_y = ELEM_Y;
if(y_cell == (num_y_cells-1)) { // last y-axis cell
valid_ele_y = remain_valid_elem_y;
}
offs_cell_y = (y_cell*cell_y_size);
for (x_cell = 0; x_cell < num_x_cells; x_cell++)
{
valid_ele_x = ELEM_X;
if(x_cell >= remain_valid_cell_x) { // Invalid xcells are skipped.
in_buf_offs+=CELL1_SIZE; // Skip the padding area
continue;
}
if(x_cell == (remain_valid_cell_x-1)) {
valid_ele_x = remain_valid_elem_x;
}
offs_cell_x = (x_cell*CELL_X);
for (y = 0; y < ELEM_Y; y++)
{ // 8x8 element
if(y >= valid_ele_y) { // Invalid elements are skipped.
in_buf_offs+=ELEM_Y;
continue;
}
w_idx = ch_offs + offs_cell_y + offs_cell_x + (y*tinfo->dim.w);
s88=vld1_s8((const signed char*)tiled+in_buf_offs); // read 8 bytes
if(tinfo->sign) {
f3242.val[0][0] = s88[0];
f3242.val[0][1] = s88[1];
f3242.val[0][2] = s88[2];
f3242.val[0][3] = s88[3];
}
else {
f3242.val[0][0] = (unsigned char)s88[0];
f3242.val[0][1] = (unsigned char)s88[1];
f3242.val[0][2] = (unsigned char)s88[2];
f3242.val[0][3] = (unsigned char)s88[3];
}
f3242.val[0] = vdivq_f32(f3242.val[0], f324div); // (val/multiplier)
vst1q_f32(scanline+w_idx, f3242.val[0]);
if(valid_ele_x > 4) {
if(tinfo->sign) {
f3242.val[1][0] = s88[4];
f3242.val[1][1] = s88[5];
f3242.val[1][2] = s88[6];
f3242.val[1][3] = s88[7];
}
else {
f3242.val[1][0] = (unsigned char)s88[4];
f3242.val[1][1] = (unsigned char)s88[5];
f3242.val[1][2] = (unsigned char)s88[6];
f3242.val[1][3] = (unsigned char)s88[7];
}
f3242.val[1] = vdivq_f32(f3242.val[1], f324div);
vst1q_f32(scanline+w_idx+4, f3242.val[1]);
}
in_buf_offs+=ELEM_X; // 8바이트 읽어서 처리
}
}
}
ch_offs += plane_size;
}
return AIW_SUCCESS;
}
int nc_neon_get_data_NCHW_uint8_cellrow(aiwTensorInfo *tinfo, unsigned char *tiled, int64_t *scanline)
{
unsigned int w_idx=0, ch_offs=0, in_buf_offs=0;
unsigned int offs_cell_x=0, offs_cell_y=0;
unsigned int valid_ele_x=ELEM_X, valid_ele_y=ELEM_Y;
unsigned int x_cell = 0, y_cell = 0;
unsigned int y = 0;
int8x8_t s88;
memset(&s88, 0, sizeof(s88));
if(!tinfo || !tiled || !scanline)
{
printf("null pointer (%s) 0x%lx, 0x%lx, 0x%lx\n", __func__, (uintptr_t)tinfo, (uintptr_t)tiled, (uintptr_t)scanline);
return AIW_ERROR;
}
const unsigned int num_x_cells = (tinfo->dim.w + TILE_SIZE - 1) / TILE_SIZE * (TILE_SIZE / CELL_SIZE);
const unsigned int num_y_cells = (tinfo->dim.h + CELL_SIZE - 1) / CELL_SIZE;
const unsigned int plane_size = (tinfo->dim.w * tinfo->dim.h);
const unsigned int cell_y_size = (CELL1_SIZE * (tinfo->dim.w)/8);
unsigned int remain_valid_cell_x = (tinfo->dim.w + 7) / CELL_X;
unsigned int remain_valid_elem_x = (tinfo->dim.w % ELEM_X);
unsigned int remain_valid_elem_y = (tinfo->dim.h % ELEM_Y);
if(remain_valid_elem_x == 0) { remain_valid_elem_x = ELEM_X; }
if(remain_valid_elem_y == 0) { remain_valid_elem_y = ELEM_Y; }
for (long ch = 0; ch < tinfo->dim.ch; ch++)
{
for (y_cell = 0; y_cell < num_y_cells; y_cell++)
{
valid_ele_y = ELEM_Y;
if(y_cell == (num_y_cells-1)) {
valid_ele_y = remain_valid_elem_y;
}
offs_cell_y = (y_cell*cell_y_size);
for (x_cell = 0; x_cell < num_x_cells; x_cell++)
{
valid_ele_x = ELEM_X;
if(x_cell >= remain_valid_cell_x) {
in_buf_offs+=CELL1_SIZE;
continue;
}
if(x_cell == (remain_valid_cell_x-1)) {
valid_ele_x = remain_valid_elem_x;
}
offs_cell_x = (x_cell*CELL_X);
for (y = 0; y < ELEM_Y; y++)
{
if(y >= valid_ele_y) {
in_buf_offs+=ELEM_Y;
continue;
}
w_idx = ch_offs + offs_cell_y + offs_cell_x + (y*tinfo->dim.w);
s88=vld1_s8((const signed char*)tiled+in_buf_offs);
memcpy((int8_t*)scanline + w_idx, &s88, 4);
if(valid_ele_x > 4) {
memcpy((int8_t*)scanline + w_idx+4, &s88[4], 4);
}
in_buf_offs+=ELEM_X;
}
}
}
ch_offs += plane_size;
}
return AIW_SUCCESS;
}
void nc_rgb_planar_to_interleaved_neon(uint8_t* R, uint8_t* G, uint8_t* B, uint8_t* interleaved, int width, int height)
{
int n = width * height;
int i;
uint8x8x3_t v;
for (i = 0; i < n; i += 8) {
// Load 8 elements from each R, G, and B arrays into NEON registers
// Interleave the RGB data and store it in the interleaved array
v.val[0] = vld1_u8(&R[i]);
v.val[1] = vld1_u8(&G[i]);
v.val[2] = vld1_u8(&B[i]);
vst3_u8(&interleaved[3 * i], v);
}
}
void nc_rgb_interleaved_to_planar_neon(unsigned char* interleaved, unsigned char* R, unsigned char* G, unsigned char* B, int w, int h)
{
int n = w * h;
uint8x8x3_t u883_rgb;
for(int i=0; i < n; i += 8) {
u883_rgb = vld3_u8(interleaved + i *3);
vst1_u8(&R[i], u883_rgb.val[0]);
vst1_u8(&G[i], u883_rgb.val[1]);
vst1_u8(&B[i], u883_rgb.val[2]);
}
}