You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1802 lines
67 KiB
1802 lines
67 KiB
/**
|
|
********************************************************************************
|
|
* Copyright (C) 2021 NEXTCHIP Inc. All rights reserved.
|
|
* This software is the confidential and proprietary information of
|
|
* NEXTCHIP, Inc. ("Confidential Information"). You shall not disclose such
|
|
* Confidential Information and shall use it only in accordance with
|
|
* the terms of the license agreement you entered into with NEXTCHIP.
|
|
********************************************************************************
|
|
*/
|
|
/**
|
|
********************************************************************************
|
|
* @file : nc_neon.c
|
|
*
|
|
* @brief : frame_mixel api c code
|
|
*
|
|
* @author : SW Solution team. NextChip Inc.
|
|
*
|
|
* @date : 2022.09.02.
|
|
*
|
|
* @version : 1.0.0
|
|
********************************************************************************
|
|
* @note
|
|
* 09.02.2022 / 1.0.0 / Initial released.
|
|
*
|
|
********************************************************************************
|
|
*/
|
|
|
|
|
|
/*
|
|
********************************************************************************
|
|
* INCLUDES
|
|
********************************************************************************
|
|
*/
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <arm_neon.h>
|
|
|
|
#include "nc_neon.h"
|
|
#include "nc_types.h"
|
|
|
|
/*
|
|
********************************************************************************
|
|
* DEFINES
|
|
********************************************************************************
|
|
*/
|
|
#define INTER_RESIZE_COEF_BITS (11)
|
|
#define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS)
|
|
|
|
|
|
#define MAX_ESIZE (16)
|
|
#define MIN(a, b) ((a) > (b) ? (b) : (a))
|
|
#define MAX(a, b) ((a) < (b) ? (b) : (a))
|
|
|
|
#define VRESIZE_LINEAR_MASK_TABLE_SIZE (7)
|
|
#define BITS (INTER_RESIZE_COEF_BITS * 2)
|
|
#define DELTA (1 << ((INTER_RESIZE_COEF_BITS * 2) - 1))
|
|
|
|
/*
|
|
********************************************************************************
|
|
* FUNCTION DEFINITIONS
|
|
********************************************************************************
|
|
*/
|
|
static inline uint32_t neon_align_size(int32_t sz, int32_t n)
|
|
{
|
|
return (sz + n - 1) & (-n);
|
|
}
|
|
static inline int32_t neon_floor(float a)
|
|
{
|
|
return (((a) >= 0) ? ((int32_t) a) : ((int32_t) a - 1));
|
|
}
|
|
static inline int32_t neon_clip(int32_t x, int32_t a, int32_t b)
|
|
{
|
|
return (x >= a ? (x < b ? x : (b - 1)) : a);
|
|
}
|
|
static inline uint8_t neon_cast_op(int32_t val)
|
|
{
|
|
int32_t bits = INTER_RESIZE_COEF_BITS * 2;
|
|
int32_t lvShift = bits;
|
|
int32_t lvDelta = 1 << (bits - 1);
|
|
int32_t temp = MIN (255, MAX (0, (val + lvDelta) >> lvShift));
|
|
return (uint8_t) (temp);
|
|
}
|
|
|
|
static void img_hresize_linear_c(const uint8_t **src,
|
|
int32_t **dst,
|
|
int32_t count,
|
|
const int32_t *xofs,
|
|
const int16_t *alpha,
|
|
int32_t dwidth,
|
|
int32_t cn,
|
|
int32_t xmax)
|
|
{
|
|
int32_t dx, k;
|
|
int32_t dx0 = 0;
|
|
|
|
if(count == 2)
|
|
{
|
|
k = 0;
|
|
const uint8_t *S0 = src[k], *S1 = src[k + 1];
|
|
int32_t *D0 = dst[k], *D1 = dst[k + 1];
|
|
|
|
for(dx = dx0; dx < xmax; dx++)
|
|
{
|
|
int32_t sx = xofs[dx];
|
|
int32_t a0 = alpha[dx * 2], a1 = alpha[dx * 2 + 1];
|
|
int32_t t0 = S0[sx] * a0;
|
|
int32_t t1 = S1[sx] * a0;
|
|
|
|
t0 += S0[sx + cn] * a1;
|
|
t1 += S1[sx + cn] * a1;
|
|
D0[dx] = t0;
|
|
D1[dx] = t1;
|
|
}
|
|
|
|
for(; dx < dwidth; dx++)
|
|
{
|
|
int32_t sx = xofs[dx];
|
|
D0[dx] = (int32_t) S0[sx] * INTER_RESIZE_COEF_SCALE;
|
|
D1[dx] = (int32_t) S1[sx] * INTER_RESIZE_COEF_SCALE;
|
|
}
|
|
}
|
|
|
|
if(count == 1)
|
|
{
|
|
k = 0;
|
|
const uint8_t *S = src[k];
|
|
int32_t *D = dst[k];
|
|
for(dx = 0; dx < xmax; dx++)
|
|
{
|
|
int32_t sx = xofs[dx];
|
|
|
|
D[dx] = S[sx] * alpha[dx * 2] + S[sx + cn] * alpha[dx * 2 + 1];
|
|
}
|
|
|
|
for(; dx < dwidth; dx++)
|
|
D[dx] = (int32_t) S[xofs[dx]] * INTER_RESIZE_COEF_SCALE;
|
|
}
|
|
}
|
|
|
|
void nc_img_vresize_linear_c(const int32_t **src, uint8_t *dst, const int16_t *beta, int32_t width)
|
|
{
|
|
int32_t b0 = beta[0], b1 = beta[1];
|
|
const int32_t *S0 = src[0], *S1 = src[1];
|
|
|
|
int32_t x = 0;
|
|
for(; x <= ((width / 2) - 4); x += 4)
|
|
{
|
|
int32_t t0, t1;
|
|
t0 = S0[x] * b0 + S1[x] * b1;
|
|
t1 = S0[x + 1] * b0 + S1[x + 1] * b1;
|
|
dst[x] = neon_cast_op(t0);
|
|
dst[x + 1] = neon_cast_op(t1);
|
|
t0 = S0[x + 2] * b0 + S1[x + 2] * b1;
|
|
t1 = S0[x + 3] * b0 + S1[x + 3] * b1;
|
|
dst[x + 2] = neon_cast_op(t0);
|
|
dst[x + 3] = neon_cast_op(t1);
|
|
}
|
|
|
|
for(; x < (width/2); x++)
|
|
dst[x] = neon_cast_op (S0[x] * b0 + S1[x] * b1);
|
|
}
|
|
|
|
static void img_resize_cal_offset_linear(int32_t *xofs,
|
|
int16_t *ialpha,
|
|
int32_t *yofs,
|
|
int16_t *ibeta,
|
|
int32_t *xmin,
|
|
int32_t *xmax,
|
|
int32_t ksize,
|
|
int32_t ksize2,
|
|
int32_t srcw,
|
|
int32_t srch,
|
|
int32_t dstw,
|
|
int32_t dsth,
|
|
int32_t channels)
|
|
{
|
|
float inv_scale_x = (float) dstw / (float)srcw;
|
|
float inv_scale_y = (float) dsth / (float)srch;
|
|
|
|
int32_t cn = channels;
|
|
float scale_x = (float)(1. / inv_scale_x);
|
|
float scale_y = (float)(1. / inv_scale_y);
|
|
int32_t k, sx, sy, dx, dy;
|
|
|
|
|
|
float fx, fy;
|
|
|
|
float cbuf[MAX_ESIZE];
|
|
|
|
|
|
// horizontal
|
|
for(dx = 0; dx < dstw; dx++)
|
|
{
|
|
fx = (float) (((dx + 0.5) * scale_x) - 0.5);
|
|
sx = neon_floor(fx);
|
|
fx -= (float)sx;
|
|
|
|
if(sx < (ksize2 - 1))
|
|
{
|
|
*xmin = dx + 1;
|
|
if(sx < 0)
|
|
fx = 0, sx = 0;
|
|
}
|
|
|
|
if((sx + ksize2) >= srcw)
|
|
{
|
|
*xmax = MIN (*xmax, dx);
|
|
if(sx >= (srcw - 1))
|
|
fx = 0, sx = srcw - 1;
|
|
}
|
|
|
|
for(k = 0, sx *= cn; k < cn; k++)
|
|
xofs[(dx * cn) + k] = sx + k;
|
|
|
|
cbuf[0] = 1.f - fx;
|
|
cbuf[1] = fx;
|
|
|
|
|
|
for(k = 0; k < ksize; k++)
|
|
ialpha[((dx * cn) * ksize) + k] = (int16_t) (cbuf[k] * INTER_RESIZE_COEF_SCALE);
|
|
|
|
for(; k < cn * ksize; k++)
|
|
ialpha[((dx * cn) * ksize) + k] = ialpha[((dx * cn) * ksize) + k - ksize];
|
|
}
|
|
|
|
// vertical
|
|
for(dy = 0; dy < dsth; dy++)
|
|
{
|
|
fy = (float) ((dy + 0.5) * scale_y - 0.5);
|
|
sy = neon_floor(fy);
|
|
fy -= (float)sy;
|
|
|
|
yofs[dy] = sy;
|
|
|
|
cbuf[0] = 1.f - fy;
|
|
cbuf[1] = fy;
|
|
|
|
for(k = 0; k < ksize; k++)
|
|
ibeta[(dy * ksize) + k] = (int16_t) (cbuf[k] * INTER_RESIZE_COEF_SCALE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
static void img_hresize_4channels_linear_neon(const uint8_t **src, int32_t **dst, int32_t count,
|
|
const int32_t *xofs, const int16_t *alpha,
|
|
int32_t dwidth, int32_t xmax)
|
|
{
|
|
int32_t dx, k;
|
|
int32_t dx0 = 0;
|
|
|
|
uint8x8_t dS0_vec, dS1_vec;
|
|
int16x8_t qS0_vec, qS1_vec;
|
|
int16x4_t dS0_0123, dS1_0123;
|
|
|
|
int32x4_t qT0_vec, qT1_vec;
|
|
|
|
int32_t dx_buf;
|
|
int16x4x2_t alpha_vec_A, alpha_vec_B, alpha_vec_C, alpha_vec_D;
|
|
|
|
uint8x8_t dS01_vec, dS02_vec, dS03_vec, dS04_vec;
|
|
int16x8_t qS01_vec, qS02_vec, qS03_vec, qS04_vec;
|
|
int16x4_t dS01_0123, dS01_4567, dS02_0123, dS02_4567, dS03_0123, dS03_4567, dS04_0123, dS04_4567;
|
|
int32x4_t qT01_vec, qT02_vec, qT03_vec, qT04_vec;
|
|
|
|
uint8x8_t dS11_vec, dS12_vec, dS13_vec, dS14_vec;
|
|
int16x8_t qS11_vec, qS12_vec, qS13_vec, qS14_vec;
|
|
int16x4_t dS11_0123, dS11_4567, dS12_0123, dS12_4567, dS13_0123, dS13_4567, dS14_0123, dS14_4567;
|
|
int32x4_t qT11_vec, qT12_vec, qT13_vec, qT14_vec;
|
|
|
|
int16x4_t dCoeff;
|
|
dCoeff = vdup_n_s16(INTER_RESIZE_COEF_SCALE);
|
|
|
|
for(k = 0; k <= count - 2; k++)
|
|
{
|
|
const uint8_t *S0 = src[k], *S1 = src[k + 1];
|
|
int32_t *D0 = dst[k], *D1 = dst[k + 1];
|
|
|
|
for(dx = dx0; dx < xmax; dx += 16)
|
|
{
|
|
dx_buf=dx;
|
|
|
|
int32_t sx1 = xofs[dx];
|
|
int32_t sx2 = xofs[dx + 4];
|
|
int32_t sx3 = xofs[dx + 8];
|
|
int32_t sx4 = xofs[dx + 12];
|
|
|
|
alpha_vec_A = vld2_s16(&alpha[dx * 2]); // 1 set
|
|
alpha_vec_B = vld2_s16(&alpha[(dx+4) * 2]); // 2 set
|
|
alpha_vec_C = vld2_s16(&alpha[(dx+8) * 2]); // 3 set
|
|
alpha_vec_D = vld2_s16(&alpha[(dx+12) * 2]); // 4 set
|
|
|
|
dS01_vec = vld1_u8(&S0[sx1]);
|
|
dS02_vec = vld1_u8(&S0[sx2]);
|
|
dS03_vec = vld1_u8(&S0[sx3]);
|
|
dS04_vec = vld1_u8(&S0[sx4]);
|
|
dS11_vec = vld1_u8(&S1[sx1]);
|
|
dS12_vec = vld1_u8(&S1[sx2]);
|
|
dS13_vec = vld1_u8(&S1[sx3]);
|
|
dS14_vec = vld1_u8(&S1[sx4]);
|
|
|
|
qS01_vec = vreinterpretq_s16_u16(vmovl_u8(dS01_vec));
|
|
qS02_vec = vreinterpretq_s16_u16(vmovl_u8(dS02_vec));
|
|
qS03_vec = vreinterpretq_s16_u16(vmovl_u8(dS03_vec));
|
|
qS04_vec = vreinterpretq_s16_u16(vmovl_u8(dS04_vec));
|
|
qS11_vec = vreinterpretq_s16_u16(vmovl_u8(dS11_vec));
|
|
qS12_vec = vreinterpretq_s16_u16(vmovl_u8(dS12_vec));
|
|
qS13_vec = vreinterpretq_s16_u16(vmovl_u8(dS13_vec));
|
|
qS14_vec = vreinterpretq_s16_u16(vmovl_u8(dS14_vec));
|
|
|
|
dS01_0123 = vget_low_s16(qS01_vec);
|
|
dS02_0123 = vget_low_s16(qS02_vec);
|
|
dS03_0123 = vget_low_s16(qS03_vec);
|
|
dS04_0123 = vget_low_s16(qS04_vec);
|
|
dS11_0123 = vget_low_s16(qS11_vec);
|
|
dS12_0123 = vget_low_s16(qS12_vec);
|
|
dS13_0123 = vget_low_s16(qS13_vec);
|
|
dS14_0123 = vget_low_s16(qS14_vec);
|
|
|
|
dS01_4567 = vget_high_s16(qS01_vec);
|
|
dS02_4567 = vget_high_s16(qS02_vec);
|
|
dS03_4567 = vget_high_s16(qS03_vec);
|
|
dS04_4567 = vget_high_s16(qS04_vec);
|
|
dS11_4567 = vget_high_s16(qS11_vec);
|
|
dS12_4567 = vget_high_s16(qS12_vec);
|
|
dS13_4567 = vget_high_s16(qS13_vec);
|
|
dS14_4567 = vget_high_s16(qS14_vec);
|
|
|
|
qT01_vec = vmull_s16(dS01_0123, alpha_vec_A.val[0]);
|
|
qT02_vec = vmull_s16(dS02_0123, alpha_vec_A.val[0]);
|
|
qT03_vec = vmull_s16(dS03_0123, alpha_vec_A.val[0]);
|
|
qT04_vec = vmull_s16(dS04_0123, alpha_vec_A.val[0]);
|
|
qT11_vec = vmull_s16(dS11_0123, alpha_vec_A.val[0]);
|
|
qT12_vec = vmull_s16(dS12_0123, alpha_vec_A.val[0]);
|
|
qT13_vec = vmull_s16(dS13_0123, alpha_vec_A.val[0]);
|
|
qT14_vec = vmull_s16(dS14_0123, alpha_vec_A.val[0]);
|
|
|
|
qT01_vec = vmlal_s16(qT01_vec, dS01_4567, alpha_vec_A.val[1]);
|
|
qT02_vec = vmlal_s16(qT02_vec, dS02_4567, alpha_vec_A.val[1]);
|
|
qT03_vec = vmlal_s16(qT03_vec, dS03_4567, alpha_vec_A.val[1]);
|
|
qT04_vec = vmlal_s16(qT04_vec, dS04_4567, alpha_vec_A.val[1]);
|
|
qT11_vec = vmlal_s16(qT11_vec, dS11_4567, alpha_vec_A.val[1]);
|
|
qT12_vec = vmlal_s16(qT12_vec, dS12_4567, alpha_vec_A.val[1]);
|
|
qT13_vec = vmlal_s16(qT13_vec, dS13_4567, alpha_vec_A.val[1]);
|
|
qT14_vec = vmlal_s16(qT14_vec, dS14_4567, alpha_vec_A.val[1]);
|
|
|
|
vst1q_s32(&D0[dx_buf], qT01_vec);
|
|
vst1q_s32(&D0[dx_buf + 4], qT02_vec);
|
|
vst1q_s32(&D0[dx_buf + 8], qT03_vec);
|
|
vst1q_s32(&D0[dx_buf + 12], qT04_vec);
|
|
vst1q_s32(&D1[dx_buf], qT11_vec);
|
|
vst1q_s32(&D1[dx_buf + 4], qT12_vec);
|
|
vst1q_s32(&D1[dx_buf + 8], qT13_vec);
|
|
vst1q_s32(&D1[dx_buf + 12], qT14_vec);
|
|
}
|
|
|
|
for(; dx < dwidth; dx += 4)
|
|
{
|
|
int32_t sx = xofs[dx];
|
|
dx_buf=dx;
|
|
|
|
dS0_vec = vld1_u8(&S0[sx]);
|
|
dS1_vec = vld1_u8(&S1[sx]);
|
|
|
|
qS0_vec = vreinterpretq_s16_u16(vmovl_u8(dS0_vec));
|
|
qS1_vec = vreinterpretq_s16_u16(vmovl_u8(dS1_vec));
|
|
|
|
dS0_0123 = vget_low_s16(qS0_vec);
|
|
dS1_0123 = vget_low_s16(qS1_vec);
|
|
|
|
qT0_vec = vmull_s16(dS0_0123, dCoeff);
|
|
qT1_vec = vmull_s16(dS1_0123, dCoeff);
|
|
|
|
vst1q_s32(&D0[dx_buf], qT0_vec);
|
|
vst1q_s32(&D1[dx_buf], qT1_vec);
|
|
}
|
|
}
|
|
|
|
for(; k < count; k++)
|
|
{
|
|
const uint8_t *S = src[k];
|
|
int32_t *D = dst[k];
|
|
|
|
for(dx = 0; dx < xmax; dx += 16)
|
|
{
|
|
int32_t sx1 = xofs[dx];
|
|
int32_t sx2 = xofs[dx + 4];
|
|
int32_t sx3 = xofs[dx + 8];
|
|
int32_t sx4 = xofs[dx + 12];
|
|
|
|
dx_buf=dx;
|
|
|
|
alpha_vec_A = vld2_s16(&alpha[dx * 2]); // 1 set
|
|
alpha_vec_B = vld2_s16(&alpha[(dx + 4) * 2]); // 2 set
|
|
alpha_vec_C = vld2_s16(&alpha[(dx + 8) * 2]); // 3 set
|
|
alpha_vec_D = vld2_s16(&alpha[(dx + 12) * 2]); // 4 set
|
|
|
|
dS01_vec = vld1_u8(&S[sx1]);
|
|
dS02_vec = vld1_u8(&S[sx2]);
|
|
dS03_vec = vld1_u8(&S[sx3]);
|
|
dS04_vec = vld1_u8(&S[sx4]);
|
|
|
|
qS01_vec = vreinterpretq_s16_u16(vmovl_u8(dS01_vec));
|
|
qS02_vec = vreinterpretq_s16_u16(vmovl_u8(dS02_vec));
|
|
qS03_vec = vreinterpretq_s16_u16(vmovl_u8(dS03_vec));
|
|
qS04_vec = vreinterpretq_s16_u16(vmovl_u8(dS04_vec));
|
|
|
|
|
|
dS01_0123 = vget_low_s16(qS01_vec);
|
|
dS02_0123 = vget_low_s16(qS02_vec);
|
|
dS03_0123 = vget_low_s16(qS03_vec);
|
|
dS04_0123 = vget_low_s16(qS04_vec);
|
|
|
|
dS01_4567 = vget_high_s16(qS01_vec);
|
|
dS02_4567 = vget_high_s16(qS02_vec);
|
|
dS03_4567 = vget_high_s16(qS03_vec);
|
|
dS04_4567 = vget_high_s16(qS04_vec);
|
|
|
|
|
|
qT01_vec = vmull_s16(dS01_0123, alpha_vec_A.val[0]);
|
|
qT02_vec = vmull_s16(dS02_0123, alpha_vec_A.val[0]);
|
|
qT03_vec = vmull_s16(dS03_0123, alpha_vec_A.val[0]);
|
|
qT04_vec = vmull_s16(dS04_0123, alpha_vec_A.val[0]);
|
|
|
|
qT01_vec = vmlal_s16(qT01_vec, dS01_4567, alpha_vec_A.val[1]);
|
|
qT02_vec = vmlal_s16(qT02_vec, dS02_4567, alpha_vec_A.val[1]);
|
|
qT03_vec = vmlal_s16(qT03_vec, dS03_4567, alpha_vec_A.val[1]);
|
|
qT04_vec = vmlal_s16(qT04_vec, dS04_4567, alpha_vec_A.val[1]);
|
|
|
|
vst1q_s32(&D[dx_buf], qT01_vec);
|
|
vst1q_s32(&D[dx_buf + 4], qT02_vec);
|
|
vst1q_s32(&D[dx_buf + 8], qT03_vec);
|
|
vst1q_s32(&D[dx_buf + 12], qT04_vec);
|
|
}
|
|
|
|
|
|
for(; dx < dwidth; dx += 4)
|
|
{
|
|
int32_t sx = xofs[dx];
|
|
|
|
dx_buf=dx;
|
|
|
|
dS0_vec = vld1_u8(&S[sx]);
|
|
|
|
qS0_vec = vreinterpretq_s16_u16(vmovl_u8(dS0_vec));
|
|
|
|
dS0_0123 = vget_low_s16(qS0_vec);
|
|
|
|
qT0_vec = vmull_s16(dS0_0123, dCoeff);
|
|
|
|
vst1q_s32(&D[dx_buf], qT0_vec);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void img_vresize_linear_neon(const int32_t **src, uint8_t *dst, const int16_t *beta, int32_t width)
|
|
{
|
|
const uint64_t img_vresize_linear_mask_residual_table[VRESIZE_LINEAR_MASK_TABLE_SIZE] =
|
|
{
|
|
0x00000000000000FF, 0x000000000000FFFF,
|
|
0x0000000000FFFFFF, 0x00000000FFFFFFFF,
|
|
0x000000FFFFFFFFFF, 0x0000FFFFFFFFFFFF,
|
|
0x00FFFFFFFFFFFFFF
|
|
};
|
|
|
|
const int32_t *S0 = src[0], *S1 = src[1];
|
|
|
|
int32x4_t qS0_0123, qS0_4567, qS1_0123, qS1_4567;
|
|
int32x4_t qS0_89AB, qS0_CDEF, qS1_89AB, qS1_CDEF;
|
|
int32x4_t qT_0123, qT_4567;
|
|
int32x4_t qT_89AB, qT_CDEF;
|
|
int16x4_t dT_0123, dT_4567;
|
|
int16x4_t dT_89AB, dT_CDEF;
|
|
uint16x8_t qT_01234567;
|
|
uint16x8_t qT_89ABCDEF;
|
|
uint8x8_t dT_01234567, dDst_01234567;
|
|
uint8x8_t dT_89ABCDEF;
|
|
|
|
int32x2_t dBeta = {};
|
|
dBeta = vset_lane_s32((int32_t)(beta[0]), dBeta, 0);
|
|
dBeta = vset_lane_s32((int32_t)(beta[1]), dBeta, 1);
|
|
|
|
int32x4_t qDelta, qMin, qMax;
|
|
qDelta = vdupq_n_s32(DELTA);
|
|
qMin = vdupq_n_s32(0);
|
|
qMax = vdupq_n_s32(255);
|
|
|
|
int32_t x = 0;
|
|
int32_t x_buf = 0;
|
|
|
|
for(; x <= ((width/2) - 16); x += 16)
|
|
{
|
|
x_buf = x;
|
|
|
|
qS0_0123 = vld1q_s32(&S0[x]);
|
|
qS0_4567 = vld1q_s32(&S0[x + 4]);
|
|
qS0_89AB = vld1q_s32(&S0[x + 8]);
|
|
qS0_CDEF = vld1q_s32(&S0[x + 12]);
|
|
|
|
qS1_0123 = vld1q_s32(&S1[x]);
|
|
qS1_4567 = vld1q_s32(&S1[x + 4]);
|
|
qS1_89AB = vld1q_s32(&S1[x + 8]);
|
|
qS1_CDEF = vld1q_s32(&S1[x + 12]);
|
|
|
|
qT_0123 = vmulq_lane_s32(qS0_0123, dBeta, 0);
|
|
qT_4567 = vmulq_lane_s32(qS0_4567, dBeta, 0);
|
|
qT_89AB = vmulq_lane_s32(qS0_89AB, dBeta, 0);
|
|
qT_CDEF = vmulq_lane_s32(qS0_CDEF, dBeta, 0);
|
|
|
|
qT_0123 = vmlaq_lane_s32(qT_0123, qS1_0123, dBeta, 1);
|
|
qT_4567 = vmlaq_lane_s32(qT_4567, qS1_4567, dBeta, 1);
|
|
qT_89AB = vmlaq_lane_s32(qT_89AB, qS1_89AB, dBeta, 1);
|
|
qT_CDEF = vmlaq_lane_s32(qT_CDEF, qS1_CDEF, dBeta, 1);
|
|
|
|
qT_0123 = vaddq_s32(qT_0123, qDelta);
|
|
qT_4567 = vaddq_s32(qT_4567, qDelta);
|
|
qT_89AB = vaddq_s32(qT_89AB, qDelta);
|
|
qT_CDEF = vaddq_s32(qT_CDEF, qDelta);
|
|
|
|
qT_0123 = vshrq_n_s32(qT_0123, BITS);
|
|
qT_4567 = vshrq_n_s32(qT_4567, BITS);
|
|
qT_89AB = vshrq_n_s32(qT_89AB, BITS);
|
|
qT_CDEF = vshrq_n_s32(qT_CDEF, BITS);
|
|
|
|
qT_0123 = vmaxq_s32(qT_0123, qMin);
|
|
qT_4567 = vmaxq_s32(qT_4567, qMin);
|
|
qT_89AB = vmaxq_s32(qT_89AB, qMin);
|
|
qT_CDEF = vmaxq_s32(qT_CDEF, qMin);
|
|
|
|
qT_0123 = vminq_s32(qT_0123, qMax);
|
|
qT_4567 = vminq_s32(qT_4567, qMax);
|
|
qT_89AB = vminq_s32(qT_89AB, qMax);
|
|
qT_CDEF = vminq_s32(qT_CDEF, qMax);
|
|
|
|
dT_0123 = vmovn_s32(qT_0123);
|
|
dT_4567 = vmovn_s32(qT_4567);
|
|
dT_89AB = vmovn_s32(qT_89AB);
|
|
dT_CDEF = vmovn_s32(qT_CDEF);
|
|
|
|
qT_01234567 = vreinterpretq_u16_s16(vcombine_s16(dT_0123, dT_4567));
|
|
qT_89ABCDEF = vreinterpretq_u16_s16(vcombine_s16(dT_89AB, dT_CDEF));
|
|
|
|
dT_01234567 = vmovn_u16(qT_01234567);
|
|
dT_89ABCDEF = vmovn_u16(qT_89ABCDEF);
|
|
|
|
vst1_u8(&dst[x_buf], dT_01234567);
|
|
vst1_u8(&dst[x_buf+8], dT_89ABCDEF);
|
|
}
|
|
|
|
for(; x <= ((width/2) - 8); x += 8)
|
|
{
|
|
qS0_0123 = vld1q_s32(&S0[x]);
|
|
qS0_4567 = vld1q_s32(&S0[x + 4]);
|
|
qS1_0123 = vld1q_s32(&S1[x]);
|
|
qS1_4567 = vld1q_s32(&S1[x + 4]);
|
|
|
|
qT_0123 = vmulq_lane_s32(qS0_0123, dBeta, 0);
|
|
qT_4567 = vmulq_lane_s32(qS0_4567, dBeta, 0);
|
|
qT_0123 = vmlaq_lane_s32(qT_0123, qS1_0123, dBeta, 1);
|
|
qT_4567 = vmlaq_lane_s32(qT_4567, qS1_4567, dBeta, 1);
|
|
|
|
qT_0123 = vaddq_s32(qT_0123, qDelta);
|
|
qT_4567 = vaddq_s32(qT_4567, qDelta);
|
|
|
|
qT_0123 = vshrq_n_s32(qT_0123, BITS);
|
|
qT_4567 = vshrq_n_s32(qT_4567, BITS);
|
|
|
|
qT_0123 = vmaxq_s32(qT_0123, qMin);
|
|
qT_4567 = vmaxq_s32(qT_4567, qMin);
|
|
qT_0123 = vminq_s32(qT_0123, qMax);
|
|
qT_4567 = vminq_s32(qT_4567, qMax);
|
|
|
|
dT_0123 = vmovn_s32(qT_0123);
|
|
dT_4567 = vmovn_s32(qT_4567);
|
|
qT_01234567 = vreinterpretq_u16_s16(vcombine_s16(dT_0123, dT_4567));
|
|
dT_01234567 = vmovn_u16(qT_01234567);
|
|
|
|
vst1_u8(&dst[x], dT_01234567);
|
|
}
|
|
|
|
if(x < (width/2))
|
|
{
|
|
uint8x8_t dMask;
|
|
dMask = vld1_u8((uint8_t*)(&img_vresize_linear_mask_residual_table[(width/2) - x - 1]));
|
|
dDst_01234567 = vld1_u8 (&dst[x]);
|
|
|
|
qS0_0123 = vld1q_s32(&S0[x]);
|
|
qS0_4567 = vld1q_s32(&S0[x + 4]);
|
|
qS1_0123 = vld1q_s32(&S1[x]);
|
|
qS1_4567 = vld1q_s32(&S1[x + 4]);
|
|
|
|
qT_0123 = vmulq_lane_s32(qS0_0123, dBeta, 0);
|
|
qT_4567 = vmulq_lane_s32(qS0_4567, dBeta, 0);
|
|
qT_0123 = vmlaq_lane_s32(qT_0123, qS1_0123, dBeta, 1);
|
|
qT_4567 = vmlaq_lane_s32(qT_4567, qS1_4567, dBeta, 1);
|
|
|
|
qT_0123 = vaddq_s32(qT_0123, qDelta);
|
|
qT_4567 = vaddq_s32(qT_4567, qDelta);
|
|
|
|
qT_0123 = vshrq_n_s32(qT_0123, BITS);
|
|
qT_4567 = vshrq_n_s32(qT_4567, BITS);
|
|
|
|
qT_0123 = vmaxq_s32(qT_0123, qMin);
|
|
qT_4567 = vmaxq_s32(qT_4567, qMin);
|
|
qT_0123 = vminq_s32(qT_0123, qMax);
|
|
qT_4567 = vminq_s32(qT_4567, qMax);
|
|
|
|
dT_0123 = vmovn_s32(qT_0123);
|
|
dT_4567 = vmovn_s32(qT_4567);
|
|
qT_01234567 = vreinterpretq_u16_s16(vcombine_s16 (dT_0123, dT_4567));
|
|
dT_01234567 = vmovn_u16(qT_01234567);
|
|
|
|
dMask = vbsl_u8(dMask, dT_01234567, dDst_01234567);
|
|
vst1_u8(&dst[x], dMask);
|
|
}
|
|
}
|
|
|
|
static void img_resize_generic_linear_neon(uint8_t *src,
|
|
uint8_t *dst,
|
|
const int32_t *xofs,
|
|
const int16_t *_alpha,
|
|
const int32_t *yofs,
|
|
const int16_t *_beta,
|
|
int32_t xmin,
|
|
int32_t xmax,
|
|
int32_t ksize,
|
|
int32_t srcw,
|
|
int32_t srch,
|
|
int32_t srcstep,
|
|
int32_t dstw,
|
|
int32_t dsth,
|
|
int32_t dststep,
|
|
int32_t channels)
|
|
{
|
|
const int16_t *alpha = _alpha;
|
|
const int16_t *beta = _beta;
|
|
int32_t cn = channels;
|
|
srcw *= cn;
|
|
dstw *= cn;
|
|
|
|
int32_t bufstep = (int32_t)neon_align_size(dstw, 16);
|
|
int32_t *buffer_ = (int32_t*)malloc(bufstep * ksize * sizeof(int32_t));
|
|
|
|
const uint8_t *srows[MAX_ESIZE];
|
|
int32_t *rows[MAX_ESIZE];
|
|
int32_t prev_sy[MAX_ESIZE];
|
|
int32_t k, dy;
|
|
|
|
xmin *= cn;
|
|
xmax *= cn;
|
|
|
|
for(k = 0; k < ksize; k++)
|
|
{
|
|
prev_sy[k] = -1;
|
|
rows[k] = (int32_t*)(buffer_ + (bufstep * k));
|
|
}
|
|
|
|
for(dy = 0; dy < dsth; dy++, beta += ksize)
|
|
{
|
|
int32_t sy0 = yofs[dy], k, k0 = ksize, k1 = 0, ksize2 = ksize / 2;
|
|
|
|
for(k = 0; k < ksize; k++)
|
|
{
|
|
int32_t sy = neon_clip(sy0 - ksize2 + 1 + k, 0, srch);
|
|
for(k1 = MAX (k1, k); k1 < ksize; k1++)
|
|
{
|
|
if(sy == prev_sy[k1])
|
|
{
|
|
if(k1 > k)
|
|
memcpy(rows[k], rows[k1], bufstep * sizeof (rows[0][0]));
|
|
break;
|
|
}
|
|
}
|
|
if(k1 == ksize)
|
|
k0 = MIN (k0, k);
|
|
srows[k] = (const uint8_t*) (src + (srcstep * sy));
|
|
prev_sy[k] = sy;
|
|
}
|
|
|
|
if(k0 < ksize)
|
|
{
|
|
if(cn == 4)
|
|
{
|
|
img_hresize_4channels_linear_neon(srows + k0, rows + k0, ksize - k0, xofs, alpha,
|
|
dstw, xmax);
|
|
}
|
|
else
|
|
{
|
|
img_hresize_linear_c(srows + k0, rows + k0, ksize - k0, xofs, alpha,
|
|
dstw, cn, xmax);
|
|
}
|
|
}
|
|
|
|
img_vresize_linear_neon((const int32_t**)rows, (uint8_t*)(dst + (dststep * dy)), beta, dstw);
|
|
}
|
|
if(buffer_)
|
|
free(buffer_);
|
|
}
|
|
|
|
int32_t nc_resize_image_bilinear_yuyv(uint8_t *src, uint32_t src_stride, uint32_t src_width, uint32_t src_height,
|
|
uint8_t *dst, uint32_t dst_stride, uint32_t dst_width, uint32_t dst_height)
|
|
{
|
|
int32_t ret = 0;
|
|
|
|
int32_t cn = 4;
|
|
int32_t xmin = 0;
|
|
int32_t xmax = dst_width;
|
|
int32_t width = dst_width * cn;
|
|
int32_t ksize = 0, ksize2;
|
|
|
|
ksize = 2;
|
|
ksize2 = ksize / 2;
|
|
|
|
uint8_t *buffer_ = (uint8_t*)malloc((width + dst_height) * (sizeof(int32_t) + (sizeof(float) * ksize)));
|
|
|
|
int32_t *xofs = (int32_t*) buffer_;
|
|
int32_t *yofs = xofs + width;
|
|
int16_t *ialpha = (int16_t*) (yofs + dst_height);
|
|
int16_t *ibeta = ialpha + width * ksize;
|
|
|
|
img_resize_cal_offset_linear(xofs, ialpha, yofs, ibeta, &xmin, &xmax, ksize, ksize2, src_width, src_height, dst_width, dst_height, cn);
|
|
|
|
img_resize_generic_linear_neon(src, dst, xofs, ialpha, yofs, ibeta, xmin, xmax, ksize, src_width, src_height, src_stride, dst_width, dst_height, dst_stride, cn);
|
|
|
|
if(buffer_)
|
|
free(buffer_);
|
|
|
|
return ret;
|
|
}
|
|
|
|
/* example */
|
|
int32_t nc_pip_image_bilinear_yuyv(uint8_t *src, int32_t src_w, int32_t src_h, int32_t trg_w, int32_t trg_h,
|
|
uint8_t *pip_plane, int32_t pip_w, int32_t pip_h,
|
|
int32_t x_ofs, int32_t y_ofs)
|
|
{
|
|
int32_t ret = OK;
|
|
|
|
// source info
|
|
int32_t pip_stride = pip_w * 2;
|
|
int32_t src_stride = src_w * 2;
|
|
|
|
// destination info
|
|
int32_t dst_width = trg_w;
|
|
int32_t dst_height = trg_h;
|
|
|
|
// image start offset
|
|
int32_t dst_start_xpos = x_ofs;
|
|
int32_t dst_start_ypos = y_ofs;
|
|
|
|
// over range prevent code
|
|
if((dst_start_xpos + dst_width) > pip_w)
|
|
return ERR;
|
|
|
|
if((dst_start_ypos + dst_height) > pip_h)
|
|
return ERR;
|
|
|
|
nc_resize_image_bilinear_yuyv(src, src_stride, src_w, src_h,
|
|
(pip_plane + ((dst_start_ypos * pip_stride) + (dst_start_xpos * 2))), pip_stride, dst_width, dst_height);
|
|
|
|
return ret;
|
|
}
|
|
|
|
void nc_ScaleYUY2RowDown2Box_NEON(uint8_t *src_ptr, int32_t src_stride,
|
|
uint8_t *dst, int32_t dst_width)
|
|
{
|
|
uint8_t *src_ptr_nextline = src_ptr + src_stride;
|
|
|
|
asm volatile (
|
|
// src_yuy2
|
|
// y0 u01 y1 v01 y2 u23 y3 v23 y4 u45 y5 v45 y6 u67 y7 v67 : 8 pixels (16 bytes)
|
|
// 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
|
|
"1: \n" // 1 loop - src : 64bytes(32pixels) | dst : 32bytes(16pixels)
|
|
"ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%[src_yuy2]], #64 \n" // v0,v2 = y | v1 = u | v3 = v
|
|
"ld4 {v16.16b, v17.16b, v18.16b, v19.16b}, [%[src_yuy2_nextline]], #64 \n"
|
|
|
|
"uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
|
|
"uaddlp v3.8h, v3.16b \n" // V 16 bytes -> 8 shorts.
|
|
"uadalp v1.8h, v17.16b \n" // U 16 bytes -> 8 shorts.
|
|
"uadalp v3.8h, v19.16b \n" // V 16 bytes -> 8 shorts.
|
|
"rshrn v1.8b, v1.8h, #2 \n" // round and pack
|
|
"rshrn v3.8b, v3.8h, #2 \n" // round and pack
|
|
|
|
// v0 <- y2 y0
|
|
// v1 <- u23 u01
|
|
// v2 <- y3 y1
|
|
// v3 <- u34 v01
|
|
"zip1 v4.16b, v0.16b, v2.16b \n"
|
|
"zip2 v5.16b, v0.16b, v2.16b \n"
|
|
"zip1 v6.16b, v16.16b, v18.16b \n"
|
|
"zip2 v7.16b, v16.16b, v18.16b \n"
|
|
// v0 <- y3 y2 y1 y0
|
|
|
|
"uaddlp v4.8h, v4.16b \n" // row 1 add adjacent
|
|
"uaddlp v5.8h, v5.16b \n"
|
|
"uadalp v4.8h, v6.16b \n" // += row 2 add adjacent
|
|
"uadalp v5.8h, v7.16b \n"
|
|
"rshrn v4.8b, v4.8h, #2 \n" // round and pack
|
|
"rshrn2 v4.16b, v5.8h, #2 \n"
|
|
|
|
"uzp1 v6.16b, v4.16b, v5.16b \n"
|
|
"uzp2 v7.16b, v4.16b, v5.16b \n"
|
|
|
|
// store : v4 v1 v5 v3 order is YUY2
|
|
"dup v8.2D, v6.D[0] \n"
|
|
"dup v9.2D, v1.D[0] \n"
|
|
"dup v10.2D, v7.D[0] \n"
|
|
"dup v11.2D, v3.D[0] \n"
|
|
"st4 {v8.8b, v9.8b, v10.8b, v11.8b}, [%[dst_yuy2]], #32 \n"
|
|
|
|
"subs %w[remain_dst_width], %w[remain_dst_width], #16 \n"
|
|
"b.gt 1b \n"
|
|
: [src_yuy2] "+r"(src_ptr)
|
|
,[src_yuy2_nextline] "+r"(src_ptr_nextline)
|
|
,[dst_yuy2] "+r"(dst)
|
|
,[remain_dst_width] "+r"(dst_width)
|
|
:
|
|
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19"
|
|
);
|
|
|
|
return;
|
|
}
|
|
|
|
void nc_ScaleYUY2Down2Box(uint8_t* src, int32_t src_stride, int32_t src_height,
|
|
uint8_t* dst, int32_t dst_stride, int32_t dst_width)
|
|
{
|
|
for (int ypos = 0; ypos < src_height; ypos += 2)
|
|
nc_ScaleYUY2RowDown2Box_NEON((src + (src_stride * ypos)), src_stride, (dst + ((dst_stride * ypos) / 2)), dst_width);
|
|
|
|
return;
|
|
}
|
|
|
|
static int32_t halfwidth_stripe(uint8_t *yuv_src_line, uint8_t *yuv_dst_line, int32_t src_width)
|
|
{
|
|
int32_t ret = OK;
|
|
|
|
asm volatile (
|
|
"1: \n" // 1 loop - src : 64bytes(32pixels) | dst : 32bytes(16pixels)
|
|
"ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%[src_yuy2]], #64 \n" // v0,v2 = y | v1 = u | v3 = v
|
|
|
|
"uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
|
|
"uaddlp v3.8h, v3.16b \n" // V 16 bytes -> 8 shorts.
|
|
"rshrn v1.8b, v1.8h, #1 \n" // round and pack
|
|
"rshrn v3.8b, v3.8h, #1 \n" // round and pack
|
|
|
|
"zip1 v4.16b, v0.16b, v2.16b \n"
|
|
"zip2 v5.16b, v0.16b, v2.16b \n"
|
|
|
|
"uaddlp v4.8h, v4.16b \n" // row 1 add adjacent
|
|
"uaddlp v5.8h, v5.16b \n"
|
|
"rshrn v4.8b, v4.8h, #1 \n" // round and pack
|
|
"rshrn2 v4.16b, v5.8h, #1 \n"
|
|
|
|
"uzp1 v6.16b, v4.16b, v5.16b \n"
|
|
"uzp2 v7.16b, v4.16b, v5.16b \n"
|
|
|
|
"dup v8.2D, v6.D[0] \n"
|
|
"dup v9.2D, v1.D[0] \n"
|
|
"dup v10.2D, v7.D[0] \n"
|
|
"dup v11.2D, v3.D[0] \n"
|
|
"st4 {v8.8b, v9.8b, v10.8b, v11.8b}, [%[dst_yuy2]], #32 \n"
|
|
|
|
"subs %w[remain_src_width], %w[remain_src_width], #32 \n"
|
|
"b.gt 1b \n"
|
|
|
|
: [src_yuy2] "+r"(yuv_src_line)
|
|
,[dst_yuy2] "+r"(yuv_dst_line)
|
|
,[remain_src_width] "+r"(src_width)
|
|
:
|
|
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19"
|
|
);
|
|
|
|
return ret;
|
|
}
|
|
|
|
int32_t nc_resize_yuy2_halfwidth(uint8_t *yuv_src, uint8_t *yuv_dst, int32_t src_width, int32_t src_height)
|
|
{
|
|
int32_t ret = OK;
|
|
int32_t stride = src_width * 2;
|
|
|
|
for (int ypos = 0; ypos < src_height; ypos++)
|
|
halfwidth_stripe(yuv_src + (stride * ypos), yuv_dst + (stride/2 * ypos), src_width);
|
|
|
|
|
|
return ret;
|
|
}
|
|
|
|
static void img_resize_cal_offset_linear_stripe(int32_t *yofs, uint16_t *ibeta, int32_t src_size, int32_t dst_size)
|
|
{
|
|
float inv_scale_y = (float)dst_size / (float)src_size;
|
|
float scale_y = (float)(1. / inv_scale_y);
|
|
int32_t k, sy, dy;
|
|
float fy;
|
|
float cbuf[2];
|
|
|
|
// vertical
|
|
for(dy = 0; dy < dst_size; dy++)
|
|
{
|
|
fy = (float) ((dy + 0.5) * scale_y - 0.5);
|
|
sy = neon_floor(fy);
|
|
fy -= (float)sy;
|
|
|
|
yofs[dy] = sy;
|
|
|
|
cbuf[0] = 1.f - fy;
|
|
cbuf[1] = fy;
|
|
|
|
for(k = 0; k < 2; k++)
|
|
ibeta[(dy * 2) + k] = (uint16_t) (cbuf[k] * (1<<11)); // floating value x 2^8
|
|
}
|
|
}
|
|
|
|
static int32_t hafwid_verti_scaledn_stripe(uint8_t *yuv_src_line, uint8_t *yuv_dst_line, int32_t src_width, uint16_t ibeta1, uint16_t ibeta2)
|
|
{
|
|
int32_t ret = OK;
|
|
uint8_t *yuv_src_line_next = yuv_src_line + (src_width * 2);
|
|
uint16_t c_ibeta1 = ibeta1;
|
|
uint16_t c_ibeta2 = ibeta2;
|
|
uint16_t *ptr_ibeta1;
|
|
uint16_t *ptr_ibeta2;
|
|
ptr_ibeta1 = &c_ibeta1;
|
|
ptr_ibeta2 = &c_ibeta2;
|
|
|
|
asm volatile (
|
|
// "ldr w0, asm_ibeta1 \n"
|
|
// "ldr w1, asm_ibeta2 \n"
|
|
"mov v8.h[0], w3 \n"
|
|
"mov v8.h[1], w4 \n"
|
|
"1: \n" // 1 loop - src : 64bytes(32pixels) | dst : 32bytes(16pixels)
|
|
"ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%[src_yuy2]], #64 \n" // v0,v2 = y | v1 = u | v3 = v
|
|
"ld4 {v10.16b, v11.16b, v12.16b, v13.16b}, [%[src_yuy2_next]], #64 \n" // v0,v2 = y | v1 = u | v3 = v
|
|
|
|
// half width start
|
|
"uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
|
|
"uaddlp v3.8h, v3.16b \n" // V 16 bytes -> 8 shorts.
|
|
"uaddlp v11.8h, v11.16b \n" // U 16 bytes -> 8 shorts.
|
|
"uaddlp v13.8h, v13.16b \n" // V 16 bytes -> 8 shorts.
|
|
"rshrn v1.8b, v1.8h, #1 \n" // round and pack
|
|
"rshrn v3.8b, v3.8h, #1 \n" // round and pack
|
|
"rshrn v11.8b, v11.8h, #1 \n" // round and pack
|
|
"rshrn v13.8b, v13.8h, #1 \n" // round and pack
|
|
|
|
"zip1 v4.16b, v0.16b, v2.16b \n" // y first
|
|
"zip1 v14.16b, v10.16b, v12.16b \n"
|
|
"zip2 v5.16b, v0.16b, v2.16b \n" // y second
|
|
"zip2 v15.16b, v10.16b, v12.16b \n"
|
|
|
|
"uaddlp v4.8h, v4.16b \n" // y first pair add
|
|
"uaddlp v5.8h, v5.16b \n" // y second pair add
|
|
"uaddlp v14.8h, v14.16b \n"
|
|
"uaddlp v15.8h, v15.16b \n"
|
|
|
|
"rshrn v4.8b, v4.8h, #1 \n" // round and pack
|
|
"rshrn v14.8b, v14.8h, #1 \n" // round and pack
|
|
"rshrn2 v4.16b, v5.8h, #1 \n"
|
|
"rshrn2 v14.16b, v15.8h, #1 \n"
|
|
|
|
"uzp1 v0.16b, v4.16b, v5.16b \n" // v5 data 사용 안함 : v0 하단 = v4 홀수번째 + v0 상단 = v5 짝수번째
|
|
"uzp1 v10.16b, v14.16b, v15.16b \n"
|
|
"uzp2 v2.16b, v4.16b, v5.16b \n"
|
|
"uzp2 v12.16b, v14.16b, v15.16b \n"
|
|
// half width until here
|
|
|
|
// fisrt line - u8 to u16
|
|
"UXTL v4.8h, v0.8b \n"
|
|
"UXTL v5.8h, v1.8b \n"
|
|
"UXTL v6.8h, v2.8b \n"
|
|
"UXTL v7.8h, v3.8b \n"
|
|
// second line - u8 to u16
|
|
"UXTL v14.8h, v10.8b \n"
|
|
"UXTL v15.8h, v11.8b \n"
|
|
"UXTL v16.8h, v12.8b \n"
|
|
"UXTL v17.8h, v13.8b \n"
|
|
|
|
// mul beta1 - first line
|
|
"UMULL v21.4s, v4.4h, v8.h[0] \n"
|
|
"UMULL2 v22.4s, v4.8h, v8.h[0] \n"
|
|
"UMULL v23.4s, v5.4h, v8.h[0] \n"
|
|
"UMULL2 v24.4s, v5.8h, v8.h[0] \n"
|
|
"UMULL v25.4s, v6.4h, v8.h[0] \n"
|
|
"UMULL2 v26.4s, v6.8h, v8.h[0] \n"
|
|
"UMULL v27.4s, v7.4h, v8.h[0] \n"
|
|
"UMULL2 v28.4s, v7.8h, v8.h[0] \n"
|
|
|
|
// mla beta2 - second line
|
|
"UMLAL v21.4s, v14.4h, v8.h[1] \n"
|
|
"UMLAL2 v22.4s, v14.8h, v8.h[1] \n"
|
|
"UMLAL v23.4s, v15.4h, v8.h[1] \n"
|
|
"UMLAL2 v24.4s, v15.8h, v8.h[1] \n"
|
|
"UMLAL v25.4s, v16.4h, v8.h[1] \n"
|
|
"UMLAL2 v26.4s, v16.8h, v8.h[1] \n"
|
|
"UMLAL v27.4s, v17.4h, v8.h[1] \n"
|
|
"UMLAL2 v28.4s, v17.8h, v8.h[1] \n"
|
|
|
|
// rshrn
|
|
"rshrn v21.4h, v21.4s, #11 \n"
|
|
"rshrn2 v21.8h, v22.4s, #11 \n"
|
|
"rshrn v23.4h, v23.4s, #11 \n"
|
|
"rshrn2 v23.8h, v24.4s, #11 \n"
|
|
"rshrn v25.4h, v25.4s, #11 \n"
|
|
"rshrn2 v25.8h, v26.4s, #11 \n"
|
|
"rshrn v27.4h, v27.4s, #11 \n"
|
|
"rshrn2 v27.8h, v28.4s, #11 \n"
|
|
|
|
// compress
|
|
"XTN v0.8b, v21.8h \n"
|
|
"XTN v1.8b, v23.8h \n"
|
|
"XTN v2.8b, v25.8h \n"
|
|
"XTN v3.8b, v27.8h \n"
|
|
|
|
// st
|
|
"st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[dst_yuy2]], #32 \n"
|
|
|
|
"subs %w[remain_src_width], %w[remain_src_width], #32 \n"
|
|
"b.gt 1b \n"
|
|
|
|
: [src_yuy2] "+r"(yuv_src_line)
|
|
,[src_yuy2_next] "+r"(yuv_src_line_next)
|
|
,[dst_yuy2] "+r"(yuv_dst_line)
|
|
,[asm_ibeta1] "+r"(ptr_ibeta1)
|
|
,[asm_ibeta2] "+r"(ptr_ibeta2)
|
|
,[remain_src_width] "+r"(src_width)
|
|
:
|
|
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
|
|
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
|
|
"x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13"
|
|
);
|
|
|
|
return ret;
|
|
}
|
|
|
|
int32_t nc_resize_yuy2_hafwid_verti_scaledn(uint8_t *yuv_src, uint8_t *yuv_dst, int32_t src_width, int32_t src_height, int32_t dst_height)
|
|
{
|
|
int32_t ret = OK;
|
|
int32_t stride = src_width * 2;
|
|
|
|
uint8_t *buffer_ = (uint8_t*)malloc(dst_height * (sizeof(int32_t) + (sizeof(uint16_t) * 2))); // (Vertical 점의 개수) x (좌표 1개 + 두점간 비 2개)
|
|
int32_t *yofs = (int32_t*) buffer_;
|
|
uint16_t *ibeta = (uint16_t*) (&yofs[dst_height]);
|
|
|
|
img_resize_cal_offset_linear_stripe(yofs, ibeta, src_height, dst_height);
|
|
|
|
for (int ypos = 0; ypos < dst_height; ypos++)
|
|
hafwid_verti_scaledn_stripe(yuv_src + (stride * yofs[ypos]), yuv_dst + (stride/2 * ypos), src_width, ibeta[ypos*2], ibeta[ypos*2 + 1]);
|
|
|
|
if (buffer_)
|
|
free(buffer_);
|
|
|
|
return ret;
|
|
}
|
|
|
|
|
|
|
|
int32_t nc_img_yuyv_packed_to_planar(uint8_t *yuv_Packed, uint8_t *yuv_Plannar, int32_t width, int32_t height)
|
|
{
|
|
int32_t ret = OK;
|
|
|
|
uint8_t *Yplane = yuv_Plannar;
|
|
uint8_t *Uplane = &yuv_Plannar[(width*height)];
|
|
uint8_t *Vplane = &yuv_Plannar[(width*height + width*height/2)];
|
|
int32_t img_langth = (width*height/2);
|
|
|
|
|
|
asm volatile (
|
|
"1: \n" // 1 loop - src : 64bytes(32pixels) | dst : 32bytes(16pixels)
|
|
|
|
"ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%[src]], #64 \n"
|
|
|
|
"zip1 v4.16b, v0.16b, v2.16b \n"
|
|
"zip2 v5.16b, v0.16b, v2.16b \n"
|
|
|
|
"st1 {v4.16b, v5.16b}, [%[dst_Y]], #32 \n"
|
|
"st1 {v1.16b}, [%[dst_U]], #16 \n"
|
|
"st1 {v3.16b}, [%[dst_V]], #16 \n"
|
|
|
|
"subs %w[remain_dst_width], %w[remain_dst_width], #16 \n"
|
|
"b.gt 1b \n"
|
|
|
|
: [src] "+r"(yuv_Packed)
|
|
,[dst_Y] "+r"(Yplane)
|
|
,[dst_U] "+r"(Uplane)
|
|
,[dst_V] "+r"(Vplane)
|
|
,[remain_dst_width] "+r"(img_langth)
|
|
:
|
|
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19"
|
|
);
|
|
|
|
return ret;
|
|
}
|
|
|
|
int32_t nc_img_yuyv_planar_to_packed(uint8_t *yuv_Plannar, uint8_t *yuv_Packed, int32_t width, int32_t height)
|
|
{
|
|
int32_t ret = OK;
|
|
|
|
uint8_t *Yplane = yuv_Plannar;
|
|
uint8_t *Uplane = &yuv_Plannar[(width*height)];
|
|
uint8_t *Vplane = &yuv_Plannar[(width*height + width*height/2)];
|
|
int32_t img_langth = (width*height/2);
|
|
|
|
// YUYV 순서 제확인 필요
|
|
asm volatile (
|
|
"1: \n" // 1 loop - src : 64bytes(32pixels) | dst : 32bytes(16pixels)
|
|
|
|
"ld2 {v0.16b, v1.16b}, [%[src_Y]], #32 \n"
|
|
"ld1 {v4.16b}, [%[src_U]], #16 \n"
|
|
"ld1 {v3.16b}, [%[src_V]], #16 \n"
|
|
|
|
"mov v2.16b, v1.16b \n"
|
|
"mov v1.16b, v4.16b \n"
|
|
|
|
"st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%[dst]], #64 \n" // right -> v0 v2 v1 v3
|
|
|
|
"subs %w[remain_dst_width], %w[remain_dst_width], #16 \n"
|
|
"b.gt 1b \n"
|
|
|
|
: [dst] "+r"(yuv_Packed)
|
|
,[src_Y] "+r"(Yplane)
|
|
,[src_U] "+r"(Uplane)
|
|
,[src_V] "+r"(Vplane)
|
|
,[remain_dst_width] "+r"(img_langth)
|
|
:
|
|
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19"
|
|
);
|
|
|
|
return ret;
|
|
}
|
|
|
|
int32_t nc_img_nv12_to_yuv422(uint8_t* nv_ptr, uint8_t* yuv_ptr, int32_t width, int32_t height)
|
|
{
|
|
int32_t ret = OK;
|
|
int32_t lvCnt_H = 0;
|
|
|
|
uint8_t *DSTplane;
|
|
uint8_t *Yplane;
|
|
uint8_t *UVplane;
|
|
|
|
int32_t lvWidth_target;
|
|
|
|
for(lvCnt_H = 0; lvCnt_H < height; lvCnt_H++)
|
|
{
|
|
lvWidth_target = width/2;
|
|
DSTplane = &yuv_ptr[(width*2)*(lvCnt_H)];
|
|
Yplane = &nv_ptr[0 + (width)*(lvCnt_H)];
|
|
UVplane = &nv_ptr[(width * height) + (width)*(lvCnt_H/2)];
|
|
|
|
asm volatile (
|
|
"1: \n"
|
|
|
|
"ld2 {v0.16b, v1.16b}, [%[src_Y]], #32 \n" // Y0 Y1
|
|
"ld2 {v4.16b, v5.16b}, [%[src_UV]], #32 \n" // U0 V0
|
|
|
|
"mov v2.16b, v1.16b \n" // Y1
|
|
"mov v1.16b, v4.16b \n" // U0
|
|
"mov v3.16b, v5.16b \n" // V0
|
|
|
|
"st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%[dst]], #64 \n" // [Y0 U0 Y1 V0]
|
|
|
|
"subs %w[remain_dst_width], %w[remain_dst_width], #16 \n"
|
|
"b.gt 1b \n"
|
|
|
|
|
|
: [dst] "+r"(DSTplane)
|
|
,[src_Y] "+r"(Yplane)
|
|
,[src_UV] "+r"(UVplane)
|
|
,[remain_dst_width] "+r"(lvWidth_target)
|
|
:
|
|
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19"
|
|
);
|
|
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
#define TILESIZE 64
|
|
int32_t nc_img_rgb24_packed_to_tiled_planar(uint8_t *rgb_Packed, uint8_t *rgb_Tiled, int32_t width, int32_t height)
|
|
{ // 64_8x8
|
|
int32_t ret = OK;
|
|
long int in_x_idx, in_y_idx;
|
|
int plane_offset = width * height;
|
|
int32_t img_stried = width * 3;
|
|
|
|
uint8_t *RGBplane = rgb_Packed;
|
|
uint8_t *Rplane = &rgb_Tiled[plane_offset * 0];
|
|
uint8_t *Gplane = &rgb_Tiled[plane_offset * 1];
|
|
uint8_t *Bplane = &rgb_Tiled[plane_offset * 2];
|
|
|
|
|
|
for (long y_super = 0; y_super < (height / TILESIZE); y_super++)
|
|
{
|
|
for (long x_super = 0; x_super < (width / TILESIZE); x_super++)
|
|
{
|
|
for (long y_sub = 0; y_sub < 8; y_sub++)
|
|
{
|
|
for (long x_sub = 0; x_sub < 8; x_sub++)
|
|
{
|
|
for (long y_pix = 0; y_pix < 8; y_pix++)
|
|
{
|
|
in_x_idx = (x_super * TILESIZE * 3) + (x_sub * 8 * 3);
|
|
in_y_idx = (y_super * TILESIZE) + (y_sub * 8) + y_pix;
|
|
RGBplane = &rgb_Packed[(in_y_idx * img_stried) + in_x_idx];
|
|
|
|
asm volatile (
|
|
// R G B
|
|
"ld3 {v0.8b, v1.8b, v2.8b}, [%[src]], #24 \n"
|
|
"st1 {v0.8b}, [%[dst_R]], #8 \n" // R Plane
|
|
"st1 {v1.8b}, [%[dst_G]], #8 \n" // G Plane
|
|
"st1 {v2.8b}, [%[dst_B]], #8 \n" // B Plane
|
|
:[src] "+r"(RGBplane)
|
|
,[dst_R] "+r"(Rplane)
|
|
,[dst_G] "+r"(Gplane)
|
|
,[dst_B] "+r"(Bplane)
|
|
:
|
|
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19"
|
|
);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Full swing for BT.601
|
|
* https://en.wikipedia.org/wiki/YUV#Full_swing_for_BT.601
|
|
*
|
|
* Y = ( 77 * R + 150 * G + 29 * B) >> 8 + 0;
|
|
* U = (-43 * R - 84 * G + 127 * B) >> 8 + 128;
|
|
* V = (127 * R - 106 * G - 21 * B) >> 8 + 128;
|
|
*/
|
|
void neon_Mix_Rgbx2Yuyv(uint8_t *src, uint8_t *dst, int32_t w, int32_t h, int32_t isRgbaNotBgra)
|
|
{
|
|
int32_t i, loop;
|
|
|
|
int16x8_t sqw_r, sqw_g, sqw_b;
|
|
int16x8_t sqw_y, sqw_u, sqw_v;
|
|
int16x8_t sqw_128 = vmovq_n_s16(128);
|
|
|
|
uint8x8x4_t udb4_rgb;
|
|
uint8x8x2_t udb2_yuyv, udb2_org, udb2_uv;
|
|
uint8x8_t udb_y, udb_u, udb_v;
|
|
uint8x8_t udb_oy;
|
|
|
|
uint8x8_t udb_a, udb_n;
|
|
uint16x8_t uqh_y, uqh_u, uqh_v;
|
|
|
|
loop = h * w;
|
|
|
|
for (i = 0; i < loop; i += 8)
|
|
{
|
|
/* load rgb */
|
|
// 8 bit x 8 lane x 4 vector
|
|
// v0 : r1 r2 r3 r4 r5 r6 r7 r8 r9 r10
|
|
// v1 : g1 g2 g3 g4 g5 g6 g7 g8 g9 g10
|
|
// v2 : b1 b2 b3 b4 b5 b6 b7 b8 b9 b10
|
|
// v3 : a1 a2 a3 a4 a5 a6 a7 a8 a9 a10
|
|
udb4_rgb = vld4_u8(src + i * 4);
|
|
|
|
/* check alpha */
|
|
udb_a = udb4_rgb.val[3];
|
|
|
|
// 8 bit x 8 lane => 64 bit x 1 lane으로 변경 후 0번 lane값이 0인지 확인 => alpha 값이 있는지 확인
|
|
if (0 != vget_lane_u64(vreinterpret_u64_u8(udb_a), 0))
|
|
{
|
|
// rgb 값 u8에서 u16으로 변경 후 다시 s16으로 변경
|
|
/* long signed rgb */
|
|
if (isRgbaNotBgra) {
|
|
sqw_r = vreinterpretq_s16_u16(vmovl_u8(udb4_rgb.val[0]));
|
|
sqw_g = vreinterpretq_s16_u16(vmovl_u8(udb4_rgb.val[1]));
|
|
sqw_b = vreinterpretq_s16_u16(vmovl_u8(udb4_rgb.val[2]));
|
|
} else {
|
|
sqw_r = vreinterpretq_s16_u16(vmovl_u8(udb4_rgb.val[2]));
|
|
sqw_g = vreinterpretq_s16_u16(vmovl_u8(udb4_rgb.val[1]));
|
|
sqw_b = vreinterpretq_s16_u16(vmovl_u8(udb4_rgb.val[0]));
|
|
}
|
|
|
|
/* Y = ( 77 * R + 150 * G + 29 * B) >> 8 + 0; */
|
|
// y = r * 77
|
|
sqw_y = vmulq_n_s16(sqw_r, 77);
|
|
// y = y + (g * 150)
|
|
sqw_y = vmlaq_n_s16(sqw_y, sqw_g, 150);
|
|
// y = y + (b * 29)
|
|
sqw_y = vmlaq_n_s16(sqw_y, sqw_b, 29);
|
|
// y = (y >> 8)
|
|
sqw_y = vshrq_n_s16(sqw_y, 8);
|
|
|
|
/* U = (-43 * R - 84 * G + 127 * B) >> 8 + 128; */
|
|
// u = r x -43
|
|
sqw_u = vmulq_n_s16(sqw_r, -43);
|
|
// u = u + (g x -84)
|
|
sqw_u = vmlaq_n_s16(sqw_u, sqw_g, -84);
|
|
// u = u + (b x 127)
|
|
sqw_u = vmlaq_n_s16(sqw_u, sqw_b, 127);
|
|
// u = (u >> 8) + v16x8(128)
|
|
sqw_u = vsraq_n_s16(sqw_128, sqw_u, 8);
|
|
|
|
/* V = (127 * R - 106 * G - 21 * B) >> 8 + 128; */
|
|
// v = r x 127
|
|
sqw_v = vmulq_n_s16(sqw_r, 127);
|
|
// v = v + (g x -106)
|
|
sqw_v = vmlaq_n_s16(sqw_v, sqw_g, -106);
|
|
// v = v + (b x -21)
|
|
sqw_v = vmlaq_n_s16(sqw_v, sqw_b, -21);
|
|
// v = (u >> v) + v16x8(128)
|
|
sqw_v = vsraq_n_s16(sqw_128, sqw_v, 8);
|
|
|
|
// yuv 값 s16에서 u16으로 변경 후 다시 u8으로 변경
|
|
/* narrow unsigned rgb */
|
|
udb_y = vmovn_u16(vreinterpretq_u16_s16(sqw_y));
|
|
udb_u = vmovn_u16(vreinterpretq_u16_s16(sqw_u));
|
|
udb_v = vmovn_u16(vreinterpretq_u16_s16(sqw_v));
|
|
|
|
/* load original yuyv 422 to 444 */
|
|
// v0 : y1 y2 y3 y4 y5 y6 y7 y8
|
|
// v1 : u1 v1 u2 v2 u3 v3 u4 v4
|
|
udb2_org = vld2_u8(dst + i * 2);
|
|
udb_oy = udb2_org.val[0];
|
|
// v0 : u1 u1 u2 u2 u3 u3 u4 u4 // dst u
|
|
// v1 : v1 v1 v2 v2 v3 v3 v4 v4 // dst v
|
|
udb2_uv = vtrn_u8(udb2_org.val[1], udb2_org.val[1]);
|
|
|
|
/* alpha' = ~alpha */
|
|
udb_n = vmvn_u8(udb_a);
|
|
|
|
/* blending = ((alpha x src + src) + (alpha' x org)) / 2 */
|
|
/* why (+ src)? : (alpha + 1) + alpha' = 0x100 */
|
|
// v16_y = ~alpha x dst_v8_y
|
|
uqh_y = vmull_u8(udb_n, udb_oy);
|
|
// v16_y = v16_y + (alpha x src_v8_y)
|
|
uqh_y = vmlal_u8(uqh_y, udb_a, udb_y);
|
|
// src_v8_y = (v16_y >> 8)
|
|
udb_y = vshrn_n_u16(uqh_y, 8);
|
|
|
|
// v16_u = ~alpha x dst_v8_u
|
|
uqh_u = vmull_u8(udb_n, udb2_uv.val[0]);
|
|
// v16_u = v16_u + (alpha x src_v8_u)
|
|
uqh_u = vmlal_u8(uqh_u, udb_a, udb_u);
|
|
// src_v8_u = (v16_u >> 8)
|
|
udb_u = vshrn_n_u16(uqh_u, 8);
|
|
|
|
// v16_v = ~alpha x dst_v8_v
|
|
uqh_v = vmull_u8(udb_n, udb2_uv.val[1]);
|
|
// v16_v = v16_v + (alpha x src_v8_v)
|
|
uqh_v = vmlal_u8(uqh_v, udb_a, udb_v);
|
|
// src_v8_v = (v16_v >> 8)
|
|
udb_v = vshrn_n_u16(uqh_v, 8);
|
|
|
|
/* 444 to 422 subsampling also - u, v : averaging */
|
|
// val[0] = y1 y2 y3 ... y8
|
|
udb2_yuyv.val[0] = udb_y;
|
|
// uv_v0 : u1 v1 u2 v2 u3 v3 u4 v4
|
|
// uv_v1 : u1 v1 u2 v2 u3 v3 u4 v4
|
|
udb2_uv = vtrn_u8(udb_u, udb_v);
|
|
// val[1] = (u1 + u1)/2 (v1 + v1)/2 ... (v4 + v4)/2 :: (uv_v0 + uv_v1)/2
|
|
// so, val[1] = u1 v1 u2 v2 u3 v3 u4 v4
|
|
udb2_yuyv.val[1] = vhadd_u8(udb2_uv.val[0], udb2_uv.val[1]);
|
|
|
|
/* store yuyv */
|
|
vst2_u8(dst + i * 2, udb2_yuyv);
|
|
} // if
|
|
} // for
|
|
|
|
return;
|
|
}
|
|
|
|
void neon_Mix_Rgba2Yuyv(uint8_t *src, uint8_t *dst, int32_t w, int32_t h)
|
|
{
|
|
neon_Mix_Rgbx2Yuyv(src, dst, w, h, 1);
|
|
}
|
|
|
|
void neon_Mix_Bgra2Yuyv(uint8_t *src, uint8_t *dst, int32_t w, int32_t h)
|
|
{
|
|
neon_Mix_Rgbx2Yuyv(src, dst, w, h, 0);
|
|
}
|
|
|
|
#define ELEM_X (8)
|
|
#define ELEM_Y (8)
|
|
#define CELL_X (8)
|
|
#define CELL_Y (8)
|
|
#define CELL8_Y_OFFSET (64) //(ELEM_Y*CELL_Y)
|
|
#define CELL_SIZE (8) //(CELL_X*CELL_Y)
|
|
#define TILE_SIZE (64) //(CELL_X*CELL_Y)
|
|
#define VALID_CELL_X (tinfo->dim.w>>3)//(tinfo->dim.w/ELEM_X) // 40/8=5, (dim.w / ELEM_X)
|
|
#define VALID_CELL_Y (tinfo->dim.h>>3)//(tinfo->dim.h/ELEM_Y) // 24/8=3, (dim.h / ELEM_Y)
|
|
#define CELL1_SIZE (64) //(CELL_X*CELL_Y) // 64
|
|
#define CELL8_SIZE (512) //(CELL1_SIZE*CELL_X) // 64 * 8
|
|
|
|
int nc_neon_get_data_NCHW_float(aiwTensorInfo *tinfo, unsigned char *tiled, float *scanline)
|
|
{
|
|
unsigned int w_idx=0, ch_offs=0, in_buf_offs=0;
|
|
unsigned int offs_tile_x=0, offs_tile_y=0, offs_cell_x=0, offs_cell_y=0;
|
|
unsigned int valid_cell_x=CELL_X, valid_cell_y=CELL_Y;
|
|
unsigned int valid_ele_x=ELEM_X, valid_ele_y=ELEM_Y;
|
|
unsigned int tile_x = 0, tile_y = 0;
|
|
unsigned int x_cell = 0, y_cell = 0;
|
|
unsigned int y = 0;
|
|
int8x8_t s88;
|
|
float32x4_t f324div;
|
|
float32x4x2_t f3242;
|
|
|
|
memset(&f3242, 0, sizeof(f3242));
|
|
|
|
if(!tinfo || !tiled || !scanline)
|
|
{
|
|
printf("null pointer (%s) 0x%lx, 0x%lx, 0x%lx\n", __func__, (uintptr_t)tinfo, (uintptr_t)tiled, (uintptr_t)scanline);
|
|
return AIW_ERROR;
|
|
}
|
|
|
|
const unsigned int num_x_tiles = (tinfo->dim.w + TILE_SIZE - 1) / TILE_SIZE;
|
|
const unsigned int num_y_tiles = (tinfo->dim.h + TILE_SIZE - 1) / TILE_SIZE;
|
|
const unsigned int plane_size = (tinfo->dim.w * tinfo->dim.h); // 1 channel size
|
|
const unsigned int tile_y_size = (tinfo->dim.w * CELL8_Y_OFFSET); // tile y line size
|
|
const unsigned int cell_y_size = (CELL1_SIZE * (tinfo->dim.w)/8); // cell y line size
|
|
unsigned int remain_valid_cell_x = ((tinfo->dim.w % TILE_SIZE)+7)/CELL_X;
|
|
unsigned int remain_valid_cell_y = ((tinfo->dim.h % TILE_SIZE)+7)/CELL_Y;
|
|
unsigned int remain_valid_elem_x = (tinfo->dim.w % ELEM_X);
|
|
unsigned int remain_valid_elem_y = (tinfo->dim.h % ELEM_Y);
|
|
const float multiplier = tinfo->exponent > 0 ? (float)(1 << tinfo->exponent) : (float)(1 << -tinfo->exponent);
|
|
|
|
if(remain_valid_cell_x == 0) { remain_valid_cell_x = CELL_X; }
|
|
if(remain_valid_cell_y == 0) { remain_valid_cell_y = CELL_Y; }
|
|
if(remain_valid_elem_x == 0) { remain_valid_elem_x = ELEM_X; }
|
|
if(remain_valid_elem_y == 0) { remain_valid_elem_y = ELEM_Y; }
|
|
|
|
f324div =vld1q_dup_f32(&multiplier);
|
|
for (long ch = 0; ch < tinfo->dim.ch; ch++)
|
|
{
|
|
valid_cell_x = CELL_X; valid_cell_y = CELL_Y;
|
|
for (tile_y = 0; tile_y < num_y_tiles; tile_y++)
|
|
{
|
|
if(tile_y == (num_y_tiles - 1)) {
|
|
// Calculate the number of valid cells in the last tile_y.(When tile size is not aligned)
|
|
valid_cell_y = remain_valid_cell_y;
|
|
}
|
|
offs_tile_y = (tile_y * tile_y_size); //Calculate the starting position of the y-axis tile.
|
|
for (tile_x = 0; tile_x < num_x_tiles; tile_x++) // count X_tiles in one channel.
|
|
{
|
|
if(tile_x == (num_x_tiles - 1)) {
|
|
// Calculate the number of valid cells in the last tile_x.(When tile size is not aligned)
|
|
valid_cell_x = remain_valid_cell_x;
|
|
}
|
|
// tile0: 0 8 16 24 32 40 48 56, tile1:64 72 80 88 96 104 112 120, 128 136 .......
|
|
offs_tile_x = (tile_x * TILE_SIZE); //Calculate the starting position of the x-axis tile.
|
|
for (y_cell = 0; y_cell < CELL_Y; y_cell++)
|
|
{
|
|
if(y_cell >= valid_cell_y) { // Invalid ycells are skipped.
|
|
in_buf_offs += CELL8_SIZE; // Skip the padding area.
|
|
continue;
|
|
}
|
|
valid_ele_x = ELEM_X;
|
|
valid_ele_y = ELEM_Y;
|
|
if(y_cell == (valid_cell_y-1)) { // last y-axis cell
|
|
valid_ele_y = remain_valid_elem_y;
|
|
}
|
|
offs_cell_y = (y_cell*cell_y_size);
|
|
for (x_cell = 0; x_cell < CELL_X; x_cell++)
|
|
{
|
|
if(x_cell >= valid_cell_x) { // Invalid xcells are skipped.
|
|
in_buf_offs+=CELL1_SIZE; // Skip the padding area
|
|
continue;
|
|
}
|
|
if(x_cell == (valid_cell_x-1)) {
|
|
valid_ele_x = remain_valid_elem_x;
|
|
}
|
|
offs_cell_x = (x_cell*CELL_X);
|
|
for (y = 0; y < ELEM_Y; y++)
|
|
{ // 8x8 element
|
|
if(y >= valid_ele_y) { // Invalid elements are skipped.
|
|
in_buf_offs+=ELEM_Y;
|
|
continue;
|
|
}
|
|
w_idx = ch_offs + offs_tile_y + offs_tile_x + offs_cell_y + offs_cell_x + (y*tinfo->dim.w);
|
|
s88=vld1_s8((const signed char*)tiled+in_buf_offs); // read 8 bytes
|
|
if(tinfo->sign) {
|
|
f3242.val[0][0] = s88[0];
|
|
f3242.val[0][1] = s88[1];
|
|
f3242.val[0][2] = s88[2];
|
|
f3242.val[0][3] = s88[3];
|
|
}
|
|
else {
|
|
f3242.val[0][0] = (unsigned char)s88[0];
|
|
f3242.val[0][1] = (unsigned char)s88[1];
|
|
f3242.val[0][2] = (unsigned char)s88[2];
|
|
f3242.val[0][3] = (unsigned char)s88[3];
|
|
}
|
|
f3242.val[0] = vdivq_f32(f3242.val[0], f324div); // (val/multiplier)
|
|
vst1q_f32(scanline+w_idx, f3242.val[0]);
|
|
if(valid_ele_x > 4) {
|
|
if(tinfo->sign) {
|
|
f3242.val[1][0] = s88[4];
|
|
f3242.val[1][1] = s88[5];
|
|
f3242.val[1][2] = s88[6];
|
|
f3242.val[1][3] = s88[7];
|
|
}
|
|
else {
|
|
f3242.val[1][0] = (unsigned char)s88[4];
|
|
f3242.val[1][1] = (unsigned char)s88[5];
|
|
f3242.val[1][2] = (unsigned char)s88[6];
|
|
f3242.val[1][3] = (unsigned char)s88[7];
|
|
}
|
|
f3242.val[1] = vdivq_f32(f3242.val[1], f324div); // (
|
|
vst1q_f32(scanline+w_idx+4, f3242.val[1]);
|
|
}
|
|
in_buf_offs+=ELEM_X; // 8바이트 읽어서 처리
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
ch_offs += plane_size;
|
|
}
|
|
|
|
return AIW_SUCCESS;
|
|
}
|
|
|
|
#define SEG_COLOR (0x400000FFU)
|
|
int nc_neon_tiled_to_scanline_n_scale_up(unsigned int npu_seg_out_w, unsigned int npu_seg_out_h, unsigned int canvas_w, unsigned int canvas_h, unsigned int *canvas, unsigned char *cnn_output) {
|
|
unsigned int in_x_idx, in_y_idx;
|
|
float xpos, ypos;
|
|
int outidx = 0;
|
|
unsigned int seg_valid_cell_x = (((npu_seg_out_w % TILE_SIZE)+7)>>3);
|
|
unsigned int seg_valid_cell_y = (((npu_seg_out_h % TILE_SIZE)+7)>>3);
|
|
unsigned int valid_cell_x=CELL_X, valid_cell_y=CELL_Y;
|
|
const unsigned int num_x_tiles=(npu_seg_out_w+TILE_SIZE-1)/TILE_SIZE; // calc the number of x-tiles
|
|
const unsigned int num_y_tiles=(npu_seg_out_h+TILE_SIZE-1)/TILE_SIZE; // calc the number of y-tiles
|
|
const float h_ratio = (float)canvas_w/(float)npu_seg_out_w;
|
|
const float v_ratio = (float)canvas_h/(float)npu_seg_out_h;
|
|
const uint32x4_t u324 ={SEG_COLOR, SEG_COLOR, SEG_COLOR, SEG_COLOR};
|
|
uint8x8_t u88;
|
|
unsigned int x_super = 0, y_super = 0;
|
|
unsigned int x_sub = 0, y_sub = 0;
|
|
unsigned int x = 0, y = 0;
|
|
unsigned int y_up = 0;
|
|
|
|
if(!canvas || !cnn_output)
|
|
{
|
|
printf("null pointer (%s) 0x%lx, 0x%lx\n", __func__, (uintptr_t)canvas, (uintptr_t)cnn_output);
|
|
return NC_INVALID;
|
|
}
|
|
|
|
valid_cell_x = (npu_seg_out_w % TILE_SIZE) ? seg_valid_cell_x : CELL_X;
|
|
valid_cell_y = (npu_seg_out_h % TILE_SIZE) ? seg_valid_cell_y : CELL_Y;
|
|
for (y_super = 0; y_super < num_y_tiles; y_super++) {
|
|
for (x_super = 0; x_super < num_x_tiles; x_super++) {
|
|
for (y_sub = 0; y_sub < CELL_Y; y_sub++) {
|
|
if(y_super == (num_y_tiles-1)) { // last y tile
|
|
if(y_sub >= valid_cell_y) { // skip invalid y cells
|
|
outidx+=(CELL8_SIZE * valid_cell_y);
|
|
break;
|
|
}
|
|
}
|
|
for (x_sub = 0; x_sub < CELL_X; x_sub++) {
|
|
if(x_super == (num_x_tiles-1)) { // last x tile
|
|
if(x_sub >= valid_cell_x) { // skip invalid x cells
|
|
outidx+=(CELL1_SIZE * valid_cell_x);
|
|
break;
|
|
}
|
|
}
|
|
for (y = 0; y < ELEM_Y; y++) {
|
|
in_y_idx = (y_super * TILESIZE) + (y_sub * CELL_Y + y);
|
|
in_x_idx = (x_super * TILESIZE) + (x_sub * CELL_X);
|
|
u88=vld1_u8((unsigned char*)cnn_output+outidx); // read 8 bytes
|
|
ypos = (float)in_y_idx * v_ratio;
|
|
|
|
for (x = 0; x < ELEM_X; x++) {
|
|
// increase drawing speed (not drawing background)
|
|
if (u88[x]) {
|
|
continue;
|
|
}
|
|
xpos = (float)(in_x_idx+x) * h_ratio;
|
|
if (ypos > 0) {
|
|
for (y_up = 0; y_up < (unsigned int)v_ratio; y_up++) { // line
|
|
#if 1
|
|
vst1q_u32(canvas + (unsigned int)((ypos + (float)y_up)* (float)canvas_w +(xpos)), u324);// ARGB x 4 pixels, peleeSeg 320x192
|
|
//vst1q_u32(canvas + ((ypos + y_up)* canvas_w +(xpos))+4, u324);// ARGB x 4 pixels, for pavliteSeg 160x96
|
|
#else
|
|
for (int x_up = 0; x_up < h_ratio; x_up++) {
|
|
canvas[(ypos + y_up) * canvas_w + (xpos + x_up)] = SEG_COLOR;
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
}
|
|
outidx+=8;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return NC_SUCCESS;
|
|
}
|
|
|
|
int nc_neon_get_data_NCHW_float_cellrow(aiwTensorInfo *tinfo, unsigned char *tiled, float *scanline)
|
|
{
|
|
unsigned int w_idx=0, ch_offs=0, in_buf_offs=0;
|
|
unsigned int offs_cell_x=0, offs_cell_y=0;
|
|
unsigned int valid_ele_x=ELEM_X, valid_ele_y=ELEM_Y;
|
|
unsigned int x_cell = 0, y_cell = 0;
|
|
unsigned int y = 0;
|
|
int8x8_t s88;
|
|
float32x4_t f324div;
|
|
float32x4x2_t f3242;
|
|
|
|
memset(&f3242, 0, sizeof(f3242));
|
|
|
|
if(!tinfo || !tiled || !scanline)
|
|
{
|
|
printf("null pointer (%s) 0x%lx, 0x%lx, 0x%lx\n", __func__, (uintptr_t)tinfo, (uintptr_t)tiled, (uintptr_t)scanline);
|
|
return AIW_ERROR;
|
|
}
|
|
|
|
const unsigned int num_x_cells = (tinfo->dim.w + TILE_SIZE - 1) / TILE_SIZE * (TILE_SIZE / CELL_SIZE);
|
|
const unsigned int num_y_cells = (tinfo->dim.h + CELL_SIZE - 1) / CELL_SIZE;
|
|
const unsigned int plane_size = (tinfo->dim.w * tinfo->dim.h); // 1 channel size
|
|
const unsigned int cell_y_size = (CELL1_SIZE * (tinfo->dim.w)/8); // cell y line size
|
|
unsigned int remain_valid_cell_x = (tinfo->dim.w + 7) / CELL_X;
|
|
unsigned int remain_valid_elem_x = (tinfo->dim.w % ELEM_X);
|
|
unsigned int remain_valid_elem_y = (tinfo->dim.h % ELEM_Y);
|
|
const float multiplier = tinfo->exponent > 0 ? (float)(1 << tinfo->exponent) : (float)(1 << -tinfo->exponent);
|
|
|
|
if(remain_valid_elem_x == 0) { remain_valid_elem_x = ELEM_X; }
|
|
if(remain_valid_elem_y == 0) { remain_valid_elem_y = ELEM_Y; }
|
|
|
|
f324div =vld1q_dup_f32(&multiplier);
|
|
for (long ch = 0; ch < tinfo->dim.ch; ch++)
|
|
{
|
|
for (y_cell = 0; y_cell < num_y_cells; y_cell++)
|
|
{
|
|
valid_ele_y = ELEM_Y;
|
|
if(y_cell == (num_y_cells-1)) { // last y-axis cell
|
|
valid_ele_y = remain_valid_elem_y;
|
|
}
|
|
offs_cell_y = (y_cell*cell_y_size);
|
|
|
|
for (x_cell = 0; x_cell < num_x_cells; x_cell++)
|
|
{
|
|
valid_ele_x = ELEM_X;
|
|
if(x_cell >= remain_valid_cell_x) { // Invalid xcells are skipped.
|
|
in_buf_offs+=CELL1_SIZE; // Skip the padding area
|
|
continue;
|
|
}
|
|
if(x_cell == (remain_valid_cell_x-1)) {
|
|
valid_ele_x = remain_valid_elem_x;
|
|
}
|
|
offs_cell_x = (x_cell*CELL_X);
|
|
|
|
for (y = 0; y < ELEM_Y; y++)
|
|
{ // 8x8 element
|
|
if(y >= valid_ele_y) { // Invalid elements are skipped.
|
|
in_buf_offs+=ELEM_Y;
|
|
continue;
|
|
}
|
|
w_idx = ch_offs + offs_cell_y + offs_cell_x + (y*tinfo->dim.w);
|
|
s88=vld1_s8((const signed char*)tiled+in_buf_offs); // read 8 bytes
|
|
|
|
if(tinfo->sign) {
|
|
f3242.val[0][0] = s88[0];
|
|
f3242.val[0][1] = s88[1];
|
|
f3242.val[0][2] = s88[2];
|
|
f3242.val[0][3] = s88[3];
|
|
}
|
|
else {
|
|
f3242.val[0][0] = (unsigned char)s88[0];
|
|
f3242.val[0][1] = (unsigned char)s88[1];
|
|
f3242.val[0][2] = (unsigned char)s88[2];
|
|
f3242.val[0][3] = (unsigned char)s88[3];
|
|
}
|
|
f3242.val[0] = vdivq_f32(f3242.val[0], f324div); // (val/multiplier)
|
|
vst1q_f32(scanline+w_idx, f3242.val[0]);
|
|
if(valid_ele_x > 4) {
|
|
if(tinfo->sign) {
|
|
f3242.val[1][0] = s88[4];
|
|
f3242.val[1][1] = s88[5];
|
|
f3242.val[1][2] = s88[6];
|
|
f3242.val[1][3] = s88[7];
|
|
}
|
|
else {
|
|
f3242.val[1][0] = (unsigned char)s88[4];
|
|
f3242.val[1][1] = (unsigned char)s88[5];
|
|
f3242.val[1][2] = (unsigned char)s88[6];
|
|
f3242.val[1][3] = (unsigned char)s88[7];
|
|
}
|
|
f3242.val[1] = vdivq_f32(f3242.val[1], f324div);
|
|
vst1q_f32(scanline+w_idx+4, f3242.val[1]);
|
|
}
|
|
in_buf_offs+=ELEM_X; // 8바이트 읽어서 처리
|
|
}
|
|
}
|
|
}
|
|
ch_offs += plane_size;
|
|
}
|
|
|
|
return AIW_SUCCESS;
|
|
}
|
|
|
|
int nc_neon_get_data_NCHW_uint8_cellrow(aiwTensorInfo *tinfo, unsigned char *tiled, int64_t *scanline)
|
|
{
|
|
unsigned int w_idx=0, ch_offs=0, in_buf_offs=0;
|
|
unsigned int offs_cell_x=0, offs_cell_y=0;
|
|
unsigned int valid_ele_x=ELEM_X, valid_ele_y=ELEM_Y;
|
|
unsigned int x_cell = 0, y_cell = 0;
|
|
unsigned int y = 0;
|
|
int8x8_t s88;
|
|
|
|
memset(&s88, 0, sizeof(s88));
|
|
|
|
if(!tinfo || !tiled || !scanline)
|
|
{
|
|
printf("null pointer (%s) 0x%lx, 0x%lx, 0x%lx\n", __func__, (uintptr_t)tinfo, (uintptr_t)tiled, (uintptr_t)scanline);
|
|
return AIW_ERROR;
|
|
}
|
|
|
|
const unsigned int num_x_cells = (tinfo->dim.w + TILE_SIZE - 1) / TILE_SIZE * (TILE_SIZE / CELL_SIZE);
|
|
const unsigned int num_y_cells = (tinfo->dim.h + CELL_SIZE - 1) / CELL_SIZE;
|
|
const unsigned int plane_size = (tinfo->dim.w * tinfo->dim.h);
|
|
const unsigned int cell_y_size = (CELL1_SIZE * (tinfo->dim.w)/8);
|
|
unsigned int remain_valid_cell_x = (tinfo->dim.w + 7) / CELL_X;
|
|
unsigned int remain_valid_elem_x = (tinfo->dim.w % ELEM_X);
|
|
unsigned int remain_valid_elem_y = (tinfo->dim.h % ELEM_Y);
|
|
|
|
if(remain_valid_elem_x == 0) { remain_valid_elem_x = ELEM_X; }
|
|
if(remain_valid_elem_y == 0) { remain_valid_elem_y = ELEM_Y; }
|
|
|
|
for (long ch = 0; ch < tinfo->dim.ch; ch++)
|
|
{
|
|
for (y_cell = 0; y_cell < num_y_cells; y_cell++)
|
|
{
|
|
valid_ele_y = ELEM_Y;
|
|
if(y_cell == (num_y_cells-1)) {
|
|
valid_ele_y = remain_valid_elem_y;
|
|
}
|
|
offs_cell_y = (y_cell*cell_y_size);
|
|
|
|
for (x_cell = 0; x_cell < num_x_cells; x_cell++)
|
|
{
|
|
valid_ele_x = ELEM_X;
|
|
if(x_cell >= remain_valid_cell_x) {
|
|
in_buf_offs+=CELL1_SIZE;
|
|
continue;
|
|
}
|
|
if(x_cell == (remain_valid_cell_x-1)) {
|
|
valid_ele_x = remain_valid_elem_x;
|
|
}
|
|
offs_cell_x = (x_cell*CELL_X);
|
|
|
|
for (y = 0; y < ELEM_Y; y++)
|
|
{
|
|
if(y >= valid_ele_y) {
|
|
in_buf_offs+=ELEM_Y;
|
|
continue;
|
|
}
|
|
w_idx = ch_offs + offs_cell_y + offs_cell_x + (y*tinfo->dim.w);
|
|
s88=vld1_s8((const signed char*)tiled+in_buf_offs);
|
|
memcpy((int8_t*)scanline + w_idx, &s88, 4);
|
|
|
|
if(valid_ele_x > 4) {
|
|
memcpy((int8_t*)scanline + w_idx+4, &s88[4], 4);
|
|
}
|
|
in_buf_offs+=ELEM_X;
|
|
}
|
|
}
|
|
}
|
|
ch_offs += plane_size;
|
|
}
|
|
return AIW_SUCCESS;
|
|
}
|
|
|
|
|
|
|
|
void nc_rgb_planar_to_interleaved_neon(uint8_t* R, uint8_t* G, uint8_t* B, uint8_t* interleaved, int width, int height)
|
|
{
|
|
int n = width * height;
|
|
int i;
|
|
uint8x8x3_t v;
|
|
|
|
for (i = 0; i < n; i += 8) {
|
|
// Load 8 elements from each R, G, and B arrays into NEON registers
|
|
// Interleave the RGB data and store it in the interleaved array
|
|
v.val[0] = vld1_u8(&R[i]);
|
|
v.val[1] = vld1_u8(&G[i]);
|
|
v.val[2] = vld1_u8(&B[i]);
|
|
|
|
vst3_u8(&interleaved[3 * i], v);
|
|
}
|
|
}
|
|
|
|
void nc_rgb_interleaved_to_planar_neon(unsigned char* interleaved, unsigned char* R, unsigned char* G, unsigned char* B, int w, int h)
|
|
{
|
|
int n = w * h;
|
|
uint8x8x3_t u883_rgb;
|
|
|
|
for(int i=0; i < n; i += 8) {
|
|
u883_rgb = vld3_u8(interleaved + i *3);
|
|
vst1_u8(&R[i], u883_rgb.val[0]);
|
|
vst1_u8(&G[i], u883_rgb.val[1]);
|
|
vst1_u8(&B[i], u883_rgb.val[2]);
|
|
}
|
|
}
|