[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Commit-gnuradio] [gnuradio] 02/02: volk: adding arm_neon.h headers to r
From: |
git |
Subject: |
[Commit-gnuradio] [gnuradio] 02/02: volk: adding arm_neon.h headers to remaining neon proto-kernels. |
Date: |
Wed, 30 Jul 2014 23:17:15 +0000 (UTC) |
This is an automated email from the git hooks/post-receive script.
trondeau pushed a commit to branch master
in repository gnuradio.
commit 265e729189ad55edf4f5447cb42589e980b92efa
Author: Tom Rondeau <address@hidden>
Date: Wed Jul 30 18:14:12 2014 -0400
volk: adding arm_neon.h headers to remaining neon proto-kernels.
---
volk/kernels/volk/volk_32f_sqrt_32f.h | 2 ++
volk/kernels/volk/volk_32f_x2_dot_prod_32f.h | 2 ++
volk/kernels/volk/volk_32f_x2_min_32f.h | 2 ++
volk/kernels/volk/volk_32f_x2_multiply_32f.h | 4 +++-
volk/kernels/volk/volk_32f_x2_subtract_32f.h | 2 ++
volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h | 1 +
volk/kernels/volk/volk_32fc_magnitude_32f.h | 16 +++++++++-------
volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h | 19 ++++++++++---------
volk/kernels/volk/volk_32fc_x2_multiply_32fc.h | 12 +++++++-----
volk/kernels/volk/volk_8i_convert_16i.h | 4 +++-
10 files changed, 41 insertions(+), 23 deletions(-)
diff --git a/volk/kernels/volk/volk_32f_sqrt_32f.h
b/volk/kernels/volk/volk_32f_sqrt_32f.h
index 2523abf..f8f8cbd 100644
--- a/volk/kernels/volk/volk_32f_sqrt_32f.h
+++ b/volk/kernels/volk/volk_32f_sqrt_32f.h
@@ -41,6 +41,8 @@ static inline void volk_32f_sqrt_32f_a_sse(float* cVector,
const float* aVector,
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
/*!
\brief Sqrts the two input vectors and store their results in the third
vector
\param cVector The vector where the results will be stored
diff --git a/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h
b/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h
index e8fa8b5..ac6f569 100644
--- a/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h
+++ b/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h
@@ -578,6 +578,8 @@ static inline void volk_32f_x2_dot_prod_32f_a_avx( float*
result, const float*
#endif /*LV_HAVE_AVX*/
#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const
float * input, const float * taps, unsigned int num_points) {
unsigned int quarter_points = num_points / 16;
diff --git a/volk/kernels/volk/volk_32f_x2_min_32f.h
b/volk/kernels/volk/volk_32f_x2_min_32f.h
index eef5e5d..f7598d6 100644
--- a/volk/kernels/volk/volk_32f_x2_min_32f.h
+++ b/volk/kernels/volk/volk_32f_x2_min_32f.h
@@ -46,6 +46,8 @@ static inline void volk_32f_x2_min_32f_a_sse(float* cVector,
const float* aVecto
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
/*!
\brief Selects minimum value from each entry between bVector and aVector and
store their results in the cVector
\param cVector The vector where the results will be stored
diff --git a/volk/kernels/volk/volk_32f_x2_multiply_32f.h
b/volk/kernels/volk/volk_32f_x2_multiply_32f.h
index 8bbd81c..00b3185 100644
--- a/volk/kernels/volk/volk_32f_x2_multiply_32f.h
+++ b/volk/kernels/volk/volk_32f_x2_multiply_32f.h
@@ -189,6 +189,8 @@ static inline void volk_32f_x2_multiply_32f_a_avx(float*
cVector, const float* a
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
/*!
\brief Multiplys the two input vectors and store their results in the third
vector
\param cVector The vector where the results will be stored
@@ -212,7 +214,7 @@ static inline void volk_32f_x2_multiply_32f_neon(float*
cVector, const float* aV
for(number=quarter_points*4; number < num_points; ++number) {
*cVector++ = *aVector++ * *bVector++;
}
-}
+}
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
diff --git a/volk/kernels/volk/volk_32f_x2_subtract_32f.h
b/volk/kernels/volk/volk_32f_x2_subtract_32f.h
index 6831d89..c725ef8 100644
--- a/volk/kernels/volk/volk_32f_x2_subtract_32f.h
+++ b/volk/kernels/volk/volk_32f_x2_subtract_32f.h
@@ -64,6 +64,8 @@ static inline void volk_32f_x2_subtract_32f_generic(float*
cVector, const float*
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
/*!
\brief Subtracts bVector form aVector and store their results in the cVector
\param cVector The vector where the results will be stored
diff --git a/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
index d566231..0d3c216 100644
--- a/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
+++ b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
@@ -294,6 +294,7 @@ static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float*
target, float* src0,
#endif // LV_HAVE_AVX
#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
static inline void volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict
target, float* __restrict src0, float* __restrict center_point_array, float*
__restrict cutoff, unsigned int num_points) {
diff --git a/volk/kernels/volk/volk_32fc_magnitude_32f.h
b/volk/kernels/volk/volk_32fc_magnitude_32f.h
index cf3e849..b6da7f3 100644
--- a/volk/kernels/volk/volk_32fc_magnitude_32f.h
+++ b/volk/kernels/volk/volk_32fc_magnitude_32f.h
@@ -234,6 +234,8 @@ static inline void volk_32fc_magnitude_32f_a_generic(float*
magnitudeVector, con
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
/*!
\brief Calculates the magnitude of the complexVector and stores the
results in the magnitudeVector
\param complexVector The vector containing the complex input values
@@ -255,7 +257,7 @@ static inline void volk_32fc_magnitude_32f_neon(float*
magnitudeVector, const lv
magnitude_vec = vrsqrteq_f32(magnitude_vec);
magnitude_vec = vrecpeq_f32( magnitude_vec ); // no plain ol' sqrt
vst1q_f32(magnitudeVectorPtr, magnitude_vec);
-
+
complexVectorPtr += 8;
magnitudeVectorPtr += 4;
}
@@ -271,8 +273,8 @@ static inline void volk_32fc_magnitude_32f_neon(float*
magnitudeVector, const lv
#ifdef LV_HAVE_NEON
/*!
\brief Calculates the magnitude of the complexVector and stores the
results in the magnitudeVector
-
- This is an approximation from "Streamlining Digital Signal Processing" by
+
+ This is an approximation from "Streamlining Digital Signal Processing" by
Richard Lyons. Apparently max error is about 1% and mean error is about
0.6%.
The basic idea is to do a weighted sum of the abs. value of imag and real
parts
where weight A is always assigned to max(imag, real) and B is always
min(imag,real).
@@ -291,7 +293,7 @@ static inline void
volk_32fc_magnitude_32f_neon_fancy_sweet(float* magnitudeVect
const float threshold = 0.4142135;
- float32x4_t a_vec, b_vec, a_high, a_low, b_high, b_low;
+ float32x4_t a_vec, b_vec, a_high, a_low, b_high, b_low;
a_high = vdupq_n_f32( 0.84 );
b_high = vdupq_n_f32( 0.561);
a_low = vdupq_n_f32( 0.99 );
@@ -304,7 +306,7 @@ static inline void
volk_32fc_magnitude_32f_neon_fancy_sweet(float* magnitudeVect
float32x4_t real_abs, imag_abs;
for(number = 0; number < quarter_points; number++){
complex_vec = vld2q_f32(complexVectorPtr);
-
+
real_abs = vabsq_f32(complex_vec.val[0]);
imag_abs = vabsq_f32(complex_vec.val[1]);
@@ -318,14 +320,14 @@ static inline void
volk_32fc_magnitude_32f_neon_fancy_sweet(float* magnitudeVect
// and 0s or 1s with coefficients from previous effective branch
a_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0,
(int32x4_t)a_high), vandq_s32((int32x4_t)comp1, (int32x4_t)a_low));
b_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0,
(int32x4_t)b_high), vandq_s32((int32x4_t)comp1, (int32x4_t)b_low));
-
+
// coefficients chosen, do the weighted sum
min_vec = vmulq_f32(min_vec, b_vec);
max_vec = vmulq_f32(max_vec, a_vec);
magnitude_vec = vaddq_f32(min_vec, max_vec);
vst1q_f32(magnitudeVectorPtr, magnitude_vec);
-
+
complexVectorPtr += 8;
magnitudeVectorPtr += 4;
}
diff --git a/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
index 5301c35..430b747 100644
--- a/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
+++ b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
@@ -761,12 +761,13 @@ static inline void
volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const
#endif /*LV_HAVE_SSE4_1*/
#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const
lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
unsigned int quarter_points = num_points / 4;
unsigned int number;
-
+
lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
lv_32fc_t* b_ptr = (lv_32fc_t*) input;
// for 2-lane vectors, 1st lane holds the real part,
@@ -775,7 +776,7 @@ static inline void
volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_3
float32x4x2_t tmp_real, tmp_imag;
accumulator.val[0] = vdupq_n_f32(0);
accumulator.val[1] = vdupq_n_f32(0);
-
+
for(number = 0; number < quarter_points; ++number) {
a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
@@ -796,7 +797,7 @@ static inline void
volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_3
c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
-
+
accumulator.val[0] = vaddq_f32(accumulator.val[0], c_val.val[0]);
accumulator.val[1] = vaddq_f32(accumulator.val[1], c_val.val[1]);
@@ -821,7 +822,7 @@ static inline void
volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, c
unsigned int quarter_points = num_points / 4;
unsigned int number;
-
+
lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
lv_32fc_t* b_ptr = (lv_32fc_t*) input;
// for 2-lane vectors, 1st lane holds the real part,
@@ -830,7 +831,7 @@ static inline void
volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, c
float32x4x2_t tmp_imag;
accumulator.val[0] = vdupq_n_f32(0);
accumulator.val[1] = vdupq_n_f32(0);
-
+
for(number = 0; number < quarter_points; ++number) {
a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
@@ -869,7 +870,7 @@ static inline void
volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, con
unsigned int quarter_points = num_points / 4;
unsigned int number;
-
+
lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
lv_32fc_t* b_ptr = (lv_32fc_t*) input;
// for 2-lane vectors, 1st lane holds the real part,
@@ -879,7 +880,7 @@ static inline void
volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, con
accumulator1.val[1] = vdupq_n_f32(0);
accumulator2.val[0] = vdupq_n_f32(0);
accumulator2.val[1] = vdupq_n_f32(0);
-
+
for(number = 0; number < quarter_points; ++number) {
a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
@@ -915,7 +916,7 @@ static inline void
volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* resul
unsigned int quarter_points = num_points / 8;
unsigned int number;
-
+
lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
lv_32fc_t* b_ptr = (lv_32fc_t*) input;
// for 2-lane vectors, 1st lane holds the real part,
@@ -930,7 +931,7 @@ static inline void
volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* resul
accumulator2.val[1] = vdupq_n_f32(0);
accumulator2.val[2] = vdupq_n_f32(0);
accumulator2.val[3] = vdupq_n_f32(0);
-
+
// 8 input regs, 8 accumulators -> 16/16 neon regs are used
for(number = 0; number < quarter_points; ++number) {
a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
diff --git a/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
index 8d2d48b..7c723bc 100644
--- a/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
+++ b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
@@ -150,6 +150,8 @@ static inline void
volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, cons
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
/*!
\brief Multiplies the two input complex vectors and stores their results
in the third vector
\param cVector The vector where the results will be stored
@@ -174,15 +176,15 @@ static inline void
volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector, const lv_
// multiply the real*real and imag*imag to get real result
// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
- tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+ tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
// a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
- tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
+ tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
// Multiply cross terms to get the imaginary result
// a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
- tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
+ tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
// a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
- tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
// store the results
c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
@@ -225,7 +227,7 @@ static inline void
volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector,
__builtin_prefetch(b_ptr+4);
// do the first multiply
- tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
// use multiply accumulate/subtract to get result
diff --git a/volk/kernels/volk/volk_8i_convert_16i.h
b/volk/kernels/volk/volk_8i_convert_16i.h
index 3b89a3f..9776dfd 100644
--- a/volk/kernels/volk/volk_8i_convert_16i.h
+++ b/volk/kernels/volk/volk_8i_convert_16i.h
@@ -139,6 +139,8 @@ static inline void volk_8i_convert_16i_a_generic(int16_t*
outputVector, const in
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
/*!
\brief Converts the input 8 bit integer data into 16 bit integer data
\param inputVector The 8 bit input data buffer
@@ -155,7 +157,7 @@ static inline void volk_8i_convert_16i_neon(int16_t*
outputVector, const int8_t*
int8x8_t input_vec ;
int16x8_t converted_vec;
- // NEON doesn't have a concept of 8 bit registers, so we are really
+ // NEON doesn't have a concept of 8 bit registers, so we are really
// dealing with the low half of 16-bit registers. Since this requires
// a move instruction we likely do better with ASM here.
for(number = 0; number < eighth_points; ++number) {