[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Commit-gnuradio] [gnuradio] 01/06: volk: add avx u/a protokernel for 32
From: |
git |
Subject: |
[Commit-gnuradio] [gnuradio] 01/06: volk: add avx u/a protokernel for 32f_x3sum_of_poly_32f |
Date: |
Tue, 28 Jan 2014 20:10:58 +0000 (UTC) |
This is an automated email from the git hooks/post-receive script.
jcorgan pushed a commit to branch master
in repository gnuradio.
commit 8d7d65d179b653089779ba2a52b6279d12d0c64f
Author: Nathan West <address@hidden>
Date: Sat Jan 25 16:22:55 2014 -0600
volk: add avx u/a protokernel for 32f_x3sum_of_poly_32f
The threshold differences have to change for volk_profile because
of rounding errors. It was passing previously because of a bug in
VOLK QA (bug 627) that has since been resolved.
---
volk/apps/volk_profile.cc | 2 +-
volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h | 147 ++++++++++++++++++++++++
volk/lib/testqa.cc | 2 +-
3 files changed, 149 insertions(+), 2 deletions(-)
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc
index 037d630..35993e6 100644
--- a/volk/apps/volk_profile.cc
+++ b/volk/apps/volk_profile.cc
@@ -109,7 +109,7 @@ int main(int argc, char *argv[]) {
VOLK_PROFILE(volk_32f_s32f_stddev_32f, 1e-4, 100, 204602, 3000, &results,
benchmark_mode);
VOLK_PROFILE(volk_32f_stddev_and_mean_32f_x2, 1e-4, 0, 204602, 3000,
&results, benchmark_mode);
VOLK_PROFILE(volk_32f_x2_subtract_32f, 1e-4, 0, 204602, 5000, &results,
benchmark_mode);
- VOLK_PROFILE(volk_32f_x3_sum_of_poly_32f, 1e-4, 0, 204602, 5000, &results,
benchmark_mode);
+ VOLK_PROFILE(volk_32f_x3_sum_of_poly_32f, 1e-2, 0, 204602, 5000, &results,
benchmark_mode);
VOLK_PROFILE(volk_32i_x2_and_32i, 0, 0, 204602, 10000, &results,
benchmark_mode);
VOLK_PROFILE(volk_32i_s32f_convert_32f, 1e-4, 100, 204602, 10000,
&results, benchmark_mode);
VOLK_PROFILE(volk_32i_x2_or_32i, 0, 0, 204602, 10000, &results,
benchmark_mode);
diff --git a/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
index e975f14..b6c3947 100644
--- a/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
+++ b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
@@ -99,6 +99,79 @@ static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float*
target, float* src0
#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_AVX
+#include<immintrin.h>
+
+static inline void volk_32f_x3_sum_of_poly_32f_a_avx(float* target, float*
src0, float* center_point_array, float* cutoff, unsigned int num_points)
+{
+ const unsigned int eighth_points = num_points / 8;
+ float fst = 0.0;
+ float sq = 0.0;
+ float thrd = 0.0;
+ float frth = 0.0;
+
+ __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
+ __m256 target_vec;
+ __m256 x_to_1, x_to_2, x_to_3, x_to_4;
+
+ cpa0 = _mm256_set1_ps(center_point_array[0]);
+ cpa1 = _mm256_set1_ps(center_point_array[1]);
+ cpa2 = _mm256_set1_ps(center_point_array[2]);
+ cpa3 = _mm256_set1_ps(center_point_array[3]);
+ cutoff_vec = _mm256_set1_ps(*cutoff);
+ target_vec = _mm256_setzero_ps();
+
+ unsigned int i;
+
+ for(i = 0; i < eighth_points; ++i) {
+ x_to_1 = _mm256_load_ps(src0);
+ x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
+ x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
+ x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
+ // x^1 * x^3 is slightly faster than x^2 * x^2
+ x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
+
+ x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
+ x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
+ x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
+ x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
+
+ x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
+ x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
+ // this is slightly faster than result += (x_to_1 + x_to_3)
+ target_vec = _mm256_add_ps(x_to_1, target_vec);
+ target_vec = _mm256_add_ps(x_to_3, target_vec);
+
+ src0 += 8;
+ }
+
+ // the hadd for vector reduction has very very slight impact @ 50k iters
+ __VOLK_ATTR_ALIGNED(32) float temp_results[8];
+ target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 |
x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
+ _mm256_store_ps(temp_results, target_vec);
+ *target = temp_results[0] + temp_results[1] + temp_results[4] +
temp_results[5];
+
+
+ for(i = eighth_points*8; i < num_points; ++i) {
+ fst = *(src0++);
+ fst = MAX(fst, *cutoff);
+ sq = fst * fst;
+ thrd = fst * sq;
+ frth = sq * sq;
+
+ *target += (center_point_array[0] * fst +
+ center_point_array[1] * sq +
+ center_point_array[2] * thrd +
+ center_point_array[3] * frth);
+ }
+
+ *target += ((float)(num_points)) * center_point_array[4];
+
+}
+#endif // LV_HAVE_AVX
+
+
#ifdef LV_HAVE_GENERIC
static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target, float*
src0, float* center_point_array, float* cutoff, unsigned int num_points) {
@@ -149,4 +222,78 @@ static inline void
volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src
#endif /*LV_HAVE_GENERIC*/
+#ifdef LV_HAVE_AVX
+#include<immintrin.h>
+
+static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target, float*
src0, float* center_point_array, float* cutoff, unsigned int num_points)
+{
+ const unsigned int eighth_points = num_points / 8;
+ float fst = 0.0;
+ float sq = 0.0;
+ float thrd = 0.0;
+ float frth = 0.0;
+
+ __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
+ __m256 target_vec;
+ __m256 x_to_1, x_to_2, x_to_3, x_to_4;
+
+ cpa0 = _mm256_set1_ps(center_point_array[0]);
+ cpa1 = _mm256_set1_ps(center_point_array[1]);
+ cpa2 = _mm256_set1_ps(center_point_array[2]);
+ cpa3 = _mm256_set1_ps(center_point_array[3]);
+ cutoff_vec = _mm256_set1_ps(*cutoff);
+ target_vec = _mm256_setzero_ps();
+
+ unsigned int i;
+
+ for(i = 0; i < eighth_points; ++i) {
+ x_to_1 = _mm256_loadu_ps(src0);
+ x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
+ x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
+ x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
+ // x^1 * x^3 is slightly faster than x^2 * x^2
+ x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
+
+ x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
+ x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
+ x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
+ x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
+
+ x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
+ x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
+ // this is slightly faster than result += (x_to_1 + x_to_3)
+ target_vec = _mm256_add_ps(x_to_1, target_vec);
+ target_vec = _mm256_add_ps(x_to_3, target_vec);
+
+ src0 += 8;
+ }
+
+ // the hadd for vector reduction has very very slight impact @ 50k iters
+ __VOLK_ATTR_ALIGNED(32) float temp_results[8];
+ target_vec = _mm256_hadd_ps(target_vec, target_vec); // x0+x1 | x2+x3 |
x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
+ _mm256_store_ps(temp_results, target_vec);
+ *target = temp_results[0] + temp_results[1] + temp_results[4] +
temp_results[5];
+
+
+ for(i = eighth_points*8; i < num_points; ++i) {
+ fst = *(src0++);
+ fst = MAX(fst, *cutoff);
+ sq = fst * fst;
+ thrd = fst * sq;
+ frth = sq * sq;
+
+ *target += (center_point_array[0] * fst +
+ center_point_array[1] * sq +
+ center_point_array[2] * thrd +
+ center_point_array[3] * frth);
+ }
+
+ *target += ((float)(num_points)) * center_point_array[4];
+
+}
+#endif // LV_HAVE_AVX
+
+
+
+
#endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index 6408e1e..e6a56ff 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -61,7 +61,7 @@ VOLK_RUN_TESTS(volk_32f_sqrt_32f, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_32f_s32f_stddev_32f, 1e-4, 100, 20462, 1);
VOLK_RUN_TESTS(volk_32f_stddev_and_mean_32f_x2, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_32f_x2_subtract_32f, 1e-4, 0, 20462, 1);
-VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f, 1e-4, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f, 1e-2, 0, 20462, 1);
VOLK_RUN_TESTS(volk_32i_x2_and_32i, 0, 0, 20462, 1);
VOLK_RUN_TESTS(volk_32i_s32f_convert_32f, 1e-4, 100, 20462, 1);
VOLK_RUN_TESTS(volk_32i_x2_or_32i, 0, 0, 20462, 1);
- [Commit-gnuradio] [gnuradio] branch master updated (8211786 -> 300f5fb), git, 2014/01/28
- [Commit-gnuradio] [gnuradio] 05/06: blocks: Add a new rotator_cc block to perform frequency shifting, git, 2014/01/28
- [Commit-gnuradio] [gnuradio] 06/06: Merge remote-tracking branch 'nwest/volk-sum_of_poly', git, 2014/01/28
- [Commit-gnuradio] [gnuradio] 04/06: blocks/rotator: Make the input data 'const', git, 2014/01/28
- [Commit-gnuradio] [gnuradio] 03/06: blocks: Add QA tests for the rotator rotateN function which uses VOLK, git, 2014/01/28
- [Commit-gnuradio] [gnuradio] 01/06: volk: add avx u/a protokernel for 32f_x3sum_of_poly_32f,
git <=
- [Commit-gnuradio] [gnuradio] 02/06: Merge branch 'volk-qa-fixes', git, 2014/01/28