[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Commit-gnuradio] [gnuradio] 07/12: volk: add 32fc_32f_dot_prod_32fc u/a
From: |
git |
Subject: |
[Commit-gnuradio] [gnuradio] 07/12: volk: add 32fc_32f_dot_prod_32fc u/a_avx protokernel |
Date: |
Thu, 16 Jan 2014 20:33:25 +0000 (UTC) |
This is an automated email from the git hooks/post-receive script.
jcorgan pushed a commit to branch master
in repository gnuradio.
commit 5b3aa9cbc486bc70dd1014bcb8626d5b8fe25f9b
Author: Nathan West <address@hidden>
Date: Tue Nov 26 14:43:04 2013 -0600
volk: add 32fc_32f_dot_prod_32fc u/a_avx protokernel
---
volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h | 179 +++++++++++++++++++++++-
1 file changed, 178 insertions(+), 1 deletion(-)
diff --git a/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
index e0a8a59..8341129 100644
--- a/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
+++ b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
@@ -30,6 +30,95 @@ static inline void
volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result, const
#endif /*LV_HAVE_GENERIC*/
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32fc_32f_dot_prod_32fc_a_avx( lv_32fc_t* result, const
lv_32fc_t* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* aPtr = (float*)input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val, a2Val, a3Val;
+ __m256 b0Val, b1Val, b2Val, b3Val;
+ __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
+ __m256 c0Val, c1Val, c2Val, c3Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+ __m256 dotProdVal2 = _mm256_setzero_ps();
+ __m256 dotProdVal3 = _mm256_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm256_load_ps(aPtr);
+ a1Val = _mm256_load_ps(aPtr+8);
+ a2Val = _mm256_load_ps(aPtr+16);
+ a3Val = _mm256_load_ps(aPtr+24);
+
+ x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
+ x1Val = _mm256_load_ps(bPtr+8);
+ x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
+ x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
+ x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
+ x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
+
+ // TODO: it may be possible to rearrange swizzling to better pipeline data
+ b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); //
t0|t0|t1|t1|t2|t2|t3|t3
+ b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); //
t4|t4|t5|t5|t6|t6|t7|t7
+ b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
+ b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
+
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
+ c2Val = _mm256_mul_ps(a2Val, b2Val);
+ c3Val = _mm256_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 32;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back
into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+ *realpt += dotProductVector[4];
+ *imagpt += dotProductVector[5];
+ *realpt += dotProductVector[6];
+ *imagpt += dotProductVector[7];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ *realpt += ((*aPtr++) * (*bPtr));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_AVX*/
+
+
+
+
#ifdef LV_HAVE_SSE
@@ -108,4 +197,92 @@ static inline void volk_32fc_32f_dot_prod_32fc_a_sse(
lv_32fc_t* result, const
#endif /*LV_HAVE_SSE*/
-#endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H*/
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32fc_32f_dot_prod_32fc_u_avx( lv_32fc_t* result, const
lv_32fc_t* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* aPtr = (float*)input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val, a2Val, a3Val;
+ __m256 b0Val, b1Val, b2Val, b3Val;
+ __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
+ __m256 c0Val, c1Val, c2Val, c3Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+ __m256 dotProdVal2 = _mm256_setzero_ps();
+ __m256 dotProdVal3 = _mm256_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm256_loadu_ps(aPtr);
+ a1Val = _mm256_loadu_ps(aPtr+8);
+ a2Val = _mm256_loadu_ps(aPtr+16);
+ a3Val = _mm256_loadu_ps(aPtr+24);
+
+ x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
+ x1Val = _mm256_loadu_ps(bPtr+8);
+ x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
+ x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
+ x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
+ x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
+
+ // TODO: it may be possible to rearrange swizzling to better pipeline data
+ b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); //
t0|t0|t1|t1|t2|t2|t3|t3
+ b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); //
t4|t4|t5|t5|t6|t6|t7|t7
+ b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
+ b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
+
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
+ c2Val = _mm256_mul_ps(a2Val, b2Val);
+ c3Val = _mm256_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 32;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back
into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+ *realpt += dotProductVector[4];
+ *imagpt += dotProductVector[5];
+ *realpt += dotProductVector[6];
+ *imagpt += dotProductVector[7];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ *realpt += ((*aPtr++) * (*bPtr));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
+}
+#endif /*LV_HAVE_AVX*/
+
+
+
+#endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_H*/
- [Commit-gnuradio] [gnuradio] branch master updated (f7f28bb -> 67aa043), git, 2014/01/16
- [Commit-gnuradio] [gnuradio] 09/12: Merge remote-tracking branch 'softerhardware/master', git, 2014/01/16
- [Commit-gnuradio] [gnuradio] 06/12: volk: add 32f_invsqrt_32f avx proto kernel, git, 2014/01/16
- [Commit-gnuradio] [gnuradio] 04/12: digital: Allow for different lengths of pilot carrier- and -symbol allocations, git, 2014/01/16
- [Commit-gnuradio] [gnuradio] 12/12: Merge remote-tracking branch 'martin/ofdm-fixes', git, 2014/01/16
- [Commit-gnuradio] [gnuradio] 01/12: Support for >1 jack port, various sizes of jack buffers, git, 2014/01/16
- [Commit-gnuradio] [gnuradio] 03/12: digital: allow to change packet length tag name in GRC, git, 2014/01/16
- [Commit-gnuradio] [gnuradio] 08/12: volk: add 32fc_32f_multiply_32fc_a_avx protokernel, git, 2014/01/16
- [Commit-gnuradio] [gnuradio] 07/12: volk: add 32fc_32f_dot_prod_32fc u/a_avx protokernel,
git <=
- [Commit-gnuradio] [gnuradio] 05/12: modtool: linkage info for OSX, git, 2014/01/16
- [Commit-gnuradio] [gnuradio] 11/12: Merge remote-tracking branch 'martin/modtool_osx', git, 2014/01/16
- [Commit-gnuradio] [gnuradio] 02/12: digital: propagate tags to packet_header, git, 2014/01/16
- [Commit-gnuradio] [gnuradio] 10/12: Merge remote-tracking branch 'nwest/volk_avx', git, 2014/01/16