[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Commit-gnuradio] [gnuradio] 07/08: volk: adds unaligned sse 32fc_32f_do
From: |
git |
Subject: |
[Commit-gnuradio] [gnuradio] 07/08: volk: adds unaligned sse 32fc_32f_dot_prod_32fc, 16i_32fc_dot_prod_32fc, and 32fc_x2_dot_prod_16i proto-kernels. |
Date: |
Fri, 7 Mar 2014 00:24:07 +0000 (UTC) |
This is an automated email from the git hooks/post-receive script.
trondeau pushed a commit to branch master
in repository gnuradio.
commit 0b5950d3297212264777b6502a232d9e8458636e
Author: Tom Rondeau <address@hidden>
Date: Thu Mar 6 13:30:23 2014 -0500
volk: adds unaligned sse 32fc_32f_dot_prod_32fc, 16i_32fc_dot_prod_32fc,
and 32fc_x2_dot_prod_16i proto-kernels.
---
volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h | 90 ++++++++++++++++++++++++-
volk/kernels/volk/volk_32f_x2_dot_prod_16i.h | 75 +++++++++++++++++++--
volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h | 76 +++++++++++++++++++++
3 files changed, 234 insertions(+), 7 deletions(-)
diff --git a/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
b/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
index 8bc1569..8c66892 100644
--- a/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
+++ b/volk/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H
-#define INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H
+#ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
+#define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
#include <volk/volk_common.h>
#include<stdio.h>
@@ -37,6 +37,90 @@ static inline void
volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, const
#endif /*LV_HAVE_GENERIC*/
+
+#if LV_HAVE_SSE && LV_HAVE_MMX
+
+static inline void volk_16i_32fc_dot_prod_32fc_u_sse( lv_32fc_t* result, const
short* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 8;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const short* aPtr = input;
+ const float* bPtr = (float*)taps;
+
+ __m64 m0, m1;
+ __m128 f0, f1, f2, f3;
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
+ m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
+ f0 = _mm_cvtpi16_ps(m0);
+ f1 = _mm_cvtpi16_ps(m0);
+ f2 = _mm_cvtpi16_ps(m1);
+ f3 = _mm_cvtpi16_ps(m1);
+
+ a0Val = _mm_unpacklo_ps(f0, f1);
+ a1Val = _mm_unpackhi_ps(f0, f1);
+ a2Val = _mm_unpacklo_ps(f2, f3);
+ a3Val = _mm_unpackhi_ps(f2, f3);
+
+ b0Val = _mm_loadu_ps(bPtr);
+ b1Val = _mm_loadu_ps(bPtr+4);
+ b2Val = _mm_loadu_ps(bPtr+8);
+ b3Val = _mm_loadu_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 8;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into
the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+
+ number = sixteenthPoints*8;
+ for(;number < num_points; number++){
+ *realpt += ((*aPtr) * (*bPtr++));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
+
+
+
+
#if LV_HAVE_SSE && LV_HAVE_MMX
@@ -119,4 +203,4 @@ static inline void volk_16i_32fc_dot_prod_32fc_a_sse(
lv_32fc_t* result, const
#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
-#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H*/
+#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/
diff --git a/volk/kernels/volk/volk_32f_x2_dot_prod_16i.h
b/volk/kernels/volk/volk_32f_x2_dot_prod_16i.h
index 8fcc7de..b637f17 100644
--- a/volk/kernels/volk/volk_32f_x2_dot_prod_16i.h
+++ b/volk/kernels/volk/volk_32f_x2_dot_prod_16i.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDED_volk_32f_x2_dot_prod_16i_a_H
-#define INCLUDED_volk_32f_x2_dot_prod_16i_a_H
+#ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H
+#define INCLUDED_volk_32f_x2_dot_prod_16i_H
#include <volk/volk_common.h>
#include<stdio.h>
@@ -27,7 +27,6 @@ static inline void volk_32f_x2_dot_prod_16i_generic(int16_t*
result, const float
#ifdef LV_HAVE_SSE
-
static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const
float* input, const float* taps, unsigned int num_points) {
unsigned int number = 0;
@@ -90,9 +89,77 @@ static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t*
result, const float*
}
*result = (short)dotProduct;
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+#ifdef LV_HAVE_SSE
+
+static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result, const
float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_loadu_ps(aPtr);
+ a1Val = _mm_loadu_ps(aPtr+4);
+ a2Val = _mm_loadu_ps(aPtr+8);
+ a3Val = _mm_loadu_ps(aPtr+12);
+ b0Val = _mm_loadu_ps(bPtr);
+ b1Val = _mm_loadu_ps(bPtr+4);
+ b2Val = _mm_loadu_ps(bPtr+8);
+ b3Val = _mm_loadu_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into
the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = (short)dotProduct;
}
#endif /*LV_HAVE_SSE*/
-#endif /*INCLUDED_volk_32f_x2_dot_prod_16i_a_H*/
+#endif /*INCLUDED_volk_32f_x2_dot_prod_16i_H*/
diff --git a/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
index 8341129..f567ede 100644
--- a/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
+++ b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
@@ -284,5 +284,81 @@ static inline void volk_32fc_32f_dot_prod_32fc_u_avx(
lv_32fc_t* result, const l
#endif /*LV_HAVE_AVX*/
+#ifdef LV_HAVE_SSE
+
+static inline void volk_32fc_32f_dot_prod_32fc_u_sse( lv_32fc_t* result, const
lv_32fc_t* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 8;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* aPtr = (float*)input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 x0Val, x1Val, x2Val, x3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_loadu_ps(aPtr);
+ a1Val = _mm_loadu_ps(aPtr+4);
+ a2Val = _mm_loadu_ps(aPtr+8);
+ a3Val = _mm_loadu_ps(aPtr+12);
+
+ x0Val = _mm_loadu_ps(bPtr);
+ x1Val = _mm_loadu_ps(bPtr);
+ x2Val = _mm_loadu_ps(bPtr+4);
+ x3Val = _mm_loadu_ps(bPtr+4);
+ b0Val = _mm_unpacklo_ps(x0Val, x1Val);
+ b1Val = _mm_unpackhi_ps(x0Val, x1Val);
+ b2Val = _mm_unpacklo_ps(x2Val, x3Val);
+ b3Val = _mm_unpackhi_ps(x2Val, x3Val);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 8;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into
the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+
+ number = sixteenthPoints*8;
+ for(;number < num_points; number++){
+ *realpt += ((*aPtr++) * (*bPtr));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_SSE*/
+
#endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_H*/
- [Commit-gnuradio] [gnuradio] branch master updated (78551a5 -> ca69ec5), git, 2014/03/06
- [Commit-gnuradio] [gnuradio] 04/08: Merge branch 'maint', git, 2014/03/06
- [Commit-gnuradio] [gnuradio] 03/08: volk: add implicit true to profile's benchmark mode, git, 2014/03/06
- [Commit-gnuradio] [gnuradio] 02/08: volk: don't write to volk_config if using regex, git, 2014/03/06
- [Commit-gnuradio] [gnuradio] 05/08: volk: add regex option to profile, git, 2014/03/06
- [Commit-gnuradio] [gnuradio] 06/08: Merge remote-tracking branch 'nwest/volk_features', git, 2014/03/06
- [Commit-gnuradio] [gnuradio] 01/08: volk: add/remove ORC protokernels in volk_modtool, git, 2014/03/06
- [Commit-gnuradio] [gnuradio] 08/08: Merge branch 'maint', git, 2014/03/06
- [Commit-gnuradio] [gnuradio] 07/08: volk: adds unaligned sse 32fc_32f_dot_prod_32fc, 16i_32fc_dot_prod_32fc, and 32fc_x2_dot_prod_16i proto-kernels.,
git <=