[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Commit-gnuradio] [gnuradio] 03/04: volk: added a binary slicer that out
From: |
git |
Subject: |
[Commit-gnuradio] [gnuradio] 03/04: volk: added a binary slicer that outputs int8_t data. |
Date: |
Wed, 30 Jul 2014 17:48:02 +0000 (UTC) |
This is an automated email from the git hooks/post-receive script.
trondeau pushed a commit to branch master
in repository gnuradio.
commit bf914b6ca9b64d12c510c92a3cc6f4762639c0f8
Author: Tom Rondeau <address@hidden>
Date: Tue Jul 29 15:23:12 2014 -0400
volk: added a binary slicer that outputs int8_t data.
Only SSE2 simd version implemented.
---
volk/apps/volk_profile.cc | 1 +
volk/kernels/volk/volk_32f_binary_slicer_32i.h | 10 +-
volk/kernels/volk/volk_32f_binary_slicer_8i.h | 187 +++++++++++++++++++++++++
volk/lib/testqa.cc | 1 +
4 files changed, 194 insertions(+), 5 deletions(-)
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc
index f816c4e..0b81c9b 100644
--- a/volk/apps/volk_profile.cc
+++ b/volk/apps/volk_profile.cc
@@ -177,6 +177,7 @@ int main(int argc, char *argv[]) {
VOLK_PROFILE(volk_32fc_s32fc_multiply_32fc, 1e-4, 0, 204602, 1000,
&results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32f_s32f_multiply_32f, 1e-4, 1.0, 204602, 10000,
&results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32f_binary_slicer_32i, 0, 1.0, 204602, 10000, &results,
benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_32f_binary_slicer_8i, 0, 1.0, 204602, 10000, &results,
benchmark_mode, kernel_regex);
// Until we can update the config on a kernel by kernel basis
// do not overwrite volk_config when using a regex.
diff --git a/volk/kernels/volk/volk_32f_binary_slicer_32i.h
b/volk/kernels/volk/volk_32f_binary_slicer_32i.h
index 911df85..f47d20f 100644
--- a/volk/kernels/volk/volk_32f_binary_slicer_32i.h
+++ b/volk/kernels/volk/volk_32f_binary_slicer_32i.h
@@ -1,5 +1,5 @@
-#ifndef INCLUDED_volk_32f_binary_slicer_32f_H
-#define INCLUDED_volk_32f_binary_slicer_32f_H
+#ifndef INCLUDED_volk_32f_binary_slicer_32i_H
+#define INCLUDED_volk_32f_binary_slicer_32i_H
#ifdef LV_HAVE_GENERIC
@@ -136,7 +136,7 @@ static inline void volk_32f_binary_slicer_32i_a_avx(int*
cVector, const float* a
}
}
}
-#endif /* LV_HAVE_SSE2 */
+#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE2
@@ -230,8 +230,8 @@ static inline void volk_32f_binary_slicer_32i_u_avx(int*
cVector, const float* a
}
}
}
-#endif /* LV_HAVE_SSE2 */
+#endif /* LV_HAVE_AVX */
-#endif /* INCLUDED_volk_32f_binary_slicer_32f_H */
+#endif /* INCLUDED_volk_32f_binary_slicer_32i_H */
diff --git a/volk/kernels/volk/volk_32f_binary_slicer_8i.h
b/volk/kernels/volk/volk_32f_binary_slicer_8i.h
new file mode 100644
index 0000000..e24960c
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_binary_slicer_8i.h
@@ -0,0 +1,187 @@
+#ifndef INCLUDED_volk_32f_binary_slicer_8i_H
+#define INCLUDED_volk_32f_binary_slicer_8i_H
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Returns integer 1 if float input is greater than or equal to 0, 1
otherwise
+ \param cVector The char (int8_t) output (either 0 or 1)
+ \param aVector The float input
+ \param num_points The number of values in aVector and stored into cVector
+*/
+static inline void
+volk_32f_binary_slicer_8i_generic(int8_t* cVector, const float* aVector,
+ unsigned int num_points)
+{
+ int8_t* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++) {
+ if(*aPtr++ >= 0) {
+ *cPtr++ = 1;
+ }
+ else {
+ *cPtr++ = 0;
+ }
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Returns integer 1 if float input is greater than or equal to 0, 1
otherwise
+ \param cVector The char (int8_t) output (either 0 or 1)
+ \param aVector The float input
+ \param num_points The number of values in aVector and stored into cVector
+*/
+static inline void
+volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector, const float*
aVector,
+ unsigned int num_points)
+{
+ int8_t* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = (*aPtr++ >= 0);
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Returns integer 1 if float input is greater than or equal to 0, 1
otherwise
+ \param cVector The char (int8_t) output (either 0 or 1)
+ \param aVector The float input
+ \param num_points The number of values in aVector and stored into cVector
+*/
+static inline void
+volk_32f_binary_slicer_8i_a_sse2(int8_t* cVector, const float* aVector,
+ unsigned int num_points)
+{
+ int8_t* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ unsigned int n16points = num_points / 16;
+ __m128 a0_val, a1_val, a2_val, a3_val;
+ __m128 res0_f, res1_f, res2_f, res3_f;
+ __m128i res0_i, res1_i, res2_i, res3_i;
+ __m128 zero_val;
+ zero_val = _mm_set1_ps(0.0f);
+
+ for(number = 0; number < n16points; number++) {
+ a0_val = _mm_load_ps(aPtr);
+ a1_val = _mm_load_ps(aPtr+4);
+ a2_val = _mm_load_ps(aPtr+8);
+ a3_val = _mm_load_ps(aPtr+12);
+
+ // compare >= 0; return float
+ res0_f = _mm_cmpge_ps(a0_val, zero_val);
+ res1_f = _mm_cmpge_ps(a1_val, zero_val);
+ res2_f = _mm_cmpge_ps(a2_val, zero_val);
+ res3_f = _mm_cmpge_ps(a3_val, zero_val);
+
+ // convert to 32i and >> 31
+ res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
+ res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
+ res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
+ res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
+
+ // pack into 16-bit results
+ res0_i = _mm_packs_epi32(res0_i, res1_i);
+ res2_i = _mm_packs_epi32(res2_i, res3_i);
+
+ // pack into 8-bit results
+ res0_i = _mm_packs_epi16(res0_i, res2_i);
+
+ _mm_store_si128((__m128i*)cPtr, res0_i);
+
+ cPtr += 16;
+ aPtr += 16;
+ }
+
+ for(number = n16points * 16; number < num_points; number++) {
+ if( *aPtr++ >= 0) {
+ *cPtr++ = 1;
+ }
+ else {
+ *cPtr++ = 0;
+ }
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+ \brief Returns integer 1 if float input is greater than or equal to 0, 1
otherwise
+ \param cVector The char (int8_t) output (either 0 or 1)
+ \param aVector The float input
+ \param num_points The number of values in aVector and stored into cVector
+*/
+static inline void
+volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector, const float* aVector,
+ unsigned int num_points)
+{
+ int8_t* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ unsigned int n16points = num_points / 16;
+ __m128 a0_val, a1_val, a2_val, a3_val;
+ __m128 res0_f, res1_f, res2_f, res3_f;
+ __m128i res0_i, res1_i, res2_i, res3_i;
+ __m128 zero_val;
+ zero_val = _mm_set1_ps (0.0f);
+
+ for(number = 0; number < n16points; number++) {
+ a0_val = _mm_loadu_ps(aPtr);
+ a1_val = _mm_loadu_ps(aPtr+4);
+ a2_val = _mm_loadu_ps(aPtr+8);
+ a3_val = _mm_loadu_ps(aPtr+12);
+
+ // compare >= 0; return float
+ res0_f = _mm_cmpge_ps(a0_val, zero_val);
+ res1_f = _mm_cmpge_ps(a1_val, zero_val);
+ res2_f = _mm_cmpge_ps(a2_val, zero_val);
+ res3_f = _mm_cmpge_ps(a3_val, zero_val);
+
+ // convert to 32i and >> 31
+ res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
+ res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
+ res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
+ res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
+
+ // pack into 16-bit results
+ res0_i = _mm_packs_epi32(res0_i, res1_i);
+ res2_i = _mm_packs_epi32(res2_i, res3_i);
+
+ // pack into 8-bit results
+ res0_i = _mm_packs_epi16(res0_i, res2_i);
+
+ _mm_storeu_si128((__m128i*)cPtr, res0_i);
+
+ cPtr += 16;
+ aPtr += 16;
+ }
+
+ for(number = n16points * 16; number < num_points; number++) {
+ if( *aPtr++ >= 0) {
+ *cPtr++ = 1;
+ }
+ else {
+ *cPtr++ = 0;
+ }
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#endif /* INCLUDED_volk_32f_binary_slicer_8i_H */
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index fc54b35..bc97ad1 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -113,3 +113,4 @@ VOLK_RUN_TESTS(volk_32fc_s32fc_rotatorpuppet_32fc, 1e-3,
(lv_32fc_t)lv_cmake(0.9
VOLK_RUN_TESTS(volk_8u_conv_k7_r2puppet_8u, 0, 0, 2060, 1);
VOLK_RUN_TESTS(volk_32f_invsqrt_32f, 1e-2, 0, 20462, 1);
VOLK_RUN_TESTS(volk_32f_binary_slicer_32i, 0, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_32f_binary_slicer_8i, 0, 0, 20462, 1);