volk: Added atan, asin, acos kernels.

author: Abhishek Bhowmick <abhowmick22@gmail.com> 2014-06-14 20:40:51 +0530
committer: Tom Rondeau <tom@trondeau.com> 2014-10-15 10:49:54 -0400
commit: ebeab0988c17b08bd1568a53a5a8713616024816 (patch)
tree: a2938c6ad1aa0773b621ba5e34522154748a6cc7
parent: 2fdd530e75c345f6a89e2368c1ccced2fc1ef487 (diff)
5 files changed, 546 insertions, 0 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc
index 46929b3df3..5030836d43 100644
--- a/volk/apps/volk_profile.cc
+++ b/volk/apps/volk_profile.cc
@@ -168,6 +168,9 @@ int main(int argc, char *argv[]) {
     VOLK_PROFILE(volk_32f_sin_32f, 1e-6, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32f_cos_32f, 1e-6, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32f_tan_32f, 1e-6, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+    VOLK_PROFILE(volk_32f_atan_32f, 1e-3, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+    VOLK_PROFILE(volk_32f_asin_32f, 1e-3, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
+    VOLK_PROFILE(volk_32f_acos_32f, 1e-3, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32fc_s32f_power_32fc, 1e-4, 0, 204602, 50, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 204602, 100, &results, benchmark_mode, kernel_regex);
diff --git a/volk/kernels/volk/volk_32f_acos_32f.h b/volk/kernels/volk/volk_32f_acos_32f.h
new file mode 100644
index 0000000000..deba615bc9
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_acos_32f.h
@@ -0,0 +1,186 @@
+#include <stdio.h>
+#include <math.h>
+#include <inttypes.h>
+
+/* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/
+#define TERMS 2
+
+#ifndef INCLUDED_volk_32f_acos_32f_a_H
+#define INCLUDED_volk_32f_acos_32f_a_H
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Computes arccosine of input vector and stores results in output vector
+  \param bVector The vector where results will be stored
+  \param aVector The input vector of floats
+  \param num_points Number of points for which arccosine is to be computed
+*/
+static inline void volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points){
+
+	float* bPtr = bVector;
+	const float* aPtr = aVector;
+    
+	unsigned int number = 0;
+        unsigned int quarterPoints = num_points / 4;
+	int i, j;
+
+	__m128 aVal, d, pi, pio2, x, y, z, arccosine;
+	__m128 fzeroes, fones, ftwos, ffours, condition;
+
+	pi = _mm_set1_ps(3.14159265358979323846);
+	pio2 = _mm_set1_ps(3.14159265358979323846/2);
+	fzeroes = _mm_setzero_ps();
+	fones = _mm_set1_ps(1.0);
+	ftwos = _mm_set1_ps(2.0);
+	ffours = _mm_set1_ps(4.0);
+
+	for(;number < quarterPoints; number++){    
+		aVal = _mm_load_ps(aPtr);
+		d = aVal;
+		aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
+		z = aVal;
+		condition = _mm_cmplt_ps(z, fzeroes);
+		z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+		x = z;
+		condition = _mm_cmplt_ps(z, fones);
+		x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+		for(i = 0; i < 2; i++)	x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+		x = _mm_div_ps(fones, x);
+		y = fzeroes;
+		for(j = TERMS - 1; j >=0 ; j--)	y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
+		
+		y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+		condition = _mm_cmpgt_ps(z, fones);
+		
+		y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+		arccosine = y;
+		condition = _mm_cmplt_ps(aVal, fzeroes);
+		arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
+		condition = _mm_cmplt_ps(d, fzeroes);
+		arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
+
+		_mm_store_ps(bPtr, arccosine);
+		aPtr += 4;
+		bPtr += 4;
+	}
+ 
+	number = quarterPoints * 4;
+	for(;number < num_points; number++){
+	   *bPtr++ = acos(*aPtr++);
+	}
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Computes arccosine of input vector and stores results in output vector
+  \param bVector The vector where results will be stored
+  \param aVector The input vector of floats
+  \param num_points Number of points for which arccosine is to be computed
+*/
+static inline void volk_32f_acos_32f_a_generic(float* bVector, const float* aVector, unsigned int num_points){    
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *bPtr++ = acos(*aPtr++);
+    }
+ 
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_acos_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_acos_32f_u_H
+#define INCLUDED_volk_32f_acos_32f_u_H
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Computes arccosine of input vector and stores results in output vector
+  \param bVector The vector where results will be stored
+  \param aVector The input vector of floats
+  \param num_points Number of points for which arccosine is to be computed
+*/
+static inline void volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points){
+
+	float* bPtr = bVector;
+	const float* aPtr = aVector;
+    
+	unsigned int number = 0;
+        unsigned int quarterPoints = num_points / 4;
+	int i, j;
+
+	__m128 aVal, d, pi, pio2, x, y, z, arccosine;
+	__m128 fzeroes, fones, ftwos, ffours, condition;
+
+	pi = _mm_set1_ps(3.14159265358979323846);
+	pio2 = _mm_set1_ps(3.14159265358979323846/2);
+	fzeroes = _mm_setzero_ps();
+	fones = _mm_set1_ps(1.0);
+	ftwos = _mm_set1_ps(2.0);
+	ffours = _mm_set1_ps(4.0);
+
+	for(;number < quarterPoints; number++){    
+		aVal = _mm_loadu_ps(aPtr);
+		d = aVal;
+		aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
+		z = aVal;
+		condition = _mm_cmplt_ps(z, fzeroes);
+		z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+		x = z;
+		condition = _mm_cmplt_ps(z, fones);
+		x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+		for(i = 0; i < 2; i++)	x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+		x = _mm_div_ps(fones, x);
+		y = fzeroes;
+		for(j = TERMS - 1; j >=0 ; j--)	y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
+		
+		y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+		condition = _mm_cmpgt_ps(z, fones);
+		
+		y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+		arccosine = y;
+		condition = _mm_cmplt_ps(aVal, fzeroes);
+		arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
+		condition = _mm_cmplt_ps(d, fzeroes);
+		arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
+
+		_mm_storeu_ps(bPtr, arccosine);
+		aPtr += 4;
+		bPtr += 4;
+	}
+ 
+	number = quarterPoints * 4;
+	for(;number < num_points; number++){
+	   *bPtr++ = acos(*aPtr++);
+	}
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Computes arccosine of input vector and stores results in output vector
+  \param bVector The vector where results will be stored
+  \param aVector The input vector of floats
+  \param num_points Number of points for which arccosine is to be computed
+*/
+static inline void volk_32f_acos_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points){    
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *bPtr++ = acos(*aPtr++);
+    }
+ 
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_acos_32f_u_H */
diff --git a/volk/kernels/volk/volk_32f_asin_32f.h b/volk/kernels/volk/volk_32f_asin_32f.h
new file mode 100644
index 0000000000..976aabc258
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_asin_32f.h
@@ -0,0 +1,178 @@
+#include <stdio.h>
+#include <math.h>
+#include <inttypes.h>
+
+/* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/
+#define TERMS 2
+
+#ifndef INCLUDED_volk_32f_asin_32f_a_H
+#define INCLUDED_volk_32f_asin_32f_a_H
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Computes arcsine of input vector and stores results in output vector
+  \param bVector The vector where results will be stored
+  \param aVector The input vector of floats
+  \param num_points Number of points for which arcsine is to be computed
+*/
+static inline void volk_32f_asin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points){
+
+	float* bPtr = bVector;
+	const float* aPtr = aVector;
+    
+	unsigned int number = 0;
+        unsigned int quarterPoints = num_points / 4;
+	int i, j;
+
+	__m128 aVal, pio2, x, y, z, arcsine;
+	__m128 fzeroes, fones, ftwos, ffours, condition;
+
+	pio2 = _mm_set1_ps(3.14159265358979323846/2);
+	fzeroes = _mm_setzero_ps();
+	fones = _mm_set1_ps(1.0);
+	ftwos = _mm_set1_ps(2.0);
+	ffours = _mm_set1_ps(4.0);
+
+	for(;number < quarterPoints; number++){    
+		aVal = _mm_load_ps(aPtr);
+		aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
+		z = aVal;
+		condition = _mm_cmplt_ps(z, fzeroes);
+		z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+		x = z;
+		condition = _mm_cmplt_ps(z, fones);
+		x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+		for(i = 0; i < 2; i++)	x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+		x = _mm_div_ps(fones, x);
+		y = fzeroes;
+		for(j = TERMS - 1; j >=0 ; j--)	y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
+		
+		y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+		condition = _mm_cmpgt_ps(z, fones);
+		
+		y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+		arcsine = y;
+		condition = _mm_cmplt_ps(aVal, fzeroes);
+		arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
+
+		_mm_store_ps(bPtr, arcsine);
+		aPtr += 4;
+		bPtr += 4;
+	}
+ 
+	number = quarterPoints * 4;
+	for(;number < num_points; number++){
+	   *bPtr++ = asin(*aPtr++);
+	}
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Computes arcsine of input vector and stores results in output vector
+  \param bVector The vector where results will be stored
+  \param aVector The input vector of floats
+  \param num_points Number of points for which arcsine is to be computed
+*/
+static inline void volk_32f_asin_32f_a_generic(float* bVector, const float* aVector, unsigned int num_points){    
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *bPtr++ = asin(*aPtr++);
+    }
+ 
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_asin_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_asin_32f_u_H
+#define INCLUDED_volk_32f_asin_32f_u_H
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Computes arcsine of input vector and stores results in output vector
+  \param bVector The vector where results will be stored
+  \param aVector The input vector of floats
+  \param num_points Number of points for which arcsine is to be computed
+*/
+static inline void volk_32f_asin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points){
+
+	float* bPtr = bVector;
+	const float* aPtr = aVector;
+    
+	unsigned int number = 0;
+        unsigned int quarterPoints = num_points / 4;
+	int i, j;
+
+	__m128 aVal, pio2, x, y, z, arcsine;
+	__m128 fzeroes, fones, ftwos, ffours, condition;
+
+	pio2 = _mm_set1_ps(3.14159265358979323846/2);
+	fzeroes = _mm_setzero_ps();
+	fones = _mm_set1_ps(1.0);
+	ftwos = _mm_set1_ps(2.0);
+	ffours = _mm_set1_ps(4.0);
+
+	for(;number < quarterPoints; number++){    
+		aVal = _mm_loadu_ps(aPtr);
+		aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
+		z = aVal;
+		condition = _mm_cmplt_ps(z, fzeroes);
+		z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+		x = z;
+		condition = _mm_cmplt_ps(z, fones);
+		x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+		for(i = 0; i < 2; i++)	x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+		x = _mm_div_ps(fones, x);
+		y = fzeroes;
+		for(j = TERMS - 1; j >=0 ; j--)	y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
+		
+		y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+		condition = _mm_cmpgt_ps(z, fones);
+		
+		y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+		arcsine = y;
+		condition = _mm_cmplt_ps(aVal, fzeroes);
+		arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
+
+		_mm_storeu_ps(bPtr, arcsine);
+		aPtr += 4;
+		bPtr += 4;
+	}
+ 
+	number = quarterPoints * 4;
+	for(;number < num_points; number++){
+	   *bPtr++ = asin(*aPtr++);
+	}
+}
+
+#endif /* LV_HAVE_SSE4_1 for unaligned */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Computes arcsine of input vector and stores results in output vector
+  \param bVector The vector where results will be stored
+  \param aVector The input vector of floats
+  \param num_points Number of points for which arcsine is to be computed
+*/
+static inline void volk_32f_asin_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points){    
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *bPtr++ = asin(*aPtr++);
+    }
+ 
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_asin_32f_a_H */
diff --git a/volk/kernels/volk/volk_32f_atan_32f.h b/volk/kernels/volk/volk_32f_atan_32f.h
new file mode 100644
index 0000000000..a60e2b8c5f
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_atan_32f.h
@@ -0,0 +1,176 @@
+#include <stdio.h>
+#include <math.h>
+#include <inttypes.h>
+
+/* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/
+#define TERMS 2
+
+#ifndef INCLUDED_volk_32f_atan_32f_a_H
+#define INCLUDED_volk_32f_atan_32f_a_H
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Computes arctangent of input vector and stores results in output vector
+  \param bVector The vector where results will be stored
+  \param aVector The input vector of floats
+  \param num_points Number of points for which arctangent is to be computed
+*/
+static inline void volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points){
+
+	float* bPtr = bVector;
+	const float* aPtr = aVector;
+    
+	unsigned int number = 0;
+        unsigned int quarterPoints = num_points / 4;
+	int i, j;
+
+	__m128 aVal, pio2, x, y, z, arctangent;
+	__m128 fzeroes, fones, ftwos, ffours, condition;
+
+	pio2 = _mm_set1_ps(3.14159265358979323846/2);
+	fzeroes = _mm_setzero_ps();
+	fones = _mm_set1_ps(1.0);
+	ftwos = _mm_set1_ps(2.0);
+	ffours = _mm_set1_ps(4.0);
+
+	for(;number < quarterPoints; number++){    
+		aVal = _mm_load_ps(aPtr);
+		z = aVal;
+		condition = _mm_cmplt_ps(z, fzeroes);
+		z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+		x = z;
+		condition = _mm_cmplt_ps(z, fones);
+		x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+		for(i = 0; i < 2; i++)	x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+		x = _mm_div_ps(fones, x);
+		y = fzeroes;
+		for(j = TERMS - 1; j >=0 ; j--)	y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
+		
+		y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+		condition = _mm_cmpgt_ps(z, fones);
+		
+		y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+		arctangent = y;
+		condition = _mm_cmplt_ps(aVal, fzeroes);
+		arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
+
+		_mm_store_ps(bPtr, arctangent);
+		aPtr += 4;
+		bPtr += 4;
+	}
+ 
+	number = quarterPoints * 4;
+	for(;number < num_points; number++){
+	   *bPtr++ = atan(*aPtr++);
+	}
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Computes arctangent of input vector and stores results in output vector
+  \param bVector The vector where results will be stored
+  \param aVector The input vector of floats
+  \param num_points Number of points for which arctangent is to be computed
+*/
+static inline void volk_32f_atan_32f_a_generic(float* bVector, const float* aVector, unsigned int num_points){    
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *bPtr++ = atan(*aPtr++);
+    }
+ 
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_atan_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_atan_32f_u_H
+#define INCLUDED_volk_32f_atan_32f_u_H
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Computes arctangent of input vector and stores results in output vector
+  \param bVector The vector where results will be stored
+  \param aVector The input vector of floats
+  \param num_points Number of points for which arctangent is to be computed
+*/
+static inline void volk_32f_atan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points){
+
+	float* bPtr = bVector;
+	const float* aPtr = aVector;
+    
+	unsigned int number = 0;
+        unsigned int quarterPoints = num_points / 4;
+	int i, j;
+	
+	__m128 aVal, pio2, x, y, z, arctangent;
+	__m128 fzeroes, fones, ftwos, ffours, condition;
+
+	pio2 = _mm_set1_ps(3.14159265358979323846/2);
+	fzeroes = _mm_setzero_ps();
+	fones = _mm_set1_ps(1.0);
+	ftwos = _mm_set1_ps(2.0);
+	ffours = _mm_set1_ps(4.0);
+
+	for(;number < quarterPoints; number++){    
+		aVal = _mm_loadu_ps(aPtr);
+		z = aVal;
+		condition = _mm_cmplt_ps(z, fzeroes);
+		z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+		x = z;
+		condition = _mm_cmplt_ps(z, fones);
+		x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+		for(i = 0; i < 2; i++)	x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+		x = _mm_div_ps(fones, x);
+		y = fzeroes;
+		for(j = TERMS - 1; j >= 0; j--)	y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
+		
+		y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+		condition = _mm_cmpgt_ps(z, fones);
+		
+		y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+		arctangent = y;
+		condition = _mm_cmplt_ps(aVal, fzeroes);
+		arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
+
+		_mm_storeu_ps(bPtr, arctangent);
+		aPtr += 4;
+		bPtr += 4;
+	}
+ 
+	number = quarterPoints * 4;
+	for(;number < num_points; number++){
+	   *bPtr++ = atan(*aPtr++);
+	}
+}
+
+#endif /* LV_HAVE_SSE4_1 for unaligned */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Computes arctangent of input vector and stores results in output vector
+  \param bVector The vector where results will be stored
+  \param aVector The input vector of floats
+  \param num_points Number of points for which arctangent is to be computed
+*/
+static inline void volk_32f_atan_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points){    
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *bPtr++ = atan(*aPtr++);
+    }
+ 
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_atan_32f_u_H */
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index 971c82fbb5..7d1826b719 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -50,6 +50,9 @@ VOLK_RUN_TESTS(volk_32f_x2_pow_32f, 1e-2, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_32f_sin_32f, 1e-6, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_32f_cos_32f, 1e-6, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_32f_tan_32f, 1e-6, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_32f_atan_32f, 1e-3, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_32f_asin_32f, 1e-3, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_32f_acos_32f, 1e-3, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc, 1e-4, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 20462, 1);
 VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 20462, 1);
author	Abhishek Bhowmick <abhowmick22@gmail.com>	2014-06-14 20:40:51 +0530
committer	Tom Rondeau <tom@trondeau.com>	2014-10-15 10:49:54 -0400
commit	ebeab0988c17b08bd1568a53a5a8713616024816 (patch)
tree	a2938c6ad1aa0773b621ba5e34522154748a6cc7
parent	2fdd530e75c345f6a89e2368c1ccced2fc1ef487 (diff)