diff options
author | Abhishek Bhowmick <abhowmick22@gmail.com> | 2014-06-05 11:16:41 +0530 |
---|---|---|
committer | Tom Rondeau <tom@trondeau.com> | 2014-10-15 10:29:11 -0400 |
commit | d0c0e856c0618d87ff88ca53fd63584679e2fb56 (patch) | |
tree | 93af7ee1be40df55c7625c6cb6c3e86dff20cd26 | |
parent | 655b2ca7d0ce014ecf5a2f4c8184c09c0fc5a946 (diff) |
volk: temp log kernels.
-rw-r--r-- | volk/kernels/volk/volk_32f_log2_32f_mine.cc | 169 | ||||
-rw-r--r-- | volk/kernels/volk/volk_32f_log2_32f_mitLicense.cc | 115 |
2 files changed, 284 insertions, 0 deletions
diff --git a/volk/kernels/volk/volk_32f_log2_32f_mine.cc b/volk/kernels/volk/volk_32f_log2_32f_mine.cc new file mode 100644 index 0000000000..94d3daafa3 --- /dev/null +++ b/volk/kernels/volk/volk_32f_log2_32f_mine.cc @@ -0,0 +1,169 @@ + +#include <stdio.h> +#include <math.h> +#include <inttypes.h> +#include <iostream> +#include <immintrin.h> + +void print128_num(__m128i var) +{ + uint32_t *val = (uint32_t*) &var;//can also use uint32_t instead of 16_t + printf("Vector of Ints: %x %x %d %d \n", + val[3], val[2], val[1], val[0]); +} + +void print128f_num(__m128 var) +{ + float *val = (float*) &var;//can also use uint32_t instead of 16_t + printf("Vector of Floats: %f %f %f %f \n", + val[3], val[2], val[1], val[0]); +} + +static inline void volk_32f_log2_32f_a_avx(float* bVector, float* aVector, unsigned int num_points){ + + float* bPtr = bVector; + float* aPtr = aVector; + int exp1 = 0x7f800000; + + __m128 aVal, bVal, charac, mantissa; + __m128i bias = _mm_set1_epi32(127); + __m128i aVali = _mm_castps_si128(aVal); + __m128i exp = _mm_set1_epi32(exp1); + aVal = _mm_load_ps(aPtr); + exp = _mm_and_si128(aVali, exp); + exp = _mm_srli_epi32(exp, 23); + exp = _mm_sub_epi32(exp, bias); + charac = _mm_cvtepi32_ps(exp); + + // Now to extract mantissa + int frac1 = 0x007fffff; + __m128i fracMask = _mm_set1_epi32(frac1); + __m128 frac = _mm_castsi128_ps(fracMask); + frac = _mm_and_ps(aVal, frac); + __m128 leadingOne = _mm_set1_ps(1.0f); + frac = _mm_or_ps(leadingOne, frac); + + + // Add the decimal bits one by one, can use LUT to speed up + __m128 compareResults, compareResultsNot, compareTo, fracByTwo, allOnes, a, b, c; + __m128i allOnesi; + allOnesi = _mm_set1_epi32(0xffffffff); + allOnes = _mm_castsi128_ps(allOnesi); + compareTo = _mm_set1_ps(2.0); + + mantissa = _mm_set1_ps(0.5); + frac = _mm_mul_ps(frac, frac); + compareResults = _mm_cmpge_ps(frac, compareTo); // ge compare + compareResultsNot = _mm_andnot_ps(compareResults, allOnes); + fracByTwo = _mm_div_ps(frac, compareTo); + a = _mm_and_ps(compareResults, fracByTwo); + //b = _mm_and_ps(compareResultsNot, frac); + frac = _mm_sub_ps(frac, a); + c = _mm_and_ps(compareResults, mantissa); + charac = _mm_add_ps(charac, c); + + + mantissa = _mm_set1_ps(0.25); + frac = _mm_mul_ps(frac, frac); + compareResults = _mm_cmpge_ps(frac, compareTo); // ge compare + compareResultsNot = _mm_andnot_ps(compareResults, allOnes); + fracByTwo = _mm_div_ps(frac, compareTo); + a = _mm_and_ps(compareResults, fracByTwo); + //b = _mm_and_ps(compareResultsNot, frac); + frac = _mm_sub_ps(frac, a); + charac = _mm_add_ps(charac, _mm_and_ps(compareResults, mantissa)); + + + mantissa = _mm_set1_ps(0.125); + frac = _mm_mul_ps(frac, frac); + compareResults = _mm_cmpge_ps(frac, compareTo); // ge compare + compareResultsNot = _mm_andnot_ps(compareResults, allOnes); + fracByTwo = _mm_div_ps(frac, compareTo); + a = _mm_and_ps(compareResults, fracByTwo); + b = _mm_and_ps(compareResultsNot, frac); + frac = _mm_add_ps(a, b); + charac = _mm_add_ps(charac, _mm_and_ps(compareResults, mantissa)); + +/* + mantissa = _mm_set1_ps(0.0625); + frac = _mm_mul_ps(frac, frac); + compareResults = _mm_cmpge_ps(frac, compareTo); // ge compare + compareResultsNot = _mm_andnot_ps(compareResults, allOnes); + fracByTwo = _mm_div_ps(frac, compareTo); + a = _mm_and_ps(compareResults, fracByTwo); + b = _mm_and_ps(compareResultsNot, frac); + frac = _mm_add_ps(a, b); + charac = _mm_add_ps(charac, _mm_and_ps(compareResults, mantissa)); + + mantissa = _mm_set1_ps(0.03125); + frac = _mm_mul_ps(frac, frac); + compareResults = _mm_cmpge_ps(frac, compareTo); // ge compare + compareResultsNot = _mm_andnot_ps(compareResults, allOnes); + fracByTwo = _mm_div_ps(frac, compareTo); + a = _mm_and_ps(compareResults, fracByTwo); + b = _mm_and_ps(compareResultsNot, frac); + frac = _mm_add_ps(a, b); + charac = _mm_add_ps(charac, _mm_and_ps(compareResults, mantissa)); + + mantissa = _mm_set1_ps(0.015625); + frac = _mm_mul_ps(frac, frac); + compareResults = _mm_cmpge_ps(frac, compareTo); // ge compare + compareResultsNot = _mm_andnot_ps(compareResults, allOnes); + fracByTwo = _mm_div_ps(frac, compareTo); + a = _mm_and_ps(compareResults, fracByTwo); + b = _mm_and_ps(compareResultsNot, frac); + frac = _mm_add_ps(a, b); + charac = _mm_add_ps(charac, _mm_and_ps(compareResults, mantissa)); + + mantissa = _mm_set1_ps(0.0078125); + frac = _mm_mul_ps(frac, frac); + compareResults = _mm_cmpge_ps(frac, compareTo); // ge compare + compareResultsNot = _mm_andnot_ps(compareResults, allOnes); + fracByTwo = _mm_div_ps(frac, compareTo); + a = _mm_and_ps(compareResults, fracByTwo); + b = _mm_and_ps(compareResultsNot, frac); + frac = _mm_add_ps(a, b); + charac = _mm_add_ps(charac, _mm_and_ps(compareResults, mantissa)); +*/ + _mm_store_ps(bPtr, charac); +} + + +int main(){ + +clock_t start, end; +double time; +float dummy; +//float input[] = {0.5, 1.0, 4.0, 9.5}; +float* input = new float(4*sizeof(float)); +input[0] = 0.125; +input[1] = 2.4; +input[2] = 8.0; +input[3] = 16.0; +float* output = new float(4*sizeof(float)); + +start = clock(); +for(int i = 0; i < 10000000; i++) + volk_32f_log2_32f_a_avx(output, input, 4); +end = clock(); +time = 1000 * (double)(end-start); +std::cout << "my time is " << time << std::endl; + +start = clock(); +for(int j = 0; j < 40000000; j++) + dummy = log2(input[0]); +end = clock(); +time = 1000 * (double)(end-start); +std::cout << "cmath time is " << time << std::endl; + + +for(int i =0; i < 4; i ++){ + std::cout << input[i] << "\t" << output[i]; + std::cout << " cmath : \t " << log2(input[i]) << std::endl; +} + +delete input; +delete output; +return 0; + +} diff --git a/volk/kernels/volk/volk_32f_log2_32f_mitLicense.cc b/volk/kernels/volk/volk_32f_log2_32f_mitLicense.cc new file mode 100644 index 0000000000..7ee18cc46b --- /dev/null +++ b/volk/kernels/volk/volk_32f_log2_32f_mitLicense.cc @@ -0,0 +1,115 @@ + +#include <stdio.h> +#include <math.h> +#include <inttypes.h> +#include <iostream> +#include <immintrin.h> + + +#define POLY0(x, c0) _mm_set1_ps(c0) +#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) +#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) +#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) +#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) +#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) + +#define LOG_POLY_DEGREE 3 + +void print128_num(__m128i var) +{ + uint32_t *val = (uint32_t*) &var;//can also use uint32_t instead of 16_t + printf("Vector of Ints: %x %x %d %d \n", + val[3], val[2], val[1], val[0]); +} + +void print128f_num(__m128 var) +{ + float *val = (float*) &var;//can also use uint32_t instead of 16_t + printf("Vector of Floats: %f %f %f %f \n", + val[3], val[2], val[1], val[0]); +} + +static inline void volk_32f_log2_32f_a_avx(float* bVector, float* aVector, unsigned int num_points){ + + float* bPtr = bVector; + float* aPtr = aVector; + int exp1 = 0x7f800000; + + __m128 aVal, bVal, charac, mantissa, frac; + __m128i bias = _mm_set1_epi32(127); + __m128i aVali = _mm_castps_si128(aVal); + __m128i exp = _mm_set1_epi32(exp1); + aVal = _mm_load_ps(aPtr); + exp = _mm_and_si128(aVali, exp); + exp = _mm_srli_epi32(exp, 23); + exp = _mm_sub_epi32(exp, bias); + charac = _mm_cvtepi32_ps(exp); + + // Now to extract mantissa + int frac1 = 0x007fffff; + __m128i fracMask = _mm_set1_epi32(frac1); + frac = _mm_castsi128_ps(fracMask); + frac = _mm_and_ps(aVal, frac); + __m128 leadingOne = _mm_set1_ps(1.0f); + frac = _mm_or_ps(leadingOne, frac); + + + // This is where MIT Licensed code starts + #if LOG_POLY_DEGREE == 6 + mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); +#elif LOG_POLY_DEGREE == 5 + mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); +#elif LOG_POLY_DEGREE == 4 + mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); +#elif LOG_POLY_DEGREE == 3 + mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); +#else +#error +#endif + + mantissa = _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)); + + bVal = _mm_add_ps(charac, mantissa); + + _mm_store_ps(bPtr, bVal); +} + + +int main(){ + +clock_t start, end; +double time; +float dummy; +//float input[] = {0.5, 1.0, 4.0, 9.5}; +float* input = new float(4*sizeof(float)); +input[0] = 0.125; +input[1] = 2.4; +input[2] = 8.0; +input[3] = 16.0; +float* output = new float(4*sizeof(float)); + +start = clock(); +for(int i = 0; i < 10000000; i++) + volk_32f_log2_32f_a_avx(output, input, 4); +end = clock(); +time = 1000 * (double)(end-start); +std::cout << "my time is " << time << std::endl; + +start = clock(); +for(int j = 0; j < 40000000; j++) + dummy = log2(input[0]); +end = clock(); +time = 1000 * (double)(end-start); +std::cout << "cmath time is " << time << std::endl; + + +for(int i =0; i < 4; i ++){ + std::cout << input[i] << "\t" << output[i]; + std::cout << " cmath : \t " << log2(input[i]) << std::endl; +} + +delete input; +delete output; +return 0; + +} |