doc/doxygen-3.6/volk__32fc__s32fc__x2__rotator__32fc__a_8h_source.html

00001 #ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
00002 #define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
00003
00004
00005 #include <volk/volk_complex.h>
00006 #include <stdio.h>
00007 #include <stdlib.h>
00008 #define ROTATOR_RELOAD 512
00009
00010
00011 #ifdef LV_HAVE_GENERIC
00012
00013 /*!
00014   \brief rotate input vector at fixed rate per sample from initial phase offset
00015   \param outVector The vector where the results will be stored
00016   \param inVector Vector to be rotated
00017   \param phase_inc rotational velocity
00018   \param phase initial phase offset
00019   \param num_points The number of values in inVector to be rotated and stored into cVector
00020 */
00021
00022
00023 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
00024     unsigned int i = 0;
00025     int j = 0;
00026     for(i = 0; i < (unsigned int)(num_points/ROTATOR_RELOAD); ++i) {
00027         for(j = 0; j < ROTATOR_RELOAD; ++j) {
00028             *outVector++ = *inVector++ * (*phase);
00029             (*phase) *= phase_inc;
00030         }
00031         (*phase) /= abs((*phase));
00032     }
00033     for(i = 0; i < num_points%ROTATOR_RELOAD; ++i) {
00034         *outVector++ = *inVector++ * (*phase);
00035         (*phase) *= phase_inc;
00036     }
00037
00038 }
00039 #endif /* LV_HAVE_GENERIC */
00040
00041
00042 #ifdef LV_HAVE_SSE4_1
00043 #include <smmintrin.h>
00044
00045 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
00046     lv_32fc_t* cPtr = outVector;
00047     const lv_32fc_t* aPtr = inVector;
00048     lv_32fc_t incr = 1;
00049     lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
00050
00051     unsigned int i, j = 0;
00052
00053     for(i = 0; i < 2; ++i) {
00054         phase_Ptr[i] *= incr;
00055         incr *= (phase_inc);
00056     }
00057
00058     /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
00059     printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
00060     printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2]));
00061     printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3]));
00062     printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
00063     __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
00064
00065     phase_Val = _mm_loadu_ps((float*)phase_Ptr);
00066     inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
00067
00068     const unsigned int halfPoints = num_points / 2;
00069
00070
00071     for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) {
00072         for(j = 0; j < ROTATOR_RELOAD; ++j) {
00073
00074             aVal = _mm_load_ps((float*)aPtr);
00075
00076             yl = _mm_moveldup_ps(phase_Val);
00077             yh = _mm_movehdup_ps(phase_Val);
00078             ylp = _mm_moveldup_ps(inc_Val);
00079             yhp = _mm_movehdup_ps(inc_Val);
00080
00081             tmp1 = _mm_mul_ps(aVal, yl);
00082             tmp1p = _mm_mul_ps(phase_Val, ylp);
00083
00084             aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
00085             phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
00086             tmp2 = _mm_mul_ps(aVal, yh);
00087             tmp2p = _mm_mul_ps(phase_Val, yhp);
00088
00089             z = _mm_addsub_ps(tmp1, tmp2);
00090             phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
00091
00092             _mm_store_ps((float*)cPtr, z);
00093
00094             aPtr += 2;
00095             cPtr += 2;
00096         }
00097         tmp1 = _mm_mul_ps(phase_Val, phase_Val);
00098         tmp2 = _mm_hadd_ps(tmp1, tmp1);
00099         tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
00100         phase_Val = _mm_div_ps(phase_Val, tmp1);
00101     }
00102     for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) {
00103         aVal = _mm_load_ps((float*)aPtr);
00104
00105         yl = _mm_moveldup_ps(phase_Val);
00106         yh = _mm_movehdup_ps(phase_Val);
00107         ylp = _mm_moveldup_ps(inc_Val);
00108         yhp = _mm_movehdup_ps(inc_Val);
00109
00110         tmp1 = _mm_mul_ps(aVal, yl);
00111
00112         tmp1p = _mm_mul_ps(phase_Val, ylp);
00113
00114         aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
00115         phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
00116         tmp2 = _mm_mul_ps(aVal, yh);
00117         tmp2p = _mm_mul_ps(phase_Val, yhp);
00118
00119         z = _mm_addsub_ps(tmp1, tmp2);
00120         phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
00121
00122         _mm_store_ps((float*)cPtr, z);
00123
00124         aPtr += 2;
00125         cPtr += 2;
00126     }
00127
00128     _mm_storeu_ps((float*)phase_Ptr, phase_Val);
00129     for(i = 0; i < num_points%2; ++i) {
00130         *cPtr++ = *aPtr++ * phase_Ptr[0];
00131         phase_Ptr[0] *= (phase_inc);
00132     }
00133
00134     (*phase) = phase_Ptr[0];
00135
00136 }
00137
00138 #endif /* LV_HAVE_SSE4_1 */
00139
00140
00141 #ifdef LV_HAVE_AVX
00142 #include <immintrin.h>
00143
00144 /*!
00145   \brief rotate input vector at fixed rate per sample from initial phase offset
00146   \param outVector The vector where the results will be stored
00147   \param inVector Vector to be rotated
00148   \param phase_inc rotational velocity
00149   \param phase initial phase offset
00150   \param num_points The number of values in inVector to be rotated and stored into cVector
00151 */
00152
00153
00154
00155
00156 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
00157     lv_32fc_t* cPtr = outVector;
00158     const lv_32fc_t* aPtr = inVector;
00159     lv_32fc_t incr = 1;
00160     lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
00161
00162     unsigned int i, j = 0;
00163
00164     for(i = 0; i < 4; ++i) {
00165         phase_Ptr[i] *= incr;
00166         incr *= (phase_inc);
00167     }
00168
00169     /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
00170     printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
00171     printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2]));
00172     printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3]));
00173     printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
00174     __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
00175
00176     phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
00177     inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
00178     const unsigned int fourthPoints = num_points / 4;
00179
00180
00181     for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
00182         for(j = 0; j < ROTATOR_RELOAD; ++j) {
00183
00184             aVal = _mm256_load_ps((float*)aPtr);
00185
00186             yl = _mm256_moveldup_ps(phase_Val);
00187             yh = _mm256_movehdup_ps(phase_Val);
00188             ylp = _mm256_moveldup_ps(inc_Val);
00189             yhp = _mm256_movehdup_ps(inc_Val);
00190
00191             tmp1 = _mm256_mul_ps(aVal, yl);
00192             tmp1p = _mm256_mul_ps(phase_Val, ylp);
00193
00194             aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
00195             phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
00196             tmp2 = _mm256_mul_ps(aVal, yh);
00197             tmp2p = _mm256_mul_ps(phase_Val, yhp);
00198
00199             z = _mm256_addsub_ps(tmp1, tmp2);
00200             phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
00201
00202             _mm256_store_ps((float*)cPtr, z);
00203
00204             aPtr += 4;
00205             cPtr += 4;
00206         }
00207         tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
00208         tmp2 = _mm256_hadd_ps(tmp1, tmp1);
00209         tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
00210         phase_Val = _mm256_div_ps(phase_Val, tmp1);
00211     }
00212     for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
00213         aVal = _mm256_load_ps((float*)aPtr);
00214
00215         yl = _mm256_moveldup_ps(phase_Val);
00216         yh = _mm256_movehdup_ps(phase_Val);
00217         ylp = _mm256_moveldup_ps(inc_Val);
00218         yhp = _mm256_movehdup_ps(inc_Val);
00219
00220         tmp1 = _mm256_mul_ps(aVal, yl);
00221
00222         tmp1p = _mm256_mul_ps(phase_Val, ylp);
00223
00224         aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
00225         phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
00226         tmp2 = _mm256_mul_ps(aVal, yh);
00227         tmp2p = _mm256_mul_ps(phase_Val, yhp);
00228
00229         z = _mm256_addsub_ps(tmp1, tmp2);
00230         phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
00231
00232         _mm256_store_ps((float*)cPtr, z);
00233
00234         aPtr += 4;
00235         cPtr += 4;
00236     }
00237
00238     _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
00239     for(i = 0; i < num_points%4; ++i) {
00240         *cPtr++ = *aPtr++ * phase_Ptr[0];
00241         phase_Ptr[0] *= (phase_inc);
00242     }
00243
00244     (*phase) = phase_Ptr[0];
00245
00246 }
00247
00248 #endif /* LV_HAVE_AVX */
00249
00250
00251
00252
00253
00254
00255
00256
00257 #endif /* INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H */