Statistics
| Branch: | Tag: | Revision:

root / volk / include / volk / volk_32fc_s32fc_x2_rotator_32fc_a.h @ ff3d467b

History | View | Annotate | Download (8.4 kB)

1
#ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
2
#define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
3
4
5
#include <volk/volk_complex.h>
6
#include <stdio.h>
7
#include <stdlib.h>
8
#define ROTATOR_RELOAD 512
9
10
11
#ifdef LV_HAVE_GENERIC
12
13
/*!
14
  \brief rotate input vector at fixed rate per sample from initial phase offset
15
  \param outVector The vector where the results will be stored
16
  \param inVector Vector to be rotated
17
  \param phase_inc rotational velocity
18
  \param phase initial phase offset
19
  \param num_points The number of values in inVector to be rotated and stored into cVector
20
*/
21
22
23
static inline void volk_32fc_s32fc_x2_rotator_32fc_a_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){    
24
    *phase = lv_cmake(1.0, 0.0);
25
    unsigned int i = 0; 
26
    int j = 0;    
27
    for(i = 0; i < (unsigned int)(num_points/ROTATOR_RELOAD); ++i) {
28
        for(j = 0; j < ROTATOR_RELOAD; ++j) {
29
            *outVector++ = *inVector++ * (*phase);
30
            (*phase) *= phase_inc;
31
        }
32
        (*phase) /= abs((*phase));
33
    }
34
    for(i = 0; i < num_points%ROTATOR_RELOAD; ++i) {
35
        *outVector++ = *inVector++ * (*phase);
36
        (*phase) *= phase_inc;
37
    }
38
    
39
}
40
#endif /* LV_HAVE_GENERIC */
41
42
43
#ifdef LV_HAVE_SSE4_1
44
#include <smmintrin.h>
45
46
static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
47
    *phase = lv_cmake(1.0, 0.0);
48
    lv_32fc_t* cPtr = outVector;
49
    const lv_32fc_t* aPtr = inVector;
50
    lv_32fc_t incr = 1;
51
    lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
52
    
53
    unsigned int i, j = 0;
54
55
    for(i = 0; i < 2; ++i) {
56
        phase_Ptr[i] *= incr;
57
        incr *= (phase_inc);
58
    }
59
60
    /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
61
    printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
62
    printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2]));
63
    printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3]));
64
    printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
65
    __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
66
    
67
    phase_Val = _mm_loadu_ps((float*)phase_Ptr);
68
    inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
69
    
70
    const unsigned int halfPoints = num_points / 2;
71
72
    
73
    for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) {
74
        for(j = 0; j < ROTATOR_RELOAD; ++j) {
75
            
76
            aVal = _mm_load_ps((float*)aPtr);
77
            
78
            yl = _mm_moveldup_ps(phase_Val);
79
            yh = _mm_movehdup_ps(phase_Val);
80
            ylp = _mm_moveldup_ps(inc_Val);
81
            yhp = _mm_movehdup_ps(inc_Val);
82
            
83
            tmp1 = _mm_mul_ps(aVal, yl);
84
            tmp1p = _mm_mul_ps(phase_Val, ylp);
85
            
86
            aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
87
            phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
88
            tmp2 = _mm_mul_ps(aVal, yh);
89
            tmp2p = _mm_mul_ps(phase_Val, yhp);
90
            
91
            z = _mm_addsub_ps(tmp1, tmp2);
92
            phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
93
            
94
            _mm_store_ps((float*)cPtr, z);
95
            
96
            aPtr += 2;
97
            cPtr += 2;
98
        }
99
        tmp1 = _mm_mul_ps(phase_Val, phase_Val);
100
        tmp2 = _mm_hadd_ps(tmp1, tmp1);
101
        tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
102
        phase_Val = _mm_div_ps(phase_Val, tmp1);
103
    }
104
    for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) {
105
        aVal = _mm_load_ps((float*)aPtr);
106
        
107
        yl = _mm_moveldup_ps(phase_Val);
108
        yh = _mm_movehdup_ps(phase_Val);
109
        ylp = _mm_moveldup_ps(inc_Val);
110
        yhp = _mm_movehdup_ps(inc_Val);
111
        
112
        tmp1 = _mm_mul_ps(aVal, yl);
113
114
        tmp1p = _mm_mul_ps(phase_Val, ylp);
115
        
116
        aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
117
        phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
118
        tmp2 = _mm_mul_ps(aVal, yh);
119
        tmp2p = _mm_mul_ps(phase_Val, yhp);
120
        
121
        z = _mm_addsub_ps(tmp1, tmp2);
122
        phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
123
        
124
        _mm_store_ps((float*)cPtr, z);
125
        
126
        aPtr += 2;
127
        cPtr += 2;
128
    }
129
130
    _mm_storeu_ps((float*)phase_Ptr, phase_Val);
131
    for(i = 0; i < num_points%2; ++i) {
132
        *cPtr++ = *aPtr++ * phase_Ptr[0];
133
        phase_Ptr[0] *= (phase_inc);
134
    }
135
     
136
    (*phase) = phase_Ptr[0];
137
138
}
139
    
140
#endif /* LV_HAVE_SSE4_1 */
141
142
143
#ifdef LV_HAVE_AVX
144
#include <immintrin.h>
145
146
/*!
147
  \brief rotate input vector at fixed rate per sample from initial phase offset
148
  \param outVector The vector where the results will be stored
149
  \param inVector Vector to be rotated
150
  \param phase_inc rotational velocity
151
  \param phase initial phase offset
152
  \param num_points The number of values in inVector to be rotated and stored into cVector
153
*/
154
155
156
157
158
static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
159
    *phase = lv_cmake(1.0, 0.0);
160
    lv_32fc_t* cPtr = outVector;
161
    const lv_32fc_t* aPtr = inVector;
162
    lv_32fc_t incr = 1;
163
    lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
164
    
165
    unsigned int i, j = 0;
166
167
    for(i = 0; i < 4; ++i) {
168
        phase_Ptr[i] *= incr;
169
        incr *= (phase_inc);
170
    }
171
172
    /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
173
    printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
174
    printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2]));
175
    printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3]));
176
    printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
177
    __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p, negated, zeros;
178
    
179
    phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
180
    inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
181
    zeros = _mm256_set1_ps(0.0);
182
    negated = _mm256_set1_ps(-1.0);
183
    const unsigned int fourthPoints = num_points / 4;
184
185
    
186
    for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
187
        for(j = 0; j < ROTATOR_RELOAD; ++j) {
188
            
189
            aVal = _mm256_load_ps((float*)aPtr);
190
            
191
            yl = _mm256_moveldup_ps(phase_Val);
192
            yh = _mm256_movehdup_ps(phase_Val);
193
            ylp = _mm256_moveldup_ps(inc_Val);
194
            yhp = _mm256_movehdup_ps(inc_Val);
195
            
196
            tmp1 = _mm256_mul_ps(aVal, yl);
197
            tmp1p = _mm256_mul_ps(phase_Val, ylp);
198
            
199
            aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
200
            phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
201
            tmp2 = _mm256_mul_ps(aVal, yh);
202
            tmp2p = _mm256_mul_ps(phase_Val, yhp);
203
            
204
            z = _mm256_addsub_ps(tmp1, tmp2);
205
            phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
206
            
207
            _mm256_store_ps((float*)cPtr, z);
208
            
209
            aPtr += 4;
210
            cPtr += 4;
211
        }
212
        tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
213
        tmp2 = _mm256_hadd_ps(tmp1, tmp1);
214
        tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
215
        phase_Val = _mm256_div_ps(phase_Val, tmp1);
216
    }
217
    for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
218
        aVal = _mm256_load_ps((float*)aPtr);
219
        
220
        yl = _mm256_moveldup_ps(phase_Val);
221
        yh = _mm256_movehdup_ps(phase_Val);
222
        ylp = _mm256_moveldup_ps(inc_Val);
223
        yhp = _mm256_movehdup_ps(inc_Val);
224
        
225
        tmp1 = _mm256_mul_ps(aVal, yl);
226
227
        tmp1p = _mm256_mul_ps(phase_Val, ylp);
228
        
229
        aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
230
        phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
231
        tmp2 = _mm256_mul_ps(aVal, yh);
232
        tmp2p = _mm256_mul_ps(phase_Val, yhp);
233
        
234
        z = _mm256_addsub_ps(tmp1, tmp2);
235
        phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
236
        
237
        _mm256_store_ps((float*)cPtr, z);
238
        
239
        aPtr += 4;
240
        cPtr += 4;
241
    }
242
243
    _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
244
    for(i = 0; i < num_points%4; ++i) {
245
        *cPtr++ = *aPtr++ * phase_Ptr[0];
246
        phase_Ptr[0] *= (phase_inc);
247
    }
248
     
249
    (*phase) = phase_Ptr[0];
250
251
}
252
    
253
#endif /* LV_HAVE_AVX */
254
255
256
257
258
259
260
261
262
#endif /* INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H */