1 #ifndef INCLUDED_volk_64u_byteswap_u_H
2 #define INCLUDED_volk_64u_byteswap_u_H
15 static inline void volk_64u_byteswap_u_sse2(
uint64_t* intsToSwap,
unsigned int num_points){
17 __m128i input, byte1, byte2, byte3, byte4, output;
18 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
19 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
21 const unsigned int halfPoints = num_points / 2;
22 for(;number < halfPoints; number++){
24 input = _mm_loadu_si128((__m128i*)inputPtr);
27 byte1 = _mm_slli_epi32(input, 24);
28 byte2 = _mm_slli_epi32(input, 8);
29 byte3 = _mm_srli_epi32(input, 8);
30 byte4 = _mm_srli_epi32(input, 24);
32 output = _mm_or_si128(byte1, byte4);
33 byte2 = _mm_and_si128(byte2, byte2mask);
34 output = _mm_or_si128(output, byte2);
35 byte3 = _mm_and_si128(byte3, byte3mask);
36 output = _mm_or_si128(output, byte3);
39 output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
42 _mm_storeu_si128((__m128i*)inputPtr, output);
47 number = halfPoints*2;
48 for(; number < num_points; number++){
52 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
54 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
56 *inputPtr++ = output2;
57 *inputPtr++ = output1;
62 #ifdef LV_HAVE_GENERIC
68 static inline void volk_64u_byteswap_generic(
uint64_t* intsToSwap,
unsigned int num_points){
71 for(point = 0; point < num_points; point++){
75 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
77 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
79 *inputPtr++ = output2;
80 *inputPtr++ = output1;
89 #ifndef INCLUDED_volk_64u_byteswap_a_H
90 #define INCLUDED_volk_64u_byteswap_a_H
96 #include <emmintrin.h>
103 static inline void volk_64u_byteswap_a_sse2(
uint64_t* intsToSwap,
unsigned int num_points){
105 __m128i input, byte1, byte2, byte3, byte4, output;
106 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
107 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
109 const unsigned int halfPoints = num_points / 2;
110 for(;number < halfPoints; number++){
112 input = _mm_load_si128((__m128i*)inputPtr);
115 byte1 = _mm_slli_epi32(input, 24);
116 byte2 = _mm_slli_epi32(input, 8);
117 byte3 = _mm_srli_epi32(input, 8);
118 byte4 = _mm_srli_epi32(input, 24);
120 output = _mm_or_si128(byte1, byte4);
121 byte2 = _mm_and_si128(byte2, byte2mask);
122 output = _mm_or_si128(output, byte2);
123 byte3 = _mm_and_si128(byte3, byte3mask);
124 output = _mm_or_si128(output, byte3);
127 output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
130 _mm_store_si128((__m128i*)inputPtr, output);
135 number = halfPoints*2;
136 for(; number < num_points; number++){
140 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
142 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
144 *inputPtr++ = output2;
145 *inputPtr++ = output1;
150 #ifdef LV_HAVE_GENERIC
156 static inline void volk_64u_byteswap_a_generic(
uint64_t* intsToSwap,
unsigned int num_points){
159 for(point = 0; point < num_points; point++){
163 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
165 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
167 *inputPtr++ = output2;
168 *inputPtr++ = output1;
unsigned int uint32_t
Definition: stdint.h:80
unsigned __int64 uint64_t
Definition: stdint.h:90