1 #ifndef INCLUDED_volk_32f_s32f_convert_8i_u_H
2 #define INCLUDED_volk_32f_s32f_convert_8i_u_H
17 static inline void volk_32f_s32f_convert_8i_u_sse2(
int8_t* outputVector,
const float* inputVector,
const float scalar,
unsigned int num_points){
18 unsigned int number = 0;
20 const unsigned int sixteenthPoints = num_points / 16;
22 const float* inputVectorPtr = (
const float*)inputVector;
23 int8_t* outputVectorPtr = outputVector;
29 __m128 vScalar = _mm_set_ps1(scalar);
30 __m128 inputVal1, inputVal2, inputVal3, inputVal4;
31 __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
32 __m128 vmin_val = _mm_set_ps1(min_val);
33 __m128 vmax_val = _mm_set_ps1(max_val);
35 for(;number < sixteenthPoints; number++){
36 inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
37 inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
38 inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
39 inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
41 inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
42 inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
43 inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
44 inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
46 intInputVal1 = _mm_cvtps_epi32(inputVal1);
47 intInputVal2 = _mm_cvtps_epi32(inputVal2);
48 intInputVal3 = _mm_cvtps_epi32(inputVal3);
49 intInputVal4 = _mm_cvtps_epi32(inputVal4);
51 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
52 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
54 intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
56 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
57 outputVectorPtr += 16;
60 number = sixteenthPoints * 16;
61 for(; number < num_points; number++){
62 r = inputVector[number] * scalar;
67 outputVector[number] = (
int16_t)(r);
73 #include <xmmintrin.h>
82 static inline void volk_32f_s32f_convert_8i_u_sse(
int8_t* outputVector,
const float* inputVector,
const float scalar,
unsigned int num_points){
83 unsigned int number = 0;
85 const unsigned int quarterPoints = num_points / 4;
87 const float* inputVectorPtr = (
const float*)inputVector;
88 int8_t* outputVectorPtr = outputVector;
94 __m128 vScalar = _mm_set_ps1(scalar);
96 __m128 vmin_val = _mm_set_ps1(min_val);
97 __m128 vmax_val = _mm_set_ps1(max_val);
101 for(;number < quarterPoints; number++){
102 ret = _mm_loadu_ps(inputVectorPtr);
105 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
107 _mm_store_ps(outputFloatBuffer, ret);
108 *outputVectorPtr++ = (
int8_t)(outputFloatBuffer[0]);
109 *outputVectorPtr++ = (
int8_t)(outputFloatBuffer[1]);
110 *outputVectorPtr++ = (
int8_t)(outputFloatBuffer[2]);
111 *outputVectorPtr++ = (
int8_t)(outputFloatBuffer[3]);
114 number = quarterPoints * 4;
115 for(; number < num_points; number++){
116 r = inputVector[number] * scalar;
121 outputVector[number] = (
int16_t)(r);
126 #ifdef LV_HAVE_GENERIC
135 static inline void volk_32f_s32f_convert_8i_generic(
int8_t* outputVector,
const float* inputVector,
const float scalar,
unsigned int num_points){
136 int8_t* outputVectorPtr = outputVector;
137 const float* inputVectorPtr = inputVector;
138 unsigned int number = 0;
139 float min_val = -128;
143 for(number = 0; number < num_points; number++){
144 r = *inputVectorPtr++ * scalar;
149 *outputVectorPtr++ = (
int16_t)(r);
158 #ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
159 #define INCLUDED_volk_32f_s32f_convert_8i_a_H
166 #include <emmintrin.h>
174 static inline void volk_32f_s32f_convert_8i_a_sse2(
int8_t* outputVector,
const float* inputVector,
const float scalar,
unsigned int num_points){
175 unsigned int number = 0;
177 const unsigned int sixteenthPoints = num_points / 16;
179 const float* inputVectorPtr = (
const float*)inputVector;
180 int8_t* outputVectorPtr = outputVector;
182 float min_val = -128;
186 __m128 vScalar = _mm_set_ps1(scalar);
187 __m128 inputVal1, inputVal2, inputVal3, inputVal4;
188 __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
189 __m128 vmin_val = _mm_set_ps1(min_val);
190 __m128 vmax_val = _mm_set_ps1(max_val);
192 for(;number < sixteenthPoints; number++){
193 inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
194 inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
195 inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
196 inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
198 inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
199 inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
200 inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
201 inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
203 intInputVal1 = _mm_cvtps_epi32(inputVal1);
204 intInputVal2 = _mm_cvtps_epi32(inputVal2);
205 intInputVal3 = _mm_cvtps_epi32(inputVal3);
206 intInputVal4 = _mm_cvtps_epi32(inputVal4);
208 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
209 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
211 intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
213 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
214 outputVectorPtr += 16;
217 number = sixteenthPoints * 16;
218 for(; number < num_points; number++){
219 r = inputVector[number] * scalar;
224 outputVector[number] = (
int8_t)(r);
230 #include <xmmintrin.h>
238 static inline void volk_32f_s32f_convert_8i_a_sse(
int8_t* outputVector,
const float* inputVector,
const float scalar,
unsigned int num_points){
239 unsigned int number = 0;
241 const unsigned int quarterPoints = num_points / 4;
243 const float* inputVectorPtr = (
const float*)inputVector;
245 float min_val = -128;
249 int8_t* outputVectorPtr = outputVector;
250 __m128 vScalar = _mm_set_ps1(scalar);
252 __m128 vmin_val = _mm_set_ps1(min_val);
253 __m128 vmax_val = _mm_set_ps1(max_val);
257 for(;number < quarterPoints; number++){
258 ret = _mm_load_ps(inputVectorPtr);
261 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
263 _mm_store_ps(outputFloatBuffer, ret);
264 *outputVectorPtr++ = (
int8_t)(outputFloatBuffer[0]);
265 *outputVectorPtr++ = (
int8_t)(outputFloatBuffer[1]);
266 *outputVectorPtr++ = (
int8_t)(outputFloatBuffer[2]);
267 *outputVectorPtr++ = (
int8_t)(outputFloatBuffer[3]);
270 number = quarterPoints * 4;
271 for(; number < num_points; number++){
272 r = inputVector[number] * scalar;
277 outputVector[number] = (
int8_t)(r);
282 #ifdef LV_HAVE_GENERIC
290 static inline void volk_32f_s32f_convert_8i_a_generic(
int8_t* outputVector,
const float* inputVector,
const float scalar,
unsigned int num_points){
291 int8_t* outputVectorPtr = outputVector;
292 const float* inputVectorPtr = inputVector;
293 unsigned int number = 0;
294 float min_val = -128;
298 for(number = 0; number < num_points; number++){
299 r = *inputVectorPtr++ * scalar;
304 *outputVectorPtr++ = (
int8_t)(r);
signed short int16_t
Definition: stdint.h:76
signed char int8_t
Definition: stdint.h:75
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27