summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNathan West <nathan.west@okstate.edu>2014-06-18 13:10:02 -0500
committerNathan West <nathan.west@okstate.edu>2014-07-18 20:41:28 -0400
commit6e17772f423dc260051e37ceb25f9384ca8151ed (patch)
tree1c74d276ccaba2db6f35a1942cd4193b0943648e
parent93db96faa81b260367908e977f15c0d7a45358db (diff)
volk: add NEON protokernels
-rw-r--r--volk/kernels/volk/asm/neon/volk_16i_max_star_horizontal_16i.s52
-rw-r--r--volk/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_neonasm.s57
-rw-r--r--volk/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonasm.s54
-rw-r--r--volk/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonpipeline.s68
-rw-r--r--volk/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm.s58
-rw-r--r--volk/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm_opts.s116
-rw-r--r--volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasm.s79
-rw-r--r--volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasmpipeline.s86
-rw-r--r--volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasmvmla.s74
-rw-r--r--volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_unrollasm.s146
-rw-r--r--volk/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm.s98
-rw-r--r--volk/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm_opttests.s96
-rw-r--r--volk/kernels/volk/asm/neon/volk_32fc_x2_multiply_32fc_neonasm.s45
-rw-r--r--volk/kernels/volk/asm/neon/volk_arm_32fc_32f_dot_prod_32fc_a_neonpipeline.s92
-rw-r--r--volk/kernels/volk/volk_16i_max_star_16i.h38
-rw-r--r--volk/kernels/volk/volk_16i_max_star_horizontal_16i.h34
-rw-r--r--volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h60
-rw-r--r--volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h46
-rw-r--r--volk/kernels/volk/volk_16u_byteswap.h30
-rw-r--r--volk/kernels/volk/volk_32f_invsqrt_32f.h31
-rw-r--r--volk/kernels/volk/volk_32f_s32f_multiply_32f.h31
-rw-r--r--volk/kernels/volk/volk_32f_sqrt_32f.h29
-rw-r--r--volk/kernels/volk/volk_32f_x2_add_32f.h43
-rw-r--r--volk/kernels/volk/volk_32f_x2_dot_prod_32f.h88
-rw-r--r--volk/kernels/volk/volk_32f_x2_interleave_32fc.h33
-rw-r--r--volk/kernels/volk/volk_32f_x2_max_32f.h36
-rw-r--r--volk/kernels/volk/volk_32f_x2_min_32f.h36
-rw-r--r--volk/kernels/volk/volk_32f_x2_multiply_32f.h27
-rw-r--r--volk/kernels/volk/volk_32f_x2_subtract_32f.h34
-rw-r--r--volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h137
-rw-r--r--volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h160
-rw-r--r--volk/kernels/volk/volk_32fc_32f_multiply_32fc.h37
-rw-r--r--volk/kernels/volk/volk_32fc_conjugate_32fc.h40
-rw-r--r--volk/kernels/volk/volk_32fc_magnitude_32f.h106
-rw-r--r--volk/kernels/volk/volk_32fc_magnitude_squared_32f.h42
-rw-r--r--volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h216
-rw-r--r--volk/kernels/volk/volk_32fc_x2_multiply_32fc.h111
-rw-r--r--volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h53
-rw-r--r--volk/kernels/volk/volk_32fc_x2_square_dist_32f.h30
-rw-r--r--volk/kernels/volk/volk_8i_convert_16i.h38
-rw-r--r--volk/kernels/volk/volk_8ic_deinterleave_real_8i.h28
-rw-r--r--volk/lib/CMakeLists.txt26
42 files changed, 2738 insertions, 3 deletions
diff --git a/volk/kernels/volk/asm/neon/volk_16i_max_star_horizontal_16i.s b/volk/kernels/volk/asm/neon/volk_16i_max_star_horizontal_16i.s
new file mode 100644
index 0000000000..2099355e7b
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_16i_max_star_horizontal_16i.s
@@ -0,0 +1,52 @@
+@ static inline void volk_16i_max_star_horizontal_16i_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+ .global volk_16i_max_star_horizontal_16i_neonasm
+volk_16i_max_star_horizontal_16i_neonasm:
+ @ r0 - cVector: pointer to output array
+ @ r1 - aVector: pointer to input array 1
+ @ r2 - num_points: number of items to process
+
+volk_16i_max_star_horizontal_16i_neonasm:
+ pld [r1:128]
+ push {r4, r5, r6} @ preserve register states
+ lsrs r5, r2, #4 @ 1/16th points = num_points/16
+ vmov.i32 q12, #0 @ q12 = [0,0,0,0]
+ beq .smallvector @ less than 16 elements in vector
+ mov r4, r1 @ r4 = aVector
+ mov r12, r0 @ gcc calls this ip
+ mov r3, #0 @ number = 0
+
+.loop1:
+ vld2.16 {d16-d19}, [r4]! @ aVector, interleaved load
+ pld [r4:128]
+ add r3, r3, #1 @ number += 1
+ cmp r3, r5 @ number < 1/16th points
+ vsub.i16 q10, q8, q9 @ subtraction
+ vcge.s16 q11, q10, #0 @ result > 0?
+ vcgt.s16 q10, q12, q10 @ result < 0?
+ vand.i16 q11, q8, q11 @ multiply by comparisons
+ vand.i16 q10, q9, q10 @ multiply by other comparison
+ vadd.i16 q10, q11, q10 @ add results to get max
+ vst1.16 {d20-d21}, [r12]! @ store the results
+ bne .loop1 @ at least 16 items left
+ add r1, r1, r3, lsl #5
+ add r0, r0, r3, lsl #4
+.smallvector:
+ ands r2, r2, #15
+ beq .end
+ mov r3, #0
+.loop3:
+ ldrh r4, [r1]
+ bic r5, r3, #1
+ ldrh ip, [r1, #2]
+ add r3, r3, #2
+ add r1, r1, #4
+ rsb r6, ip, r4
+ sxth r6, r6
+ cmp r6, #0
+ movgt ip, r4
+ cmp r3, r2
+ strh ip, [r0, r5]
+ bcc .loop3
+.end:
+ pop {r4, r5, r6}
+ bx lr
diff --git a/volk/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_neonasm.s b/volk/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_neonasm.s
new file mode 100644
index 0000000000..8262e4cd29
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_neonasm.s
@@ -0,0 +1,57 @@
+@ static inline void volk_32f_s32f_multiply_32f_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+ .global volk_32f_s32f_multiply_32f_neonasm
+volk_32f_s32f_multiply_32f_neonasm:
+ @ r0 - cVector: pointer to output array
+ @ r1 - aVector: pointer to input array 1
+ @ r2 - bVector: pointer to input array 2
+ @ r3 - num_points: number of items to process
+
+ stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, sl} @ prologue - save register states
+
+
+ @ quarter_points = num_points / 4
+ movs r11, r3, lsr #2
+ beq .loop2 @ if zero into quarterPoints
+
+ @ number = quarter_points
+ mov r10, r3
+ @ copy address of input vector
+ mov r4, r1
+ @ copy address of output vector
+ mov r5, r0
+
+ @ load the scalar to a quad register
+ @ vmov.32 d2[0], r2
+ @ The scalar might be in s0, not totally sure
+ vdup.32 q2, d0[0]
+
+ @ this is giving fits. Current theory is hf has something to do with it
+ .loop1:
+ @ vld1.32 {q1}, [r4:128]! @ aVal
+ @ vmul.f32 q3, q1, q2
+ @ vst1.32 {q3}, [r5:128]! @ cVal
+ @
+ @ subs r10, r10, #1
+ @ bne .loop1 @ first loop
+
+ @ number = quarter_points * 4
+ mov r10, r11, asl #2
+
+ .loop2:
+ @ cmp num_points, number
+ @ bls .done
+ @
+ @ vld1.32 {d0[0]}, [aVector]!
+ @ vmul.f32 s2, s0, s4
+ @ vst1.32 {d1[0]}, [cVector]!
+ @ add number, number, #1
+ @ b .loop2
+
+.done:
+ ldmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, sl} @ epilogue - restore register states
+ bx lr
+
+
+
+
+
diff --git a/volk/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonasm.s b/volk/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonasm.s
new file mode 100644
index 0000000000..09e8638423
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonasm.s
@@ -0,0 +1,54 @@
+@ static inline void volk_32f_x2_add_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+ .global volk_32f_x2_add_32f_a_neonasm
+volk_32f_x2_add_32f_a_neonasm:
+ @ r0 - cVector: pointer to output array
+ @ r1 - aVector: pointer to input array 1
+ @ r2 - bVector: pointer to input array 2
+ @ r3 - num_points: number of items to process
+ cVector .req r0
+ aVector .req r1
+ bVector .req r2
+ num_points .req r3
+ quarterPoints .req r7
+ number .req r8
+ aVal .req q0 @ d0-d1
+ bVal .req q1 @ d2-d3
+ cVal .req q2 @ d4-d5
+
+ @ AAPCS Section 5.1.1
+ @ A subroutine must preserve the contents of the registers r4-r8, r10, r11 and SP
+ stmfd sp!, {r7, r8, sl} @ prologue - save register states
+
+ movs quarterPoints, num_points, lsr #2
+ beq .loop2 @ if zero into quarterPoints
+
+ mov number, #0 @ number, 0
+.loop1:
+ pld [aVector, #128] @ pre-load hint - this is implementation specific!
+ pld [bVector, #128] @ pre-load hint - this is implementation specific!
+
+ vld1.32 {d0-d1}, [aVector:128]! @ aVal
+ add number, number, #1
+ vld1.32 {d2-d3}, [bVector:128]! @ bVal
+ vadd.f32 cVal, bVal, aVal
+ cmp number, quarterPoints
+ vst1.32 {d4-d5}, [cVector:128]! @ cVal
+
+ ble .loop1 @ first loop
+
+ mov number, quarterPoints, asl #2
+
+.loop2:
+ cmp num_points, number
+ bls .done
+
+ vld1.32 {d0[0]}, [aVector]!
+ vld1.32 {d0[1]}, [bVector]!
+ vadd.f32 s2, s1, s0
+ vst1.32 {d1[0]}, [cVector]!
+ add number, number, #1
+ b .loop2
+
+.done:
+ ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states
+ bx lr
diff --git a/volk/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonpipeline.s b/volk/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonpipeline.s
new file mode 100644
index 0000000000..4c8af8b51c
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonpipeline.s
@@ -0,0 +1,68 @@
+@ static inline void volk_32f_x2_add_32f_a_neonpipeline(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+ .global volk_32f_x2_add_32f_a_neonpipeline
+volk_32f_x2_add_32f_a_neonpipeline:
+ @ r0 - cVector: pointer to output array
+ @ r1 - aVector: pointer to input array 1
+ @ r2 - bVector: pointer to input array 2
+ @ r3 - num_points: number of items to process
+ cVector .req r0
+ aVector .req r1
+ bVector .req r2
+ num_points .req r3
+ quarterPoints .req r7
+ number .req r8
+ aVal .req q0 @ d0-d1
+ bVal .req q1 @ d2-d3
+ cVal .req q2 @ d4-d5
+
+ stmfd sp!, {r7, r8, sl} @ prologue - save register states
+
+ pld [aVector, #128] @ pre-load hint - this is implementation specific!
+ pld [bVector, #128] @ pre-load hint - this is implementation specific!
+
+ movs quarterPoints, num_points, lsr #2
+ beq .loop2 @ if zero into quarterPoints
+
+ mov number, quarterPoints
+
+ @ Optimizing for pipeline
+ vld1.32 {d0-d1}, [aVector:128]! @ aVal
+ vld1.32 {d2-d3}, [bVector:128]! @ bVal
+ subs number, number, #1
+
+.loop1:
+ pld [aVector, #128] @ pre-load hint - this is implementation specific!
+ pld [bVector, #128] @ pre-load hint - this is implementation specific!
+ vadd.f32 cVal, bVal, aVal
+ vld1.32 {d0-d1}, [aVector:128]! @ aVal
+ vld1.32 {d2-d3}, [bVector:128]! @ bVal
+ vst1.32 {d4-d5}, [cVector:128]! @ cVal
+
+ subs number, number, #1
+ bne .loop1 @ first loop
+
+ @ One more time
+ vadd.f32 cVal, bVal, aVal
+ vst1.32 {d4-d5}, [cVector:128]! @ cVal
+
+ mov number, quarterPoints, asl #2
+
+.loop2:
+ cmp num_points, number
+ bls .done
+
+ vld1.32 {d0[0]}, [aVector]!
+ vld1.32 {d0[1]}, [bVector]!
+ vadd.f32 s2, s1, s0
+ vst1.32 {d1[0]}, [cVector]!
+ add number, number, #1
+ b .loop2
+
+.done:
+ ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states
+ bx lr
+
+
+
+
+
diff --git a/volk/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm.s b/volk/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm.s
new file mode 100644
index 0000000000..64579579e5
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm.s
@@ -0,0 +1,58 @@
+@ static inline void volk_32f_x2_dot_prod_32f_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+ .global volk_32f_x2_dot_prod_32f_neonasm
+volk_32f_x2_dot_prod_32f_neonasm:
+ @ r0 - cVector: pointer to output array
+ @ r1 - aVector: pointer to input array 1
+ @ r2 - bVector: pointer to input array 2
+ @ r3 - num_points: number of items to process
+ cVector .req r0
+ aVector .req r1
+ bVector .req r2
+ num_points .req r3
+ quarterPoints .req r7
+ number .req r8
+ aVal .req q0 @ d0-d1
+ bVal .req q1 @ d2-d3
+ cVal .req q2 @ d4-d5
+
+ @ AAPCS Section 5.1.1
+ @ A subroutine must preserve the contents of the registers r4-r8, r10, r11 and SP
+ stmfd sp!, {r7, r8, sl} @ prologue - save register states
+
+ veor.32 q0, q0, q0
+ movs quarterPoints, num_points, lsr #2
+ beq .loop2 @ if zero into quarterPoints
+
+ mov number, #0 @ number, 0
+.loop1:
+ pld [aVector, #128] @ pre-load hint - this is implementation specific!
+ pld [bVector, #128] @ pre-load hint - this is implementation specific!
+
+ vld1.32 {q1}, [aVector:128]! @ aVal
+ vld1.32 {q2}, [bVector:128]! @ bVal
+ vmla.f32 q0, q1, q2
+
+ add number, number, #1
+ cmp number, quarterPoints
+ ble .loop1 @ first loop
+
+ @ strange order comes from trying to schedule instructions
+ vadd.f32 s0, s0, s1
+ vadd.f32 s2, s2, s3
+ mov number, quarterPoints, asl #2
+ vadd.f32 s0, s0, s2
+
+.loop2:
+ cmp num_points, number
+ bls .done
+
+ vld1.32 {d1[0]}, [aVector]!
+ vld1.32 {d1[1]}, [bVector]!
+ vmla.f32 s0, s2, s3
+ add number, number, #1
+ b .loop2
+
+.done:
+ vstr s0, [cVector]
+ ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states
+ bx lr
diff --git a/volk/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm_opts.s b/volk/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm_opts.s
new file mode 100644
index 0000000000..3093edc121
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm_opts.s
@@ -0,0 +1,116 @@
+@ static inline void volk_32f_x2_dot_prod_32f_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+ @ r0 = cVector
+ @ r1 = aVector
+ @ r2 = bVector
+ @ r3 = num_points
+ .global volk_32f_x2_dot_prod_32f_neonasm_opts
+volk_32f_x2_dot_prod_32f_neonasm_opts:
+ push {r4, r5, r6, r7, r8, r9, r10, r11}
+ @ sixteenth_points = num_points / 16
+ lsrs r8, r3, #4
+ sub r13, r13, #16 @ subtracting 16 from stack pointer?, wat?
+ @ 0 out neon accumulators
+ veor q0, q3, q3
+ veor q1, q3, q3
+ veor q2, q3, q3
+ veor q3, q3, q3
+ beq .smallvector @ if less than 16 points skip main loop
+ mov r7, r2 @ copy input ptrs
+ mov r6, r1 @ copy input ptrs
+ mov r5, #0 @ loop counter
+.mainloop:
+ vld4.32 {d16,d18,d20,d22}, [r6]!
+ add r5, r5, #1 @ inc loop counter
+ cmp r5, r8 @ loop counter < sixteenth_points?
+ vld4.32 {d24,d26,d28,d30}, [r7]!
+ vld4.32 {d17,d19,d21,d23}, [r6]!
+ vld4.32 {d25,d27,d29,d31}, [r7]!
+ vmla.f32 q3, q8, q12
+ vmla.f32 q0, q13, q9
+ vmla.f32 q1, q14, q10
+ vmla.f32 q2, q15, q11
+ bne .mainloop
+ lsl r12, r8, #6 @ r12=r8/64
+ add r1, r1, r12
+ add r2, r2, r12
+.smallvector: @ actually this can be skipped for small vectors
+ vadd.f32 q3, q3, q0
+ lsl r8, r8, #4 @ sixteenth_points * 16
+ cmp r3, r8 @ num_points < sixteenth_points*16?
+ vadd.f32 q2, q1, q2
+ vadd.f32 q3, q2, q3 @ sum of 4 accumulators in to q3
+ vadd.f32 s15, s12, s15 @ q3 is s12-s15, so reduce to a single float
+ vadd.f32 s15, s15, s13
+ vadd.f32 s15, s15, s14
+ bls .done @ if vector is multiple of 16 then finish
+ sbfx r11, r1, #2, #1 @ check alignment
+ rsb r9, r8, r3
+ and r11, r11, #3
+ mov r6, r1
+ cmp r11, r9
+ movcs r11, r9
+ cmp r9, #3
+ movls r11, r9
+ cmp r11, #0
+ beq .nothingtodo
+ mov r5, r2
+ mov r12, r8
+.dlabel5:
+ add r12, r12, #1
+ vldmia r6!, {s14}
+ rsb r4, r8, r12
+ vldmia r5!, {s13}
+ cmp r4, r11
+ vmla.f32 s15, s13, s14
+ mov r7, r6
+ mov r4, r5
+ bcc .dlabel5
+ cmp r9, r11
+ beq .done
+.dlabel8:
+ rsb r9, r11, r9
+ lsr r8, r9, #2
+ lsls r10, r8, #2
+ beq .dlabel6
+ lsl r6, r11, #2
+ veor q8, q8, q8
+ add r1, r1, r6
+ add r6, r2, r6
+ mov r5, #0
+.dlabel9:
+ add r5, r5, #1
+ vld1.32 {d20-d21}, [r6]!
+ cmp r5, r8
+ vld1.64 {d18-d19}, [r1 :64]!
+ vmla.f32 q8, q10, q9
+ bcc .dlabel9
+ vadd.f32 d16, d16, d17
+ lsl r2, r10, #2
+ veor q9, q9, q9
+ add r7, r7, r2
+ vpadd.f32 d6, d16, d16
+ add r4, r4, r2
+ cmp r9, r10
+ add r12, r12, r10
+ vadd.f32 s15, s15, s12
+ beq .done
+.dlabel6:
+ mov r2, r7
+.dlabel7:
+ add r12, r12, #1
+ vldmia r2!, {s13}
+ cmp r3, r12
+ vldmia r4!, {s14}
+ vmla.f32 s15, s13, s14
+ bhi .dlabel7
+.done:
+ vstr s15, [r0]
+ add r13, r13, #16
+ pop {r4, r5, r6, r7, r8, r9, r10, r11}
+ bx lr @ lr is the return address
+.nothingtodo:
+ mov r12, r8
+ mov r4, r2
+ mov r7, r1
+ b .dlabel8
+
diff --git a/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasm.s b/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasm.s
new file mode 100644
index 0000000000..481cadee2c
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasm.s
@@ -0,0 +1,79 @@
+@ static inline void volk_32fc_32f_dot_prod_32fc_a_neonasm ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
+ .global volk_32fc_32f_dot_prod_32fc_a_neonasm
+ volk_32fc_32f_dot_prod_32fc_a_neonasm:
+ @ r0 - result: pointer to output array (32fc)
+ @ r1 - input: pointer to input array 1 (32fc)
+ @ r2 - taps: pointer to input array 2 (32f)
+ @ r3 - num_points: number of items to process
+
+ result .req r0
+ input .req r1
+ taps .req r2
+ num_points .req r3
+ quarterPoints .req r7
+ number .req r8
+ @ Note that according to the ARM EABI (AAPCS) Section 5.1.1:
+ @ registers s16-s31 (d8-d15, q4-q7) must be preserved across subroutine calls;
+ @ registers s0-s15 (d0-d7, q0-q3) do not need to be preserved
+ @ registers d16-d31 (q8-q15), if present, do not need to be preserved.
+ realAccQ .req q0 @ d0-d1/s0-s3
+ compAccQ .req q1 @ d2-d3/s4-s7
+ realAccS .req s0 @ d0[0]
+ compAccS .req s4 @ d2[0]
+ tapsVal .req q2 @ d4-d5
+ outVal .req q3 @ d6-d7
+ realMul .req q8 @ d8-d9
+ compMul .req q9 @ d16-d17
+ inRealVal .req q10 @ d18-d19
+ inCompVal .req q11 @ d20-d21
+
+ stmfd sp!, {r7, r8, sl} @ prologue - save register states
+
+ veor realAccQ, realAccQ @ zero out accumulators
+ veor compAccQ, compAccQ @ zero out accumulators
+ movs quarterPoints, num_points, lsr #2
+ beq .loop2 @ if zero into quarterPoints
+
+ mov number, quarterPoints
+
+.loop1:
+ @ do work here
+ @pld [taps, #128] @ pre-load hint - this is implementation specific!
+ @pld [input, #128] @ pre-load hint - this is implementation specific!
+ vld1.32 {d4-d5}, [taps:128]! @ tapsVal
+ vld2.32 {d20-d23}, [input:128]! @ inRealVal, inCompVal
+ vmul.f32 realMul, tapsVal, inRealVal
+ vmul.f32 compMul, tapsVal, inCompVal
+ vadd.f32 realAccQ, realAccQ, realMul
+ vadd.f32 compAccQ, compAccQ, compMul
+ subs number, number, #1
+ bne .loop1 @ first loop
+
+ @ Sum up across realAccQ and compAccQ
+ vpadd.f32 d0, d0, d1 @ realAccQ +-> d0
+ vpadd.f32 d2, d2, d3 @ compAccQ +-> d2
+ vadd.f32 realAccS, s0, s1 @ sum the contents of d0 together (realAccQ)
+ vadd.f32 compAccS, s4, s5 @ sum the contents of d2 together (compAccQ)
+ @ critical values are now in s0 (realAccS), s4 (realAccQ)
+ mov number, quarterPoints, asl #2
+
+.loop2:
+ cmp num_points, number
+ bls .done
+
+ vld1.32 {d4[0]}, [taps]! @ s8
+ vld2.32 {d5[0],d6[0]}, [input]! @ s10, s12
+ vmul.f32 s5, s8, s10
+ vmul.f32 s6, s8, s12
+ vadd.f32 realAccS, realAccS, s5
+ vadd.f32 compAccS, compAccS, s6
+
+ add number, number, #1
+ b .loop2
+
+.done:
+ vst1.32 {d0[0]}, [result]! @ realAccS
+ vst1.32 {d2[0]}, [result] @ compAccS
+
+ ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states
+ bx lr
diff --git a/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasmpipeline.s b/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasmpipeline.s
new file mode 100644
index 0000000000..aaf70e2cbc
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasmpipeline.s
@@ -0,0 +1,86 @@
+@ static inline void volk_32fc_32f_dot_prod_32fc_a_neonasmpipeline ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
+ .global volk_32fc_32f_dot_prod_32fc_a_neonasmpipeline
+volk_32fc_32f_dot_prod_32fc_a_neonasmpipeline:
+ @ r0 - result: pointer to output array (32fc)
+ @ r1 - input: pointer to input array 1 (32fc)
+ @ r2 - taps: pointer to input array 2 (32f)
+ @ r3 - num_points: number of items to process
+
+ result .req r0
+ input .req r1
+ taps .req r2
+ num_points .req r3
+ quarterPoints .req r7
+ number .req r8
+ @ Note that according to the ARM EABI (AAPCS) Section 5.1.1:
+ @ registers s16-s31 (d8-d15, q4-q7) must be preserved across subroutine calls;
+ @ registers s0-s15 (d0-d7, q0-q3) do not need to be preserved
+ @ registers d16-d31 (q8-q15), if present, do not need to be preserved.
+ realAccQ .req q0 @ d0-d1/s0-s3
+ compAccQ .req q1 @ d2-d3/s4-s7
+ realAccS .req s0 @ d0[0]
+ compAccS .req s4 @ d2[0]
+ tapsVal .req q2 @ d4-d5
+ outVal .req q3 @ d6-d7
+ realMul .req q8 @ d8-d9
+ compMul .req q9 @ d16-d17
+ inRealVal .req q10 @ d18-d19
+ inCompVal .req q11 @ d20-d21
+
+ stmfd sp!, {r7, r8, sl} @ prologue - save register states
+
+ pld [taps, #128] @ pre-load hint - this is implementation specific!
+ pld [input, #128] @ pre-load hint - this is implementation specific!
+
+ veor realAccQ, realAccQ @ zero out accumulators
+ veor compAccQ, compAccQ @ zero out accumulators
+ movs quarterPoints, num_points, lsr #2
+ beq .loop2 @ if zero into quarterPoints
+
+ mov number, quarterPoints
+ @ Optimizing for pipeline
+ vld1.32 {d4-d5}, [taps:128]! @ tapsVal
+ vld2.32 {d18-d21}, [input:128]! @ inRealVal, inCompVal
+ subs number, number, #1
+
+.loop1:
+ @ do work here
+ pld [taps, #128] @ pre-load hint - this is implementation specific!
+ pld [input, #128] @ pre-load hint - this is implementation specific!
+ vmul.f32 realMul, tapsVal, inRealVal
+ vmul.f32 compMul, tapsVal, inCompVal
+ vadd.f32 realAccQ, realAccQ, realMul
+ vadd.f32 compAccQ, compAccQ, compMul
+ vld1.32 {d4-d5}, [taps:128]! @ tapsVal
+ vld2.32 {d18-d21}, [input:128]! @ inRealVal, inCompVal
+
+ subs number, number, #1
+ bne .loop1 @ first loop
+
+ @ Sum up across realAccQ and compAccQ
+ vpadd.f32 d0, d0, d1 @ realAccQ +-> d0
+ vpadd.f32 d2, d2, d3 @ compAccQ +-> d2
+ vadd.f32 realAccS, s0, s1 @ sum the contents of d0 together (realAccQ)
+ vadd.f32 compAccS, s4, s5 @ sum the contents of d2 together (compAccQ)
+ @ critical values are now in s0 (realAccS), s4 (realAccQ)
+ mov number, quarterPoints, asl #2
+.loop2:
+ cmp num_points, number
+ bls .done
+
+ vld1.32 {d4[0]}, [taps]! @ s8
+ vld2.32 {d5[0],d6[0]}, [input]! @ s10, s12
+ vmul.f32 s5, s8, s12
+ vmul.f32 s6, s8, s10
+ vadd.f32 realAccS, realAccS, s5
+ vadd.f32 compAccS, compAccS, s6
+
+ add number, number, #1
+ b .loop2
+
+.done:
+ vst1.32 {d0[0]}, [result]! @ realAccS
+ vst1.32 {d2[0]}, [result] @ compAccS
+
+ ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states
+ bx lr
diff --git a/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasmvmla.s b/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasmvmla.s
new file mode 100644
index 0000000000..cb50e4bced
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasmvmla.s
@@ -0,0 +1,74 @@
+@ static inline void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points)
+ .global volk_32fc_32f_dot_prod_32fc_a_neonasmvmla
+volk_32fc_32f_dot_prod_32fc_a_neonasmvmla:
+ @ r0 - result: pointer to output array (32fc)
+ @ r1 - input: pointer to input array 1 (32fc)
+ @ r2 - taps: pointer to input array 2 (32f)
+ @ r3 - num_points: number of items to process
+
+ result .req r0
+ input .req r1
+ taps .req r2
+ num_points .req r3
+ quarterPoints .req r7
+ number .req r8
+ @ Note that according to the ARM EABI (AAPCS) Section 5.1.1:
+ @ registers s16-s31 (d8-d15, q4-q7) must be preserved across subroutine calls;
+ @ registers s0-s15 (d0-d7, q0-q3) do not need to be preserved
+ @ registers d16-d31 (q8-q15), if present, do not need to be preserved.
+ realAccQ .req q0 @ d0-d1/s0-s3
+ compAccQ .req q1 @ d2-d3/s4-s7
+ realAccS .req s0 @ d0[0]
+ compAccS .req s4 @ d2[0]
+ tapsVal .req q2 @ d4-d5
+ outVal .req q3 @ d6-d7
+ realMul .req q8 @ d8-d9
+ compMul .req q9 @ d16-d17
+ inRealVal .req q10 @ d18-d19
+ inCompVal .req q11 @ d20-d21
+
+ stmfd sp!, {r7, r8, sl} @ prologue - save register states
+
+ veor realAccQ, realAccQ @ zero out accumulators
+ veor compAccQ, compAccQ @ zero out accumulators
+ movs quarterPoints, num_points, lsr #2
+ beq .loop2 @ if zero into quarterPoints
+
+ mov number, quarterPoints
+
+.loop1:
+ @ do work here
+ pld [taps, #128] @ pre-load hint - this is implementation specific!
+ pld [input, #128] @ pre-load hint - this is implementation specific!
+ vld1.32 {d4-d5}, [taps:128]! @ tapsVal
+ vld2.32 {d18-d21}, [input:128]! @ inRealVal, inCompVal
+ vmla.f32 realAccQ, tapsVal, inRealVal
+ vmla.f32 compAccQ, tapsVal, inCompVal
+ subs number, number, #1
+ bne .loop1 @ first loop
+
+ @ Sum up across realAccQ and compAccQ
+ vpadd.f32 d0, d0, d1 @ realAccQ +-> d0
+ vpadd.f32 d2, d2, d3 @ compAccQ +-> d2
+ vadd.f32 realAccS, s0, s1 @ sum the contents of d0 together (realAccQ)
+ vadd.f32 compAccS, s4, s5 @ sum the contents of d2 together (compAccQ)
+ @ critical values are now in s0 (realAccS), s4 (compAccS)
+ mov number, quarterPoints, asl #2
+.loop2:
+ cmp num_points, number
+ bls .done
+
+ vld1.32 {d4[0]}, [taps]! @ s8
+ vld2.32 {d5[0],d6[0]}, [input]! @ s10, s12
+ vmla.f32 realAccS, s8, s10 @ d0[0]
+ vmla.f32 compAccS, s8, s12 @ d2[0]
+
+ add number, number, #1
+ b .loop2
+
+.done:
+ vst1.32 {d0[0]}, [result]! @ realAccS
+ vst1.32 {d2[0]}, [result] @ compAccS
+
+ ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states
+ bx lr
diff --git a/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_unrollasm.s b/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_unrollasm.s
new file mode 100644
index 0000000000..7185ab9d17
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_unrollasm.s
@@ -0,0 +1,146 @@
+@ static inline void volk_32fc_32f_dot_prod_32fc_unrollasm ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points)
+.global volk_32fc_32f_dot_prod_32fc_unrollasm
+volk_32fc_32f_dot_prod_32fc_unrollasm:
+ @ r0 - result: pointer to output array (32fc)
+ @ r1 - input: pointer to input array 1 (32fc)
+ @ r2 - taps: pointer to input array 2 (32f)
+ @ r3 - num_points: number of items to process
+
+ push {r4, r5, r6, r7, r8, r9}
+ vpush {q4-q7}
+ sub r13, r13, #56 @ 0x38
+ add r12, r13, #8
+ lsrs r8, r3, #3
+ veor.32 q2, q5, q5
+ veor.32 q3, q5, q5
+ veor.32 q4, q5, q5
+ veor.32 q5, q5, q5
+ beq .smallvector
+ vld2.32 {d20-d23}, [r1]!
+ vld1.32 {d24-d25}, [r2]!
+ mov r5, #1
+
+
+
+.mainloop:
+ vld2.32 {d14-d17}, [r1]! @ q7,q8
+ vld1.32 {d18-d19}, [r2]! @ q9
+
+ vmul.f32 q0, q12, q10 @ real mult
+ vmul.f32 q1, q12, q11 @ imag mult
+
+ add r5, r5, #1
+ cmp r5, r8
+
+ vadd.f32 q4, q4, q0@ q4 accumulates real
+ vadd.f32 q5, q5, q1@ q5 accumulates imag
+
+ vld2.32 {d20-d23}, [r1]! @ q10-q11
+ vld1.32 {d24-d25}, [r2]! @ q12
+
+ vmul.f32 q13, q9, q7
+ vmul.f32 q14, q9, q8
+ vadd.f32 q2, q2, q13 @ q2 accumulates real
+ vadd.f32 q3, q3, q14 @ q3 accumulates imag
+
+
+
+ bne .mainloop
+
+ vmul.f32 q0, q12, q10 @ real mult
+ vmul.f32 q1, q12, q11 @ imag mult
+
+ vadd.f32 q4, q4, q0@ q4 accumulates real
+ vadd.f32 q5, q5, q1@ q5 accumulates imag
+
+
+.smallvector:
+ vadd.f32 q0, q2, q4
+ add r12, r13, #24
+ lsl r8, r8, #3
+ vadd.f32 q1, q3, q5
+ cmp r3, r8
+
+ vadd.f32 d0, d0, d1
+ vadd.f32 d1, d2, d3
+ vadd.f32 s14, s0, s1
+ vadd.f32 s15, s2, s3
+
+ vstr s14, [r13]
+ vstr s15, [r13, #4]
+ bls .D1
+ rsb r12, r8, r3
+ lsr r4, r12, #2
+ cmp r4, #0
+ cmpne r12, #3
+ lsl r5, r4, #2
+ movhi r6, #0
+ movls r6, #1
+ bls .L1
+ vmov.i32 q10, #0 @ 0x00000000
+ mov r9, r1
+ mov r7, r2
+ vorr q11, q10, q10
+
+.smallloop:
+ add r6, r6, #1
+ vld2.32 {d16-d19}, [r9]!
+ cmp r4, r6
+ vld1.32 {d24-d25}, [r7]!
+ vmla.f32 q11, q12, q8
+ vmla.f32 q10, q12, q9
+ bhi .smallloop
+ vmov.i32 q9, #0 @ 0x00000000
+ cmp r12, r5
+ vadd.f32 d20, d20, d21
+ add r8, r8, r5
+ vorr q8, q9, q9
+ add r1, r1, r5, lsl #3
+ vadd.f32 d22, d22, d23
+ add r2, r2, r5, lsl #2
+ vpadd.f32 d18, d20, d20
+ vpadd.f32 d16, d22, d22
+ vmov.32 r4, d18[0]
+ vmov.32 r12, d16[0]
+ vmov s13, r4
+ vadd.f32 s15, s13, s15
+ vmov s13, r12
+ vadd.f32 s14, s13, s14
+ beq .finishreduction
+ .L1:
+ add r12, r8, #1
+ vldr s13, [r2]
+ cmp r3, r12
+ vldr s11, [r1]
+ vldr s12, [r1, #4]
+ vmla.f32 s14, s13, s11
+ vmla.f32 s15, s13, s12
+ bls .finishreduction
+ add r8, r8, #2
+ vldr s13, [r2, #4]
+ cmp r3, r8
+ vldr s11, [r1, #8]
+ vldr s12, [r1, #12]
+ vmla.f32 s14, s13, s11
+ vmla.f32 s15, s13, s12
+ bls .finishreduction
+ vldr s13, [r2, #8]
+ vldr s11, [r1, #16]
+ vldr s12, [r1, #20]
+ vmla.f32 s14, s13, s11
+ vmla.f32 s15, s13, s12
+
+.finishreduction:
+ vstr s14, [r13]
+ vstr s15, [r13, #4]
+ .D1:
+ ldr r3, [r13]
+ str r3, [r0]
+ ldr r3, [r13, #4]
+ str r3, [r0, #4]
+ add r13, r13, #56 @ 0x38
+ vpop {q4-q7}
+ pop {r4, r5, r6, r7, r8, r9}
+ bx r14
+
+
diff --git a/volk/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm.s b/volk/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm.s
new file mode 100644
index 0000000000..a1c5b7f184
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm.s
@@ -0,0 +1,98 @@
+@ static inline void volk_32fc_x2_dot_prod_32fc_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+ .global volk_32fc_x2_dot_prod_32fc_neonasm
+volk_32fc_x2_dot_prod_32fc_neonasm:
+ push {r4, r5, r6, r7, r8, lr}
+ vpush {q0-q7}
+ vpush {q8-q15}
+ mov r8, r3 @ hold on to num_points (r8)
+ @ zero out accumulators -- leave 1 reg in alu
+ veor q8, q15, q15
+ mov r7, r0 @ (r7) is cVec
+ veor q9, q15, q15
+ mov r5, r1 @ (r5) is aVec
+ veor q10, q15, q15
+ mov r6, r2 @ (r6) is bVec
+ veor q11, q15, q15
+ lsrs r3, r3, #3 @ eighth_points (r3) = num_points/8
+ veor q12, q15, q15
+ mov r12, r2 @ (r12) is bVec
+ veor q13, q15, q15
+ mov r4, r1 @ (r4) is aVec
+ veor q14, q15, q15
+ veor q15, q15, q15
+ beq .smallvector @ nathan optimized this file based on an objdump
+ @ but I don't understand this jump. Seems like it should go to loop2
+ @ and smallvector (really vector reduction) shouldn't need to be a label
+ mov r2, #0 @ 0 out r2 (now number)
+.loop1:
+ add r2, r2, #1 @ increment number
+ vld4.32 {d0,d2,d4,d6}, [r12]! @ q0-q3
+ cmp r2, r3 @ is number < eighth_points
+ @pld [r12, #64]
+ vld4.32 {d8,d10,d12,d14}, [r4]! @ q4-q7
+ @pld [r4, #64]
+ vmla.f32 q12, q4, q0 @ real (re*re)
+ vmla.f32 q14, q4, q1 @ imag (re*im)
+ vmls.f32 q15, q5, q1 @ real (im*im)
+ vmla.f32 q13, q5, q0 @ imag (im*re)
+
+ vmla.f32 q8, q2, q6 @ real (re*re)
+ vmla.f32 q9, q2, q7 @ imag (re*im)
+ vmls.f32 q10, q3, q7 @ real (im*im)
+ vmla.f32 q11, q3, q6 @ imag (im*re)
+ bne .loop1
+ lsl r2, r3, #3 @ r2 = eighth_points * 8
+ add r6, r6, r2 @ bVec = bVec + eighth_points -- whyyyyy gcc?!?
+ add r5, r5, r2 @ aVec = aVec + eighth_points
+ @ q12-q13 were original real accumulators
+ @ q14-q15 were original imag accumulators
+ @ reduce 8 accumulators down to 2 (1 real, 1 imag)
+ vadd.f32 q8, q10, q8 @ real + real
+ vadd.f32 q11, q11, q9 @ imag + imag
+ vadd.f32 q12, q12, q15 @ real + real
+ vadd.f32 q14, q14, q13 @ imag + imag
+ vadd.f32 q8, q8, q12
+ vadd.f32 q9, q9, q14
+.smallvector:
+ lsl r4, r3, #3
+ cmp r8, r4
+ vst2.32 {d16-d19}, [sp :64] @ whaaaaat? no way this is necessary!
+ vldr s15, [sp, #8]
+ vldr s17, [sp]
+ vldr s16, [sp, #4]
+ vadd.f32 s17, s17, s15
+ vldr s11, [sp, #12]
+ vldr s12, [sp, #24]
+ vldr s13, [sp, #28]
+ vldr s14, [sp, #16]
+ vldr s15, [sp, #20]
+ vadd.f32 s16, s16, s11
+ vadd.f32 s17, s17, s12
+ vadd.f32 s16, s16, s13
+ vadd.f32 s17, s17, s14
+ vadd.f32 s16, s16, s15
+ vstr s17, [r7]
+ vstr s16, [r7, #4]
+ bls .done
+.loop2:
+ mov r3, r6
+ add r6, r6, #8
+ vldr s0, [r3]
+ vldr s1, [r6, #-4]
+ mov r3, r5
+ add r5, r5, #8
+ vldr s2, [r3]
+ vldr s3, [r5, #-4]
+ bl __mulsc3 @ GCC/Clang built-in. Portability?
+ add r4, r4, #1
+ cmp r4, r8
+ vadd.f32 s17, s17, s0
+ vadd.f32 s16, s16, s1
+ vstr s17, [r7]
+ vstr s16, [r7, #4]
+ bne .loop2
+.done:
+ vpop {q8-q15}
+ vpop {q0-q7}
+ pop {r4, r5, r6, r7, r8, pc}
+
diff --git a/volk/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm_opttests.s b/volk/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm_opttests.s
new file mode 100644
index 0000000000..77f026e1b0
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm_opttests.s
@@ -0,0 +1,96 @@
+@ static inline void volk_32fc_x2_dot_prod_32fc_neonasm_opttests(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)@
+.global volk_32fc_x2_dot_prod_32fc_neonasm_opttests
+volk_32fc_x2_dot_prod_32fc_neonasm_opttests:
+ push {r4, r5, r6, r7, r8, r9, sl, fp, lr}
+ vpush {d8-d15}
+ lsrs fp, r3, #3
+ sub sp, sp, #52 @ 0x34
+ mov r9, r3
+ mov sl, r0
+ mov r7, r1
+ mov r8, r2
+ vorr q0, q7, q7
+ vorr q1, q7, q7
+ vorr q2, q7, q7
+ vorr q3, q7, q7
+ vorr q4, q7, q7
+ vorr q5, q7, q7
+ veor q6, q7, q7
+ vorr q7, q7, q7
+ beq .smallvector
+ mov r4, r1
+ mov ip, r2
+ mov r3, #0
+.mainloop:
+ @mov r6, ip
+ @mov r5, r4
+ vld4.32 {d24,d26,d28,d30}, [r6]!
+ @add ip, ip, #64 @ 0x40
+ @add r4, r4, #64 @ 0x40
+ vld4.32 {d16,d18,d20,d22}, [r5]!
+ add r3, r3, #1
+ vld4.32 {d25,d27,d29,d31}, [r6]!
+ vld4.32 {d17,d19,d21,d23}, [r5]!
+ vmla.f32 q6, q8, q12
+ vmla.f32 q0, q9, q12
+ cmp r3, fp
+ vmls.f32 q5, q13, q9
+ vmla.f32 q2, q13, q8
+ vmla.f32 q7, q10, q14
+ vmla.f32 q1, q11, q14
+ vmls.f32 q4, q15, q11
+ vmla.f32 q3, q15, q10
+ bne .mainloop
+ lsl r3, fp, #6
+ add r8, r8, r3
+ add r7, r7, r3
+.smallvector:
+ vadd.f32 q3, q2, q3
+ add r3, sp, #16
+ lsl r4, fp, #3
+ vadd.f32 q4, q5, q4
+ cmp r9, r4
+ vadd.f32 q6, q6, q7
+ vadd.f32 q1, q0, q1
+ vadd.f32 q8, q6, q4
+ vadd.f32 q9, q1, q3
+ vst2.32 {d16-d19}, [r3 :64]
+ vldr s15, [sp, #24]
+ vldr s16, [sp, #16]
+ vldr s17, [sp, #20]
+ vadd.f32 s16, s16, s15
+ vldr s11, [sp, #28]
+ vldr s12, [sp, #40] @ 0x28
+ vldr s13, [sp, #44] @ 0x2c
+ vldr s14, [sp, #32]
+ vldr s15, [sp, #36] @ 0x24
+ vadd.f32 s17, s17, s11
+ vadd.f32 s16, s16, s12
+ vadd.f32 s17, s17, s13
+ vadd.f32 s16, s16, s14
+ vadd.f32 s17, s17, s15
+ vstr s16, [sl]
+ vstr s17, [sl, #4]
+ bls .epilog
+ add r5, sp, #8
+.tailcase:
+ ldr r3, [r7], #8
+ mov r0, r5
+ ldr r1, [r8], #8
+ add r4, r4, #1
+ ldr ip, [r7, #-4]
+ ldr r2, [r8, #-4]
+ str ip, [sp]
+ bl __mulsc3
+ vldr s14, [sp, #8]
+ vldr s15, [sp, #12]
+ vadd.f32 s16, s16, s14
+ cmp r4, r9
+ vadd.f32 s17, s17, s15
+ vstr s16, [sl]
+ vstr s17, [sl, #4]
+ bne .tailcase
+.epilog:
+ add sp, sp, #52 @ 0x34
+ vpop {d8-d15}
+ pop {r4, r5, r6, r7, r8, r9, sl, fp, pc}
diff --git a/volk/kernels/volk/asm/neon/volk_32fc_x2_multiply_32fc_neonasm.s b/volk/kernels/volk/asm/neon/volk_32fc_x2_multiply_32fc_neonasm.s
new file mode 100644
index 0000000000..5d79b466ac
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_32fc_x2_multiply_32fc_neonasm.s
@@ -0,0 +1,45 @@
+@ static inline void volk_32fc_x2_multiply_32fc_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+ .global volk_32fc_x2_multiply_32fc_neonasm
+volk_32fc_x2_multiply_32fc_neonasm:
+ push {r4, r5, r6, r7, r8, r9, r14}
+ lsrs r7, r3, #2
+ @ r0 is c vector
+ @ r1 is a vector
+ @ r2 is b vector
+ @ r3 is num_points
+ @ r7 is quarter_points
+ beq .smallvector
+ mov r5, #0
+.mainloop:
+ vld2.32 {d24-d27}, [r1]! @ ar=q12, ai=q13
+ add r5, r5, #1
+ cmp r5, r7
+ vld2.32 {d20-d23}, [r2]! @ br=q10, bi=q11
+ pld [r1]
+ pld [r2]
+ vmul.f32 q0, q12, q10 @ q15 = ar*br
+ vmul.f32 q1, q13, q11 @ q11 = ai*bi
+ vmul.f32 q2, q12, q11 @ q14 = ar*bi
+ vmul.f32 q3, q13, q10 @ q12 = ai*br
+ vsub.f32 q8, q0, q1 @ real
+ vadd.f32 q9, q2, q3 @ imag
+ vst2.32 {d16-d19}, [r0]!
+ bne .mainloop
+
+.smallvector:
+ lsl r5, r7, #2
+ cmp r3, r7
+ bls .done
+.tailcase:
+ add r5, r5, #1
+ vld1.32 d1, [r1]! @ s2, s3 = ar, ai
+ vld1.32 d0, [r2]! @ s0, s1 = br, bi
+ vmul.f32 s4, s0, s2 @ s4 = ar*br
+ vmul.f32 s5, s0, s3 @ s5 = ar*bi
+ vmls.f32 s4, s1, s3 @ s4 = s4 - ai*bi
+ vmla.f32 s5, s1, s2 @ s5 = s5 + ai*br
+ vst1.32 d2, [r0]!
+ cmp r3, r5
+ bne .tailcase
+.done:
+ pop {r4, r5, r6, r7, r8, r9, r15}
diff --git a/volk/kernels/volk/asm/neon/volk_arm_32fc_32f_dot_prod_32fc_a_neonpipeline.s b/volk/kernels/volk/asm/neon/volk_arm_32fc_32f_dot_prod_32fc_a_neonpipeline.s
new file mode 100644
index 0000000000..758e7436cd
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_arm_32fc_32f_dot_prod_32fc_a_neonpipeline.s
@@ -0,0 +1,92 @@
+@ static inline void volk_32fc_32f_dot_prod_32fc_a_neonpipeline ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
+ .global volk_32fc_32f_dot_prod_32fc_a_neonpipeline
+volk_32fc_32f_dot_prod_32fc_a_neonpipeline:
+ @ r0 - result: pointer to output array (32fc)
+ @ r1 - input: pointer to input array 1 (32fc)
+ @ r2 - taps: pointer to input array 2 (32f)
+ @ r3 - num_points: number of items to process
+
+ result .req r0
+ input .req r1
+ taps .req r2
+ num_points .req r3
+ quarterPoints .req r7
+ number .req r8
+ @ Note that according to the ARM EABI (AAPCS) Section 5.1.1:
+ @ registers s16-s31 (d8-d15, q4-q7) must be preserved across subroutine calls;
+ @ registers s0-s15 (d0-d7, q0-q3) do not need to be preserved
+ @ registers d16-d31 (q8-q15), if present, do not need to be preserved.
+ realAccQ .req q0 @ d0-d1/s0-s3
+ compAccQ .req q1 @ d2-d3/s4-s7
+ realAccS .req s0 @ d0[0]
+ compAccS .req s4 @ d2[0]
+ tapsVal .req q2 @ d4-d5
+ outVal .req q3 @ d6-d7
+ realMul .req q8 @ d8-d9
+ compMul .req q9 @ d16-d17
+ inRealVal .req q10 @ d18-d19
+ inCompVal .req q11 @ d20-d21
+
+ stmfd sp!, {r7, r8, sl} @ prologue - save register states
+
+ pld [taps, #128] @ pre-load hint - this is implementation specific!
+ pld [input, #128] @ pre-load hint - this is implementation specific!
+
+ veor realAccQ, realAccQ @ zero out accumulators
+ veor compAccQ, compAccQ @ zero out accumulators
+ movs quarterPoints, num_points, lsr #2
+ beq .loop2 @ if zero into quarterPoints
+
+ mov number, quarterPoints
+ @ Optimizing for pipeline
+ vld1.32 {d4-d5}, [taps:128]! @ tapsVal
+ vld2.32 {d20-d23}, [input:128]! @ inRealVal, inCompVal
+ subs number, number, #1
+
+.loop1:
+ @ do work here
+ pld [taps, #128] @ pre-load hint - this is implementation specific!
+ pld [input, #128] @ pre-load hint - this is implementation specific!
+ vmul.f32 realMul, tapsVal, inRealVal
+ vmul.f32 compMul, tapsVal, inCompVal
+ vadd.f32 realAccQ, realAccQ, realMul
+ vadd.f32 compAccQ, compAccQ, compMul
+ vld1.32 {d4-d5}, [taps:128]! @ tapsVal
+ vld2.32 {d20-d23}, [input:128]! @ inRealVal, inCompVal
+
+ subs number, number, #1
+ bne .loop1 @ first loop
+
+ vmul.f32 realMul, tapsVal, inRealVal
+ vmul.f32 compMul, tapsVal, inCompVal
+ vadd.f32 realAccQ, realAccQ, realMul
+ vadd.f32 compAccQ, compAccQ, compMul
+
+ @ Sum up across realAccQ and compAccQ
+ vpadd.f32 d0, d0, d1 @ realAccQ +-> d0
+ vpadd.f32 d2, d2, d3 @ compAccQ +-> d2
+ vadd.f32 realAccS, s0, s1 @ sum the contents of d0 together (realAccQ)
+ vadd.f32 compAccS, s4, s5 @ sum the contents of d2 together (compAccQ)
+ @ critical values are now in s0 (realAccS), s4 (realAccQ)
+ mov number, quarterPoints, asl #2
+ sub number, number, #5
+.loop2:
+ cmp num_points, number
+ bls .done
+
+ vld1.32 {d4[0]}, [taps]! @ s8
+ vld2.32 {d5[0],d6[0]}, [input]! @ s10, s12
+ vmul.f32 s5, s8, s10
+ vmul.f32 s6, s8, s12
+ vadd.f32 realAccS, realAccS, s5
+ vadd.f32 compAccS, compAccS, s6
+
+ add number, number, #1
+ b .loop2
+
+.done:
+ vst1.32 {d0[0]}, [result]! @ realAccS
+ vst1.32 {d2[0]}, [result] @ compAccS
+
+ ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states
+ bx lr
diff --git a/volk/kernels/volk/volk_16i_max_star_16i.h b/volk/kernels/volk/volk_16i_max_star_16i.h
index c67351c5fa..5366a2e325 100644
--- a/volk/kernels/volk/volk_16i_max_star_16i.h
+++ b/volk/kernels/volk/volk_16i_max_star_16i.h
@@ -85,6 +85,44 @@ static inline void volk_16i_max_star_16i_a_ssse3(short* target, short* src0, un
#endif /*LV_HAVE_SSSE3*/
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_16i_max_star_16i_neon(short* target, short* src0, unsigned int num_points) {
+ const unsigned int eighth_points = num_points / 8;
+ unsigned number;
+ int16x8_t input_vec;
+ int16x8_t diff, max_vec, zeros;
+ uint16x8_t comp1, comp2;
+ zeros = veorq_s16(zeros, zeros);
+
+ int16x8x2_t tmpvec;
+
+ int16x8_t candidate_vec = vld1q_dup_s16(src0 );
+ short candidate;
+ ++src0;
+
+ for(number=0; number < eighth_points; ++number) {
+ input_vec = vld1q_s16(src0);
+ __builtin_prefetch(src0+16);
+ diff = vsubq_s16(candidate_vec, input_vec);
+ comp1 = vcgeq_s16(diff, zeros);
+ comp2 = vcltq_s16(diff, zeros);
+
+ tmpvec.val[0] = vandq_s16(candidate_vec, (int16x8_t)comp1);
+ tmpvec.val[1] = vandq_s16(input_vec, (int16x8_t)comp2);
+
+ candidate_vec = vaddq_s16(tmpvec.val[0], tmpvec.val[1]);
+ src0 += 8;
+ }
+ vst1q_s16(&candidate, candidate_vec);
+
+ for(number=0; number < num_points%8; number++) {
+ candidate = ((int16_t)(candidate - src0[number]) > 0) ? candidate : src0[number];
+ }
+ target[0] = candidate;
+}
+#endif /*LV_HAVE_NEON*/
+
#ifdef LV_HAVE_GENERIC
static inline void volk_16i_max_star_16i_generic(short* target, short* src0, unsigned int num_points) {
diff --git a/volk/kernels/volk/volk_16i_max_star_horizontal_16i.h b/volk/kernels/volk/volk_16i_max_star_horizontal_16i.h
index ef88ec094f..1915522947 100644
--- a/volk/kernels/volk/volk_16i_max_star_horizontal_16i.h
+++ b/volk/kernels/volk/volk_16i_max_star_horizontal_16i.h
@@ -110,6 +110,40 @@ static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, in
#endif /*LV_HAVE_SSSE3*/
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_16i_max_star_horizontal_16i_neon(int16_t* target, int16_t* src0, unsigned int num_points) {
+ const unsigned int eighth_points = num_points / 16;
+ unsigned number;
+ int16x8x2_t input_vec;
+ int16x8_t diff, max_vec, zeros;
+ uint16x8_t comp1, comp2;
+ zeros = veorq_s16(zeros, zeros);
+ for(number=0; number < eighth_points; ++number) {
+ input_vec = vld2q_s16(src0);
+ //__builtin_prefetch(src0+16);
+ diff = vsubq_s16(input_vec.val[0], input_vec.val[1]);
+ comp1 = vcgeq_s16(diff, zeros);
+ comp2 = vcltq_s16(diff, zeros);
+
+ input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1);
+ input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2);
+
+ max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]);
+ vst1q_s16(target, max_vec);
+ src0 += 16;
+ target += 8;
+ }
+ for(number=0; number < num_points%16; number+=2) {
+ target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0) ? src0[number] : src0[number+1];
+ }
+
+}
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_NEON
+extern void volk_16i_max_star_horizontal_16i_neonasm(int16_t* target, int16_t* src0, unsigned int num_points);
+#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target, int16_t* src0, unsigned int num_points) {
diff --git a/volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h b/volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h
index 56b2cc07ab..8e84b6ea17 100644
--- a/volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h
+++ b/volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h
@@ -165,6 +165,66 @@ static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* s
#endif /*LV_HAVE_SSE2*/
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_16i_x4_quad_max_star_16i_neon(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) {
+ const unsigned int eighth_points = num_points / 8;
+ unsigned i;
+
+ int16x8_t src0_vec, src1_vec, src2_vec, src3_vec;
+ int16x8_t diff12, diff34;
+ int16x8_t comp0, comp1, comp2, comp3;
+ int16x8_t result1_vec, result2_vec;
+ int16x8_t zeros;
+ zeros = veorq_s16(zeros, zeros);
+ for(i=0; i < eighth_points; ++i) {
+ src0_vec = vld1q_s16(src0);
+ src1_vec = vld1q_s16(src1);
+ src2_vec = vld1q_s16(src2);
+ src3_vec = vld1q_s16(src3);
+ diff12 = vsubq_s16(src0_vec, src1_vec);
+ diff34 = vsubq_s16(src2_vec, src3_vec);
+ comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
+ comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
+ comp2 = (int16x8_t)vcgeq_s16(diff34, zeros);
+ comp3 = (int16x8_t)vcltq_s16(diff34, zeros);
+ comp0 = vandq_s16(src0_vec, comp0);
+ comp1 = vandq_s16(src1_vec, comp1);
+ comp2 = vandq_s16(src2_vec, comp2);
+ comp3 = vandq_s16(src3_vec, comp3);
+
+ result1_vec = vaddq_s16(comp0, comp1);
+ result2_vec = vaddq_s16(comp2, comp3);
+
+ diff12 = vsubq_s16(result1_vec, result2_vec);
+ comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
+ comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
+ comp0 = vandq_s16(result1_vec, comp0);
+ comp1 = vandq_s16(result2_vec, comp1);
+ result1_vec = vaddq_s16(comp0, comp1);
+ vst1q_s16(target, result1_vec);
+ src0 += 8;
+ src1 += 8;
+ src2 += 8;
+ src3 += 8;
+ target += 8;
+ }
+
+
+ short temp0 = 0;
+ short temp1 = 0;
+ for(i=eighth_points*8; i < num_points; ++i) {
+ temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1;
+ temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3;
+ *target++ = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
+ src0++;
+ src1++;
+ src2++;
+ src3++;
+ }
+}
+#endif /* LV_HAVE_NEON */
+
#ifdef LV_HAVE_GENERIC
static inline void volk_16i_x4_quad_max_star_16i_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) {
diff --git a/volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h b/volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h
index 9b6d19fd66..28575b6282 100644
--- a/volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h
+++ b/volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h
@@ -112,6 +112,52 @@ static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* ta
}
#endif /*LV_HAVE_SSE2*/
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_16i_x5_add_quad_16i_x4_neon(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points) {
+
+ const unsigned int eighth_points = num_points / 8;
+ int number = 0;
+
+ int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec;
+ int16x8_t target0_vec, target1_vec, target2_vec, target3_vec;
+ for(number = 0; number < eighth_points; ++number) {
+ src0_vec = vld1q_s16(src0);
+ src1_vec = vld1q_s16(src1);
+ src2_vec = vld1q_s16(src2);
+ src3_vec = vld1q_s16(src3);
+ src4_vec = vld1q_s16(src4);
+
+ target0_vec = vaddq_s16(src0_vec , src1_vec);
+ target1_vec = vaddq_s16(src0_vec , src2_vec);
+ target2_vec = vaddq_s16(src0_vec , src3_vec);
+ target3_vec = vaddq_s16(src0_vec , src4_vec);
+
+ vst1q_s16(target0, target0_vec);
+ vst1q_s16(target1, target1_vec);
+ vst1q_s16(target2, target2_vec);
+ vst1q_s16(target3, target3_vec);
+ src0 += 8;
+ src1 += 8;
+ src2 += 8;
+ src3 += 8;
+ src4 += 8;
+ target0 += 8;
+ target1 += 8;
+ target2 += 8;
+ target3 += 8;
+ }
+
+ for(number = eighth_points * 8; number < num_points; ++number) {
+ *target0++ = *src0 + *src1++;
+ *target1++ = *src0 + *src2++;
+ *target2++ = *src0 + *src3++;
+ *target3++ = *src0++ + *src4++;
+ }
+}
+
+#endif /* LV_HAVE_NEON */
+
#ifdef LV_HAVE_GENERIC
diff --git a/volk/kernels/volk/volk_16u_byteswap.h b/volk/kernels/volk/volk_16u_byteswap.h
index 57f2008991..436caf0474 100644
--- a/volk/kernels/volk/volk_16u_byteswap.h
+++ b/volk/kernels/volk/volk_16u_byteswap.h
@@ -106,6 +106,36 @@ static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int n
}
#endif /* LV_HAVE_SSE2 */
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+ \brief Byteswaps (in-place) an unaligned vector of int16_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int number;
+ unsigned int eighth_points = num_points / 8;
+ uint16x8_t input, output;
+ uint16_t* inputPtr = intsToSwap;
+
+ for(number = 0; number < eighth_points; number++) {
+ input = vld1q_u16(inputPtr);
+ output = vsriq_n_u16(output, input, 8);
+ output = vsliq_n_u16(output, input, 8);
+ vst1q_u16(inputPtr, output);
+ inputPtr += 8;
+ }
+
+ for(number = eighth_points * 8; number < num_points; number++){
+ uint16_t output = *inputPtr;
+ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+ *inputPtr = output;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_NEON */
+
#ifdef LV_HAVE_GENERIC
/*!
\brief Byteswaps (in-place) an aligned vector of int16_t's.
diff --git a/volk/kernels/volk/volk_32f_invsqrt_32f.h b/volk/kernels/volk/volk_32f_invsqrt_32f.h
index 055370661a..8ea12a73c4 100644
--- a/volk/kernels/volk/volk_32f_invsqrt_32f.h
+++ b/volk/kernels/volk/volk_32f_invsqrt_32f.h
@@ -90,6 +90,37 @@ static inline void volk_32f_invsqrt_32f_a_sse(float* cVector, const float* aVect
}
#endif /* LV_HAVE_SSE */
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+\brief Sqrts the two input vectors and store their results in the third vector
+\param cVector The vector where the results will be stored
+\param aVector One of the vectors to be invsqrted
+\param num_points The number of values in aVector and bVector to be invsqrted together and stored into cVector
+*/
+static inline void volk_32f_invsqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points){
+ unsigned int number;
+ const unsigned int quarter_points = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ float32x4_t a_val, c_val;
+ for (number = 0; number < quarter_points; ++number)
+ {
+ a_val = vld1q_f32(aPtr);
+ c_val = vrsqrteq_f32(a_val);
+ vst1q_f32(cPtr, c_val);
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ for(number=quarter_points * 4;number < num_points; number++)
+ *cPtr++ = Q_rsqrt(*aPtr++);
+
+}
+#endif /* LV_HAVE_NEON */
+
+
#ifdef LV_HAVE_GENERIC
/*!
\brief Sqrts the two input vectors and store their results in the third vector
diff --git a/volk/kernels/volk/volk_32f_s32f_multiply_32f.h b/volk/kernels/volk/volk_32f_s32f_multiply_32f.h
index 2dd86a17c2..8665d4e90e 100644
--- a/volk/kernels/volk/volk_32f_s32f_multiply_32f.h
+++ b/volk/kernels/volk/volk_32f_s32f_multiply_32f.h
@@ -180,6 +180,35 @@ static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector, const float*
}
#endif /* LV_HAVE_AVX */
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+ \brief Scalar float multiply
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param scalar the scalar value
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_u_neon(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const float* inputPtr = aVector;
+ float* outputPtr = cVector;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float32x4_t aVal, cVal;
+
+ for(number = 0; number < quarterPoints; number++){
+ aVal = vld1q_f32(inputPtr); // Load into NEON regs
+ cVal = vmulq_n_f32 (aVal, scalar); // Do the multiply
+ vst1q_f32(outputPtr, cVal); // Store results back to output
+ inputPtr += 4;
+ outputPtr += 4;
+ }
+ for(number = quarterPoints * 4; number < num_points; number++){
+ *outputPtr++ = (*inputPtr++) * scalar;
+ }
+}
+#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC
/*!
@@ -216,6 +245,4 @@ static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector, const float*
#endif /* LV_HAVE_GENERIC */
-
-
#endif /* INCLUDED_volk_32f_s32f_multiply_32f_a_H */
diff --git a/volk/kernels/volk/volk_32f_sqrt_32f.h b/volk/kernels/volk/volk_32f_sqrt_32f.h
index ab9fffd7dc..2523abf0da 100644
--- a/volk/kernels/volk/volk_32f_sqrt_32f.h
+++ b/volk/kernels/volk/volk_32f_sqrt_32f.h
@@ -40,6 +40,35 @@ static inline void volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector,
}
#endif /* LV_HAVE_SSE */
+#ifdef LV_HAVE_NEON
+/*!
+ \brief Sqrts the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be sqrted
+ \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector
+*/
+static inline void volk_32f_sqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+ float32x4_t in_vec, out_vec;
+
+ for(number = 0; number < quarter_points; number++){
+ in_vec = vld1q_f32(aPtr);
+ // note that armv8 has vsqrt_f32 which will be much better
+ out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec) );
+ vst1q_f32(cPtr, out_vec);
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ for(number = quarter_points * 4; number < num_points; number++){
+ *cPtr++ = sqrtf(*aPtr++);
+ }
+}
+#endif /* LV_HAVE_NEON */
+
#ifdef LV_HAVE_GENERIC
/*!
\brief Sqrts the two input vectors and store their results in the third vector
diff --git a/volk/kernels/volk/volk_32f_x2_add_32f.h b/volk/kernels/volk/volk_32f_x2_add_32f.h
index 42278f6068..a9a1d4fbf0 100644
--- a/volk/kernels/volk/volk_32f_x2_add_32f.h
+++ b/volk/kernels/volk/volk_32f_x2_add_32f.h
@@ -109,6 +109,49 @@ static inline void volk_32f_x2_add_32f_a_sse(float* cVector, const float* aVecto
}
#endif /* LV_HAVE_SSE */
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*
+ \brief Adds the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be added
+ \param bVector One of the vectors to be added
+ \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_x2_add_32f_u_neon(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) {
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ float32x4_t aVal, bVal, cVal;
+ for(number=0; number < quarterPoints; number++){
+ // Load in to NEON registers
+ aVal = vld1q_f32(aPtr);
+ bVal = vld1q_f32(bPtr);
+ __builtin_prefetch(aPtr+4);
+ __builtin_prefetch(bPtr+4);
+
+ // vector add
+ cVal = vaddq_f32(aVal, bVal);
+ // Store the results back into the C container
+ vst1q_f32(cPtr,cVal);
+
+ aPtr += 4; // q uses quadwords, 4 floats per vadd
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4; // should be = num_points
+ for(;number < num_points; number++){
+ *cPtr++ = (*aPtr++) + (*bPtr++);
+ }
+
+}
+
+#endif /* LV_HAVE_NEON */
+
#ifdef LV_HAVE_GENERIC
/*!
\brief Adds the two input vectors and store their results in the third vector
diff --git a/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h b/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h
index b91252e36f..ed16d9a49e 100644
--- a/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h
+++ b/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h
@@ -577,4 +577,92 @@ static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const float*
#endif /*LV_HAVE_AVX*/
+#ifdef LV_HAVE_NEON
+static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+ unsigned int quarter_points = num_points / 16;
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr= taps;
+ unsigned int number = 0;
+
+ float32x4x4_t a_val, b_val, accumulator0, accumulator1;
+ accumulator0.val[0] = vdupq_n_f32(0);
+ accumulator0.val[1] = vdupq_n_f32(0);
+ accumulator0.val[2] = vdupq_n_f32(0);
+ accumulator0.val[3] = vdupq_n_f32(0);
+ // factor of 4 loop unroll with independent accumulators
+ // uses 12 out of 16 neon q registers
+ for( number = 0; number < quarter_points; ++number) {
+ a_val = vld4q_f32(aPtr);
+ b_val = vld4q_f32(bPtr);
+ accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]);
+ accumulator0.val[1] = vmlaq_f32(accumulator0.val[1], a_val.val[1], b_val.val[1]);
+ accumulator0.val[2] = vmlaq_f32(accumulator0.val[2], a_val.val[2], b_val.val[2]);
+ accumulator0.val[3] = vmlaq_f32(accumulator0.val[3], a_val.val[3], b_val.val[3]);
+ aPtr += 16;
+ bPtr += 16;
+ }
+ accumulator0.val[0] = vaddq_f32(accumulator0.val[0], accumulator0.val[1]);
+ accumulator0.val[2] = vaddq_f32(accumulator0.val[2], accumulator0.val[3]);
+ accumulator0.val[0] = vaddq_f32(accumulator0.val[2], accumulator0.val[0]);
+ __VOLK_ATTR_ALIGNED(32) float accumulator[4];
+ vst1q_f32(accumulator, accumulator0.val[0]);
+ dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
+
+ for(number = quarter_points*16; number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif
+
+
+
+
+#ifdef LV_HAVE_NEON
+static inline void volk_32f_x2_dot_prod_32f_neon(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+ unsigned int quarter_points = num_points / 8;
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr= taps;
+ unsigned int number = 0;
+
+ float32x4x2_t a_val, b_val, accumulator_val;
+ accumulator_val.val[0] = vdupq_n_f32(0);
+ accumulator_val.val[1] = vdupq_n_f32(0);
+ // factor of 2 loop unroll with independent accumulators
+ for( number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32(aPtr);
+ b_val = vld2q_f32(bPtr);
+ accumulator_val.val[0] = vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
+ accumulator_val.val[1] = vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
+ aPtr += 8;
+ bPtr += 8;
+ }
+ accumulator_val.val[0] = vaddq_f32(accumulator_val.val[0], accumulator_val.val[1]);
+ __VOLK_ATTR_ALIGNED(32) float accumulator[4];
+ vst1q_f32(accumulator, accumulator_val.val[0]);
+ dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
+
+ for(number = quarter_points*8; number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+}
+
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_NEON
+extern void volk_32f_x2_dot_prod_32f_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_NEON
+extern void volk_32f_x2_dot_prod_32f_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+#endif /* LV_HAVE_NEON */
+
#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
diff --git a/volk/kernels/volk/volk_32f_x2_interleave_32fc.h b/volk/kernels/volk/volk_32f_x2_interleave_32fc.h
index 0935cb32bd..3591b24d69 100644
--- a/volk/kernels/volk/volk_32f_x2_interleave_32fc.h
+++ b/volk/kernels/volk/volk_32f_x2_interleave_32fc.h
@@ -48,6 +48,39 @@ static inline void volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, c
}
#endif /* LV_HAVE_SSE */
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+ \brief Interleaves the I & Q vector data into the complex vector.
+ \param iBuffer The I buffer data to be interleaved
+ \param qBuffer The Q buffer data to be interleaved
+ \param complexVector The complex output vector
+ \param num_points The number of complex data values to be interleaved
+*/
+static inline void volk_32f_x2_interleave_32fc_neon(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
+ unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+ float* complexVectorPtr = (float*) complexVector;
+
+ float32x4x2_t complex_vec;
+ for(number=0; number < quarter_points; ++number) {
+ complex_vec.val[0] = vld1q_f32(iBuffer);
+ complex_vec.val[1] = vld1q_f32(qBuffer);
+ vst2q_f32(complexVectorPtr, complex_vec);
+ iBuffer += 4;
+ qBuffer += 4;
+ complexVectorPtr += 8;
+ }
+
+ for(number=quarter_points * 4; number < num_points; ++number) {
+ *complexVectorPtr++ = *iBuffer++;
+ *complexVectorPtr++ = *qBuffer++;
+ }
+
+}
+#endif /* LV_HAVE_NEON */
+
+
#ifdef LV_HAVE_GENERIC
/*!
\brief Interleaves the I & Q vector data into the complex vector.
diff --git a/volk/kernels/volk/volk_32f_x2_max_32f.h b/volk/kernels/volk/volk_32f_x2_max_32f.h
index 27633acae8..a1403fba18 100644
--- a/volk/kernels/volk/volk_32f_x2_max_32f.h
+++ b/volk/kernels/volk/volk_32f_x2_max_32f.h
@@ -45,6 +45,42 @@ static inline void volk_32f_x2_max_32f_a_sse(float* cVector, const float* aVecto
}
#endif /* LV_HAVE_SSE */
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+ \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_x2_max_32f_neon(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ unsigned int quarter_points = num_points / 4;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+
+ float32x4_t a_vec, b_vec, c_vec;
+ for(number = 0; number < quarter_points; number++){
+ a_vec = vld1q_f32(aPtr);
+ b_vec = vld1q_f32(bPtr);
+ c_vec = vmaxq_f32(a_vec, b_vec);
+ vst1q_f32(cPtr, c_vec);
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ for(number = quarter_points*4; number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a > b ? a : b);
+ }
+
+}
+#endif /* LV_HAVE_NEON */
+
#ifdef LV_HAVE_GENERIC
/*!
\brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
diff --git a/volk/kernels/volk/volk_32f_x2_min_32f.h b/volk/kernels/volk/volk_32f_x2_min_32f.h
index 4773d13211..eef5e5da2e 100644
--- a/volk/kernels/volk/volk_32f_x2_min_32f.h
+++ b/volk/kernels/volk/volk_32f_x2_min_32f.h
@@ -45,6 +45,42 @@ static inline void volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVecto
}
#endif /* LV_HAVE_SSE */
+#ifdef LV_HAVE_NEON
+/*!
+ \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The vector to be checked
+ \param bVector The vector to be checked
+ \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_x2_min_32f_neon(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+
+ float32x4_t a_vec, b_vec, c_vec;
+ for(number = 0; number < quarter_points; number++){
+ a_vec = vld1q_f32(aPtr);
+ b_vec = vld1q_f32(bPtr);
+
+ c_vec = vminq_f32(a_vec, b_vec);
+
+ vst1q_f32(cPtr, c_vec);
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ for(number = quarter_points*4; number < num_points; number++){
+ const float a = *aPtr++;
+ const float b = *bPtr++;
+ *cPtr++ = ( a < b ? a : b);
+ }
+}
+#endif /* LV_HAVE_NEON */
+
#ifdef LV_HAVE_GENERIC
/*!
\brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
diff --git a/volk/kernels/volk/volk_32f_x2_multiply_32f.h b/volk/kernels/volk/volk_32f_x2_multiply_32f.h
index 9fdbec0a2c..8bbd81c8a6 100644
--- a/volk/kernels/volk/volk_32f_x2_multiply_32f.h
+++ b/volk/kernels/volk/volk_32f_x2_multiply_32f.h
@@ -188,6 +188,33 @@ static inline void volk_32f_x2_multiply_32f_a_avx(float* cVector, const float* a
}
#endif /* LV_HAVE_AVX */
+#ifdef LV_HAVE_NEON
+/*!
+ \brief Multiplys the two input vectors and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_neon(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ const unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+ float32x4_t avec, bvec, cvec;
+ for(number=0; number < quarter_points; ++number) {
+ avec = vld1q_f32(aVector);
+ bvec = vld1q_f32(bVector);
+ cvec = vmulq_f32(avec, bvec);
+ vst1q_f32(cVector, cvec);
+ aVector += 4;
+ bVector += 4;
+ cVector += 4;
+ }
+ for(number=quarter_points*4; number < num_points; ++number) {
+ *cVector++ = *aVector++ * *bVector++;
+ }
+}
+#endif /* LV_HAVE_NEON */
+
#ifdef LV_HAVE_GENERIC
/*!
\brief Multiplys the two input vectors and store their results in the third vector
diff --git a/volk/kernels/volk/volk_32f_x2_subtract_32f.h b/volk/kernels/volk/volk_32f_x2_subtract_32f.h
index 8ea491f988..6831d89a10 100644
--- a/volk/kernels/volk/volk_32f_x2_subtract_32f.h
+++ b/volk/kernels/volk/volk_32f_x2_subtract_32f.h
@@ -63,6 +63,40 @@ static inline void volk_32f_x2_subtract_32f_generic(float* cVector, const float*
}
#endif /* LV_HAVE_GENERIC */
+#ifdef LV_HAVE_NEON
+/*!
+ \brief Subtracts bVector form aVector and store their results in the cVector
+ \param cVector The vector where the results will be stored
+ \param aVector The initial vector
+ \param bVector The vector to be subtracted
+ \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+static inline void volk_32f_x2_subtract_32f_neon(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+
+ float32x4_t a_vec, b_vec, c_vec;
+
+ for(number = 0; number < quarter_points; number++){
+ a_vec = vld1q_f32(aPtr);
+ b_vec = vld1q_f32(bPtr);
+ c_vec = vsubq_f32(a_vec, b_vec);
+ vst1q_f32(cPtr, c_vec);
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ for(number = quarter_points * 4; number < num_points; number++){
+ *cPtr++ = (*aPtr++) - (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
#ifdef LV_HAVE_ORC
/*!
\brief Subtracts bVector form aVector and store their results in the cVector
diff --git a/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
index fdef68209e..c555bbb696 100644
--- a/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
+++ b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
@@ -293,7 +293,144 @@ static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target, float* src0,
}
#endif // LV_HAVE_AVX
+#ifdef LV_HAVE_NEON
+
+static inline void volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target, float* __restrict src0, float* __restrict center_point_array, float* __restrict cutoff, unsigned int num_points) {
+
+
+ int i;
+ float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
+
+ float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
+ float32x2_t cutoff_vector;
+ float32x2x2_t x_low, x_high;
+ float32x4_t x_qvector, c_qvector, cpa_qvector;
+ float accumulator, final_result;
+ float res_accumulators[4];
+
+ float dbg_cpa[4], dbg_x[4], dbg_c[4];
+ float dbg_max[4];
+ float dbg_x_to_1[2], dbg_x_to_2[2], dbg_x_to_3[2], dbg_x_to_4[2];
+ float dbg_x_high[2], dbg_x_low[2];
+ float dbg_foo;
+
+ c_qvector = vld1q_f32( zero );
+ // load the cutoff in to a vector
+ cutoff_vector = vdup_n_f32( *cutoff );
+ // ... center point array
+ cpa_qvector = vld1q_f32( center_point_array );
+
+ for(i=0; i < num_points; ++i) {
+ // load x (src0)
+ x_to_1 = vdup_n_f32( *src0++ );
+
+ // Get a vector of max(src0, cutoff)
+ x_to_1 = vmax_f32(x_to_1, cutoff_vector ); // x^1
+ x_to_2 = vmul_f32(x_to_1, x_to_1); // x^2
+ x_to_3 = vmul_f32(x_to_2, x_to_1); // x^3
+ x_to_4 = vmul_f32(x_to_3, x_to_1); // x^4
+ // zip up doubles to interleave
+ x_low = vzip_f32(x_to_1, x_to_2); // [x^2 | x^1 || x^2 | x^1]
+ x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3]
+ // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
+ x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
+ // now we finally have [x^4 | x^3 | x^2 | x] !
+
+ c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
+ }
+ // there should be better vector reduction techniques
+ vst1q_f32(res_accumulators, c_qvector );
+ accumulator = res_accumulators[0] + res_accumulators[1] +
+ res_accumulators[2] + res_accumulators[3];
+
+ *target = accumulator + center_point_array[4] * (float)num_points;
+}
+
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_NEON
+
+static inline void volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target, float* __restrict src0, float* __restrict center_point_array, float* __restrict cutoff, unsigned int num_points) {
+
+
+ int i;
+ float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
+
+ float accumulator, final_result;
+
+
+ float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
+ accumulator1_vec = vld1q_f32(zero);
+ accumulator2_vec = vld1q_f32(zero);
+ accumulator3_vec = vld1q_f32(zero);
+ accumulator4_vec = vld1q_f32(zero);
+ float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
+ float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
+
+ // load the cutoff in to a vector
+ cutoff_vector = vdupq_n_f32( *cutoff );
+ // ... center point array
+ cpa_0 = vdupq_n_f32(center_point_array[0]);
+ cpa_1 = vdupq_n_f32(center_point_array[1]);
+ cpa_2 = vdupq_n_f32(center_point_array[2]);
+ cpa_3 = vdupq_n_f32(center_point_array[3]);
+
+
+ // nathan is not sure why this is slower *and* wrong compared to neonvertfma
+ for(i=0; i < num_points/4; ++i) {
+ // load x
+ x_to_1 = vld1q_f32( src0 );
+
+ // Get a vector of max(src0, cutoff)
+ x_to_1 = vmaxq_f32(x_to_1, cutoff_vector ); // x^1
+ x_to_2 = vmulq_f32(x_to_1, x_to_1); // x^2
+ x_to_3 = vmulq_f32(x_to_2, x_to_1); // x^3
+ x_to_4 = vmulq_f32(x_to_3, x_to_1); // x^4
+ x_to_1 = vmulq_f32(x_to_1, cpa_0);
+ x_to_2 = vmulq_f32(x_to_2, cpa_1);
+ x_to_3 = vmulq_f32(x_to_3, cpa_2);
+ x_to_4 = vmulq_f32(x_to_4, cpa_3);
+ accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
+ accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
+ accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
+ accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
+
+ src0 += 4;
+ }
+ accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
+ accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
+ accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
+
+ __VOLK_ATTR_ALIGNED(32) float res_accumulators[4];
+ vst1q_f32(res_accumulators, accumulator1_vec );
+ accumulator = res_accumulators[0] + res_accumulators[1] +
+ res_accumulators[2] + res_accumulators[3];
+
+ float result = 0.0;
+ float fst = 0.0;
+ float sq = 0.0;
+ float thrd = 0.0;
+ float frth = 0.0;
+
+ for(i = 4*num_points/4; i < num_points; ++i) {
+ fst = src0[i];
+ fst = MAX(fst, *cutoff);
+
+ sq = fst * fst;
+ thrd = fst * sq;
+ frth = sq * sq;
+ //fith = sq * thrd;
+
+ accumulator += (center_point_array[0] * fst +
+ center_point_array[1] * sq +
+ center_point_array[2] * thrd +
+ center_point_array[3] * frth); //+
+ }
+
+ *target = accumulator + center_point_array[4] * (float)num_points;
+}
+#endif /* LV_HAVE_NEON */
#endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/
diff --git a/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
index 44535da6d8..cf67c134fd 100644
--- a/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
+++ b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
@@ -283,6 +283,166 @@ static inline void volk_32fc_32f_dot_prod_32fc_u_avx( lv_32fc_t* result, const l
}
#endif /*LV_HAVE_AVX*/
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32fc_32f_dot_prod_32fc_neon_unroll ( lv_32fc_t* __restrict result, const lv_32fc_t* __restrict input, const float* __restrict taps, unsigned int num_points) {
+
+ unsigned int number;
+ const unsigned int quarterPoints = num_points / 8;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* inputPtr = (float*)input;
+ const float* tapsPtr = taps;
+ float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
+ float* real_accum;
+ float current_accum = 0.0f ;
+ float accVector_real[4];
+ float accVector_imag[4];
+
+ float32x4x2_t inputVector0, inputVector1;
+ float32x4_t tapsVector0, tapsVector1;
+ float32x4_t tmp_real0, tmp_imag0;
+ float32x4_t tmp_real1, tmp_imag1;
+ float32x4_t real_accumulator0, imag_accumulator0;
+ float32x4_t real_accumulator1, imag_accumulator1;
+
+
+ // zero out accumulators
+ // take a *float, return float32x4_t
+ real_accumulator0 = vld1q_f32( zero );
+ imag_accumulator0 = vld1q_f32( zero );
+ real_accumulator1 = vld1q_f32( zero );
+ imag_accumulator1 = vld1q_f32( zero );
+ float dbgVec[8];
+
+ for(number=0 ;number < quarterPoints; number++){
+ // load doublewords and duplicate in to second lane
+ tapsVector0 = vld1q_f32(tapsPtr );
+ tapsVector1 = vld1q_f32(tapsPtr+4 );
+
+ // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
+ inputVector0 = vld2q_f32(inputPtr );
+ inputVector1 = vld2q_f32(inputPtr+8 );
+ // inputVector is now a struct of two vectors, 0th is real, 1st is imag
+
+ tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
+ tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
+
+ tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
+ tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
+
+ real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
+ imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
+
+ real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
+ imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
+
+ tapsPtr += 8;
+ inputPtr += 16;
+ }
+
+ real_accumulator0 = vaddq_f32( real_accumulator0, real_accumulator1);
+ imag_accumulator0 = vaddq_f32( imag_accumulator0, imag_accumulator1);
+ // void vst1q_f32( float32_t * ptr, float32x4_t val);
+ // store results back to a complex (array of 2 floats)
+ vst1q_f32(accVector_real, real_accumulator0);
+ vst1q_f32(accVector_imag, imag_accumulator0);
+ *realpt = accVector_real[0] + accVector_real[1] +
+ accVector_real[2] + accVector_real[3] ;
+
+ *imagpt = accVector_imag[0] + accVector_imag[1] +
+ accVector_imag[2] + accVector_imag[3] ;
+
+ // clean up the remainder
+ for(number=quarterPoints*8; number < num_points; number++){
+ *realpt += ((*inputPtr++) * (*tapsPtr));
+ *imagpt += ((*inputPtr++) * (*tapsPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32fc_32f_dot_prod_32fc_a_neon ( lv_32fc_t* __restrict result, const lv_32fc_t* __restrict input, const float* __restrict taps, unsigned int num_points) {
+
+ unsigned int number;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const float* inputPtr = (float*)input;
+ const float* tapsPtr = taps;
+ float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
+ float* real_accum;
+ float current_accum = 0.0f ;
+ float accVector_real[4];
+ float accVector_imag[4];
+
+ float32x4x2_t inputVector;
+ float32x4_t tapsVector;
+ float32x4_t tmp_real, tmp_imag;
+ float32x4_t real_accumulator, imag_accumulator;
+
+
+ // zero out accumulators
+ // take a *float, return float32x4_t
+ real_accumulator = vld1q_f32( zero );
+ imag_accumulator = vld1q_f32( zero );
+
+ for(number=0 ;number < quarterPoints; number++){
+ // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) )
+ // load doublewords and duplicate in to second lane
+ tapsVector = vld1q_f32(tapsPtr );
+
+ // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
+ inputVector = vld2q_f32(inputPtr );
+
+ tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
+ tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
+
+ real_accumulator = vaddq_f32(real_accumulator, tmp_real);
+ imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
+
+
+ tapsPtr += 4;
+ inputPtr += 8;
+
+ }
+
+ // void vst1q_f32( float32_t * ptr, float32x4_t val);
+ // store results back to a complex (array of 2 floats)
+ vst1q_f32(accVector_real, real_accumulator);
+ vst1q_f32(accVector_imag, imag_accumulator);
+ *realpt = accVector_real[0] + accVector_real[1] +
+ accVector_real[2] + accVector_real[3] ;
+
+ *imagpt = accVector_imag[0] + accVector_imag[1] +
+ accVector_imag[2] + accVector_imag[3] ;
+
+ // clean up the remainder
+ for(number=quarterPoints*4; number < num_points; number++){
+ *realpt += ((*inputPtr++) * (*tapsPtr));
+ *imagpt += ((*inputPtr++) * (*tapsPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_NEON
+extern void volk_32fc_32f_dot_prod_32fc_a_neonasm ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points);
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_NEON
+extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points);
+#endif /*LV_HAVE_NEON*/
#ifdef LV_HAVE_SSE
diff --git a/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h b/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h
index a12d078c68..21b71998c2 100644
--- a/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h
+++ b/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h
@@ -135,6 +135,43 @@ static inline void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector, const
}
#endif /* LV_HAVE_GENERIC */
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+ /*!
+ \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector The complex vector to be multiplied
+ \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector
+ \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_32f_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
+ lv_32fc_t* cPtr = cVector;
+ const lv_32fc_t* aPtr = aVector;
+ const float* bPtr= bVector;
+ unsigned int number = 0;
+ unsigned int quarter_points = num_points / 4;
+
+ float32x4x2_t inputVector, outputVector;
+ float32x4_t tapsVector;
+ for(number = 0; number < quarter_points; number++){
+ inputVector = vld2q_f32((float*)aPtr);
+ tapsVector = vld1q_f32(bPtr);
+
+ outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector);
+ outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector);
+
+ vst2q_f32((float*)cPtr, outputVector);
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ for(number = quarter_points * 4; number < num_points; number++){
+ *cPtr++ = (*aPtr++) * (*bPtr++);
+ }
+}
+#endif /* LV_HAVE_NEON */
+
#ifdef LV_HAVE_ORC
/*!
\brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector
diff --git a/volk/kernels/volk/volk_32fc_conjugate_32fc.h b/volk/kernels/volk/volk_32fc_conjugate_32fc.h
index dce897ff57..480fa36994 100644
--- a/volk/kernels/volk/volk_32fc_conjugate_32fc.h
+++ b/volk/kernels/volk/volk_32fc_conjugate_32fc.h
@@ -106,6 +106,46 @@ static inline void volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_
}
#endif /* LV_HAVE_SSE3 */
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+ /*!
+ \brief Takes the conjugate of a complex vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+ */
+static inline void volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+ unsigned int number;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float32x4x2_t x;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ float conj[4] = {-0.f, -0.f, -0.f, -0.f};
+ //uint32x4_t conjugator;
+
+ //conjugator = vld1q_u32( (uint32_t *)conj );
+
+ for(number=0; number < quarterPoints; number++){
+ __builtin_prefetch(a+4);
+ x = vld2q_f32((float*)a); // Load the complex data as ar,br,cr,dr; ai,bi,ci,di
+
+ // xor the imaginary lane
+ x.val[1] = vnegq_f32( x.val[1]);
+
+ vst2q_f32((float*)c,x); // Store the results back into the C container
+
+ a += 4;
+ c += 4;
+ }
+
+ for(number=quarterPoints*4; number < num_points; number++){
+ *c++ = lv_conj(*a++);
+ }
+}
+#endif /* LV_HAVE_NEON */
+
#ifdef LV_HAVE_GENERIC
/*!
\brief Takes the conjugate of a complex vector.
diff --git a/volk/kernels/volk/volk_32fc_magnitude_32f.h b/volk/kernels/volk/volk_32fc_magnitude_32f.h
index 64e99cc1be..cf3e8490eb 100644
--- a/volk/kernels/volk/volk_32fc_magnitude_32f.h
+++ b/volk/kernels/volk/volk_32fc_magnitude_32f.h
@@ -233,6 +233,112 @@ static inline void volk_32fc_magnitude_32f_a_generic(float* magnitudeVector, con
}
#endif /* LV_HAVE_GENERIC */
+#ifdef LV_HAVE_NEON
+ /*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_32f_neon(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number;
+ unsigned int quarter_points = num_points / 4;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ float32x4x2_t complex_vec;
+ float32x4_t magnitude_vec;
+ for(number = 0; number < quarter_points; number++){
+ complex_vec = vld2q_f32(complexVectorPtr);
+ complex_vec.val[0] = vmulq_f32(complex_vec.val[0], complex_vec.val[0]);
+ magnitude_vec = vmlaq_f32(complex_vec.val[0], complex_vec.val[1], complex_vec.val[1]);
+ magnitude_vec = vrsqrteq_f32(magnitude_vec);
+ magnitude_vec = vrecpeq_f32( magnitude_vec ); // no plain ol' sqrt
+ vst1q_f32(magnitudeVectorPtr, magnitude_vec);
+
+ complexVectorPtr += 8;
+ magnitudeVectorPtr += 4;
+ }
+
+ for(number = quarter_points*4; number < num_points; number++){
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_NEON
+ /*!
+ \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+
+ This is an approximation from "Streamlining Digital Signal Processing" by
+ Richard Lyons. Apparently max error is about 1% and mean error is about 0.6%.
+ The basic idea is to do a weighted sum of the abs. value of imag and real parts
+ where weight A is always assigned to max(imag, real) and B is always min(imag,real).
+ There are two pairs of cofficients chosen based on whether min <= 0.4142 max.
+ This method is called equiripple-error magnitude estimation proposed by Filip in '73
+
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_32f_neon_fancy_sweet(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number;
+ unsigned int quarter_points = num_points / 4;
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ const float threshold = 0.4142135;
+
+ float32x4_t a_vec, b_vec, a_high, a_low, b_high, b_low;
+ a_high = vdupq_n_f32( 0.84 );
+ b_high = vdupq_n_f32( 0.561);
+ a_low = vdupq_n_f32( 0.99 );
+ b_low = vdupq_n_f32( 0.197);
+
+ uint32x4_t comp0, comp1;
+
+ float32x4x2_t complex_vec;
+ float32x4_t min_vec, max_vec, magnitude_vec;
+ float32x4_t real_abs, imag_abs;
+ for(number = 0; number < quarter_points; number++){
+ complex_vec = vld2q_f32(complexVectorPtr);
+
+ real_abs = vabsq_f32(complex_vec.val[0]);
+ imag_abs = vabsq_f32(complex_vec.val[1]);
+
+ min_vec = vminq_f32(real_abs, imag_abs);
+ max_vec = vmaxq_f32(real_abs, imag_abs);
+
+ // effective branch to choose coefficient pair.
+ comp0 = vcgtq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
+ comp1 = vcleq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
+
+ // and 0s or 1s with coefficients from previous effective branch
+ a_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)a_high), vandq_s32((int32x4_t)comp1, (int32x4_t)a_low));
+ b_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)b_high), vandq_s32((int32x4_t)comp1, (int32x4_t)b_low));
+
+ // coefficients chosen, do the weighted sum
+ min_vec = vmulq_f32(min_vec, b_vec);
+ max_vec = vmulq_f32(max_vec, a_vec);
+
+ magnitude_vec = vaddq_f32(min_vec, max_vec);
+ vst1q_f32(magnitudeVectorPtr, magnitude_vec);
+
+ complexVectorPtr += 8;
+ magnitudeVectorPtr += 4;
+ }
+
+ for(number = quarter_points*4; number < num_points; number++){
+ const float real = *complexVectorPtr++;
+ const float imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
#ifdef LV_HAVE_ORC
/*!
\brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
diff --git a/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h b/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h
index 0af81401a8..878794ba79 100644
--- a/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h
+++ b/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h
@@ -206,6 +206,48 @@ static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector,
}
#endif /* LV_HAVE_SSE */
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+//
+
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_squared_32f_neon(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ float32x4x2_t cmplx_val;
+ float32x4_t result;
+ for(;number < quarterPoints; number++){
+ cmplx_val = vld2q_f32(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ cmplx_val.val[0] = vmulq_f32(cmplx_val.val[0], cmplx_val.val[0]); // Square the values
+ cmplx_val.val[1] = vmulq_f32(cmplx_val.val[1], cmplx_val.val[1]); // Square the values
+
+ result = vaddq_f32(cmplx_val.val[0], cmplx_val.val[1]); // Add the I2 and Q2 values
+
+ vst1q_f32(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
#ifdef LV_HAVE_GENERIC
/*!
\brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
diff --git a/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
index 657fa3158f..fb79d6613c 100644
--- a/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
+++ b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
@@ -760,4 +760,220 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const
#endif /*LV_HAVE_SSE4_1*/
+#ifdef LV_HAVE_NEON
+
+static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+
+ lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
+ lv_32fc_t* b_ptr = (lv_32fc_t*) input;
+ // for 2-lane vectors, 1st lane holds the real part,
+ // 2nd lane holds the imaginary part
+ float32x4x2_t a_val, b_val, c_val, accumulator;
+ float32x4x2_t tmp_real, tmp_imag;
+ accumulator.val[0] = vdupq_n_f32(0);
+ accumulator.val[1] = vdupq_n_f32(0);
+
+ for(number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ __builtin_prefetch(a_ptr+8);
+ __builtin_prefetch(b_ptr+8);
+
+ // multiply the real*real and imag*imag to get real result
+ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+ tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+ // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+ tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
+
+ // Multiply cross terms to get the imaginary result
+ // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+ tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
+ // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+
+ c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
+ c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
+
+ accumulator.val[0] = vaddq_f32(accumulator.val[0], c_val.val[0]);
+ accumulator.val[1] = vaddq_f32(accumulator.val[1], c_val.val[1]);
+
+ a_ptr += 4;
+ b_ptr += 4;
+ }
+ lv_32fc_t accum_result[4];
+ vst2q_f32((float*)accum_result, accumulator);
+ *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+ // tail case
+ for(number = quarter_points*4; number < num_points; ++number) {
+ *result += (*a_ptr++) * (*b_ptr++);
+ }
+
+}
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_NEON
+
+static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+
+ lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
+ lv_32fc_t* b_ptr = (lv_32fc_t*) input;
+ // for 2-lane vectors, 1st lane holds the real part,
+ // 2nd lane holds the imaginary part
+ float32x4x2_t a_val, b_val, c_val, accumulator;
+ float32x4x2_t tmp_real, tmp_imag;
+ accumulator.val[0] = vdupq_n_f32(0);
+ accumulator.val[1] = vdupq_n_f32(0);
+
+ for(number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ __builtin_prefetch(a_ptr+8);
+ __builtin_prefetch(b_ptr+8);
+
+ // do the first multiply
+ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+ tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+
+ // use multiply accumulate/subtract to get result
+ tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
+ tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
+
+ accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]);
+ accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]);
+
+ // increment pointers
+ a_ptr += 4;
+ b_ptr += 4;
+ }
+ lv_32fc_t accum_result[4];
+ vst2q_f32((float*)accum_result, accumulator);
+ *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+ // tail case
+ for(number = quarter_points*4; number < num_points; ++number) {
+ *result += (*a_ptr++) * (*b_ptr++);
+ }
+
+}
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_NEON
+static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+
+ lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
+ lv_32fc_t* b_ptr = (lv_32fc_t*) input;
+ // for 2-lane vectors, 1st lane holds the real part,
+ // 2nd lane holds the imaginary part
+ float32x4x2_t a_val, b_val, accumulator1, accumulator2;
+ float32x4x2_t tmp_real, tmp_imag;
+ accumulator1.val[0] = vdupq_n_f32(0);
+ accumulator1.val[1] = vdupq_n_f32(0);
+ accumulator2.val[0] = vdupq_n_f32(0);
+ accumulator2.val[1] = vdupq_n_f32(0);
+
+ for(number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ __builtin_prefetch(a_ptr+8);
+ __builtin_prefetch(b_ptr+8);
+
+ // use 2 accumulators to remove inter-instruction data dependencies
+ accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
+ accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
+ accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
+ accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
+ // increment pointers
+ a_ptr += 4;
+ b_ptr += 4;
+ }
+ accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
+ accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
+ lv_32fc_t accum_result[4];
+ vst2q_f32((float*)accum_result, accumulator1);
+ *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+ // tail case
+ for(number = quarter_points*4; number < num_points; ++number) {
+ *result += (*a_ptr++) * (*b_ptr++);
+ }
+
+}
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_NEON
+static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+// NOTE: GCC does a poor job with this kernel, but the euivalent ASM code is very fast
+
+ unsigned int quarter_points = num_points / 8;
+ unsigned int number;
+
+ lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
+ lv_32fc_t* b_ptr = (lv_32fc_t*) input;
+ // for 2-lane vectors, 1st lane holds the real part,
+ // 2nd lane holds the imaginary part
+ float32x4x4_t a_val, b_val, accumulator1, accumulator2;
+ float32x4x2_t reduced_accumulator;
+ accumulator1.val[0] = vdupq_n_f32(0);
+ accumulator1.val[1] = vdupq_n_f32(0);
+ accumulator1.val[2] = vdupq_n_f32(0);
+ accumulator1.val[3] = vdupq_n_f32(0);
+ accumulator2.val[0] = vdupq_n_f32(0);
+ accumulator2.val[1] = vdupq_n_f32(0);
+ accumulator2.val[2] = vdupq_n_f32(0);
+ accumulator2.val[3] = vdupq_n_f32(0);
+
+ // 8 input regs, 8 accumulators -> 16/16 neon regs are used
+ for(number = 0; number < quarter_points; ++number) {
+ a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld4q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ __builtin_prefetch(a_ptr+8);
+ __builtin_prefetch(b_ptr+8);
+
+ // use 2 accumulators to remove inter-instruction data dependencies
+ accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
+ accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
+
+ accumulator1.val[2] = vmlaq_f32(accumulator1.val[2], a_val.val[2], b_val.val[2]);
+ accumulator1.val[3] = vmlaq_f32(accumulator1.val[3], a_val.val[2], b_val.val[3]);
+
+ accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
+ accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
+
+ accumulator2.val[2] = vmlsq_f32(accumulator2.val[2], a_val.val[3], b_val.val[3]);
+ accumulator2.val[3] = vmlaq_f32(accumulator2.val[3], a_val.val[3], b_val.val[2]);
+ // increment pointers
+ a_ptr += 8;
+ b_ptr += 8;
+ }
+ // reduce 8 accumulator lanes down to 2 (1 real and 1 imag)
+ accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator1.val[2]);
+ accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator1.val[3]);
+ accumulator2.val[0] = vaddq_f32(accumulator2.val[0], accumulator2.val[2]);
+ accumulator2.val[1] = vaddq_f32(accumulator2.val[1], accumulator2.val[3]);
+ reduced_accumulator.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
+ reduced_accumulator.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
+ // now reduce accumulators to scalars
+ lv_32fc_t accum_result[4];
+ vst2q_f32((float*)accum_result, reduced_accumulator);
+ *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+ // tail case
+ for(number = quarter_points*8; number < num_points; ++number) {
+ *result += (*a_ptr++) * (*b_ptr++);
+ }
+
+}
+#endif /*LV_HAVE_NEON*/
+
+
#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H*/
diff --git a/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
index 7db68c1bd8..0993a16ceb 100644
--- a/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
+++ b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
@@ -149,6 +149,117 @@ static inline void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, cons
}
#endif /* LV_HAVE_GENERIC */
+#ifdef LV_HAVE_NEON
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+
+ lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
+ lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
+ unsigned int quarter_points = num_points / 4;
+ float32x4x2_t a_val, b_val, c_val;
+ float32x4x2_t tmp_real, tmp_imag;
+ unsigned int number = 0;
+
+ for(number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ __builtin_prefetch(a_ptr+4);
+ __builtin_prefetch(b_ptr+4);
+
+ // multiply the real*real and imag*imag to get real result
+ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+ tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+ // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+ tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
+
+ // Multiply cross terms to get the imaginary result
+ // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+ tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
+ // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+
+ // store the results
+ c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
+ c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
+ vst2q_f32((float*)cVector, c_val);
+
+ a_ptr += 4;
+ b_ptr += 4;
+ cVector += 4;
+ }
+
+ for(number = quarter_points*4; number < num_points; number++){
+ *cVector++ = (*a_ptr++) * (*b_ptr++);
+ }
+
+}
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_NEON
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+
+ lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
+ lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
+ unsigned int quarter_points = num_points / 4;
+ float32x4x2_t a_val, b_val, c_val;
+ float32x4x2_t tmp_real, tmp_imag;
+ unsigned int number = 0;
+
+ // TODO: I suspect the compiler is doing a poor job scheduling this. This seems
+ // highly optimal, but is barely better than generic
+ for(number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ __builtin_prefetch(a_ptr+4);
+ __builtin_prefetch(b_ptr+4);
+
+ // do the first multiply
+ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+ tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+
+ // use multiply accumulate/subtract to get result
+ tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
+ tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
+
+ // store
+ vst2q_f32((float*)cVector, tmp_imag);
+ // increment pointers
+ a_ptr += 4;
+ b_ptr += 4;
+ cVector += 4;
+ }
+
+ for(number = quarter_points*4; number < num_points; number++){
+ *cVector++ = (*a_ptr++) * (*b_ptr++);
+ }
+
+}
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_NEON
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+extern void volk_32fc_x2_multiply_32fc_neonasm(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points);
+#endif /* LV_HAVE_NEON */
+
#ifdef LV_HAVE_ORC
/*!
\brief Multiplies the two input complex vectors and stores their results in the third vector
diff --git a/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h b/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
index cfd6c007f1..dbc123ff25 100644
--- a/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
+++ b/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
@@ -138,6 +138,59 @@ static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVecto
}
#endif /* LV_HAVE_SSE */
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+ /*!
+ \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector First vector to be multiplied
+ \param bVector Second vector that is conjugated before being multiplied
+ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+
+ lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
+ lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
+ unsigned int quarter_points = num_points / 4;
+ float32x4x2_t a_val, b_val, c_val;
+ float32x4x2_t tmp_real, tmp_imag;
+ unsigned int number = 0;
+
+ for(number = 0; number < quarter_points; ++number) {
+ a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+ b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+ b_val.val[1] = vnegq_f32(b_val.val[1]);
+ __builtin_prefetch(a_ptr+4);
+ __builtin_prefetch(b_ptr+4);
+
+ // multiply the real*real and imag*imag to get real result
+ // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+ tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+ // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+ tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
+
+ // Multiply cross terms to get the imaginary result
+ // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+ tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
+ // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+ tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+
+ // store the results
+ c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
+ c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
+ vst2q_f32((float*)cVector, c_val);
+
+ a_ptr += 4;
+ b_ptr += 4;
+ cVector += 4;
+ }
+
+ for(number = quarter_points*4; number < num_points; number++){
+ *cVector++ = (*a_ptr++) * conj(*b_ptr++);
+ }
+}
+#endif /* LV_HAVE_NEON */
+
#ifdef LV_HAVE_GENERIC
/*!
\brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
diff --git a/volk/kernels/volk/volk_32fc_x2_square_dist_32f.h b/volk/kernels/volk/volk_32fc_x2_square_dist_32f.h
index 27a081b7cf..56b3d5c230 100644
--- a/volk/kernels/volk/volk_32fc_x2_square_dist_32f.h
+++ b/volk/kernels/volk/volk_32fc_x2_square_dist_32f.h
@@ -92,6 +92,36 @@ static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t*
#endif /*LV_HAVE_SSE3*/
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_32fc_x2_square_dist_32f_neon(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) {
+ const unsigned int quarter_points = num_points / 4;
+ unsigned int number;
+
+ float32x4x2_t a_vec, b_vec;
+ float32x4x2_t diff_vec;
+ float32x4_t tmp, tmp1, dist_sq;
+ a_vec.val[0] = vdupq_n_f32( lv_creal(src0[0]) );
+ a_vec.val[1] = vdupq_n_f32( lv_cimag(src0[0]) );
+ for(number=0; number < quarter_points; ++number) {
+ b_vec = vld2q_f32((float*)points);
+ diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
+ diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
+ tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
+ tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
+
+ dist_sq = vaddq_f32(tmp, tmp1);
+ vst1q_f32(target, dist_sq);
+ points += 4;
+ target += 4;
+ }
+ for(number=quarter_points*4; number < num_points; ++number) {
+ lv_32fc_t diff = src0[0] - *points++;
+ *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+ }
+}
+#endif /* LV_HAVE_NEON */
+
#ifdef LV_HAVE_GENERIC
static inline void volk_32fc_x2_square_dist_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) {
diff --git a/volk/kernels/volk/volk_8i_convert_16i.h b/volk/kernels/volk/volk_8i_convert_16i.h
index 3e5c92723f..5b27900cf1 100644
--- a/volk/kernels/volk/volk_8i_convert_16i.h
+++ b/volk/kernels/volk/volk_8i_convert_16i.h
@@ -138,6 +138,44 @@ static inline void volk_8i_convert_16i_a_generic(int16_t* outputVector, const in
}
#endif /* LV_HAVE_GENERIC */
+#ifdef LV_HAVE_NEON
+ /*!
+ \brief Converts the input 8 bit integer data into 16 bit integer data
+ \param inputVector The 8 bit input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param num_points The number of data values to be converted
+ \note Input and output buffers do NOT need to be properly aligned
+ */
+static inline void volk_8i_convert_16i_neon(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+ int16_t* outputVectorPtr = outputVector;
+ const int8_t* inputVectorPtr = inputVector;
+ unsigned int number;
+ const unsigned int eighth_points = num_points / 8;
+ float scale_factor = 256;
+
+ int8x8_t input_vec ;
+ int16x8_t converted_vec;
+
+ // NEON doesn't have a concept of 8 bit registers, so we are really
+ // dealing with the low half of 16-bit registers. Since this requires
+ // a move instruction we likely do better with ASM here.
+ for(number = 0; number < eighth_points; ++number) {
+ input_vec = vld1_s8(inputVectorPtr);
+ converted_vec = vmovl_s8(input_vec);
+ //converted_vec = vmulq_s16(converted_vec, scale_factor);
+ converted_vec = vshlq_n_s16(converted_vec, 8);
+ vst1q_s16( outputVectorPtr, converted_vec);
+
+ inputVectorPtr += 8;
+ outputVectorPtr += 8;
+ }
+
+ for(number = eighth_points * 8; number < num_points; number++){
+ *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+ }
+}
+#endif /* LV_HAVE_NEON */
+
#ifdef LV_HAVE_ORC
/*!
\brief Converts the input 8 bit integer data into 16 bit integer data
diff --git a/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h b/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h
index c8ff18e67b..427c9abf55 100644
--- a/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h
+++ b/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h
@@ -61,6 +61,34 @@ static inline void volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer, const
}
#endif /* LV_HAVE_GENERIC */
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+ unsigned int number;
+ unsigned int sixteenth_points = num_points / 16;
+
+ int8x16x2_t input_vector;
+ for(number=0; number < sixteenth_points; ++number) {
+ input_vector = vld2q_s8((int8_t*) complexVector );
+ vst1q_s8(iBuffer, input_vector.val[0]);
+ iBuffer += 16;
+ complexVector += 16;
+ }
+
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ for(number = sixteenth_points*16; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_NEON */
diff --git a/volk/lib/CMakeLists.txt b/volk/lib/CMakeLists.txt
index d72eb726e4..86b2c6a239 100644
--- a/volk/lib/CMakeLists.txt
+++ b/volk/lib/CMakeLists.txt
@@ -382,6 +382,29 @@ include_directories(
)
########################################################################
+# Handle ASM (for ARM) support
+# on by default, but let users turn it off
+########################################################################
+if( NOT DEFINED ENABLE_ARM_ASM OR ENABLE_ARM_ASM )
+ message("---- Adding ARM ASM files")
+ set(ASM-ATT $ENV{ASM})
+ #set(_CMAKE_TOOLCHAIN_PREFIX $ENV{TARGET_PREFIX}) # Gah - wtf, this shouldn't be needed
+ enable_language(ASM-ATT)
+ # what would make this OK, appending?
+ set(ASM-ATT_FLAGS "-mfpu=neon -g") # Horrid horrid hack to assemble for ARM neon
+ set(CMAKE_ASM-ATT_FLAGS ${ASM-ATT_FLAGS})
+ message("DEBUG: looking for ASM files in ${CMAKE_SOURCE_DIR}/kernels/volk/asm/neon")
+ include_directories(${CMAKE_SOURCE_DIR}/kernels/volk/asm/neon)
+ file(GLOB asm_files ${CMAKE_SOURCE_DIR}/kernels/volk/asm/neon/*.s)
+ foreach(asm_file ${asm_files})
+ list(APPEND volk_sources ${asm_file})
+ message(STATUS "Adding source file: ${asm_file}")
+ endforeach(asm_file)
+else()
+ message("---- NOT Adding ARM ASM files")
+endif()
+
+########################################################################
# Handle orc support
########################################################################
if(ORC_FOUND)
@@ -436,7 +459,8 @@ list(APPEND volk_sources ${CMAKE_CURRENT_BINARY_DIR}/constants.c)
# Setup the volk sources list and library
########################################################################
if(NOT WIN32)
- add_definitions(-fvisibility=hidden)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden")
endif()
list(APPEND volk_sources