diff options
author | Nathan West <nathan.west@okstate.edu> | 2014-06-18 13:10:02 -0500 |
---|---|---|
committer | Nathan West <nathan.west@okstate.edu> | 2014-07-18 20:41:28 -0400 |
commit | 6e17772f423dc260051e37ceb25f9384ca8151ed (patch) | |
tree | 1c74d276ccaba2db6f35a1942cd4193b0943648e | |
parent | 93db96faa81b260367908e977f15c0d7a45358db (diff) |
volk: add NEON protokernels
42 files changed, 2738 insertions, 3 deletions
diff --git a/volk/kernels/volk/asm/neon/volk_16i_max_star_horizontal_16i.s b/volk/kernels/volk/asm/neon/volk_16i_max_star_horizontal_16i.s new file mode 100644 index 0000000000..2099355e7b --- /dev/null +++ b/volk/kernels/volk/asm/neon/volk_16i_max_star_horizontal_16i.s @@ -0,0 +1,52 @@ +@ static inline void volk_16i_max_star_horizontal_16i_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); + .global volk_16i_max_star_horizontal_16i_neonasm +volk_16i_max_star_horizontal_16i_neonasm: + @ r0 - cVector: pointer to output array + @ r1 - aVector: pointer to input array 1 + @ r2 - num_points: number of items to process + +volk_16i_max_star_horizontal_16i_neonasm: + pld [r1:128] + push {r4, r5, r6} @ preserve register states + lsrs r5, r2, #4 @ 1/16th points = num_points/16 + vmov.i32 q12, #0 @ q12 = [0,0,0,0] + beq .smallvector @ less than 16 elements in vector + mov r4, r1 @ r4 = aVector + mov r12, r0 @ gcc calls this ip + mov r3, #0 @ number = 0 + +.loop1: + vld2.16 {d16-d19}, [r4]! @ aVector, interleaved load + pld [r4:128] + add r3, r3, #1 @ number += 1 + cmp r3, r5 @ number < 1/16th points + vsub.i16 q10, q8, q9 @ subtraction + vcge.s16 q11, q10, #0 @ result > 0? + vcgt.s16 q10, q12, q10 @ result < 0? + vand.i16 q11, q8, q11 @ multiply by comparisons + vand.i16 q10, q9, q10 @ multiply by other comparison + vadd.i16 q10, q11, q10 @ add results to get max + vst1.16 {d20-d21}, [r12]! @ store the results + bne .loop1 @ at least 16 items left + add r1, r1, r3, lsl #5 + add r0, r0, r3, lsl #4 +.smallvector: + ands r2, r2, #15 + beq .end + mov r3, #0 +.loop3: + ldrh r4, [r1] + bic r5, r3, #1 + ldrh ip, [r1, #2] + add r3, r3, #2 + add r1, r1, #4 + rsb r6, ip, r4 + sxth r6, r6 + cmp r6, #0 + movgt ip, r4 + cmp r3, r2 + strh ip, [r0, r5] + bcc .loop3 +.end: + pop {r4, r5, r6} + bx lr diff --git a/volk/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_neonasm.s b/volk/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_neonasm.s new file mode 100644 index 0000000000..8262e4cd29 --- /dev/null +++ b/volk/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_neonasm.s @@ -0,0 +1,57 @@ +@ static inline void volk_32f_s32f_multiply_32f_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); + .global volk_32f_s32f_multiply_32f_neonasm +volk_32f_s32f_multiply_32f_neonasm: + @ r0 - cVector: pointer to output array + @ r1 - aVector: pointer to input array 1 + @ r2 - bVector: pointer to input array 2 + @ r3 - num_points: number of items to process + + stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, sl} @ prologue - save register states + + + @ quarter_points = num_points / 4 + movs r11, r3, lsr #2 + beq .loop2 @ if zero into quarterPoints + + @ number = quarter_points + mov r10, r3 + @ copy address of input vector + mov r4, r1 + @ copy address of output vector + mov r5, r0 + + @ load the scalar to a quad register + @ vmov.32 d2[0], r2 + @ The scalar might be in s0, not totally sure + vdup.32 q2, d0[0] + + @ this is giving fits. Current theory is hf has something to do with it + .loop1: + @ vld1.32 {q1}, [r4:128]! @ aVal + @ vmul.f32 q3, q1, q2 + @ vst1.32 {q3}, [r5:128]! @ cVal + @ + @ subs r10, r10, #1 + @ bne .loop1 @ first loop + + @ number = quarter_points * 4 + mov r10, r11, asl #2 + + .loop2: + @ cmp num_points, number + @ bls .done + @ + @ vld1.32 {d0[0]}, [aVector]! + @ vmul.f32 s2, s0, s4 + @ vst1.32 {d1[0]}, [cVector]! + @ add number, number, #1 + @ b .loop2 + +.done: + ldmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, sl} @ epilogue - restore register states + bx lr + + + + + diff --git a/volk/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonasm.s b/volk/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonasm.s new file mode 100644 index 0000000000..09e8638423 --- /dev/null +++ b/volk/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonasm.s @@ -0,0 +1,54 @@ +@ static inline void volk_32f_x2_add_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); + .global volk_32f_x2_add_32f_a_neonasm +volk_32f_x2_add_32f_a_neonasm: + @ r0 - cVector: pointer to output array + @ r1 - aVector: pointer to input array 1 + @ r2 - bVector: pointer to input array 2 + @ r3 - num_points: number of items to process + cVector .req r0 + aVector .req r1 + bVector .req r2 + num_points .req r3 + quarterPoints .req r7 + number .req r8 + aVal .req q0 @ d0-d1 + bVal .req q1 @ d2-d3 + cVal .req q2 @ d4-d5 + + @ AAPCS Section 5.1.1 + @ A subroutine must preserve the contents of the registers r4-r8, r10, r11 and SP + stmfd sp!, {r7, r8, sl} @ prologue - save register states + + movs quarterPoints, num_points, lsr #2 + beq .loop2 @ if zero into quarterPoints + + mov number, #0 @ number, 0 +.loop1: + pld [aVector, #128] @ pre-load hint - this is implementation specific! + pld [bVector, #128] @ pre-load hint - this is implementation specific! + + vld1.32 {d0-d1}, [aVector:128]! @ aVal + add number, number, #1 + vld1.32 {d2-d3}, [bVector:128]! @ bVal + vadd.f32 cVal, bVal, aVal + cmp number, quarterPoints + vst1.32 {d4-d5}, [cVector:128]! @ cVal + + ble .loop1 @ first loop + + mov number, quarterPoints, asl #2 + +.loop2: + cmp num_points, number + bls .done + + vld1.32 {d0[0]}, [aVector]! + vld1.32 {d0[1]}, [bVector]! + vadd.f32 s2, s1, s0 + vst1.32 {d1[0]}, [cVector]! + add number, number, #1 + b .loop2 + +.done: + ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states + bx lr diff --git a/volk/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonpipeline.s b/volk/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonpipeline.s new file mode 100644 index 0000000000..4c8af8b51c --- /dev/null +++ b/volk/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonpipeline.s @@ -0,0 +1,68 @@ +@ static inline void volk_32f_x2_add_32f_a_neonpipeline(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); + .global volk_32f_x2_add_32f_a_neonpipeline +volk_32f_x2_add_32f_a_neonpipeline: + @ r0 - cVector: pointer to output array + @ r1 - aVector: pointer to input array 1 + @ r2 - bVector: pointer to input array 2 + @ r3 - num_points: number of items to process + cVector .req r0 + aVector .req r1 + bVector .req r2 + num_points .req r3 + quarterPoints .req r7 + number .req r8 + aVal .req q0 @ d0-d1 + bVal .req q1 @ d2-d3 + cVal .req q2 @ d4-d5 + + stmfd sp!, {r7, r8, sl} @ prologue - save register states + + pld [aVector, #128] @ pre-load hint - this is implementation specific! + pld [bVector, #128] @ pre-load hint - this is implementation specific! + + movs quarterPoints, num_points, lsr #2 + beq .loop2 @ if zero into quarterPoints + + mov number, quarterPoints + + @ Optimizing for pipeline + vld1.32 {d0-d1}, [aVector:128]! @ aVal + vld1.32 {d2-d3}, [bVector:128]! @ bVal + subs number, number, #1 + +.loop1: + pld [aVector, #128] @ pre-load hint - this is implementation specific! + pld [bVector, #128] @ pre-load hint - this is implementation specific! + vadd.f32 cVal, bVal, aVal + vld1.32 {d0-d1}, [aVector:128]! @ aVal + vld1.32 {d2-d3}, [bVector:128]! @ bVal + vst1.32 {d4-d5}, [cVector:128]! @ cVal + + subs number, number, #1 + bne .loop1 @ first loop + + @ One more time + vadd.f32 cVal, bVal, aVal + vst1.32 {d4-d5}, [cVector:128]! @ cVal + + mov number, quarterPoints, asl #2 + +.loop2: + cmp num_points, number + bls .done + + vld1.32 {d0[0]}, [aVector]! + vld1.32 {d0[1]}, [bVector]! + vadd.f32 s2, s1, s0 + vst1.32 {d1[0]}, [cVector]! + add number, number, #1 + b .loop2 + +.done: + ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states + bx lr + + + + + diff --git a/volk/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm.s b/volk/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm.s new file mode 100644 index 0000000000..64579579e5 --- /dev/null +++ b/volk/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm.s @@ -0,0 +1,58 @@ +@ static inline void volk_32f_x2_dot_prod_32f_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); + .global volk_32f_x2_dot_prod_32f_neonasm +volk_32f_x2_dot_prod_32f_neonasm: + @ r0 - cVector: pointer to output array + @ r1 - aVector: pointer to input array 1 + @ r2 - bVector: pointer to input array 2 + @ r3 - num_points: number of items to process + cVector .req r0 + aVector .req r1 + bVector .req r2 + num_points .req r3 + quarterPoints .req r7 + number .req r8 + aVal .req q0 @ d0-d1 + bVal .req q1 @ d2-d3 + cVal .req q2 @ d4-d5 + + @ AAPCS Section 5.1.1 + @ A subroutine must preserve the contents of the registers r4-r8, r10, r11 and SP + stmfd sp!, {r7, r8, sl} @ prologue - save register states + + veor.32 q0, q0, q0 + movs quarterPoints, num_points, lsr #2 + beq .loop2 @ if zero into quarterPoints + + mov number, #0 @ number, 0 +.loop1: + pld [aVector, #128] @ pre-load hint - this is implementation specific! + pld [bVector, #128] @ pre-load hint - this is implementation specific! + + vld1.32 {q1}, [aVector:128]! @ aVal + vld1.32 {q2}, [bVector:128]! @ bVal + vmla.f32 q0, q1, q2 + + add number, number, #1 + cmp number, quarterPoints + ble .loop1 @ first loop + + @ strange order comes from trying to schedule instructions + vadd.f32 s0, s0, s1 + vadd.f32 s2, s2, s3 + mov number, quarterPoints, asl #2 + vadd.f32 s0, s0, s2 + +.loop2: + cmp num_points, number + bls .done + + vld1.32 {d1[0]}, [aVector]! + vld1.32 {d1[1]}, [bVector]! + vmla.f32 s0, s2, s3 + add number, number, #1 + b .loop2 + +.done: + vstr s0, [cVector] + ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states + bx lr diff --git a/volk/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm_opts.s b/volk/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm_opts.s new file mode 100644 index 0000000000..3093edc121 --- /dev/null +++ b/volk/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm_opts.s @@ -0,0 +1,116 @@ +@ static inline void volk_32f_x2_dot_prod_32f_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); + @ r0 = cVector + @ r1 = aVector + @ r2 = bVector + @ r3 = num_points + .global volk_32f_x2_dot_prod_32f_neonasm_opts +volk_32f_x2_dot_prod_32f_neonasm_opts: + push {r4, r5, r6, r7, r8, r9, r10, r11} + @ sixteenth_points = num_points / 16 + lsrs r8, r3, #4 + sub r13, r13, #16 @ subtracting 16 from stack pointer?, wat? + @ 0 out neon accumulators + veor q0, q3, q3 + veor q1, q3, q3 + veor q2, q3, q3 + veor q3, q3, q3 + beq .smallvector @ if less than 16 points skip main loop + mov r7, r2 @ copy input ptrs + mov r6, r1 @ copy input ptrs + mov r5, #0 @ loop counter +.mainloop: + vld4.32 {d16,d18,d20,d22}, [r6]! + add r5, r5, #1 @ inc loop counter + cmp r5, r8 @ loop counter < sixteenth_points? + vld4.32 {d24,d26,d28,d30}, [r7]! + vld4.32 {d17,d19,d21,d23}, [r6]! + vld4.32 {d25,d27,d29,d31}, [r7]! + vmla.f32 q3, q8, q12 + vmla.f32 q0, q13, q9 + vmla.f32 q1, q14, q10 + vmla.f32 q2, q15, q11 + bne .mainloop + lsl r12, r8, #6 @ r12=r8/64 + add r1, r1, r12 + add r2, r2, r12 +.smallvector: @ actually this can be skipped for small vectors + vadd.f32 q3, q3, q0 + lsl r8, r8, #4 @ sixteenth_points * 16 + cmp r3, r8 @ num_points < sixteenth_points*16? + vadd.f32 q2, q1, q2 + vadd.f32 q3, q2, q3 @ sum of 4 accumulators in to q3 + vadd.f32 s15, s12, s15 @ q3 is s12-s15, so reduce to a single float + vadd.f32 s15, s15, s13 + vadd.f32 s15, s15, s14 + bls .done @ if vector is multiple of 16 then finish + sbfx r11, r1, #2, #1 @ check alignment + rsb r9, r8, r3 + and r11, r11, #3 + mov r6, r1 + cmp r11, r9 + movcs r11, r9 + cmp r9, #3 + movls r11, r9 + cmp r11, #0 + beq .nothingtodo + mov r5, r2 + mov r12, r8 +.dlabel5: + add r12, r12, #1 + vldmia r6!, {s14} + rsb r4, r8, r12 + vldmia r5!, {s13} + cmp r4, r11 + vmla.f32 s15, s13, s14 + mov r7, r6 + mov r4, r5 + bcc .dlabel5 + cmp r9, r11 + beq .done +.dlabel8: + rsb r9, r11, r9 + lsr r8, r9, #2 + lsls r10, r8, #2 + beq .dlabel6 + lsl r6, r11, #2 + veor q8, q8, q8 + add r1, r1, r6 + add r6, r2, r6 + mov r5, #0 +.dlabel9: + add r5, r5, #1 + vld1.32 {d20-d21}, [r6]! + cmp r5, r8 + vld1.64 {d18-d19}, [r1 :64]! + vmla.f32 q8, q10, q9 + bcc .dlabel9 + vadd.f32 d16, d16, d17 + lsl r2, r10, #2 + veor q9, q9, q9 + add r7, r7, r2 + vpadd.f32 d6, d16, d16 + add r4, r4, r2 + cmp r9, r10 + add r12, r12, r10 + vadd.f32 s15, s15, s12 + beq .done +.dlabel6: + mov r2, r7 +.dlabel7: + add r12, r12, #1 + vldmia r2!, {s13} + cmp r3, r12 + vldmia r4!, {s14} + vmla.f32 s15, s13, s14 + bhi .dlabel7 +.done: + vstr s15, [r0] + add r13, r13, #16 + pop {r4, r5, r6, r7, r8, r9, r10, r11} + bx lr @ lr is the return address +.nothingtodo: + mov r12, r8 + mov r4, r2 + mov r7, r1 + b .dlabel8 + diff --git a/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasm.s b/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasm.s new file mode 100644 index 0000000000..481cadee2c --- /dev/null +++ b/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasm.s @@ -0,0 +1,79 @@ +@ static inline void volk_32fc_32f_dot_prod_32fc_a_neonasm ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) { + .global volk_32fc_32f_dot_prod_32fc_a_neonasm + volk_32fc_32f_dot_prod_32fc_a_neonasm: + @ r0 - result: pointer to output array (32fc) + @ r1 - input: pointer to input array 1 (32fc) + @ r2 - taps: pointer to input array 2 (32f) + @ r3 - num_points: number of items to process + + result .req r0 + input .req r1 + taps .req r2 + num_points .req r3 + quarterPoints .req r7 + number .req r8 + @ Note that according to the ARM EABI (AAPCS) Section 5.1.1: + @ registers s16-s31 (d8-d15, q4-q7) must be preserved across subroutine calls; + @ registers s0-s15 (d0-d7, q0-q3) do not need to be preserved + @ registers d16-d31 (q8-q15), if present, do not need to be preserved. + realAccQ .req q0 @ d0-d1/s0-s3 + compAccQ .req q1 @ d2-d3/s4-s7 + realAccS .req s0 @ d0[0] + compAccS .req s4 @ d2[0] + tapsVal .req q2 @ d4-d5 + outVal .req q3 @ d6-d7 + realMul .req q8 @ d8-d9 + compMul .req q9 @ d16-d17 + inRealVal .req q10 @ d18-d19 + inCompVal .req q11 @ d20-d21 + + stmfd sp!, {r7, r8, sl} @ prologue - save register states + + veor realAccQ, realAccQ @ zero out accumulators + veor compAccQ, compAccQ @ zero out accumulators + movs quarterPoints, num_points, lsr #2 + beq .loop2 @ if zero into quarterPoints + + mov number, quarterPoints + +.loop1: + @ do work here + @pld [taps, #128] @ pre-load hint - this is implementation specific! + @pld [input, #128] @ pre-load hint - this is implementation specific! + vld1.32 {d4-d5}, [taps:128]! @ tapsVal + vld2.32 {d20-d23}, [input:128]! @ inRealVal, inCompVal + vmul.f32 realMul, tapsVal, inRealVal + vmul.f32 compMul, tapsVal, inCompVal + vadd.f32 realAccQ, realAccQ, realMul + vadd.f32 compAccQ, compAccQ, compMul + subs number, number, #1 + bne .loop1 @ first loop + + @ Sum up across realAccQ and compAccQ + vpadd.f32 d0, d0, d1 @ realAccQ +-> d0 + vpadd.f32 d2, d2, d3 @ compAccQ +-> d2 + vadd.f32 realAccS, s0, s1 @ sum the contents of d0 together (realAccQ) + vadd.f32 compAccS, s4, s5 @ sum the contents of d2 together (compAccQ) + @ critical values are now in s0 (realAccS), s4 (realAccQ) + mov number, quarterPoints, asl #2 + +.loop2: + cmp num_points, number + bls .done + + vld1.32 {d4[0]}, [taps]! @ s8 + vld2.32 {d5[0],d6[0]}, [input]! @ s10, s12 + vmul.f32 s5, s8, s10 + vmul.f32 s6, s8, s12 + vadd.f32 realAccS, realAccS, s5 + vadd.f32 compAccS, compAccS, s6 + + add number, number, #1 + b .loop2 + +.done: + vst1.32 {d0[0]}, [result]! @ realAccS + vst1.32 {d2[0]}, [result] @ compAccS + + ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states + bx lr diff --git a/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasmpipeline.s b/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasmpipeline.s new file mode 100644 index 0000000000..aaf70e2cbc --- /dev/null +++ b/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasmpipeline.s @@ -0,0 +1,86 @@ +@ static inline void volk_32fc_32f_dot_prod_32fc_a_neonasmpipeline ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) { + .global volk_32fc_32f_dot_prod_32fc_a_neonasmpipeline +volk_32fc_32f_dot_prod_32fc_a_neonasmpipeline: + @ r0 - result: pointer to output array (32fc) + @ r1 - input: pointer to input array 1 (32fc) + @ r2 - taps: pointer to input array 2 (32f) + @ r3 - num_points: number of items to process + + result .req r0 + input .req r1 + taps .req r2 + num_points .req r3 + quarterPoints .req r7 + number .req r8 + @ Note that according to the ARM EABI (AAPCS) Section 5.1.1: + @ registers s16-s31 (d8-d15, q4-q7) must be preserved across subroutine calls; + @ registers s0-s15 (d0-d7, q0-q3) do not need to be preserved + @ registers d16-d31 (q8-q15), if present, do not need to be preserved. + realAccQ .req q0 @ d0-d1/s0-s3 + compAccQ .req q1 @ d2-d3/s4-s7 + realAccS .req s0 @ d0[0] + compAccS .req s4 @ d2[0] + tapsVal .req q2 @ d4-d5 + outVal .req q3 @ d6-d7 + realMul .req q8 @ d8-d9 + compMul .req q9 @ d16-d17 + inRealVal .req q10 @ d18-d19 + inCompVal .req q11 @ d20-d21 + + stmfd sp!, {r7, r8, sl} @ prologue - save register states + + pld [taps, #128] @ pre-load hint - this is implementation specific! + pld [input, #128] @ pre-load hint - this is implementation specific! + + veor realAccQ, realAccQ @ zero out accumulators + veor compAccQ, compAccQ @ zero out accumulators + movs quarterPoints, num_points, lsr #2 + beq .loop2 @ if zero into quarterPoints + + mov number, quarterPoints + @ Optimizing for pipeline + vld1.32 {d4-d5}, [taps:128]! @ tapsVal + vld2.32 {d18-d21}, [input:128]! @ inRealVal, inCompVal + subs number, number, #1 + +.loop1: + @ do work here + pld [taps, #128] @ pre-load hint - this is implementation specific! + pld [input, #128] @ pre-load hint - this is implementation specific! + vmul.f32 realMul, tapsVal, inRealVal + vmul.f32 compMul, tapsVal, inCompVal + vadd.f32 realAccQ, realAccQ, realMul + vadd.f32 compAccQ, compAccQ, compMul + vld1.32 {d4-d5}, [taps:128]! @ tapsVal + vld2.32 {d18-d21}, [input:128]! @ inRealVal, inCompVal + + subs number, number, #1 + bne .loop1 @ first loop + + @ Sum up across realAccQ and compAccQ + vpadd.f32 d0, d0, d1 @ realAccQ +-> d0 + vpadd.f32 d2, d2, d3 @ compAccQ +-> d2 + vadd.f32 realAccS, s0, s1 @ sum the contents of d0 together (realAccQ) + vadd.f32 compAccS, s4, s5 @ sum the contents of d2 together (compAccQ) + @ critical values are now in s0 (realAccS), s4 (realAccQ) + mov number, quarterPoints, asl #2 +.loop2: + cmp num_points, number + bls .done + + vld1.32 {d4[0]}, [taps]! @ s8 + vld2.32 {d5[0],d6[0]}, [input]! @ s10, s12 + vmul.f32 s5, s8, s12 + vmul.f32 s6, s8, s10 + vadd.f32 realAccS, realAccS, s5 + vadd.f32 compAccS, compAccS, s6 + + add number, number, #1 + b .loop2 + +.done: + vst1.32 {d0[0]}, [result]! @ realAccS + vst1.32 {d2[0]}, [result] @ compAccS + + ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states + bx lr diff --git a/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasmvmla.s b/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasmvmla.s new file mode 100644 index 0000000000..cb50e4bced --- /dev/null +++ b/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasmvmla.s @@ -0,0 +1,74 @@ +@ static inline void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) + .global volk_32fc_32f_dot_prod_32fc_a_neonasmvmla +volk_32fc_32f_dot_prod_32fc_a_neonasmvmla: + @ r0 - result: pointer to output array (32fc) + @ r1 - input: pointer to input array 1 (32fc) + @ r2 - taps: pointer to input array 2 (32f) + @ r3 - num_points: number of items to process + + result .req r0 + input .req r1 + taps .req r2 + num_points .req r3 + quarterPoints .req r7 + number .req r8 + @ Note that according to the ARM EABI (AAPCS) Section 5.1.1: + @ registers s16-s31 (d8-d15, q4-q7) must be preserved across subroutine calls; + @ registers s0-s15 (d0-d7, q0-q3) do not need to be preserved + @ registers d16-d31 (q8-q15), if present, do not need to be preserved. + realAccQ .req q0 @ d0-d1/s0-s3 + compAccQ .req q1 @ d2-d3/s4-s7 + realAccS .req s0 @ d0[0] + compAccS .req s4 @ d2[0] + tapsVal .req q2 @ d4-d5 + outVal .req q3 @ d6-d7 + realMul .req q8 @ d8-d9 + compMul .req q9 @ d16-d17 + inRealVal .req q10 @ d18-d19 + inCompVal .req q11 @ d20-d21 + + stmfd sp!, {r7, r8, sl} @ prologue - save register states + + veor realAccQ, realAccQ @ zero out accumulators + veor compAccQ, compAccQ @ zero out accumulators + movs quarterPoints, num_points, lsr #2 + beq .loop2 @ if zero into quarterPoints + + mov number, quarterPoints + +.loop1: + @ do work here + pld [taps, #128] @ pre-load hint - this is implementation specific! + pld [input, #128] @ pre-load hint - this is implementation specific! + vld1.32 {d4-d5}, [taps:128]! @ tapsVal + vld2.32 {d18-d21}, [input:128]! @ inRealVal, inCompVal + vmla.f32 realAccQ, tapsVal, inRealVal + vmla.f32 compAccQ, tapsVal, inCompVal + subs number, number, #1 + bne .loop1 @ first loop + + @ Sum up across realAccQ and compAccQ + vpadd.f32 d0, d0, d1 @ realAccQ +-> d0 + vpadd.f32 d2, d2, d3 @ compAccQ +-> d2 + vadd.f32 realAccS, s0, s1 @ sum the contents of d0 together (realAccQ) + vadd.f32 compAccS, s4, s5 @ sum the contents of d2 together (compAccQ) + @ critical values are now in s0 (realAccS), s4 (compAccS) + mov number, quarterPoints, asl #2 +.loop2: + cmp num_points, number + bls .done + + vld1.32 {d4[0]}, [taps]! @ s8 + vld2.32 {d5[0],d6[0]}, [input]! @ s10, s12 + vmla.f32 realAccS, s8, s10 @ d0[0] + vmla.f32 compAccS, s8, s12 @ d2[0] + + add number, number, #1 + b .loop2 + +.done: + vst1.32 {d0[0]}, [result]! @ realAccS + vst1.32 {d2[0]}, [result] @ compAccS + + ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states + bx lr diff --git a/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_unrollasm.s b/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_unrollasm.s new file mode 100644 index 0000000000..7185ab9d17 --- /dev/null +++ b/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_unrollasm.s @@ -0,0 +1,146 @@ +@ static inline void volk_32fc_32f_dot_prod_32fc_unrollasm ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) +.global volk_32fc_32f_dot_prod_32fc_unrollasm +volk_32fc_32f_dot_prod_32fc_unrollasm: + @ r0 - result: pointer to output array (32fc) + @ r1 - input: pointer to input array 1 (32fc) + @ r2 - taps: pointer to input array 2 (32f) + @ r3 - num_points: number of items to process + + push {r4, r5, r6, r7, r8, r9} + vpush {q4-q7} + sub r13, r13, #56 @ 0x38 + add r12, r13, #8 + lsrs r8, r3, #3 + veor.32 q2, q5, q5 + veor.32 q3, q5, q5 + veor.32 q4, q5, q5 + veor.32 q5, q5, q5 + beq .smallvector + vld2.32 {d20-d23}, [r1]! + vld1.32 {d24-d25}, [r2]! + mov r5, #1 + + + +.mainloop: + vld2.32 {d14-d17}, [r1]! @ q7,q8 + vld1.32 {d18-d19}, [r2]! @ q9 + + vmul.f32 q0, q12, q10 @ real mult + vmul.f32 q1, q12, q11 @ imag mult + + add r5, r5, #1 + cmp r5, r8 + + vadd.f32 q4, q4, q0@ q4 accumulates real + vadd.f32 q5, q5, q1@ q5 accumulates imag + + vld2.32 {d20-d23}, [r1]! @ q10-q11 + vld1.32 {d24-d25}, [r2]! @ q12 + + vmul.f32 q13, q9, q7 + vmul.f32 q14, q9, q8 + vadd.f32 q2, q2, q13 @ q2 accumulates real + vadd.f32 q3, q3, q14 @ q3 accumulates imag + + + + bne .mainloop + + vmul.f32 q0, q12, q10 @ real mult + vmul.f32 q1, q12, q11 @ imag mult + + vadd.f32 q4, q4, q0@ q4 accumulates real + vadd.f32 q5, q5, q1@ q5 accumulates imag + + +.smallvector: + vadd.f32 q0, q2, q4 + add r12, r13, #24 + lsl r8, r8, #3 + vadd.f32 q1, q3, q5 + cmp r3, r8 + + vadd.f32 d0, d0, d1 + vadd.f32 d1, d2, d3 + vadd.f32 s14, s0, s1 + vadd.f32 s15, s2, s3 + + vstr s14, [r13] + vstr s15, [r13, #4] + bls .D1 + rsb r12, r8, r3 + lsr r4, r12, #2 + cmp r4, #0 + cmpne r12, #3 + lsl r5, r4, #2 + movhi r6, #0 + movls r6, #1 + bls .L1 + vmov.i32 q10, #0 @ 0x00000000 + mov r9, r1 + mov r7, r2 + vorr q11, q10, q10 + +.smallloop: + add r6, r6, #1 + vld2.32 {d16-d19}, [r9]! + cmp r4, r6 + vld1.32 {d24-d25}, [r7]! + vmla.f32 q11, q12, q8 + vmla.f32 q10, q12, q9 + bhi .smallloop + vmov.i32 q9, #0 @ 0x00000000 + cmp r12, r5 + vadd.f32 d20, d20, d21 + add r8, r8, r5 + vorr q8, q9, q9 + add r1, r1, r5, lsl #3 + vadd.f32 d22, d22, d23 + add r2, r2, r5, lsl #2 + vpadd.f32 d18, d20, d20 + vpadd.f32 d16, d22, d22 + vmov.32 r4, d18[0] + vmov.32 r12, d16[0] + vmov s13, r4 + vadd.f32 s15, s13, s15 + vmov s13, r12 + vadd.f32 s14, s13, s14 + beq .finishreduction + .L1: + add r12, r8, #1 + vldr s13, [r2] + cmp r3, r12 + vldr s11, [r1] + vldr s12, [r1, #4] + vmla.f32 s14, s13, s11 + vmla.f32 s15, s13, s12 + bls .finishreduction + add r8, r8, #2 + vldr s13, [r2, #4] + cmp r3, r8 + vldr s11, [r1, #8] + vldr s12, [r1, #12] + vmla.f32 s14, s13, s11 + vmla.f32 s15, s13, s12 + bls .finishreduction + vldr s13, [r2, #8] + vldr s11, [r1, #16] + vldr s12, [r1, #20] + vmla.f32 s14, s13, s11 + vmla.f32 s15, s13, s12 + +.finishreduction: + vstr s14, [r13] + vstr s15, [r13, #4] + .D1: + ldr r3, [r13] + str r3, [r0] + ldr r3, [r13, #4] + str r3, [r0, #4] + add r13, r13, #56 @ 0x38 + vpop {q4-q7} + pop {r4, r5, r6, r7, r8, r9} + bx r14 + + diff --git a/volk/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm.s b/volk/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm.s new file mode 100644 index 0000000000..a1c5b7f184 --- /dev/null +++ b/volk/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm.s @@ -0,0 +1,98 @@ +@ static inline void volk_32fc_x2_dot_prod_32fc_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); + .global volk_32fc_x2_dot_prod_32fc_neonasm +volk_32fc_x2_dot_prod_32fc_neonasm: + push {r4, r5, r6, r7, r8, lr} + vpush {q0-q7} + vpush {q8-q15} + mov r8, r3 @ hold on to num_points (r8) + @ zero out accumulators -- leave 1 reg in alu + veor q8, q15, q15 + mov r7, r0 @ (r7) is cVec + veor q9, q15, q15 + mov r5, r1 @ (r5) is aVec + veor q10, q15, q15 + mov r6, r2 @ (r6) is bVec + veor q11, q15, q15 + lsrs r3, r3, #3 @ eighth_points (r3) = num_points/8 + veor q12, q15, q15 + mov r12, r2 @ (r12) is bVec + veor q13, q15, q15 + mov r4, r1 @ (r4) is aVec + veor q14, q15, q15 + veor q15, q15, q15 + beq .smallvector @ nathan optimized this file based on an objdump + @ but I don't understand this jump. Seems like it should go to loop2 + @ and smallvector (really vector reduction) shouldn't need to be a label + mov r2, #0 @ 0 out r2 (now number) +.loop1: + add r2, r2, #1 @ increment number + vld4.32 {d0,d2,d4,d6}, [r12]! @ q0-q3 + cmp r2, r3 @ is number < eighth_points + @pld [r12, #64] + vld4.32 {d8,d10,d12,d14}, [r4]! @ q4-q7 + @pld [r4, #64] + vmla.f32 q12, q4, q0 @ real (re*re) + vmla.f32 q14, q4, q1 @ imag (re*im) + vmls.f32 q15, q5, q1 @ real (im*im) + vmla.f32 q13, q5, q0 @ imag (im*re) + + vmla.f32 q8, q2, q6 @ real (re*re) + vmla.f32 q9, q2, q7 @ imag (re*im) + vmls.f32 q10, q3, q7 @ real (im*im) + vmla.f32 q11, q3, q6 @ imag (im*re) + bne .loop1 + lsl r2, r3, #3 @ r2 = eighth_points * 8 + add r6, r6, r2 @ bVec = bVec + eighth_points -- whyyyyy gcc?!? + add r5, r5, r2 @ aVec = aVec + eighth_points + @ q12-q13 were original real accumulators + @ q14-q15 were original imag accumulators + @ reduce 8 accumulators down to 2 (1 real, 1 imag) + vadd.f32 q8, q10, q8 @ real + real + vadd.f32 q11, q11, q9 @ imag + imag + vadd.f32 q12, q12, q15 @ real + real + vadd.f32 q14, q14, q13 @ imag + imag + vadd.f32 q8, q8, q12 + vadd.f32 q9, q9, q14 +.smallvector: + lsl r4, r3, #3 + cmp r8, r4 + vst2.32 {d16-d19}, [sp :64] @ whaaaaat? no way this is necessary! + vldr s15, [sp, #8] + vldr s17, [sp] + vldr s16, [sp, #4] + vadd.f32 s17, s17, s15 + vldr s11, [sp, #12] + vldr s12, [sp, #24] + vldr s13, [sp, #28] + vldr s14, [sp, #16] + vldr s15, [sp, #20] + vadd.f32 s16, s16, s11 + vadd.f32 s17, s17, s12 + vadd.f32 s16, s16, s13 + vadd.f32 s17, s17, s14 + vadd.f32 s16, s16, s15 + vstr s17, [r7] + vstr s16, [r7, #4] + bls .done +.loop2: + mov r3, r6 + add r6, r6, #8 + vldr s0, [r3] + vldr s1, [r6, #-4] + mov r3, r5 + add r5, r5, #8 + vldr s2, [r3] + vldr s3, [r5, #-4] + bl __mulsc3 @ GCC/Clang built-in. Portability? + add r4, r4, #1 + cmp r4, r8 + vadd.f32 s17, s17, s0 + vadd.f32 s16, s16, s1 + vstr s17, [r7] + vstr s16, [r7, #4] + bne .loop2 +.done: + vpop {q8-q15} + vpop {q0-q7} + pop {r4, r5, r6, r7, r8, pc} + diff --git a/volk/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm_opttests.s b/volk/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm_opttests.s new file mode 100644 index 0000000000..77f026e1b0 --- /dev/null +++ b/volk/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm_opttests.s @@ -0,0 +1,96 @@ +@ static inline void volk_32fc_x2_dot_prod_32fc_neonasm_opttests(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)@ +.global volk_32fc_x2_dot_prod_32fc_neonasm_opttests +volk_32fc_x2_dot_prod_32fc_neonasm_opttests: + push {r4, r5, r6, r7, r8, r9, sl, fp, lr} + vpush {d8-d15} + lsrs fp, r3, #3 + sub sp, sp, #52 @ 0x34 + mov r9, r3 + mov sl, r0 + mov r7, r1 + mov r8, r2 + vorr q0, q7, q7 + vorr q1, q7, q7 + vorr q2, q7, q7 + vorr q3, q7, q7 + vorr q4, q7, q7 + vorr q5, q7, q7 + veor q6, q7, q7 + vorr q7, q7, q7 + beq .smallvector + mov r4, r1 + mov ip, r2 + mov r3, #0 +.mainloop: + @mov r6, ip + @mov r5, r4 + vld4.32 {d24,d26,d28,d30}, [r6]! + @add ip, ip, #64 @ 0x40 + @add r4, r4, #64 @ 0x40 + vld4.32 {d16,d18,d20,d22}, [r5]! + add r3, r3, #1 + vld4.32 {d25,d27,d29,d31}, [r6]! + vld4.32 {d17,d19,d21,d23}, [r5]! + vmla.f32 q6, q8, q12 + vmla.f32 q0, q9, q12 + cmp r3, fp + vmls.f32 q5, q13, q9 + vmla.f32 q2, q13, q8 + vmla.f32 q7, q10, q14 + vmla.f32 q1, q11, q14 + vmls.f32 q4, q15, q11 + vmla.f32 q3, q15, q10 + bne .mainloop + lsl r3, fp, #6 + add r8, r8, r3 + add r7, r7, r3 +.smallvector: + vadd.f32 q3, q2, q3 + add r3, sp, #16 + lsl r4, fp, #3 + vadd.f32 q4, q5, q4 + cmp r9, r4 + vadd.f32 q6, q6, q7 + vadd.f32 q1, q0, q1 + vadd.f32 q8, q6, q4 + vadd.f32 q9, q1, q3 + vst2.32 {d16-d19}, [r3 :64] + vldr s15, [sp, #24] + vldr s16, [sp, #16] + vldr s17, [sp, #20] + vadd.f32 s16, s16, s15 + vldr s11, [sp, #28] + vldr s12, [sp, #40] @ 0x28 + vldr s13, [sp, #44] @ 0x2c + vldr s14, [sp, #32] + vldr s15, [sp, #36] @ 0x24 + vadd.f32 s17, s17, s11 + vadd.f32 s16, s16, s12 + vadd.f32 s17, s17, s13 + vadd.f32 s16, s16, s14 + vadd.f32 s17, s17, s15 + vstr s16, [sl] + vstr s17, [sl, #4] + bls .epilog + add r5, sp, #8 +.tailcase: + ldr r3, [r7], #8 + mov r0, r5 + ldr r1, [r8], #8 + add r4, r4, #1 + ldr ip, [r7, #-4] + ldr r2, [r8, #-4] + str ip, [sp] + bl __mulsc3 + vldr s14, [sp, #8] + vldr s15, [sp, #12] + vadd.f32 s16, s16, s14 + cmp r4, r9 + vadd.f32 s17, s17, s15 + vstr s16, [sl] + vstr s17, [sl, #4] + bne .tailcase +.epilog: + add sp, sp, #52 @ 0x34 + vpop {d8-d15} + pop {r4, r5, r6, r7, r8, r9, sl, fp, pc} diff --git a/volk/kernels/volk/asm/neon/volk_32fc_x2_multiply_32fc_neonasm.s b/volk/kernels/volk/asm/neon/volk_32fc_x2_multiply_32fc_neonasm.s new file mode 100644 index 0000000000..5d79b466ac --- /dev/null +++ b/volk/kernels/volk/asm/neon/volk_32fc_x2_multiply_32fc_neonasm.s @@ -0,0 +1,45 @@ +@ static inline void volk_32fc_x2_multiply_32fc_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); + .global volk_32fc_x2_multiply_32fc_neonasm +volk_32fc_x2_multiply_32fc_neonasm: + push {r4, r5, r6, r7, r8, r9, r14} + lsrs r7, r3, #2 + @ r0 is c vector + @ r1 is a vector + @ r2 is b vector + @ r3 is num_points + @ r7 is quarter_points + beq .smallvector + mov r5, #0 +.mainloop: + vld2.32 {d24-d27}, [r1]! @ ar=q12, ai=q13 + add r5, r5, #1 + cmp r5, r7 + vld2.32 {d20-d23}, [r2]! @ br=q10, bi=q11 + pld [r1] + pld [r2] + vmul.f32 q0, q12, q10 @ q15 = ar*br + vmul.f32 q1, q13, q11 @ q11 = ai*bi + vmul.f32 q2, q12, q11 @ q14 = ar*bi + vmul.f32 q3, q13, q10 @ q12 = ai*br + vsub.f32 q8, q0, q1 @ real + vadd.f32 q9, q2, q3 @ imag + vst2.32 {d16-d19}, [r0]! + bne .mainloop + +.smallvector: + lsl r5, r7, #2 + cmp r3, r7 + bls .done +.tailcase: + add r5, r5, #1 + vld1.32 d1, [r1]! @ s2, s3 = ar, ai + vld1.32 d0, [r2]! @ s0, s1 = br, bi + vmul.f32 s4, s0, s2 @ s4 = ar*br + vmul.f32 s5, s0, s3 @ s5 = ar*bi + vmls.f32 s4, s1, s3 @ s4 = s4 - ai*bi + vmla.f32 s5, s1, s2 @ s5 = s5 + ai*br + vst1.32 d2, [r0]! + cmp r3, r5 + bne .tailcase +.done: + pop {r4, r5, r6, r7, r8, r9, r15} diff --git a/volk/kernels/volk/asm/neon/volk_arm_32fc_32f_dot_prod_32fc_a_neonpipeline.s b/volk/kernels/volk/asm/neon/volk_arm_32fc_32f_dot_prod_32fc_a_neonpipeline.s new file mode 100644 index 0000000000..758e7436cd --- /dev/null +++ b/volk/kernels/volk/asm/neon/volk_arm_32fc_32f_dot_prod_32fc_a_neonpipeline.s @@ -0,0 +1,92 @@ +@ static inline void volk_32fc_32f_dot_prod_32fc_a_neonpipeline ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) { + .global volk_32fc_32f_dot_prod_32fc_a_neonpipeline +volk_32fc_32f_dot_prod_32fc_a_neonpipeline: + @ r0 - result: pointer to output array (32fc) + @ r1 - input: pointer to input array 1 (32fc) + @ r2 - taps: pointer to input array 2 (32f) + @ r3 - num_points: number of items to process + + result .req r0 + input .req r1 + taps .req r2 + num_points .req r3 + quarterPoints .req r7 + number .req r8 + @ Note that according to the ARM EABI (AAPCS) Section 5.1.1: + @ registers s16-s31 (d8-d15, q4-q7) must be preserved across subroutine calls; + @ registers s0-s15 (d0-d7, q0-q3) do not need to be preserved + @ registers d16-d31 (q8-q15), if present, do not need to be preserved. + realAccQ .req q0 @ d0-d1/s0-s3 + compAccQ .req q1 @ d2-d3/s4-s7 + realAccS .req s0 @ d0[0] + compAccS .req s4 @ d2[0] + tapsVal .req q2 @ d4-d5 + outVal .req q3 @ d6-d7 + realMul .req q8 @ d8-d9 + compMul .req q9 @ d16-d17 + inRealVal .req q10 @ d18-d19 + inCompVal .req q11 @ d20-d21 + + stmfd sp!, {r7, r8, sl} @ prologue - save register states + + pld [taps, #128] @ pre-load hint - this is implementation specific! + pld [input, #128] @ pre-load hint - this is implementation specific! + + veor realAccQ, realAccQ @ zero out accumulators + veor compAccQ, compAccQ @ zero out accumulators + movs quarterPoints, num_points, lsr #2 + beq .loop2 @ if zero into quarterPoints + + mov number, quarterPoints + @ Optimizing for pipeline + vld1.32 {d4-d5}, [taps:128]! @ tapsVal + vld2.32 {d20-d23}, [input:128]! @ inRealVal, inCompVal + subs number, number, #1 + +.loop1: + @ do work here + pld [taps, #128] @ pre-load hint - this is implementation specific! + pld [input, #128] @ pre-load hint - this is implementation specific! + vmul.f32 realMul, tapsVal, inRealVal + vmul.f32 compMul, tapsVal, inCompVal + vadd.f32 realAccQ, realAccQ, realMul + vadd.f32 compAccQ, compAccQ, compMul + vld1.32 {d4-d5}, [taps:128]! @ tapsVal + vld2.32 {d20-d23}, [input:128]! @ inRealVal, inCompVal + + subs number, number, #1 + bne .loop1 @ first loop + + vmul.f32 realMul, tapsVal, inRealVal + vmul.f32 compMul, tapsVal, inCompVal + vadd.f32 realAccQ, realAccQ, realMul + vadd.f32 compAccQ, compAccQ, compMul + + @ Sum up across realAccQ and compAccQ + vpadd.f32 d0, d0, d1 @ realAccQ +-> d0 + vpadd.f32 d2, d2, d3 @ compAccQ +-> d2 + vadd.f32 realAccS, s0, s1 @ sum the contents of d0 together (realAccQ) + vadd.f32 compAccS, s4, s5 @ sum the contents of d2 together (compAccQ) + @ critical values are now in s0 (realAccS), s4 (realAccQ) + mov number, quarterPoints, asl #2 + sub number, number, #5 +.loop2: + cmp num_points, number + bls .done + + vld1.32 {d4[0]}, [taps]! @ s8 + vld2.32 {d5[0],d6[0]}, [input]! @ s10, s12 + vmul.f32 s5, s8, s10 + vmul.f32 s6, s8, s12 + vadd.f32 realAccS, realAccS, s5 + vadd.f32 compAccS, compAccS, s6 + + add number, number, #1 + b .loop2 + +.done: + vst1.32 {d0[0]}, [result]! @ realAccS + vst1.32 {d2[0]}, [result] @ compAccS + + ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states + bx lr diff --git a/volk/kernels/volk/volk_16i_max_star_16i.h b/volk/kernels/volk/volk_16i_max_star_16i.h index c67351c5fa..5366a2e325 100644 --- a/volk/kernels/volk/volk_16i_max_star_16i.h +++ b/volk/kernels/volk/volk_16i_max_star_16i.h @@ -85,6 +85,44 @@ static inline void volk_16i_max_star_16i_a_ssse3(short* target, short* src0, un #endif /*LV_HAVE_SSSE3*/ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +static inline void volk_16i_max_star_16i_neon(short* target, short* src0, unsigned int num_points) { + const unsigned int eighth_points = num_points / 8; + unsigned number; + int16x8_t input_vec; + int16x8_t diff, max_vec, zeros; + uint16x8_t comp1, comp2; + zeros = veorq_s16(zeros, zeros); + + int16x8x2_t tmpvec; + + int16x8_t candidate_vec = vld1q_dup_s16(src0 ); + short candidate; + ++src0; + + for(number=0; number < eighth_points; ++number) { + input_vec = vld1q_s16(src0); + __builtin_prefetch(src0+16); + diff = vsubq_s16(candidate_vec, input_vec); + comp1 = vcgeq_s16(diff, zeros); + comp2 = vcltq_s16(diff, zeros); + + tmpvec.val[0] = vandq_s16(candidate_vec, (int16x8_t)comp1); + tmpvec.val[1] = vandq_s16(input_vec, (int16x8_t)comp2); + + candidate_vec = vaddq_s16(tmpvec.val[0], tmpvec.val[1]); + src0 += 8; + } + vst1q_s16(&candidate, candidate_vec); + + for(number=0; number < num_points%8; number++) { + candidate = ((int16_t)(candidate - src0[number]) > 0) ? candidate : src0[number]; + } + target[0] = candidate; +} +#endif /*LV_HAVE_NEON*/ + #ifdef LV_HAVE_GENERIC static inline void volk_16i_max_star_16i_generic(short* target, short* src0, unsigned int num_points) { diff --git a/volk/kernels/volk/volk_16i_max_star_horizontal_16i.h b/volk/kernels/volk/volk_16i_max_star_horizontal_16i.h index ef88ec094f..1915522947 100644 --- a/volk/kernels/volk/volk_16i_max_star_horizontal_16i.h +++ b/volk/kernels/volk/volk_16i_max_star_horizontal_16i.h @@ -110,6 +110,40 @@ static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, in #endif /*LV_HAVE_SSSE3*/ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +static inline void volk_16i_max_star_horizontal_16i_neon(int16_t* target, int16_t* src0, unsigned int num_points) { + const unsigned int eighth_points = num_points / 16; + unsigned number; + int16x8x2_t input_vec; + int16x8_t diff, max_vec, zeros; + uint16x8_t comp1, comp2; + zeros = veorq_s16(zeros, zeros); + for(number=0; number < eighth_points; ++number) { + input_vec = vld2q_s16(src0); + //__builtin_prefetch(src0+16); + diff = vsubq_s16(input_vec.val[0], input_vec.val[1]); + comp1 = vcgeq_s16(diff, zeros); + comp2 = vcltq_s16(diff, zeros); + + input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1); + input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2); + + max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]); + vst1q_s16(target, max_vec); + src0 += 16; + target += 8; + } + for(number=0; number < num_points%16; number+=2) { + target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0) ? src0[number] : src0[number+1]; + } + +} +#endif /* LV_HAVE_NEON */ + +#ifdef LV_HAVE_NEON +extern void volk_16i_max_star_horizontal_16i_neonasm(int16_t* target, int16_t* src0, unsigned int num_points); +#endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target, int16_t* src0, unsigned int num_points) { diff --git a/volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h b/volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h index 56b2cc07ab..8e84b6ea17 100644 --- a/volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h +++ b/volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h @@ -165,6 +165,66 @@ static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* s #endif /*LV_HAVE_SSE2*/ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +static inline void volk_16i_x4_quad_max_star_16i_neon(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) { + const unsigned int eighth_points = num_points / 8; + unsigned i; + + int16x8_t src0_vec, src1_vec, src2_vec, src3_vec; + int16x8_t diff12, diff34; + int16x8_t comp0, comp1, comp2, comp3; + int16x8_t result1_vec, result2_vec; + int16x8_t zeros; + zeros = veorq_s16(zeros, zeros); + for(i=0; i < eighth_points; ++i) { + src0_vec = vld1q_s16(src0); + src1_vec = vld1q_s16(src1); + src2_vec = vld1q_s16(src2); + src3_vec = vld1q_s16(src3); + diff12 = vsubq_s16(src0_vec, src1_vec); + diff34 = vsubq_s16(src2_vec, src3_vec); + comp0 = (int16x8_t)vcgeq_s16(diff12, zeros); + comp1 = (int16x8_t)vcltq_s16(diff12, zeros); + comp2 = (int16x8_t)vcgeq_s16(diff34, zeros); + comp3 = (int16x8_t)vcltq_s16(diff34, zeros); + comp0 = vandq_s16(src0_vec, comp0); + comp1 = vandq_s16(src1_vec, comp1); + comp2 = vandq_s16(src2_vec, comp2); + comp3 = vandq_s16(src3_vec, comp3); + + result1_vec = vaddq_s16(comp0, comp1); + result2_vec = vaddq_s16(comp2, comp3); + + diff12 = vsubq_s16(result1_vec, result2_vec); + comp0 = (int16x8_t)vcgeq_s16(diff12, zeros); + comp1 = (int16x8_t)vcltq_s16(diff12, zeros); + comp0 = vandq_s16(result1_vec, comp0); + comp1 = vandq_s16(result2_vec, comp1); + result1_vec = vaddq_s16(comp0, comp1); + vst1q_s16(target, result1_vec); + src0 += 8; + src1 += 8; + src2 += 8; + src3 += 8; + target += 8; + } + + + short temp0 = 0; + short temp1 = 0; + for(i=eighth_points*8; i < num_points; ++i) { + temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1; + temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3; + *target++ = ((short)(temp0 - temp1)>0) ? temp0 : temp1; + src0++; + src1++; + src2++; + src3++; + } +} +#endif /* LV_HAVE_NEON */ + #ifdef LV_HAVE_GENERIC static inline void volk_16i_x4_quad_max_star_16i_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) { diff --git a/volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h b/volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h index 9b6d19fd66..28575b6282 100644 --- a/volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h +++ b/volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h @@ -112,6 +112,52 @@ static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* ta } #endif /*LV_HAVE_SSE2*/ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +static inline void volk_16i_x5_add_quad_16i_x4_neon(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points) { + + const unsigned int eighth_points = num_points / 8; + int number = 0; + + int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec; + int16x8_t target0_vec, target1_vec, target2_vec, target3_vec; + for(number = 0; number < eighth_points; ++number) { + src0_vec = vld1q_s16(src0); + src1_vec = vld1q_s16(src1); + src2_vec = vld1q_s16(src2); + src3_vec = vld1q_s16(src3); + src4_vec = vld1q_s16(src4); + + target0_vec = vaddq_s16(src0_vec , src1_vec); + target1_vec = vaddq_s16(src0_vec , src2_vec); + target2_vec = vaddq_s16(src0_vec , src3_vec); + target3_vec = vaddq_s16(src0_vec , src4_vec); + + vst1q_s16(target0, target0_vec); + vst1q_s16(target1, target1_vec); + vst1q_s16(target2, target2_vec); + vst1q_s16(target3, target3_vec); + src0 += 8; + src1 += 8; + src2 += 8; + src3 += 8; + src4 += 8; + target0 += 8; + target1 += 8; + target2 += 8; + target3 += 8; + } + + for(number = eighth_points * 8; number < num_points; ++number) { + *target0++ = *src0 + *src1++; + *target1++ = *src0 + *src2++; + *target2++ = *src0 + *src3++; + *target3++ = *src0++ + *src4++; + } +} + +#endif /* LV_HAVE_NEON */ + #ifdef LV_HAVE_GENERIC diff --git a/volk/kernels/volk/volk_16u_byteswap.h b/volk/kernels/volk/volk_16u_byteswap.h index 57f2008991..436caf0474 100644 --- a/volk/kernels/volk/volk_16u_byteswap.h +++ b/volk/kernels/volk/volk_16u_byteswap.h @@ -106,6 +106,36 @@ static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int n } #endif /* LV_HAVE_SSE2 */ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +/*! + \brief Byteswaps (in-place) an unaligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points){ + unsigned int number; + unsigned int eighth_points = num_points / 8; + uint16x8_t input, output; + uint16_t* inputPtr = intsToSwap; + + for(number = 0; number < eighth_points; number++) { + input = vld1q_u16(inputPtr); + output = vsriq_n_u16(output, input, 8); + output = vsliq_n_u16(output, input, 8); + vst1q_u16(inputPtr, output); + inputPtr += 8; + } + + for(number = eighth_points * 8; number < num_points; number++){ + uint16_t output = *inputPtr; + output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_NEON */ + #ifdef LV_HAVE_GENERIC /*! \brief Byteswaps (in-place) an aligned vector of int16_t's. diff --git a/volk/kernels/volk/volk_32f_invsqrt_32f.h b/volk/kernels/volk/volk_32f_invsqrt_32f.h index 055370661a..8ea12a73c4 100644 --- a/volk/kernels/volk/volk_32f_invsqrt_32f.h +++ b/volk/kernels/volk/volk_32f_invsqrt_32f.h @@ -90,6 +90,37 @@ static inline void volk_32f_invsqrt_32f_a_sse(float* cVector, const float* aVect } #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +/*! +\brief Sqrts the two input vectors and store their results in the third vector +\param cVector The vector where the results will be stored +\param aVector One of the vectors to be invsqrted +\param num_points The number of values in aVector and bVector to be invsqrted together and stored into cVector +*/ +static inline void volk_32f_invsqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points){ + unsigned int number; + const unsigned int quarter_points = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + float32x4_t a_val, c_val; + for (number = 0; number < quarter_points; ++number) + { + a_val = vld1q_f32(aPtr); + c_val = vrsqrteq_f32(a_val); + vst1q_f32(cPtr, c_val); + aPtr += 4; + cPtr += 4; + } + + for(number=quarter_points * 4;number < num_points; number++) + *cPtr++ = Q_rsqrt(*aPtr++); + +} +#endif /* LV_HAVE_NEON */ + + #ifdef LV_HAVE_GENERIC /*! \brief Sqrts the two input vectors and store their results in the third vector diff --git a/volk/kernels/volk/volk_32f_s32f_multiply_32f.h b/volk/kernels/volk/volk_32f_s32f_multiply_32f.h index 2dd86a17c2..8665d4e90e 100644 --- a/volk/kernels/volk/volk_32f_s32f_multiply_32f.h +++ b/volk/kernels/volk/volk_32f_s32f_multiply_32f.h @@ -180,6 +180,35 @@ static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector, const float* } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_u_neon(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const float* inputPtr = aVector; + float* outputPtr = cVector; + const unsigned int quarterPoints = num_points / 4; + + float32x4_t aVal, cVal; + + for(number = 0; number < quarterPoints; number++){ + aVal = vld1q_f32(inputPtr); // Load into NEON regs + cVal = vmulq_n_f32 (aVal, scalar); // Do the multiply + vst1q_f32(outputPtr, cVal); // Store results back to output + inputPtr += 4; + outputPtr += 4; + } + for(number = quarterPoints * 4; number < num_points; number++){ + *outputPtr++ = (*inputPtr++) * scalar; + } +} +#endif /* LV_HAVE_NEON */ #ifdef LV_HAVE_GENERIC /*! @@ -216,6 +245,4 @@ static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector, const float* #endif /* LV_HAVE_GENERIC */ - - #endif /* INCLUDED_volk_32f_s32f_multiply_32f_a_H */ diff --git a/volk/kernels/volk/volk_32f_sqrt_32f.h b/volk/kernels/volk/volk_32f_sqrt_32f.h index ab9fffd7dc..2523abf0da 100644 --- a/volk/kernels/volk/volk_32f_sqrt_32f.h +++ b/volk/kernels/volk/volk_32f_sqrt_32f.h @@ -40,6 +40,35 @@ static inline void volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, } #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_NEON +/*! + \brief Sqrts the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be sqrted + \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector +*/ +static inline void volk_32f_sqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; + unsigned int quarter_points = num_points / 4; + float32x4_t in_vec, out_vec; + + for(number = 0; number < quarter_points; number++){ + in_vec = vld1q_f32(aPtr); + // note that armv8 has vsqrt_f32 which will be much better + out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec) ); + vst1q_f32(cPtr, out_vec); + aPtr += 4; + cPtr += 4; + } + + for(number = quarter_points * 4; number < num_points; number++){ + *cPtr++ = sqrtf(*aPtr++); + } +} +#endif /* LV_HAVE_NEON */ + #ifdef LV_HAVE_GENERIC /*! \brief Sqrts the two input vectors and store their results in the third vector diff --git a/volk/kernels/volk/volk_32f_x2_add_32f.h b/volk/kernels/volk/volk_32f_x2_add_32f.h index 42278f6068..a9a1d4fbf0 100644 --- a/volk/kernels/volk/volk_32f_x2_add_32f.h +++ b/volk/kernels/volk/volk_32f_x2_add_32f.h @@ -109,6 +109,49 @@ static inline void volk_32f_x2_add_32f_a_sse(float* cVector, const float* aVecto } #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +/* + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_x2_add_32f_u_neon(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) { + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + float32x4_t aVal, bVal, cVal; + for(number=0; number < quarterPoints; number++){ + // Load in to NEON registers + aVal = vld1q_f32(aPtr); + bVal = vld1q_f32(bPtr); + __builtin_prefetch(aPtr+4); + __builtin_prefetch(bPtr+4); + + // vector add + cVal = vaddq_f32(aVal, bVal); + // Store the results back into the C container + vst1q_f32(cPtr,cVal); + + aPtr += 4; // q uses quadwords, 4 floats per vadd + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; // should be = num_points + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } + +} + +#endif /* LV_HAVE_NEON */ + #ifdef LV_HAVE_GENERIC /*! \brief Adds the two input vectors and store their results in the third vector diff --git a/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h b/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h index b91252e36f..ed16d9a49e 100644 --- a/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h +++ b/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h @@ -577,4 +577,92 @@ static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const float* #endif /*LV_HAVE_AVX*/ +#ifdef LV_HAVE_NEON +static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const float * input, const float * taps, unsigned int num_points) { + + unsigned int quarter_points = num_points / 16; + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr= taps; + unsigned int number = 0; + + float32x4x4_t a_val, b_val, accumulator0, accumulator1; + accumulator0.val[0] = vdupq_n_f32(0); + accumulator0.val[1] = vdupq_n_f32(0); + accumulator0.val[2] = vdupq_n_f32(0); + accumulator0.val[3] = vdupq_n_f32(0); + // factor of 4 loop unroll with independent accumulators + // uses 12 out of 16 neon q registers + for( number = 0; number < quarter_points; ++number) { + a_val = vld4q_f32(aPtr); + b_val = vld4q_f32(bPtr); + accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]); + accumulator0.val[1] = vmlaq_f32(accumulator0.val[1], a_val.val[1], b_val.val[1]); + accumulator0.val[2] = vmlaq_f32(accumulator0.val[2], a_val.val[2], b_val.val[2]); + accumulator0.val[3] = vmlaq_f32(accumulator0.val[3], a_val.val[3], b_val.val[3]); + aPtr += 16; + bPtr += 16; + } + accumulator0.val[0] = vaddq_f32(accumulator0.val[0], accumulator0.val[1]); + accumulator0.val[2] = vaddq_f32(accumulator0.val[2], accumulator0.val[3]); + accumulator0.val[0] = vaddq_f32(accumulator0.val[2], accumulator0.val[0]); + __VOLK_ATTR_ALIGNED(32) float accumulator[4]; + vst1q_f32(accumulator, accumulator0.val[0]); + dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3]; + + for(number = quarter_points*16; number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif + + + + +#ifdef LV_HAVE_NEON +static inline void volk_32f_x2_dot_prod_32f_neon(float * result, const float * input, const float * taps, unsigned int num_points) { + + unsigned int quarter_points = num_points / 8; + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr= taps; + unsigned int number = 0; + + float32x4x2_t a_val, b_val, accumulator_val; + accumulator_val.val[0] = vdupq_n_f32(0); + accumulator_val.val[1] = vdupq_n_f32(0); + // factor of 2 loop unroll with independent accumulators + for( number = 0; number < quarter_points; ++number) { + a_val = vld2q_f32(aPtr); + b_val = vld2q_f32(bPtr); + accumulator_val.val[0] = vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]); + accumulator_val.val[1] = vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]); + aPtr += 8; + bPtr += 8; + } + accumulator_val.val[0] = vaddq_f32(accumulator_val.val[0], accumulator_val.val[1]); + __VOLK_ATTR_ALIGNED(32) float accumulator[4]; + vst1q_f32(accumulator, accumulator_val.val[0]); + dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3]; + + for(number = quarter_points*8; number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /* LV_HAVE_NEON */ + +#ifdef LV_HAVE_NEON +extern void volk_32f_x2_dot_prod_32f_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +#endif /* LV_HAVE_NEON */ + +#ifdef LV_HAVE_NEON +extern void volk_32f_x2_dot_prod_32f_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +#endif /* LV_HAVE_NEON */ + #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/ diff --git a/volk/kernels/volk/volk_32f_x2_interleave_32fc.h b/volk/kernels/volk/volk_32f_x2_interleave_32fc.h index 0935cb32bd..3591b24d69 100644 --- a/volk/kernels/volk/volk_32f_x2_interleave_32fc.h +++ b/volk/kernels/volk/volk_32f_x2_interleave_32fc.h @@ -48,6 +48,39 @@ static inline void volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, c } #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +/*! + \brief Interleaves the I & Q vector data into the complex vector. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param num_points The number of complex data values to be interleaved +*/ +static inline void volk_32f_x2_interleave_32fc_neon(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ + unsigned int quarter_points = num_points / 4; + unsigned int number; + float* complexVectorPtr = (float*) complexVector; + + float32x4x2_t complex_vec; + for(number=0; number < quarter_points; ++number) { + complex_vec.val[0] = vld1q_f32(iBuffer); + complex_vec.val[1] = vld1q_f32(qBuffer); + vst2q_f32(complexVectorPtr, complex_vec); + iBuffer += 4; + qBuffer += 4; + complexVectorPtr += 8; + } + + for(number=quarter_points * 4; number < num_points; ++number) { + *complexVectorPtr++ = *iBuffer++; + *complexVectorPtr++ = *qBuffer++; + } + +} +#endif /* LV_HAVE_NEON */ + + #ifdef LV_HAVE_GENERIC /*! \brief Interleaves the I & Q vector data into the complex vector. diff --git a/volk/kernels/volk/volk_32f_x2_max_32f.h b/volk/kernels/volk/volk_32f_x2_max_32f.h index 27633acae8..a1403fba18 100644 --- a/volk/kernels/volk/volk_32f_x2_max_32f.h +++ b/volk/kernels/volk/volk_32f_x2_max_32f.h @@ -45,6 +45,42 @@ static inline void volk_32f_x2_max_32f_a_sse(float* cVector, const float* aVecto } #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_x2_max_32f_neon(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int quarter_points = num_points / 4; + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + float32x4_t a_vec, b_vec, c_vec; + for(number = 0; number < quarter_points; number++){ + a_vec = vld1q_f32(aPtr); + b_vec = vld1q_f32(bPtr); + c_vec = vmaxq_f32(a_vec, b_vec); + vst1q_f32(cPtr, c_vec); + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + for(number = quarter_points*4; number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } + +} +#endif /* LV_HAVE_NEON */ + #ifdef LV_HAVE_GENERIC /*! \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector diff --git a/volk/kernels/volk/volk_32f_x2_min_32f.h b/volk/kernels/volk/volk_32f_x2_min_32f.h index 4773d13211..eef5e5da2e 100644 --- a/volk/kernels/volk/volk_32f_x2_min_32f.h +++ b/volk/kernels/volk/volk_32f_x2_min_32f.h @@ -45,6 +45,42 @@ static inline void volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVecto } #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_NEON +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_x2_min_32f_neon(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + unsigned int quarter_points = num_points / 4; + + float32x4_t a_vec, b_vec, c_vec; + for(number = 0; number < quarter_points; number++){ + a_vec = vld1q_f32(aPtr); + b_vec = vld1q_f32(bPtr); + + c_vec = vminq_f32(a_vec, b_vec); + + vst1q_f32(cPtr, c_vec); + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + for(number = quarter_points*4; number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_NEON */ + #ifdef LV_HAVE_GENERIC /*! \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector diff --git a/volk/kernels/volk/volk_32f_x2_multiply_32f.h b/volk/kernels/volk/volk_32f_x2_multiply_32f.h index 9fdbec0a2c..8bbd81c8a6 100644 --- a/volk/kernels/volk/volk_32f_x2_multiply_32f.h +++ b/volk/kernels/volk/volk_32f_x2_multiply_32f.h @@ -188,6 +188,33 @@ static inline void volk_32f_x2_multiply_32f_a_avx(float* cVector, const float* a } #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_NEON +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_neon(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + const unsigned int quarter_points = num_points / 4; + unsigned int number; + float32x4_t avec, bvec, cvec; + for(number=0; number < quarter_points; ++number) { + avec = vld1q_f32(aVector); + bvec = vld1q_f32(bVector); + cvec = vmulq_f32(avec, bvec); + vst1q_f32(cVector, cvec); + aVector += 4; + bVector += 4; + cVector += 4; + } + for(number=quarter_points*4; number < num_points; ++number) { + *cVector++ = *aVector++ * *bVector++; + } +} +#endif /* LV_HAVE_NEON */ + #ifdef LV_HAVE_GENERIC /*! \brief Multiplys the two input vectors and store their results in the third vector diff --git a/volk/kernels/volk/volk_32f_x2_subtract_32f.h b/volk/kernels/volk/volk_32f_x2_subtract_32f.h index 8ea491f988..6831d89a10 100644 --- a/volk/kernels/volk/volk_32f_x2_subtract_32f.h +++ b/volk/kernels/volk/volk_32f_x2_subtract_32f.h @@ -63,6 +63,40 @@ static inline void volk_32f_x2_subtract_32f_generic(float* cVector, const float* } #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_NEON +/*! + \brief Subtracts bVector form aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The initial vector + \param bVector The vector to be subtracted + \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector +*/ +static inline void volk_32f_x2_subtract_32f_neon(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + unsigned int quarter_points = num_points / 4; + + float32x4_t a_vec, b_vec, c_vec; + + for(number = 0; number < quarter_points; number++){ + a_vec = vld1q_f32(aPtr); + b_vec = vld1q_f32(bPtr); + c_vec = vsubq_f32(a_vec, b_vec); + vst1q_f32(cPtr, c_vec); + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + for(number = quarter_points * 4; number < num_points; number++){ + *cPtr++ = (*aPtr++) - (*bPtr++); + } +} +#endif /* LV_HAVE_NEON */ + + #ifdef LV_HAVE_ORC /*! \brief Subtracts bVector form aVector and store their results in the cVector diff --git a/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h index fdef68209e..c555bbb696 100644 --- a/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h +++ b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h @@ -293,7 +293,144 @@ static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target, float* src0, } #endif // LV_HAVE_AVX +#ifdef LV_HAVE_NEON + +static inline void volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target, float* __restrict src0, float* __restrict center_point_array, float* __restrict cutoff, unsigned int num_points) { + + + int i; + float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f }; + + float32x2_t x_to_1, x_to_2, x_to_3, x_to_4; + float32x2_t cutoff_vector; + float32x2x2_t x_low, x_high; + float32x4_t x_qvector, c_qvector, cpa_qvector; + float accumulator, final_result; + float res_accumulators[4]; + + float dbg_cpa[4], dbg_x[4], dbg_c[4]; + float dbg_max[4]; + float dbg_x_to_1[2], dbg_x_to_2[2], dbg_x_to_3[2], dbg_x_to_4[2]; + float dbg_x_high[2], dbg_x_low[2]; + float dbg_foo; + + c_qvector = vld1q_f32( zero ); + // load the cutoff in to a vector + cutoff_vector = vdup_n_f32( *cutoff ); + // ... center point array + cpa_qvector = vld1q_f32( center_point_array ); + + for(i=0; i < num_points; ++i) { + // load x (src0) + x_to_1 = vdup_n_f32( *src0++ ); + + // Get a vector of max(src0, cutoff) + x_to_1 = vmax_f32(x_to_1, cutoff_vector ); // x^1 + x_to_2 = vmul_f32(x_to_1, x_to_1); // x^2 + x_to_3 = vmul_f32(x_to_2, x_to_1); // x^3 + x_to_4 = vmul_f32(x_to_3, x_to_1); // x^4 + // zip up doubles to interleave + x_low = vzip_f32(x_to_1, x_to_2); // [x^2 | x^1 || x^2 | x^1] + x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3] + // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0 + x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]); + // now we finally have [x^4 | x^3 | x^2 | x] ! + + c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector); + } + // there should be better vector reduction techniques + vst1q_f32(res_accumulators, c_qvector ); + accumulator = res_accumulators[0] + res_accumulators[1] + + res_accumulators[2] + res_accumulators[3]; + + *target = accumulator + center_point_array[4] * (float)num_points; +} + +#endif /* LV_HAVE_NEON */ + +#ifdef LV_HAVE_NEON + +static inline void volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target, float* __restrict src0, float* __restrict center_point_array, float* __restrict cutoff, unsigned int num_points) { + + + int i; + float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f }; + + float accumulator, final_result; + + + float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec; + accumulator1_vec = vld1q_f32(zero); + accumulator2_vec = vld1q_f32(zero); + accumulator3_vec = vld1q_f32(zero); + accumulator4_vec = vld1q_f32(zero); + float32x4_t x_to_1, x_to_2, x_to_3, x_to_4; + float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3; + + // load the cutoff in to a vector + cutoff_vector = vdupq_n_f32( *cutoff ); + // ... center point array + cpa_0 = vdupq_n_f32(center_point_array[0]); + cpa_1 = vdupq_n_f32(center_point_array[1]); + cpa_2 = vdupq_n_f32(center_point_array[2]); + cpa_3 = vdupq_n_f32(center_point_array[3]); + + + // nathan is not sure why this is slower *and* wrong compared to neonvertfma + for(i=0; i < num_points/4; ++i) { + // load x + x_to_1 = vld1q_f32( src0 ); + + // Get a vector of max(src0, cutoff) + x_to_1 = vmaxq_f32(x_to_1, cutoff_vector ); // x^1 + x_to_2 = vmulq_f32(x_to_1, x_to_1); // x^2 + x_to_3 = vmulq_f32(x_to_2, x_to_1); // x^3 + x_to_4 = vmulq_f32(x_to_3, x_to_1); // x^4 + x_to_1 = vmulq_f32(x_to_1, cpa_0); + x_to_2 = vmulq_f32(x_to_2, cpa_1); + x_to_3 = vmulq_f32(x_to_3, cpa_2); + x_to_4 = vmulq_f32(x_to_4, cpa_3); + accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1); + accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2); + accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3); + accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4); + + src0 += 4; + } + accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec); + accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec); + accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec); + + __VOLK_ATTR_ALIGNED(32) float res_accumulators[4]; + vst1q_f32(res_accumulators, accumulator1_vec ); + accumulator = res_accumulators[0] + res_accumulators[1] + + res_accumulators[2] + res_accumulators[3]; + + float result = 0.0; + float fst = 0.0; + float sq = 0.0; + float thrd = 0.0; + float frth = 0.0; + + for(i = 4*num_points/4; i < num_points; ++i) { + fst = src0[i]; + fst = MAX(fst, *cutoff); + + sq = fst * fst; + thrd = fst * sq; + frth = sq * sq; + //fith = sq * thrd; + + accumulator += (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth); //+ + } + + *target = accumulator + center_point_array[4] * (float)num_points; +} +#endif /* LV_HAVE_NEON */ #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/ diff --git a/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h index 44535da6d8..cf67c134fd 100644 --- a/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h +++ b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h @@ -283,6 +283,166 @@ static inline void volk_32fc_32f_dot_prod_32fc_u_avx( lv_32fc_t* result, const l } #endif /*LV_HAVE_AVX*/ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> + +static inline void volk_32fc_32f_dot_prod_32fc_neon_unroll ( lv_32fc_t* __restrict result, const lv_32fc_t* __restrict input, const float* __restrict taps, unsigned int num_points) { + + unsigned int number; + const unsigned int quarterPoints = num_points / 8; + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const float* inputPtr = (float*)input; + const float* tapsPtr = taps; + float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f }; + float* real_accum; + float current_accum = 0.0f ; + float accVector_real[4]; + float accVector_imag[4]; + + float32x4x2_t inputVector0, inputVector1; + float32x4_t tapsVector0, tapsVector1; + float32x4_t tmp_real0, tmp_imag0; + float32x4_t tmp_real1, tmp_imag1; + float32x4_t real_accumulator0, imag_accumulator0; + float32x4_t real_accumulator1, imag_accumulator1; + + + // zero out accumulators + // take a *float, return float32x4_t + real_accumulator0 = vld1q_f32( zero ); + imag_accumulator0 = vld1q_f32( zero ); + real_accumulator1 = vld1q_f32( zero ); + imag_accumulator1 = vld1q_f32( zero ); + float dbgVec[8]; + + for(number=0 ;number < quarterPoints; number++){ + // load doublewords and duplicate in to second lane + tapsVector0 = vld1q_f32(tapsPtr ); + tapsVector1 = vld1q_f32(tapsPtr+4 ); + + // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag + inputVector0 = vld2q_f32(inputPtr ); + inputVector1 = vld2q_f32(inputPtr+8 ); + // inputVector is now a struct of two vectors, 0th is real, 1st is imag + + tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]); + tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]); + + tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]); + tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]); + + real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0); + imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0); + + real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1); + imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1); + + tapsPtr += 8; + inputPtr += 16; + } + + real_accumulator0 = vaddq_f32( real_accumulator0, real_accumulator1); + imag_accumulator0 = vaddq_f32( imag_accumulator0, imag_accumulator1); + // void vst1q_f32( float32_t * ptr, float32x4_t val); + // store results back to a complex (array of 2 floats) + vst1q_f32(accVector_real, real_accumulator0); + vst1q_f32(accVector_imag, imag_accumulator0); + *realpt = accVector_real[0] + accVector_real[1] + + accVector_real[2] + accVector_real[3] ; + + *imagpt = accVector_imag[0] + accVector_imag[1] + + accVector_imag[2] + accVector_imag[3] ; + + // clean up the remainder + for(number=quarterPoints*8; number < num_points; number++){ + *realpt += ((*inputPtr++) * (*tapsPtr)); + *imagpt += ((*inputPtr++) * (*tapsPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); +} + +#endif /*LV_HAVE_NEON*/ + +#ifdef LV_HAVE_NEON +#include <arm_neon.h> + +static inline void volk_32fc_32f_dot_prod_32fc_a_neon ( lv_32fc_t* __restrict result, const lv_32fc_t* __restrict input, const float* __restrict taps, unsigned int num_points) { + + unsigned int number; + const unsigned int quarterPoints = num_points / 4; + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const float* inputPtr = (float*)input; + const float* tapsPtr = taps; + float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f }; + float* real_accum; + float current_accum = 0.0f ; + float accVector_real[4]; + float accVector_imag[4]; + + float32x4x2_t inputVector; + float32x4_t tapsVector; + float32x4_t tmp_real, tmp_imag; + float32x4_t real_accumulator, imag_accumulator; + + + // zero out accumulators + // take a *float, return float32x4_t + real_accumulator = vld1q_f32( zero ); + imag_accumulator = vld1q_f32( zero ); + + for(number=0 ;number < quarterPoints; number++){ + // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) ) + // load doublewords and duplicate in to second lane + tapsVector = vld1q_f32(tapsPtr ); + + // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag + inputVector = vld2q_f32(inputPtr ); + + tmp_real = vmulq_f32(tapsVector, inputVector.val[0]); + tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]); + + real_accumulator = vaddq_f32(real_accumulator, tmp_real); + imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag); + + + tapsPtr += 4; + inputPtr += 8; + + } + + // void vst1q_f32( float32_t * ptr, float32x4_t val); + // store results back to a complex (array of 2 floats) + vst1q_f32(accVector_real, real_accumulator); + vst1q_f32(accVector_imag, imag_accumulator); + *realpt = accVector_real[0] + accVector_real[1] + + accVector_real[2] + accVector_real[3] ; + + *imagpt = accVector_imag[0] + accVector_imag[1] + + accVector_imag[2] + accVector_imag[3] ; + + // clean up the remainder + for(number=quarterPoints*4; number < num_points; number++){ + *realpt += ((*inputPtr++) * (*tapsPtr)); + *imagpt += ((*inputPtr++) * (*tapsPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); +} + +#endif /*LV_HAVE_NEON*/ + +#ifdef LV_HAVE_NEON +extern void volk_32fc_32f_dot_prod_32fc_a_neonasm ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points); +#endif /*LV_HAVE_NEON*/ + +#ifdef LV_HAVE_NEON +extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points); +#endif /*LV_HAVE_NEON*/ #ifdef LV_HAVE_SSE diff --git a/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h b/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h index a12d078c68..21b71998c2 100644 --- a/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h +++ b/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h @@ -135,6 +135,43 @@ static inline void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector, const } #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> + /*! + \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The complex vector to be multiplied + \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_32f_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + unsigned int quarter_points = num_points / 4; + + float32x4x2_t inputVector, outputVector; + float32x4_t tapsVector; + for(number = 0; number < quarter_points; number++){ + inputVector = vld2q_f32((float*)aPtr); + tapsVector = vld1q_f32(bPtr); + + outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector); + outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector); + + vst2q_f32((float*)cPtr, outputVector); + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + for(number = quarter_points * 4; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_NEON */ + #ifdef LV_HAVE_ORC /*! \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector diff --git a/volk/kernels/volk/volk_32fc_conjugate_32fc.h b/volk/kernels/volk/volk_32fc_conjugate_32fc.h index dce897ff57..480fa36994 100644 --- a/volk/kernels/volk/volk_32fc_conjugate_32fc.h +++ b/volk/kernels/volk/volk_32fc_conjugate_32fc.h @@ -106,6 +106,46 @@ static inline void volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_ } #endif /* LV_HAVE_SSE3 */ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + unsigned int number; + const unsigned int quarterPoints = num_points / 4; + + float32x4x2_t x; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + float conj[4] = {-0.f, -0.f, -0.f, -0.f}; + //uint32x4_t conjugator; + + //conjugator = vld1q_u32( (uint32_t *)conj ); + + for(number=0; number < quarterPoints; number++){ + __builtin_prefetch(a+4); + x = vld2q_f32((float*)a); // Load the complex data as ar,br,cr,dr; ai,bi,ci,di + + // xor the imaginary lane + x.val[1] = vnegq_f32( x.val[1]); + + vst2q_f32((float*)c,x); // Store the results back into the C container + + a += 4; + c += 4; + } + + for(number=quarterPoints*4; number < num_points; number++){ + *c++ = lv_conj(*a++); + } +} +#endif /* LV_HAVE_NEON */ + #ifdef LV_HAVE_GENERIC /*! \brief Takes the conjugate of a complex vector. diff --git a/volk/kernels/volk/volk_32fc_magnitude_32f.h b/volk/kernels/volk/volk_32fc_magnitude_32f.h index 64e99cc1be..cf3e8490eb 100644 --- a/volk/kernels/volk/volk_32fc_magnitude_32f.h +++ b/volk/kernels/volk/volk_32fc_magnitude_32f.h @@ -233,6 +233,112 @@ static inline void volk_32fc_magnitude_32f_a_generic(float* magnitudeVector, con } #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_NEON + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_neon(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number; + unsigned int quarter_points = num_points / 4; + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + float32x4x2_t complex_vec; + float32x4_t magnitude_vec; + for(number = 0; number < quarter_points; number++){ + complex_vec = vld2q_f32(complexVectorPtr); + complex_vec.val[0] = vmulq_f32(complex_vec.val[0], complex_vec.val[0]); + magnitude_vec = vmlaq_f32(complex_vec.val[0], complex_vec.val[1], complex_vec.val[1]); + magnitude_vec = vrsqrteq_f32(magnitude_vec); + magnitude_vec = vrecpeq_f32( magnitude_vec ); // no plain ol' sqrt + vst1q_f32(magnitudeVectorPtr, magnitude_vec); + + complexVectorPtr += 8; + magnitudeVectorPtr += 4; + } + + for(number = quarter_points*4; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); + } +} +#endif /* LV_HAVE_NEON */ + +#ifdef LV_HAVE_NEON + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + + This is an approximation from "Streamlining Digital Signal Processing" by + Richard Lyons. Apparently max error is about 1% and mean error is about 0.6%. + The basic idea is to do a weighted sum of the abs. value of imag and real parts + where weight A is always assigned to max(imag, real) and B is always min(imag,real). + There are two pairs of cofficients chosen based on whether min <= 0.4142 max. + This method is called equiripple-error magnitude estimation proposed by Filip in '73 + + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_neon_fancy_sweet(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number; + unsigned int quarter_points = num_points / 4; + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + const float threshold = 0.4142135; + + float32x4_t a_vec, b_vec, a_high, a_low, b_high, b_low; + a_high = vdupq_n_f32( 0.84 ); + b_high = vdupq_n_f32( 0.561); + a_low = vdupq_n_f32( 0.99 ); + b_low = vdupq_n_f32( 0.197); + + uint32x4_t comp0, comp1; + + float32x4x2_t complex_vec; + float32x4_t min_vec, max_vec, magnitude_vec; + float32x4_t real_abs, imag_abs; + for(number = 0; number < quarter_points; number++){ + complex_vec = vld2q_f32(complexVectorPtr); + + real_abs = vabsq_f32(complex_vec.val[0]); + imag_abs = vabsq_f32(complex_vec.val[1]); + + min_vec = vminq_f32(real_abs, imag_abs); + max_vec = vmaxq_f32(real_abs, imag_abs); + + // effective branch to choose coefficient pair. + comp0 = vcgtq_f32(min_vec, vmulq_n_f32(max_vec, threshold)); + comp1 = vcleq_f32(min_vec, vmulq_n_f32(max_vec, threshold)); + + // and 0s or 1s with coefficients from previous effective branch + a_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)a_high), vandq_s32((int32x4_t)comp1, (int32x4_t)a_low)); + b_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)b_high), vandq_s32((int32x4_t)comp1, (int32x4_t)b_low)); + + // coefficients chosen, do the weighted sum + min_vec = vmulq_f32(min_vec, b_vec); + max_vec = vmulq_f32(max_vec, a_vec); + + magnitude_vec = vaddq_f32(min_vec, max_vec); + vst1q_f32(magnitudeVectorPtr, magnitude_vec); + + complexVectorPtr += 8; + magnitudeVectorPtr += 4; + } + + for(number = quarter_points*4; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); + } +} +#endif /* LV_HAVE_NEON */ + + #ifdef LV_HAVE_ORC /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector diff --git a/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h b/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h index 0af81401a8..878794ba79 100644 --- a/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h +++ b/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h @@ -206,6 +206,48 @@ static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, } #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +// + + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_neon(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + float32x4x2_t cmplx_val; + float32x4_t result; + for(;number < quarterPoints; number++){ + cmplx_val = vld2q_f32(complexVectorPtr); + complexVectorPtr += 8; + + cmplx_val.val[0] = vmulq_f32(cmplx_val.val[0], cmplx_val.val[0]); // Square the values + cmplx_val.val[1] = vmulq_f32(cmplx_val.val[1], cmplx_val.val[1]); // Square the values + + result = vaddq_f32(cmplx_val.val[0], cmplx_val.val[1]); // Add the I2 and Q2 values + + vst1q_f32(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_NEON */ + + #ifdef LV_HAVE_GENERIC /*! \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector diff --git a/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h index 657fa3158f..fb79d6613c 100644 --- a/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h +++ b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h @@ -760,4 +760,220 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const #endif /*LV_HAVE_SSE4_1*/ +#ifdef LV_HAVE_NEON + +static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + unsigned int quarter_points = num_points / 4; + unsigned int number; + + lv_32fc_t* a_ptr = (lv_32fc_t*) taps; + lv_32fc_t* b_ptr = (lv_32fc_t*) input; + // for 2-lane vectors, 1st lane holds the real part, + // 2nd lane holds the imaginary part + float32x4x2_t a_val, b_val, c_val, accumulator; + float32x4x2_t tmp_real, tmp_imag; + accumulator.val[0] = vdupq_n_f32(0); + accumulator.val[1] = vdupq_n_f32(0); + + for(number = 0; number < quarter_points; ++number) { + a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i + b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i + __builtin_prefetch(a_ptr+8); + __builtin_prefetch(b_ptr+8); + + // multiply the real*real and imag*imag to get real result + // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r + tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); + // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i + tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]); + + // Multiply cross terms to get the imaginary result + // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i + tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]); + // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r + tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); + + c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]); + c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]); + + accumulator.val[0] = vaddq_f32(accumulator.val[0], c_val.val[0]); + accumulator.val[1] = vaddq_f32(accumulator.val[1], c_val.val[1]); + + a_ptr += 4; + b_ptr += 4; + } + lv_32fc_t accum_result[4]; + vst2q_f32((float*)accum_result, accumulator); + *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; + + // tail case + for(number = quarter_points*4; number < num_points; ++number) { + *result += (*a_ptr++) * (*b_ptr++); + } + +} +#endif /*LV_HAVE_NEON*/ + +#ifdef LV_HAVE_NEON + +static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + unsigned int quarter_points = num_points / 4; + unsigned int number; + + lv_32fc_t* a_ptr = (lv_32fc_t*) taps; + lv_32fc_t* b_ptr = (lv_32fc_t*) input; + // for 2-lane vectors, 1st lane holds the real part, + // 2nd lane holds the imaginary part + float32x4x2_t a_val, b_val, c_val, accumulator; + float32x4x2_t tmp_real, tmp_imag; + accumulator.val[0] = vdupq_n_f32(0); + accumulator.val[1] = vdupq_n_f32(0); + + for(number = 0; number < quarter_points; ++number) { + a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i + b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i + __builtin_prefetch(a_ptr+8); + __builtin_prefetch(b_ptr+8); + + // do the first multiply + tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); + tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); + + // use multiply accumulate/subtract to get result + tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]); + tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]); + + accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]); + accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]); + + // increment pointers + a_ptr += 4; + b_ptr += 4; + } + lv_32fc_t accum_result[4]; + vst2q_f32((float*)accum_result, accumulator); + *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; + + // tail case + for(number = quarter_points*4; number < num_points; ++number) { + *result += (*a_ptr++) * (*b_ptr++); + } + +} +#endif /*LV_HAVE_NEON*/ + +#ifdef LV_HAVE_NEON +static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + unsigned int quarter_points = num_points / 4; + unsigned int number; + + lv_32fc_t* a_ptr = (lv_32fc_t*) taps; + lv_32fc_t* b_ptr = (lv_32fc_t*) input; + // for 2-lane vectors, 1st lane holds the real part, + // 2nd lane holds the imaginary part + float32x4x2_t a_val, b_val, accumulator1, accumulator2; + float32x4x2_t tmp_real, tmp_imag; + accumulator1.val[0] = vdupq_n_f32(0); + accumulator1.val[1] = vdupq_n_f32(0); + accumulator2.val[0] = vdupq_n_f32(0); + accumulator2.val[1] = vdupq_n_f32(0); + + for(number = 0; number < quarter_points; ++number) { + a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i + b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i + __builtin_prefetch(a_ptr+8); + __builtin_prefetch(b_ptr+8); + + // use 2 accumulators to remove inter-instruction data dependencies + accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]); + accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]); + accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]); + accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]); + // increment pointers + a_ptr += 4; + b_ptr += 4; + } + accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]); + accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]); + lv_32fc_t accum_result[4]; + vst2q_f32((float*)accum_result, accumulator1); + *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; + + // tail case + for(number = quarter_points*4; number < num_points; ++number) { + *result += (*a_ptr++) * (*b_ptr++); + } + +} +#endif /*LV_HAVE_NEON*/ + +#ifdef LV_HAVE_NEON +static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +// NOTE: GCC does a poor job with this kernel, but the euivalent ASM code is very fast + + unsigned int quarter_points = num_points / 8; + unsigned int number; + + lv_32fc_t* a_ptr = (lv_32fc_t*) taps; + lv_32fc_t* b_ptr = (lv_32fc_t*) input; + // for 2-lane vectors, 1st lane holds the real part, + // 2nd lane holds the imaginary part + float32x4x4_t a_val, b_val, accumulator1, accumulator2; + float32x4x2_t reduced_accumulator; + accumulator1.val[0] = vdupq_n_f32(0); + accumulator1.val[1] = vdupq_n_f32(0); + accumulator1.val[2] = vdupq_n_f32(0); + accumulator1.val[3] = vdupq_n_f32(0); + accumulator2.val[0] = vdupq_n_f32(0); + accumulator2.val[1] = vdupq_n_f32(0); + accumulator2.val[2] = vdupq_n_f32(0); + accumulator2.val[3] = vdupq_n_f32(0); + + // 8 input regs, 8 accumulators -> 16/16 neon regs are used + for(number = 0; number < quarter_points; ++number) { + a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i + b_val = vld4q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i + __builtin_prefetch(a_ptr+8); + __builtin_prefetch(b_ptr+8); + + // use 2 accumulators to remove inter-instruction data dependencies + accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]); + accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]); + + accumulator1.val[2] = vmlaq_f32(accumulator1.val[2], a_val.val[2], b_val.val[2]); + accumulator1.val[3] = vmlaq_f32(accumulator1.val[3], a_val.val[2], b_val.val[3]); + + accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]); + accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]); + + accumulator2.val[2] = vmlsq_f32(accumulator2.val[2], a_val.val[3], b_val.val[3]); + accumulator2.val[3] = vmlaq_f32(accumulator2.val[3], a_val.val[3], b_val.val[2]); + // increment pointers + a_ptr += 8; + b_ptr += 8; + } + // reduce 8 accumulator lanes down to 2 (1 real and 1 imag) + accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator1.val[2]); + accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator1.val[3]); + accumulator2.val[0] = vaddq_f32(accumulator2.val[0], accumulator2.val[2]); + accumulator2.val[1] = vaddq_f32(accumulator2.val[1], accumulator2.val[3]); + reduced_accumulator.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]); + reduced_accumulator.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]); + // now reduce accumulators to scalars + lv_32fc_t accum_result[4]; + vst2q_f32((float*)accum_result, reduced_accumulator); + *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; + + // tail case + for(number = quarter_points*8; number < num_points; ++number) { + *result += (*a_ptr++) * (*b_ptr++); + } + +} +#endif /*LV_HAVE_NEON*/ + + #endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H*/ diff --git a/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h index 7db68c1bd8..0993a16ceb 100644 --- a/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h +++ b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h @@ -149,6 +149,117 @@ static inline void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, cons } #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_NEON + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + + lv_32fc_t *a_ptr = (lv_32fc_t*) aVector; + lv_32fc_t *b_ptr = (lv_32fc_t*) bVector; + unsigned int quarter_points = num_points / 4; + float32x4x2_t a_val, b_val, c_val; + float32x4x2_t tmp_real, tmp_imag; + unsigned int number = 0; + + for(number = 0; number < quarter_points; ++number) { + a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i + b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i + __builtin_prefetch(a_ptr+4); + __builtin_prefetch(b_ptr+4); + + // multiply the real*real and imag*imag to get real result + // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r + tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); + // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i + tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]); + + // Multiply cross terms to get the imaginary result + // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i + tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]); + // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r + tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); + + // store the results + c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]); + c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]); + vst2q_f32((float*)cVector, c_val); + + a_ptr += 4; + b_ptr += 4; + cVector += 4; + } + + for(number = quarter_points*4; number < num_points; number++){ + *cVector++ = (*a_ptr++) * (*b_ptr++); + } + +} +#endif /* LV_HAVE_NEON */ + +#ifdef LV_HAVE_NEON + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + + lv_32fc_t *a_ptr = (lv_32fc_t*) aVector; + lv_32fc_t *b_ptr = (lv_32fc_t*) bVector; + unsigned int quarter_points = num_points / 4; + float32x4x2_t a_val, b_val, c_val; + float32x4x2_t tmp_real, tmp_imag; + unsigned int number = 0; + + // TODO: I suspect the compiler is doing a poor job scheduling this. This seems + // highly optimal, but is barely better than generic + for(number = 0; number < quarter_points; ++number) { + a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i + b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i + __builtin_prefetch(a_ptr+4); + __builtin_prefetch(b_ptr+4); + + // do the first multiply + tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); + tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); + + // use multiply accumulate/subtract to get result + tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]); + tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]); + + // store + vst2q_f32((float*)cVector, tmp_imag); + // increment pointers + a_ptr += 4; + b_ptr += 4; + cVector += 4; + } + + for(number = quarter_points*4; number < num_points; number++){ + *cVector++ = (*a_ptr++) * (*b_ptr++); + } + +} +#endif /* LV_HAVE_NEON */ + +#ifdef LV_HAVE_NEON + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +extern void volk_32fc_x2_multiply_32fc_neonasm(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points); +#endif /* LV_HAVE_NEON */ + #ifdef LV_HAVE_ORC /*! \brief Multiplies the two input complex vectors and stores their results in the third vector diff --git a/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h b/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h index cfd6c007f1..dbc123ff25 100644 --- a/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h +++ b/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h @@ -138,6 +138,59 @@ static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVecto } #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + + lv_32fc_t *a_ptr = (lv_32fc_t*) aVector; + lv_32fc_t *b_ptr = (lv_32fc_t*) bVector; + unsigned int quarter_points = num_points / 4; + float32x4x2_t a_val, b_val, c_val; + float32x4x2_t tmp_real, tmp_imag; + unsigned int number = 0; + + for(number = 0; number < quarter_points; ++number) { + a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i + b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i + b_val.val[1] = vnegq_f32(b_val.val[1]); + __builtin_prefetch(a_ptr+4); + __builtin_prefetch(b_ptr+4); + + // multiply the real*real and imag*imag to get real result + // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r + tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); + // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i + tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]); + + // Multiply cross terms to get the imaginary result + // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i + tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]); + // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r + tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); + + // store the results + c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]); + c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]); + vst2q_f32((float*)cVector, c_val); + + a_ptr += 4; + b_ptr += 4; + cVector += 4; + } + + for(number = quarter_points*4; number < num_points; number++){ + *cVector++ = (*a_ptr++) * conj(*b_ptr++); + } +} +#endif /* LV_HAVE_NEON */ + #ifdef LV_HAVE_GENERIC /*! \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector diff --git a/volk/kernels/volk/volk_32fc_x2_square_dist_32f.h b/volk/kernels/volk/volk_32fc_x2_square_dist_32f.h index 27a081b7cf..56b3d5c230 100644 --- a/volk/kernels/volk/volk_32fc_x2_square_dist_32f.h +++ b/volk/kernels/volk/volk_32fc_x2_square_dist_32f.h @@ -92,6 +92,36 @@ static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* #endif /*LV_HAVE_SSE3*/ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +static inline void volk_32fc_x2_square_dist_32f_neon(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) { + const unsigned int quarter_points = num_points / 4; + unsigned int number; + + float32x4x2_t a_vec, b_vec; + float32x4x2_t diff_vec; + float32x4_t tmp, tmp1, dist_sq; + a_vec.val[0] = vdupq_n_f32( lv_creal(src0[0]) ); + a_vec.val[1] = vdupq_n_f32( lv_cimag(src0[0]) ); + for(number=0; number < quarter_points; ++number) { + b_vec = vld2q_f32((float*)points); + diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]); + diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]); + tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]); + tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]); + + dist_sq = vaddq_f32(tmp, tmp1); + vst1q_f32(target, dist_sq); + points += 4; + target += 4; + } + for(number=quarter_points*4; number < num_points; ++number) { + lv_32fc_t diff = src0[0] - *points++; + *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); + } +} +#endif /* LV_HAVE_NEON */ + #ifdef LV_HAVE_GENERIC static inline void volk_32fc_x2_square_dist_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) { diff --git a/volk/kernels/volk/volk_8i_convert_16i.h b/volk/kernels/volk/volk_8i_convert_16i.h index 3e5c92723f..5b27900cf1 100644 --- a/volk/kernels/volk/volk_8i_convert_16i.h +++ b/volk/kernels/volk/volk_8i_convert_16i.h @@ -138,6 +138,44 @@ static inline void volk_8i_convert_16i_a_generic(int16_t* outputVector, const in } #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_NEON + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned + */ +static inline void volk_8i_convert_16i_neon(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number; + const unsigned int eighth_points = num_points / 8; + float scale_factor = 256; + + int8x8_t input_vec ; + int16x8_t converted_vec; + + // NEON doesn't have a concept of 8 bit registers, so we are really + // dealing with the low half of 16-bit registers. Since this requires + // a move instruction we likely do better with ASM here. + for(number = 0; number < eighth_points; ++number) { + input_vec = vld1_s8(inputVectorPtr); + converted_vec = vmovl_s8(input_vec); + //converted_vec = vmulq_s16(converted_vec, scale_factor); + converted_vec = vshlq_n_s16(converted_vec, 8); + vst1q_s16( outputVectorPtr, converted_vec); + + inputVectorPtr += 8; + outputVectorPtr += 8; + } + + for(number = eighth_points * 8; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; + } +} +#endif /* LV_HAVE_NEON */ + #ifdef LV_HAVE_ORC /*! \brief Converts the input 8 bit integer data into 16 bit integer data diff --git a/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h b/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h index c8ff18e67b..427c9abf55 100644 --- a/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h +++ b/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h @@ -61,6 +61,34 @@ static inline void volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer, const } #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_NEON +#include <arm_neon.h> +/*! + \brief Deinterleaves the complex 8 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number; + unsigned int sixteenth_points = num_points / 16; + + int8x16x2_t input_vector; + for(number=0; number < sixteenth_points; ++number) { + input_vector = vld2q_s8((int8_t*) complexVector ); + vst1q_s8(iBuffer, input_vector.val[0]); + iBuffer += 16; + complexVector += 16; + } + + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + for(number = sixteenth_points*16; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_NEON */ diff --git a/volk/lib/CMakeLists.txt b/volk/lib/CMakeLists.txt index d72eb726e4..86b2c6a239 100644 --- a/volk/lib/CMakeLists.txt +++ b/volk/lib/CMakeLists.txt @@ -382,6 +382,29 @@ include_directories( ) ######################################################################## +# Handle ASM (for ARM) support +# on by default, but let users turn it off +######################################################################## +if( NOT DEFINED ENABLE_ARM_ASM OR ENABLE_ARM_ASM ) + message("---- Adding ARM ASM files") + set(ASM-ATT $ENV{ASM}) + #set(_CMAKE_TOOLCHAIN_PREFIX $ENV{TARGET_PREFIX}) # Gah - wtf, this shouldn't be needed + enable_language(ASM-ATT) + # what would make this OK, appending? + set(ASM-ATT_FLAGS "-mfpu=neon -g") # Horrid horrid hack to assemble for ARM neon + set(CMAKE_ASM-ATT_FLAGS ${ASM-ATT_FLAGS}) + message("DEBUG: looking for ASM files in ${CMAKE_SOURCE_DIR}/kernels/volk/asm/neon") + include_directories(${CMAKE_SOURCE_DIR}/kernels/volk/asm/neon) + file(GLOB asm_files ${CMAKE_SOURCE_DIR}/kernels/volk/asm/neon/*.s) + foreach(asm_file ${asm_files}) + list(APPEND volk_sources ${asm_file}) + message(STATUS "Adding source file: ${asm_file}") + endforeach(asm_file) +else() + message("---- NOT Adding ARM ASM files") +endif() + +######################################################################## # Handle orc support ######################################################################## if(ORC_FOUND) @@ -436,7 +459,8 @@ list(APPEND volk_sources ${CMAKE_CURRENT_BINARY_DIR}/constants.c) # Setup the volk sources list and library ######################################################################## if(NOT WIN32) - add_definitions(-fvisibility=hidden) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden") endif() list(APPEND volk_sources |