diff options
-rw-r--r-- | volk/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_neonasm.s | 17 |
1 files changed, 6 insertions, 11 deletions
diff --git a/volk/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_neonasm.s b/volk/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_neonasm.s index 8262e4cd29..235d37538f 100644 --- a/volk/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_neonasm.s +++ b/volk/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_neonasm.s @@ -6,7 +6,7 @@ volk_32f_s32f_multiply_32f_neonasm: @ r2 - bVector: pointer to input array 2 @ r3 - num_points: number of items to process - stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, sl} @ prologue - save register states + stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12} @ prologue - save register states @ quarter_points = num_points / 4 @@ -21,7 +21,7 @@ volk_32f_s32f_multiply_32f_neonasm: mov r5, r0 @ load the scalar to a quad register - @ vmov.32 d2[0], r2 + @ vmov.32 d2[0], r2 @ The scalar might be in s0, not totally sure vdup.32 q2, d0[0] @@ -30,17 +30,17 @@ volk_32f_s32f_multiply_32f_neonasm: @ vld1.32 {q1}, [r4:128]! @ aVal @ vmul.f32 q3, q1, q2 @ vst1.32 {q3}, [r5:128]! @ cVal - @ + @ @ subs r10, r10, #1 @ bne .loop1 @ first loop - + @ number = quarter_points * 4 mov r10, r11, asl #2 .loop2: @ cmp num_points, number @ bls .done - @ + @ @ vld1.32 {d0[0]}, [aVector]! @ vmul.f32 s2, s0, s4 @ vst1.32 {d1[0]}, [cVector]! @@ -48,10 +48,5 @@ volk_32f_s32f_multiply_32f_neonasm: @ b .loop2 .done: - ldmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, sl} @ epilogue - restore register states + ldmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12} @ epilogue - restore register states bx lr - - - - - |