volk: add NEON protokernels

author: Nathan West <nathan.west@okstate.edu> 2014-06-18 13:10:02 -0500
committer: Nathan West <nathan.west@okstate.edu> 2014-07-18 20:41:28 -0400
commit: 6e17772f423dc260051e37ceb25f9384ca8151ed (patch)
tree: 1c74d276ccaba2db6f35a1942cd4193b0943648e
parent: 93db96faa81b260367908e977f15c0d7a45358db (diff)
42 files changed, 2738 insertions, 3 deletions
diff --git a/volk/kernels/volk/asm/neon/volk_16i_max_star_horizontal_16i.s b/volk/kernels/volk/asm/neon/volk_16i_max_star_horizontal_16i.s
new file mode 100644
index 0000000000..2099355e7b
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_16i_max_star_horizontal_16i.s
@@ -0,0 +1,52 @@
+@ static inline void volk_16i_max_star_horizontal_16i_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+	.global	volk_16i_max_star_horizontal_16i_neonasm
+volk_16i_max_star_horizontal_16i_neonasm:
+	@ r0 - cVector: pointer to output array
+	@ r1 - aVector: pointer to input array 1
+	@ r2 - num_points: number of items to process
+
+volk_16i_max_star_horizontal_16i_neonasm:
+    pld     [r1:128]
+    push    {r4, r5, r6}    @ preserve register states
+    lsrs    r5, r2, #4      @ 1/16th points = num_points/16
+    vmov.i32    q12, #0     @ q12 = [0,0,0,0]
+    beq .smallvector        @ less than 16 elements in vector
+    mov r4, r1              @ r4 = aVector
+    mov r12, r0             @ gcc calls this ip
+    mov r3, #0              @ number = 0
+
+.loop1:
+    vld2.16 {d16-d19}, [r4]! @ aVector, interleaved load
+    pld [r4:128]
+    add r3, r3, #1           @ number += 1
+    cmp r3, r5               @ number < 1/16th points
+    vsub.i16    q10, q8, q9  @ subtraction
+    vcge.s16    q11, q10, #0 @ result > 0?
+    vcgt.s16    q10, q12, q10 @ result < 0?
+    vand.i16    q11, q8, q11 @ multiply by comparisons
+    vand.i16    q10, q9, q10 @ multiply by other comparison
+    vadd.i16    q10, q11, q10 @ add results to get max
+    vst1.16 {d20-d21}, [r12]! @ store the results
+    bne .loop1               @ at least 16 items left
+    add r1, r1, r3, lsl #5   
+    add r0, r0, r3, lsl #4
+.smallvector:
+    ands    r2, r2, #15
+    beq .end
+    mov r3, #0
+.loop3:
+    ldrh    r4, [r1]
+    bic r5, r3, #1
+    ldrh    ip, [r1, #2]
+    add r3, r3, #2
+    add r1, r1, #4
+    rsb r6, ip, r4
+    sxth    r6, r6
+    cmp r6, #0
+    movgt   ip, r4
+    cmp r3, r2
+    strh    ip, [r0, r5]
+    bcc .loop3
+.end:
+    pop {r4, r5, r6}
+    bx  lr
diff --git a/volk/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_neonasm.s b/volk/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_neonasm.s
new file mode 100644
index 0000000000..8262e4cd29
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_neonasm.s
@@ -0,0 +1,57 @@
+@ static inline void volk_32f_s32f_multiply_32f_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+	.global	volk_32f_s32f_multiply_32f_neonasm
+volk_32f_s32f_multiply_32f_neonasm:
+	@ r0 - cVector: pointer to output array
+	@ r1 - aVector: pointer to input array 1
+	@ r2 - bVector: pointer to input array 2
+	@ r3 - num_points: number of items to process
+
+	stmfd	sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, sl}	@ prologue - save register states
+
+
+    @ quarter_points = num_points / 4
+	movs r11, r3, lsr #2
+	beq .loop2 @ if zero into quarterPoints
+
+    @ number = quarter_points
+	mov r10, r3
+    @ copy address of input vector
+    mov r4, r1
+    @ copy address of output vector
+    mov r5, r0
+
+    @ load the scalar to a quad register
+    @ vmov.32 d2[0], r2 
+    @ The scalar might be in s0, not totally sure
+    vdup.32 q2, d0[0]
+
+    @ this is giving fits. Current theory is hf has something to do with it
+    .loop1:
+    @  vld1.32 {q1}, [r4:128]! @ aVal
+    @  vmul.f32 q3, q1, q2
+    @  vst1.32	{q3}, [r5:128]! @ cVal
+    @  
+    @  subs r10, r10, #1
+    @  bne	.loop1	@ first loop
+      
+    @ number = quarter_points * 4
+    mov	r10, r11, asl #2
+
+    .loop2:
+    @   cmp	num_points, number
+    @   bls	.done
+    @   
+    @   vld1.32 {d0[0]}, [aVector]!
+    @   vmul.f32 s2, s0, s4
+    @   vst1.32 {d1[0]}, [cVector]!
+    @   add number, number, #1
+    @   b .loop2
+
+.done:
+	ldmfd	sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, sl} @ epilogue - restore register states
+	bx	lr
+
+
+
+
+
diff --git a/volk/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonasm.s b/volk/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonasm.s
new file mode 100644
index 0000000000..09e8638423
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonasm.s
@@ -0,0 +1,54 @@
+@ static inline void volk_32f_x2_add_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+	.global	volk_32f_x2_add_32f_a_neonasm
+volk_32f_x2_add_32f_a_neonasm:
+	@ r0 - cVector: pointer to output array
+	@ r1 - aVector: pointer to input array 1
+	@ r2 - bVector: pointer to input array 2
+	@ r3 - num_points: number of items to process
+	cVector .req r0
+	aVector .req r1
+	bVector .req r2
+	num_points .req r3
+	quarterPoints .req r7
+	number .req r8
+	aVal .req q0 @ d0-d1
+	bVal .req q1 @ d2-d3
+	cVal .req q2 @ d4-d5
+
+	@ AAPCS Section 5.1.1
+	@ A subroutine must preserve the contents of the registers r4-r8, r10, r11 and SP
+	stmfd	sp!, {r7, r8, sl}	@ prologue - save register states
+
+	movs quarterPoints, num_points, lsr #2
+	beq .loop2 @ if zero into quarterPoints
+
+	mov	number, #0	@ number, 0
+.loop1:
+	pld [aVector, #128] @ pre-load hint - this is implementation specific!
+	pld [bVector, #128] @ pre-load hint - this is implementation specific!
+
+	vld1.32	{d0-d1}, [aVector:128]!	@ aVal
+	add	number, number, #1
+	vld1.32	{d2-d3}, [bVector:128]!	@ bVal
+	vadd.f32 cVal, bVal, aVal
+	cmp	number, quarterPoints
+	vst1.32	{d4-d5}, [cVector:128]!	@ cVal
+
+	ble	.loop1	@ first loop
+
+	mov	number, quarterPoints, asl #2
+
+.loop2:
+	cmp	num_points, number
+	bls	.done
+
+	vld1.32 {d0[0]}, [aVector]!
+	vld1.32 {d0[1]}, [bVector]!
+	vadd.f32 s2, s1, s0
+	vst1.32 {d1[0]}, [cVector]!
+	add number, number, #1
+	b .loop2
+
+.done:
+	ldmfd	sp!, {r7, r8, sl} @ epilogue - restore register states
+	bx	lr
diff --git a/volk/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonpipeline.s b/volk/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonpipeline.s
new file mode 100644
index 0000000000..4c8af8b51c
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonpipeline.s
@@ -0,0 +1,68 @@
+@ static inline void volk_32f_x2_add_32f_a_neonpipeline(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+	.global	volk_32f_x2_add_32f_a_neonpipeline
+volk_32f_x2_add_32f_a_neonpipeline:
+	@ r0 - cVector: pointer to output array
+	@ r1 - aVector: pointer to input array 1
+	@ r2 - bVector: pointer to input array 2
+	@ r3 - num_points: number of items to process
+	cVector .req r0
+	aVector .req r1
+	bVector .req r2
+	num_points .req r3
+	quarterPoints .req r7
+	number .req r8
+	aVal .req q0 @ d0-d1
+	bVal .req q1 @ d2-d3
+	cVal .req q2 @ d4-d5
+
+	stmfd	sp!, {r7, r8, sl}	@ prologue - save register states
+
+	pld [aVector, #128] @ pre-load hint - this is implementation specific!
+	pld [bVector, #128] @ pre-load hint - this is implementation specific!
+
+	movs quarterPoints, num_points, lsr #2
+	beq .loop2 @ if zero into quarterPoints
+
+	mov number, quarterPoints
+
+	@ Optimizing for pipeline
+	vld1.32	{d0-d1}, [aVector:128]!	@ aVal
+	vld1.32	{d2-d3}, [bVector:128]!	@ bVal
+	subs number, number, #1
+
+.loop1:
+	pld [aVector, #128] @ pre-load hint - this is implementation specific!
+	pld [bVector, #128] @ pre-load hint - this is implementation specific!
+	vadd.f32 cVal, bVal, aVal
+	vld1.32 {d0-d1}, [aVector:128]! @ aVal
+	vld1.32 {d2-d3}, [bVector:128]! @ bVal
+	vst1.32	{d4-d5}, [cVector:128]! @ cVal
+
+	subs number, number, #1
+	bne	.loop1	@ first loop
+
+	@ One more time
+	vadd.f32 cVal, bVal, aVal
+	vst1.32	{d4-d5}, [cVector:128]! @ cVal
+
+	mov	number, quarterPoints, asl #2
+
+.loop2:
+	cmp	num_points, number
+	bls	.done
+
+	vld1.32 {d0[0]}, [aVector]!
+	vld1.32 {d0[1]}, [bVector]!
+	vadd.f32 s2, s1, s0
+	vst1.32 {d1[0]}, [cVector]!
+	add number, number, #1
+	b .loop2
+
+.done:
+	ldmfd	sp!, {r7, r8, sl} @ epilogue - restore register states
+	bx	lr
+
+
+
+
+
diff --git a/volk/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm.s b/volk/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm.s
new file mode 100644
index 0000000000..64579579e5
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm.s
@@ -0,0 +1,58 @@
+@ static inline void volk_32f_x2_dot_prod_32f_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+	.global	volk_32f_x2_dot_prod_32f_neonasm
+volk_32f_x2_dot_prod_32f_neonasm:
+	@ r0 - cVector: pointer to output array
+	@ r1 - aVector: pointer to input array 1
+	@ r2 - bVector: pointer to input array 2
+	@ r3 - num_points: number of items to process
+	cVector .req r0
+	aVector .req r1
+	bVector .req r2
+	num_points .req r3
+	quarterPoints .req r7
+	number .req r8
+	aVal .req q0 @ d0-d1
+	bVal .req q1 @ d2-d3
+	cVal .req q2 @ d4-d5
+
+	@ AAPCS Section 5.1.1
+	@ A subroutine must preserve the contents of the registers r4-r8, r10, r11 and SP
+	stmfd	sp!, {r7, r8, sl}	@ prologue - save register states
+
+    veor.32 q0, q0, q0
+	movs quarterPoints, num_points, lsr #2
+	beq .loop2 @ if zero into quarterPoints
+
+	mov	number, #0	@ number, 0
+.loop1:
+	pld [aVector, #128] @ pre-load hint - this is implementation specific!
+	pld [bVector, #128] @ pre-load hint - this is implementation specific!
+
+	vld1.32	{q1}, [aVector:128]!	@ aVal
+	vld1.32	{q2}, [bVector:128]!	@ bVal
+    vmla.f32 q0, q1, q2
+
+	add	number, number, #1
+	cmp	number, quarterPoints
+	ble	.loop1	@ first loop
+    
+    @ strange order comes from trying to schedule instructions
+    vadd.f32 s0, s0, s1
+    vadd.f32 s2, s2, s3
+	mov	number, quarterPoints, asl #2
+    vadd.f32 s0, s0, s2
+
+.loop2:
+	cmp	num_points, number
+	bls	.done
+
+	vld1.32 {d1[0]}, [aVector]!
+	vld1.32 {d1[1]}, [bVector]!
+	vmla.f32 s0, s2, s3
+	add number, number, #1
+	b .loop2
+
+.done:
+	vstr s0, [cVector]
+	ldmfd	sp!, {r7, r8, sl} @ epilogue - restore register states
+	bx	lr
diff --git a/volk/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm_opts.s b/volk/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm_opts.s
new file mode 100644
index 0000000000..3093edc121
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_neonasm_opts.s
@@ -0,0 +1,116 @@
+@ static inline void volk_32f_x2_dot_prod_32f_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+    @ r0 = cVector
+    @ r1 = aVector
+    @ r2 = bVector
+    @ r3 = num_points
+    .global    volk_32f_x2_dot_prod_32f_neonasm_opts
+volk_32f_x2_dot_prod_32f_neonasm_opts:
+     push    {r4, r5, r6, r7, r8, r9, r10, r11}
+    @ sixteenth_points = num_points / 16
+     lsrs       r8, r3, #4
+     sub        r13, r13, #16 @ subtracting 16 from stack pointer?, wat?
+    @ 0 out neon accumulators
+     veor       q0, q3, q3
+     veor       q1, q3, q3
+     veor       q2, q3, q3
+     veor       q3, q3, q3
+     beq        .smallvector @ if less than 16 points skip main loop
+     mov        r7, r2  @ copy input ptrs
+     mov        r6, r1  @ copy input ptrs
+     mov        r5, #0  @ loop counter
+.mainloop:
+     vld4.32    {d16,d18,d20,d22}, [r6]!
+     add        r5, r5, #1 @ inc loop counter
+     cmp        r5, r8     @ loop counter < sixteenth_points?
+     vld4.32    {d24,d26,d28,d30}, [r7]!
+     vld4.32    {d17,d19,d21,d23}, [r6]!
+     vld4.32    {d25,d27,d29,d31}, [r7]!
+     vmla.f32   q3, q8, q12
+     vmla.f32   q0, q13, q9
+     vmla.f32   q1, q14, q10
+     vmla.f32   q2, q15, q11
+     bne        .mainloop
+     lsl        r12, r8, #6 @ r12=r8/64
+     add        r1, r1, r12
+     add        r2, r2, r12
+.smallvector: @ actually this can be skipped for small vectors
+     vadd.f32   q3, q3, q0
+     lsl        r8, r8, #4 @ sixteenth_points * 16
+     cmp        r3, r8     @ num_points < sixteenth_points*16?
+     vadd.f32   q2, q1, q2
+     vadd.f32   q3, q2, q3 @ sum of 4 accumulators in to q3
+     vadd.f32   s15, s12, s15 @ q3 is s12-s15, so reduce to a single float
+     vadd.f32   s15, s15, s13
+     vadd.f32   s15, s15, s14
+     bls        .done      @ if vector is multiple of 16 then finish
+     sbfx       r11, r1, #2, #1 @ check alignment
+     rsb        r9, r8, r3
+     and        r11, r11, #3
+     mov        r6, r1
+     cmp        r11, r9
+     movcs      r11, r9
+     cmp        r9, #3
+     movls      r11, r9
+     cmp        r11, #0
+     beq        .nothingtodo
+     mov        r5, r2
+     mov        r12, r8
+.dlabel5:
+     add        r12, r12, #1
+     vldmia     r6!, {s14}
+     rsb        r4, r8, r12
+     vldmia     r5!, {s13}
+     cmp        r4, r11
+     vmla.f32   s15, s13, s14
+     mov        r7, r6
+     mov        r4, r5
+     bcc        .dlabel5
+     cmp        r9, r11
+     beq        .done
+.dlabel8:
+     rsb        r9, r11, r9
+     lsr        r8, r9, #2
+     lsls       r10, r8, #2
+     beq        .dlabel6
+     lsl        r6, r11, #2
+     veor       q8, q8, q8
+     add        r1, r1, r6
+     add        r6, r2, r6
+     mov        r5, #0
+.dlabel9:
+     add        r5, r5, #1
+     vld1.32    {d20-d21}, [r6]!
+     cmp        r5, r8
+     vld1.64    {d18-d19}, [r1 :64]!
+     vmla.f32   q8, q10, q9
+     bcc        .dlabel9
+     vadd.f32   d16, d16, d17
+     lsl        r2, r10, #2
+     veor       q9, q9, q9
+     add        r7, r7, r2
+     vpadd.f32  d6, d16, d16
+     add        r4, r4, r2
+     cmp        r9, r10
+     add        r12, r12, r10
+     vadd.f32   s15, s15, s12
+     beq        .done
+.dlabel6:
+     mov        r2, r7
+.dlabel7:
+     add        r12, r12, #1
+     vldmia     r2!, {s13}
+     cmp        r3, r12
+     vldmia     r4!, {s14}
+     vmla.f32   s15, s13, s14
+     bhi        .dlabel7
+.done:
+     vstr       s15, [r0]
+     add        r13, r13, #16
+     pop        {r4, r5, r6, r7, r8, r9, r10, r11}
+     bx         lr @ lr is the return address
+.nothingtodo:
+     mov        r12, r8
+     mov        r4, r2
+     mov        r7, r1
+     b          .dlabel8
+
diff --git a/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasm.s b/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasm.s
new file mode 100644
index 0000000000..481cadee2c
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasm.s
@@ -0,0 +1,79 @@
+@ static inline void volk_32fc_32f_dot_prod_32fc_a_neonasm ( lv_32fc_t* result, const  lv_32fc_t* input, const  float* taps, unsigned int num_points) {
+    .global	volk_32fc_32f_dot_prod_32fc_a_neonasm
+    volk_32fc_32f_dot_prod_32fc_a_neonasm:
+    @ r0 - result: pointer to output array (32fc)
+    @ r1 - input: pointer to input array 1 (32fc)
+    @ r2 - taps: pointer to input array 2 (32f)
+    @ r3 - num_points: number of items to process
+    
+    result .req r0
+    input .req r1
+    taps .req r2
+    num_points .req r3
+    quarterPoints .req r7
+    number .req r8
+    @ Note that according to the ARM EABI (AAPCS) Section 5.1.1:
+    @ registers s16-s31 (d8-d15, q4-q7) must be preserved across subroutine calls;
+    @ registers s0-s15 (d0-d7, q0-q3) do not need to be preserved
+    @ registers d16-d31 (q8-q15), if present, do not need to be preserved.
+    realAccQ   .req q0 @ d0-d1/s0-s3
+    compAccQ   .req q1 @ d2-d3/s4-s7
+    realAccS   .req s0 @ d0[0]
+    compAccS   .req s4 @ d2[0]
+    tapsVal    .req q2 @ d4-d5
+    outVal     .req q3 @ d6-d7
+    realMul    .req q8 @ d8-d9
+    compMul    .req q9 @ d16-d17
+    inRealVal  .req q10 @ d18-d19
+    inCompVal  .req q11 @ d20-d21
+    
+    stmfd	sp!, {r7, r8, sl}	@ prologue - save register states
+    
+    veor realAccQ, realAccQ @ zero out accumulators
+    veor compAccQ, compAccQ @ zero out accumulators
+    movs quarterPoints, num_points, lsr #2
+    beq .loop2 @ if zero into quarterPoints
+    
+    mov number, quarterPoints
+
+.loop1:
+    @ do work here
+    @pld [taps, #128] @ pre-load hint - this is implementation specific!
+    @pld [input, #128] @ pre-load hint - this is implementation specific!
+    vld1.32 {d4-d5}, [taps:128]! @ tapsVal
+    vld2.32 {d20-d23}, [input:128]! @ inRealVal, inCompVal
+    vmul.f32 realMul, tapsVal, inRealVal
+    vmul.f32 compMul, tapsVal, inCompVal
+    vadd.f32 realAccQ, realAccQ, realMul
+    vadd.f32 compAccQ, compAccQ, compMul
+    subs number, number, #1
+    bne	.loop1	@ first loop
+
+    @ Sum up across realAccQ and compAccQ
+    vpadd.f32 d0, d0, d1      @ realAccQ +-> d0
+    vpadd.f32 d2, d2, d3      @ compAccQ +-> d2
+    vadd.f32 realAccS, s0, s1 @ sum the contents of d0 together (realAccQ)
+    vadd.f32 compAccS, s4, s5 @ sum the contents of d2 together (compAccQ)
+    @ critical values are now in s0 (realAccS), s4 (realAccQ)
+	mov	number, quarterPoints, asl #2
+
+.loop2:
+    cmp	num_points, number
+    bls	.done
+    
+    vld1.32 {d4[0]}, [taps]! @ s8
+    vld2.32 {d5[0],d6[0]}, [input]! @ s10, s12
+    vmul.f32 s5, s8, s10
+    vmul.f32 s6, s8, s12
+    vadd.f32 realAccS, realAccS, s5
+    vadd.f32 compAccS, compAccS, s6
+    
+    add number, number, #1
+    b .loop2
+
+.done:
+    vst1.32 {d0[0]}, [result]! @ realAccS
+    vst1.32 {d2[0]}, [result]  @ compAccS
+
+    ldmfd	sp!, {r7, r8, sl} @ epilogue - restore register states
+    bx	lr
diff --git a/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasmpipeline.s b/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasmpipeline.s
new file mode 100644
index 0000000000..aaf70e2cbc
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasmpipeline.s
@@ -0,0 +1,86 @@
+@ static inline void volk_32fc_32f_dot_prod_32fc_a_neonasmpipeline ( lv_32fc_t* result, const  lv_32fc_t* input, const  float* taps, unsigned int num_points) {
+	.global	volk_32fc_32f_dot_prod_32fc_a_neonasmpipeline
+volk_32fc_32f_dot_prod_32fc_a_neonasmpipeline:
+	@ r0 - result: pointer to output array (32fc)
+	@ r1 - input: pointer to input array 1 (32fc)
+	@ r2 - taps: pointer to input array 2 (32f)
+	@ r3 - num_points: number of items to process
+
+	result .req r0
+	input .req r1
+	taps .req r2
+	num_points .req r3
+	quarterPoints .req r7
+	number .req r8
+	@ Note that according to the ARM EABI (AAPCS) Section 5.1.1:
+    @ registers s16-s31 (d8-d15, q4-q7) must be preserved across subroutine calls;
+    @ registers s0-s15 (d0-d7, q0-q3) do not need to be preserved
+    @ registers d16-d31 (q8-q15), if present, do not need to be preserved.
+	realAccQ   .req q0 @ d0-d1/s0-s3
+	compAccQ   .req q1 @ d2-d3/s4-s7
+	realAccS   .req s0 @ d0[0]
+	compAccS   .req s4 @ d2[0]
+	tapsVal    .req q2 @ d4-d5
+	outVal     .req q3 @ d6-d7
+    realMul    .req q8 @ d8-d9
+    compMul    .req q9 @ d16-d17
+    inRealVal  .req q10 @ d18-d19
+	inCompVal  .req q11 @ d20-d21
+
+	stmfd	sp!, {r7, r8, sl}	@ prologue - save register states
+
+    pld [taps, #128] @ pre-load hint - this is implementation specific!
+	pld [input, #128] @ pre-load hint - this is implementation specific!
+
+	veor realAccQ, realAccQ @ zero out accumulators
+	veor compAccQ, compAccQ @ zero out accumulators
+	movs quarterPoints, num_points, lsr #2
+	beq .loop2 @ if zero into quarterPoints
+
+	mov number, quarterPoints
+	@ Optimizing for pipeline
+	vld1.32 {d4-d5}, [taps:128]! @ tapsVal
+	vld2.32 {d18-d21}, [input:128]! @ inRealVal, inCompVal
+	subs number, number, #1
+
+.loop1:
+	@ do work here
+	pld [taps, #128] @ pre-load hint - this is implementation specific!
+	pld [input, #128] @ pre-load hint - this is implementation specific!
+	vmul.f32 realMul, tapsVal, inRealVal
+	vmul.f32 compMul, tapsVal, inCompVal
+	vadd.f32 realAccQ, realAccQ, realMul
+	vadd.f32 compAccQ, compAccQ, compMul
+	vld1.32 {d4-d5}, [taps:128]! @ tapsVal
+	vld2.32 {d18-d21}, [input:128]! @ inRealVal, inCompVal
+
+	subs number, number, #1
+	bne	.loop1	@ first loop
+
+    @ Sum up across realAccQ and compAccQ
+    vpadd.f32 d0, d0, d1      @ realAccQ +-> d0
+    vpadd.f32 d2, d2, d3      @ compAccQ +-> d2
+    vadd.f32 realAccS, s0, s1 @ sum the contents of d0 together (realAccQ)
+    vadd.f32 compAccS, s4, s5 @ sum the contents of d2 together (compAccQ)
+    @ critical values are now in s0 (realAccS), s4 (realAccQ)
+	mov	number, quarterPoints, asl #2
+.loop2:
+	cmp	num_points, number
+	bls	.done
+
+	vld1.32 {d4[0]}, [taps]! @ s8
+	vld2.32 {d5[0],d6[0]}, [input]! @ s10, s12
+	vmul.f32 s5, s8, s12
+	vmul.f32 s6, s8, s10
+	vadd.f32 realAccS, realAccS, s5
+	vadd.f32 compAccS, compAccS, s6
+
+	add number, number, #1
+	b .loop2
+
+.done:
+    vst1.32 {d0[0]}, [result]! @ realAccS
+    vst1.32 {d2[0]}, [result]  @ compAccS
+
+	ldmfd	sp!, {r7, r8, sl} @ epilogue - restore register states
+	bx	lr
diff --git a/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasmvmla.s b/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasmvmla.s
new file mode 100644
index 0000000000..cb50e4bced
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasmvmla.s
@@ -0,0 +1,74 @@
+@ static inline void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla ( lv_32fc_t* result, const  lv_32fc_t* input, const  float* taps, unsigned int num_points)
+	.global	volk_32fc_32f_dot_prod_32fc_a_neonasmvmla
+volk_32fc_32f_dot_prod_32fc_a_neonasmvmla:
+	@ r0 - result: pointer to output array (32fc)
+	@ r1 - input: pointer to input array 1 (32fc)
+	@ r2 - taps: pointer to input array 2 (32f)
+	@ r3 - num_points: number of items to process
+
+	result .req r0
+	input .req r1
+	taps .req r2
+	num_points .req r3
+	quarterPoints .req r7
+	number .req r8
+	@ Note that according to the ARM EABI (AAPCS) Section 5.1.1:
+    @ registers s16-s31 (d8-d15, q4-q7) must be preserved across subroutine calls;
+    @ registers s0-s15 (d0-d7, q0-q3) do not need to be preserved
+    @ registers d16-d31 (q8-q15), if present, do not need to be preserved.
+	realAccQ   .req q0 @ d0-d1/s0-s3
+	compAccQ   .req q1 @ d2-d3/s4-s7
+	realAccS   .req s0 @ d0[0]
+	compAccS   .req s4 @ d2[0]
+	tapsVal    .req q2 @ d4-d5
+	outVal     .req q3 @ d6-d7
+    realMul    .req q8 @ d8-d9
+    compMul    .req q9 @ d16-d17
+    inRealVal  .req q10 @ d18-d19
+	inCompVal  .req q11 @ d20-d21
+
+	stmfd	sp!, {r7, r8, sl}	@ prologue - save register states
+
+	veor realAccQ, realAccQ @ zero out accumulators
+	veor compAccQ, compAccQ @ zero out accumulators
+	movs quarterPoints, num_points, lsr #2
+	beq .loop2 @ if zero into quarterPoints
+
+	mov number, quarterPoints
+
+.loop1:
+	@ do work here
+	pld [taps, #128] @ pre-load hint - this is implementation specific!
+	pld [input, #128] @ pre-load hint - this is implementation specific!
+	vld1.32 {d4-d5}, [taps:128]! @ tapsVal
+	vld2.32 {d18-d21}, [input:128]! @ inRealVal, inCompVal
+	vmla.f32 realAccQ, tapsVal, inRealVal
+	vmla.f32 compAccQ, tapsVal, inCompVal
+	subs number, number, #1
+	bne	.loop1	@ first loop
+
+    @ Sum up across realAccQ and compAccQ
+    vpadd.f32 d0, d0, d1      @ realAccQ +-> d0
+    vpadd.f32 d2, d2, d3      @ compAccQ +-> d2
+    vadd.f32 realAccS, s0, s1 @ sum the contents of d0 together (realAccQ)
+    vadd.f32 compAccS, s4, s5 @ sum the contents of d2 together (compAccQ)
+    @ critical values are now in s0 (realAccS), s4 (compAccS)
+	mov	number, quarterPoints, asl #2
+.loop2:
+	cmp	num_points, number
+	bls	.done
+
+	vld1.32 {d4[0]}, [taps]! @ s8
+	vld2.32 {d5[0],d6[0]}, [input]! @ s10, s12
+	vmla.f32 realAccS, s8, s10 @ d0[0]
+	vmla.f32 compAccS, s8, s12 @ d2[0]
+
+	add number, number, #1
+	b .loop2
+
+.done:
+    vst1.32 {d0[0]}, [result]! @ realAccS
+    vst1.32 {d2[0]}, [result]  @ compAccS
+
+	ldmfd	sp!, {r7, r8, sl} @ epilogue - restore register states
+	bx	lr
diff --git a/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_unrollasm.s b/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_unrollasm.s
new file mode 100644
index 0000000000..7185ab9d17
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_unrollasm.s
@@ -0,0 +1,146 @@
+@ static inline void volk_32fc_32f_dot_prod_32fc_unrollasm ( lv_32fc_t* result, const  lv_32fc_t* input, const  float* taps, unsigned int num_points)
+.global	volk_32fc_32f_dot_prod_32fc_unrollasm
+volk_32fc_32f_dot_prod_32fc_unrollasm:
+	@ r0 - result: pointer to output array (32fc)
+	@ r1 - input: pointer to input array 1 (32fc)
+	@ r2 - taps: pointer to input array 2 (32f)
+	@ r3 - num_points: number of items to process
+
+    push    {r4, r5, r6, r7, r8, r9}
+    vpush   {q4-q7}
+    sub     r13, r13, #56   @ 0x38
+    add     r12, r13, #8
+    lsrs    r8, r3, #3
+    veor.32 q2, q5, q5
+    veor.32 q3, q5, q5
+    veor.32 q4, q5, q5
+    veor.32 q5, q5, q5
+    beq     .smallvector 
+    vld2.32 {d20-d23}, [r1]!
+    vld1.32 {d24-d25}, [r2]!
+    mov     r5, #1
+
+
+
+.mainloop:
+    vld2.32 {d14-d17}, [r1]!  @ q7,q8
+    vld1.32 {d18-d19}, [r2]!  @ q9
+
+    vmul.f32        q0, q12, q10 @ real mult
+    vmul.f32        q1, q12, q11 @ imag mult
+
+    add     r5, r5, #1
+    cmp     r5, r8
+
+    vadd.f32        q4, q4, q0@ q4 accumulates real
+    vadd.f32        q5, q5, q1@ q5 accumulates imag
+
+    vld2.32 {d20-d23}, [r1]!  @ q10-q11
+    vld1.32 {d24-d25}, [r2]!  @ q12
+
+    vmul.f32        q13, q9, q7
+    vmul.f32        q14, q9, q8
+    vadd.f32        q2, q2, q13  @ q2 accumulates real
+    vadd.f32        q3, q3, q14  @ q3 accumulates imag
+
+
+
+    bne     .mainloop 
+
+    vmul.f32        q0, q12, q10 @ real mult
+    vmul.f32        q1, q12, q11 @ imag mult
+
+    vadd.f32        q4, q4, q0@ q4 accumulates real
+    vadd.f32        q5, q5, q1@ q5 accumulates imag
+
+
+.smallvector:
+    vadd.f32        q0, q2, q4
+    add     r12, r13, #24
+    lsl     r8, r8, #3
+    vadd.f32        q1, q3, q5
+    cmp     r3, r8
+
+    vadd.f32    d0, d0, d1
+    vadd.f32    d1, d2, d3
+    vadd.f32    s14, s0, s1
+    vadd.f32    s15, s2, s3
+
+    vstr    s14, [r13]
+    vstr    s15, [r13, #4]
+    bls     .D1 
+    rsb     r12, r8, r3
+    lsr     r4, r12, #2
+    cmp     r4, #0
+    cmpne   r12, #3
+    lsl     r5, r4, #2
+    movhi   r6, #0
+    movls   r6, #1
+    bls     .L1 
+    vmov.i32        q10, #0 @ 0x00000000
+    mov     r9, r1
+    mov     r7, r2
+    vorr    q11, q10, q10
+
+.smallloop:
+    add     r6, r6, #1
+    vld2.32 {d16-d19}, [r9]!
+    cmp     r4, r6
+    vld1.32 {d24-d25}, [r7]!
+    vmla.f32        q11, q12, q8
+    vmla.f32        q10, q12, q9
+    bhi     .smallloop 
+    vmov.i32        q9, #0  @ 0x00000000
+    cmp     r12, r5
+    vadd.f32        d20, d20, d21
+    add     r8, r8, r5
+    vorr    q8, q9, q9
+    add     r1, r1, r5, lsl #3
+    vadd.f32        d22, d22, d23
+    add     r2, r2, r5, lsl #2
+    vpadd.f32       d18, d20, d20
+    vpadd.f32       d16, d22, d22
+    vmov.32 r4, d18[0]
+    vmov.32 r12, d16[0]
+    vmov    s13, r4
+    vadd.f32        s15, s13, s15
+    vmov    s13, r12
+    vadd.f32        s14, s13, s14
+    beq     .finishreduction 
+    .L1: 
+    add     r12, r8, #1
+    vldr    s13, [r2]
+    cmp     r3, r12
+    vldr    s11, [r1]
+    vldr    s12, [r1, #4]
+    vmla.f32        s14, s13, s11
+    vmla.f32        s15, s13, s12
+    bls     .finishreduction 
+    add     r8, r8, #2
+    vldr    s13, [r2, #4]
+    cmp     r3, r8
+    vldr    s11, [r1, #8]
+    vldr    s12, [r1, #12]
+    vmla.f32        s14, s13, s11
+    vmla.f32        s15, s13, s12
+    bls     .finishreduction 
+    vldr    s13, [r2, #8]
+    vldr    s11, [r1, #16]
+    vldr    s12, [r1, #20]
+    vmla.f32        s14, s13, s11
+    vmla.f32        s15, s13, s12
+
+.finishreduction:
+    vstr    s14, [r13]
+    vstr    s15, [r13, #4]
+    .D1:
+    ldr     r3, [r13]
+    str     r3, [r0]
+    ldr     r3, [r13, #4]
+    str     r3, [r0, #4]
+    add     r13, r13, #56   @ 0x38
+    vpop    {q4-q7}
+    pop     {r4, r5, r6, r7, r8, r9}
+    bx      r14
+
+
diff --git a/volk/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm.s b/volk/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm.s
new file mode 100644
index 0000000000..a1c5b7f184
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm.s
@@ -0,0 +1,98 @@
+@ static inline void volk_32fc_x2_dot_prod_32fc_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+	.global	volk_32fc_x2_dot_prod_32fc_neonasm
+volk_32fc_x2_dot_prod_32fc_neonasm:
+    push    {r4, r5, r6, r7, r8, lr}
+    vpush   {q0-q7}
+    vpush   {q8-q15}
+    mov r8, r3          @ hold on to num_points (r8)
+    @ zero out accumulators -- leave 1 reg in alu
+    veor    q8, q15, q15
+    mov r7, r0          @ (r7) is cVec
+    veor    q9, q15, q15
+    mov r5, r1          @ (r5) is aVec
+    veor    q10, q15, q15
+    mov r6, r2          @ (r6) is bVec
+    veor    q11, q15, q15
+    lsrs    r3, r3, #3  @ eighth_points (r3) = num_points/8
+    veor    q12, q15, q15
+    mov r12, r2         @ (r12) is bVec
+    veor    q13, q15, q15
+    mov r4, r1          @ (r4) is aVec
+    veor    q14, q15, q15
+    veor    q15, q15, q15
+    beq .smallvector @ nathan optimized this file based on an objdump
+    @ but I don't understand this jump. Seems like it should go to loop2
+    @ and smallvector (really vector reduction) shouldn't need to be a label
+    mov r2, #0          @ 0 out r2 (now number)
+.loop1:
+    add r2, r2, #1      @ increment number
+    vld4.32 {d0,d2,d4,d6}, [r12]! @ q0-q3
+    cmp r2, r3          @ is number < eighth_points
+    @pld [r12, #64]   
+    vld4.32 {d8,d10,d12,d14}, [r4]! @ q4-q7
+    @pld [r4, #64]  
+    vmla.f32    q12, q4, q0 @ real (re*re)
+    vmla.f32    q14, q4, q1 @ imag (re*im)
+    vmls.f32    q15, q5, q1 @ real (im*im)
+    vmla.f32    q13, q5, q0 @ imag (im*re)
+
+    vmla.f32    q8, q2, q6 @ real (re*re)
+    vmla.f32    q9, q2, q7 @ imag (re*im)
+    vmls.f32    q10, q3, q7 @ real (im*im)
+    vmla.f32    q11, q3, q6 @ imag (im*re)
+    bne .loop1
+    lsl r2, r3, #3      @ r2 = eighth_points * 8
+    add r6, r6, r2      @ bVec = bVec + eighth_points -- whyyyyy gcc?!?
+    add r5, r5, r2      @ aVec = aVec + eighth_points
+    @ q12-q13 were original real accumulators
+    @ q14-q15 were original imag accumulators
+    @ reduce 8 accumulators down to 2 (1 real, 1 imag)
+    vadd.f32    q8, q10, q8 @ real + real
+    vadd.f32    q11, q11, q9 @ imag + imag
+    vadd.f32    q12, q12, q15 @ real + real
+    vadd.f32    q14, q14, q13 @ imag + imag
+    vadd.f32    q8, q8, q12
+    vadd.f32    q9, q9, q14
+.smallvector:
+    lsl r4, r3, #3
+    cmp r8, r4
+    vst2.32 {d16-d19}, [sp :64] @ whaaaaat? no way this is necessary!
+    vldr    s15, [sp, #8]
+    vldr    s17, [sp]
+    vldr    s16, [sp, #4]
+    vadd.f32    s17, s17, s15
+    vldr    s11, [sp, #12]
+    vldr    s12, [sp, #24]
+    vldr    s13, [sp, #28]
+    vldr    s14, [sp, #16]
+    vldr    s15, [sp, #20]
+    vadd.f32    s16, s16, s11
+    vadd.f32    s17, s17, s12
+    vadd.f32    s16, s16, s13
+    vadd.f32    s17, s17, s14
+    vadd.f32    s16, s16, s15
+    vstr    s17, [r7]
+    vstr    s16, [r7, #4]
+    bls .done
+.loop2:
+    mov r3, r6
+    add r6, r6, #8
+    vldr    s0, [r3]
+    vldr    s1, [r6, #-4]
+    mov r3, r5
+    add r5, r5, #8
+    vldr    s2, [r3]
+    vldr    s3, [r5, #-4]
+    bl  __mulsc3            @ GCC/Clang built-in. Portability?
+    add r4, r4, #1
+    cmp r4, r8
+    vadd.f32    s17, s17, s0
+    vadd.f32    s16, s16, s1
+    vstr    s17, [r7]
+    vstr    s16, [r7, #4]
+    bne .loop2
+.done: 
+    vpop    {q8-q15}
+    vpop    {q0-q7}
+    pop {r4, r5, r6, r7, r8, pc}
+
diff --git a/volk/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm_opttests.s b/volk/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm_opttests.s
new file mode 100644
index 0000000000..77f026e1b0
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_neonasm_opttests.s
@@ -0,0 +1,96 @@
+@ static inline void volk_32fc_x2_dot_prod_32fc_neonasm_opttests(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)@
+.global	volk_32fc_x2_dot_prod_32fc_neonasm_opttests
+volk_32fc_x2_dot_prod_32fc_neonasm_opttests:
+    push    {r4, r5, r6, r7, r8, r9, sl, fp, lr}
+    vpush   {d8-d15}
+    lsrs    fp, r3, #3
+    sub     sp, sp, #52     @ 0x34
+    mov     r9, r3
+    mov     sl, r0
+    mov     r7, r1
+    mov     r8, r2
+    vorr    q0, q7, q7
+    vorr    q1, q7, q7
+    vorr    q2, q7, q7
+    vorr    q3, q7, q7
+    vorr    q4, q7, q7
+    vorr    q5, q7, q7
+    veor    q6, q7, q7
+    vorr    q7, q7, q7
+    beq     .smallvector 
+    mov     r4, r1
+    mov     ip, r2
+    mov     r3, #0
+.mainloop:
+    @mov     r6, ip
+    @mov     r5, r4
+    vld4.32 {d24,d26,d28,d30}, [r6]!
+    @add     ip, ip, #64     @ 0x40
+    @add     r4, r4, #64     @ 0x40
+    vld4.32 {d16,d18,d20,d22}, [r5]!
+    add     r3, r3, #1
+    vld4.32 {d25,d27,d29,d31}, [r6]!
+    vld4.32 {d17,d19,d21,d23}, [r5]!
+    vmla.f32        q6, q8, q12
+    vmla.f32        q0, q9, q12
+    cmp     r3, fp
+    vmls.f32        q5, q13, q9
+    vmla.f32        q2, q13, q8
+    vmla.f32        q7, q10, q14
+    vmla.f32        q1, q11, q14
+    vmls.f32        q4, q15, q11
+    vmla.f32        q3, q15, q10
+    bne     .mainloop 
+    lsl     r3, fp, #6
+    add     r8, r8, r3
+    add     r7, r7, r3
+.smallvector:
+    vadd.f32        q3, q2, q3
+    add     r3, sp, #16
+    lsl     r4, fp, #3
+    vadd.f32        q4, q5, q4
+    cmp     r9, r4
+    vadd.f32        q6, q6, q7
+    vadd.f32        q1, q0, q1
+    vadd.f32        q8, q6, q4
+    vadd.f32        q9, q1, q3
+    vst2.32 {d16-d19}, [r3 :64]
+    vldr    s15, [sp, #24]
+    vldr    s16, [sp, #16]
+    vldr    s17, [sp, #20]
+    vadd.f32        s16, s16, s15
+    vldr    s11, [sp, #28]
+    vldr    s12, [sp, #40]  @ 0x28
+    vldr    s13, [sp, #44]  @ 0x2c
+    vldr    s14, [sp, #32]
+    vldr    s15, [sp, #36]  @ 0x24
+    vadd.f32        s17, s17, s11
+    vadd.f32        s16, s16, s12
+    vadd.f32        s17, s17, s13
+    vadd.f32        s16, s16, s14
+    vadd.f32        s17, s17, s15
+    vstr    s16, [sl]
+    vstr    s17, [sl, #4]
+    bls     .epilog 
+    add     r5, sp, #8
+.tailcase:
+    ldr     r3, [r7], #8
+    mov     r0, r5
+    ldr     r1, [r8], #8
+    add     r4, r4, #1
+    ldr     ip, [r7, #-4]
+    ldr     r2, [r8, #-4]
+    str     ip, [sp]
+    bl      __mulsc3
+    vldr    s14, [sp, #8]
+    vldr    s15, [sp, #12]
+    vadd.f32        s16, s16, s14
+    cmp     r4, r9
+    vadd.f32        s17, s17, s15
+    vstr    s16, [sl]
+    vstr    s17, [sl, #4]
+    bne     .tailcase 
+.epilog:
+    add     sp, sp, #52     @ 0x34
+    vpop    {d8-d15}
+    pop     {r4, r5, r6, r7, r8, r9, sl, fp, pc}
diff --git a/volk/kernels/volk/asm/neon/volk_32fc_x2_multiply_32fc_neonasm.s b/volk/kernels/volk/asm/neon/volk_32fc_x2_multiply_32fc_neonasm.s
new file mode 100644
index 0000000000..5d79b466ac
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_32fc_x2_multiply_32fc_neonasm.s
@@ -0,0 +1,45 @@
+@ static inline void volk_32fc_x2_multiply_32fc_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+	.global	volk_32fc_x2_multiply_32fc_neonasm
+volk_32fc_x2_multiply_32fc_neonasm:
+    push    {r4, r5, r6, r7, r8, r9, r14}
+    lsrs    r7, r3, #2
+    @ r0 is c vector
+    @ r1 is a vector
+    @ r2 is b vector
+    @ r3 is num_points
+    @ r7 is quarter_points
+    beq     .smallvector
+    mov     r5, #0
+.mainloop:
+   vld2.32   {d24-d27}, [r1]!  @ ar=q12, ai=q13
+   add       r5, r5, #1
+   cmp       r5, r7
+   vld2.32   {d20-d23}, [r2]!  @ br=q10, bi=q11
+   pld       [r1]
+   pld       [r2]
+   vmul.f32  q0, q12, q10 @ q15 = ar*br
+   vmul.f32  q1, q13, q11 @ q11 = ai*bi
+   vmul.f32  q2, q12, q11 @ q14 = ar*bi
+   vmul.f32  q3, q13, q10 @ q12 = ai*br
+   vsub.f32  q8, q0, q1  @ real
+   vadd.f32  q9, q2, q3  @ imag
+   vst2.32   {d16-d19}, [r0]!
+   bne     .mainloop 
+
+.smallvector:
+   lsl     r5, r7, #2
+   cmp     r3, r7
+   bls     .done
+.tailcase:
+   add    r5, r5, #1
+   vld1.32    d1, [r1]! @ s2, s3 = ar, ai
+   vld1.32    d0, [r2]! @ s0, s1 = br, bi
+   vmul.f32   s4, s0, s2 @ s4 = ar*br
+   vmul.f32   s5, s0, s3 @ s5 = ar*bi
+   vmls.f32   s4, s1, s3 @ s4 = s4 - ai*bi
+   vmla.f32   s5, s1, s2 @ s5 = s5 + ai*br
+   vst1.32    d2, [r0]!
+   cmp     r3, r5
+   bne     .tailcase 
+.done:
+   pop     {r4, r5, r6, r7, r8, r9, r15}
diff --git a/volk/kernels/volk/asm/neon/volk_arm_32fc_32f_dot_prod_32fc_a_neonpipeline.s b/volk/kernels/volk/asm/neon/volk_arm_32fc_32f_dot_prod_32fc_a_neonpipeline.s
new file mode 100644
index 0000000000..758e7436cd
--- /dev/null
+++ b/volk/kernels/volk/asm/neon/volk_arm_32fc_32f_dot_prod_32fc_a_neonpipeline.s
@@ -0,0 +1,92 @@
+@ static inline void volk_32fc_32f_dot_prod_32fc_a_neonpipeline ( lv_32fc_t* result, const  lv_32fc_t* input, const  float* taps, unsigned int num_points) {
+	.global	volk_32fc_32f_dot_prod_32fc_a_neonpipeline
+volk_32fc_32f_dot_prod_32fc_a_neonpipeline:
+    @ r0 - result: pointer to output array (32fc)
+    @ r1 - input: pointer to input array 1 (32fc)
+    @ r2 - taps: pointer to input array 2 (32f)
+    @ r3 - num_points: number of items to process
+    
+    result .req r0
+    input .req r1
+    taps .req r2
+    num_points .req r3
+    quarterPoints .req r7
+    number .req r8
+    @ Note that according to the ARM EABI (AAPCS) Section 5.1.1:
+    @ registers s16-s31 (d8-d15, q4-q7) must be preserved across subroutine calls;
+    @ registers s0-s15 (d0-d7, q0-q3) do not need to be preserved
+    @ registers d16-d31 (q8-q15), if present, do not need to be preserved.
+    realAccQ   .req q0 @ d0-d1/s0-s3
+    compAccQ   .req q1 @ d2-d3/s4-s7
+    realAccS   .req s0 @ d0[0]
+    compAccS   .req s4 @ d2[0]
+    tapsVal    .req q2 @ d4-d5
+    outVal     .req q3 @ d6-d7
+    realMul    .req q8 @ d8-d9
+    compMul    .req q9 @ d16-d17
+    inRealVal  .req q10 @ d18-d19
+    inCompVal  .req q11 @ d20-d21
+    
+    stmfd	sp!, {r7, r8, sl}	@ prologue - save register states
+    
+    pld [taps, #128] @ pre-load hint - this is implementation specific!
+    pld [input, #128] @ pre-load hint - this is implementation specific!
+    
+    veor realAccQ, realAccQ @ zero out accumulators
+    veor compAccQ, compAccQ @ zero out accumulators
+    movs quarterPoints, num_points, lsr #2
+    beq .loop2 @ if zero into quarterPoints
+    
+    mov number, quarterPoints
+    @ Optimizing for pipeline
+    vld1.32 {d4-d5}, [taps:128]! @ tapsVal
+    vld2.32 {d20-d23}, [input:128]! @ inRealVal, inCompVal
+    subs number, number, #1
+
+.loop1:
+	@ do work here
+	pld [taps, #128] @ pre-load hint - this is implementation specific!
+	pld [input, #128] @ pre-load hint - this is implementation specific!
+	vmul.f32 realMul, tapsVal, inRealVal
+	vmul.f32 compMul, tapsVal, inCompVal
+	vadd.f32 realAccQ, realAccQ, realMul
+	vadd.f32 compAccQ, compAccQ, compMul
+	vld1.32 {d4-d5}, [taps:128]! @ tapsVal
+	vld2.32 {d20-d23}, [input:128]! @ inRealVal, inCompVal
+
+	subs number, number, #1
+	bne	.loop1	@ first loop
+
+	vmul.f32 realMul, tapsVal, inRealVal
+	vmul.f32 compMul, tapsVal, inCompVal
+	vadd.f32 realAccQ, realAccQ, realMul
+	vadd.f32 compAccQ, compAccQ, compMul
+
+    @ Sum up across realAccQ and compAccQ
+    vpadd.f32 d0, d0, d1      @ realAccQ +-> d0
+    vpadd.f32 d2, d2, d3      @ compAccQ +-> d2
+    vadd.f32 realAccS, s0, s1 @ sum the contents of d0 together (realAccQ)
+    vadd.f32 compAccS, s4, s5 @ sum the contents of d2 together (compAccQ)
+    @ critical values are now in s0 (realAccS), s4 (realAccQ)
+	mov	number, quarterPoints, asl #2
+	sub number, number, #5
+.loop2:
+	cmp	num_points, number
+	bls	.done
+
+	vld1.32 {d4[0]}, [taps]! @ s8
+	vld2.32 {d5[0],d6[0]}, [input]! @ s10, s12
+	vmul.f32 s5, s8, s10
+	vmul.f32 s6, s8, s12
+	vadd.f32 realAccS, realAccS, s5
+	vadd.f32 compAccS, compAccS, s6
+
+	add number, number, #1
+	b .loop2
+
+.done:
+    vst1.32 {d0[0]}, [result]! @ realAccS
+    vst1.32 {d2[0]}, [result]  @ compAccS
+
+	ldmfd	sp!, {r7, r8, sl} @ epilogue - restore register states
+	bx	lr
diff --git a/volk/kernels/volk/volk_16i_max_star_16i.h b/volk/kernels/volk/volk_16i_max_star_16i.h
index c67351c5fa..5366a2e325 100644
--- a/volk/kernels/volk/volk_16i_max_star_16i.h
+++ b/volk/kernels/volk/volk_16i_max_star_16i.h
@@ -85,6 +85,44 @@ static inline  void volk_16i_max_star_16i_a_ssse3(short* target, short* src0, un
 
 #endif /*LV_HAVE_SSSE3*/
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_16i_max_star_16i_neon(short* target, short* src0, unsigned int num_points) {
+    const unsigned int eighth_points = num_points / 8;
+    unsigned number;
+    int16x8_t input_vec;
+    int16x8_t diff, max_vec, zeros;
+    uint16x8_t comp1, comp2;
+    zeros = veorq_s16(zeros, zeros);
+    
+    int16x8x2_t tmpvec;
+
+    int16x8_t candidate_vec = vld1q_dup_s16(src0 );
+    short candidate;
+    ++src0;
+
+    for(number=0; number < eighth_points; ++number) {
+        input_vec = vld1q_s16(src0);
+        __builtin_prefetch(src0+16);
+        diff = vsubq_s16(candidate_vec, input_vec);
+        comp1 = vcgeq_s16(diff, zeros);
+        comp2 = vcltq_s16(diff, zeros);
+
+        tmpvec.val[0] = vandq_s16(candidate_vec, (int16x8_t)comp1);
+        tmpvec.val[1] = vandq_s16(input_vec, (int16x8_t)comp2);
+
+        candidate_vec = vaddq_s16(tmpvec.val[0], tmpvec.val[1]);
+        src0 += 8;
+    }
+        vst1q_s16(&candidate, candidate_vec);
+
+    for(number=0; number < num_points%8; number++) {
+        candidate = ((int16_t)(candidate - src0[number]) > 0) ? candidate : src0[number];
+    }
+    target[0] = candidate;
+}
+#endif /*LV_HAVE_NEON*/
+
 #ifdef LV_HAVE_GENERIC
 
 static inline void volk_16i_max_star_16i_generic(short* target, short* src0, unsigned int num_points) {
diff --git a/volk/kernels/volk/volk_16i_max_star_horizontal_16i.h b/volk/kernels/volk/volk_16i_max_star_horizontal_16i.h
index ef88ec094f..1915522947 100644
--- a/volk/kernels/volk/volk_16i_max_star_horizontal_16i.h
+++ b/volk/kernels/volk/volk_16i_max_star_horizontal_16i.h
@@ -110,6 +110,40 @@ static inline  void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, in
 
 #endif /*LV_HAVE_SSSE3*/
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_16i_max_star_horizontal_16i_neon(int16_t* target, int16_t* src0, unsigned int num_points) {
+    const unsigned int eighth_points = num_points / 16;
+    unsigned number;
+    int16x8x2_t input_vec;
+    int16x8_t diff, max_vec, zeros;
+    uint16x8_t comp1, comp2;
+    zeros = veorq_s16(zeros, zeros);
+    for(number=0; number < eighth_points; ++number) {
+        input_vec = vld2q_s16(src0);
+        //__builtin_prefetch(src0+16);
+        diff = vsubq_s16(input_vec.val[0], input_vec.val[1]);
+        comp1 = vcgeq_s16(diff, zeros);
+        comp2 = vcltq_s16(diff, zeros);
+        
+        input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1);
+        input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2);
+        
+        max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]);
+        vst1q_s16(target, max_vec);
+        src0 += 16;
+        target += 8;
+    }
+    for(number=0; number < num_points%16; number+=2) {
+        target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0) ? src0[number] : src0[number+1]; 
+    }
+
+}
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_NEON
+extern void volk_16i_max_star_horizontal_16i_neonasm(int16_t* target, int16_t* src0, unsigned int num_points);
+#endif /* LV_HAVE_NEON */
 
 #ifdef LV_HAVE_GENERIC
 static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target, int16_t* src0, unsigned int num_points) {
diff --git a/volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h b/volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h
index 56b2cc07ab..8e84b6ea17 100644
--- a/volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h
+++ b/volk/kernels/volk/volk_16i_x4_quad_max_star_16i.h
@@ -165,6 +165,66 @@ static inline  void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* s
 
 #endif /*LV_HAVE_SSE2*/
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_16i_x4_quad_max_star_16i_neon(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) {
+    const unsigned int eighth_points = num_points / 8;
+    unsigned i;
+
+    int16x8_t src0_vec, src1_vec, src2_vec, src3_vec;
+    int16x8_t diff12, diff34;
+    int16x8_t comp0, comp1, comp2, comp3;
+    int16x8_t result1_vec, result2_vec;
+    int16x8_t zeros;
+    zeros = veorq_s16(zeros, zeros);
+    for(i=0; i < eighth_points; ++i) {
+        src0_vec = vld1q_s16(src0);
+        src1_vec = vld1q_s16(src1);
+        src2_vec = vld1q_s16(src2);
+        src3_vec = vld1q_s16(src3);
+        diff12 = vsubq_s16(src0_vec, src1_vec);
+        diff34  = vsubq_s16(src2_vec, src3_vec);
+        comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
+        comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
+        comp2 = (int16x8_t)vcgeq_s16(diff34, zeros);
+        comp3 = (int16x8_t)vcltq_s16(diff34, zeros);
+        comp0 = vandq_s16(src0_vec, comp0);
+        comp1 = vandq_s16(src1_vec, comp1);
+        comp2 = vandq_s16(src2_vec, comp2);
+        comp3 = vandq_s16(src3_vec, comp3);
+
+        result1_vec = vaddq_s16(comp0, comp1);
+        result2_vec = vaddq_s16(comp2, comp3);
+
+        diff12 = vsubq_s16(result1_vec, result2_vec);
+        comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
+        comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
+        comp0 = vandq_s16(result1_vec, comp0);
+        comp1 = vandq_s16(result2_vec, comp1);
+        result1_vec = vaddq_s16(comp0, comp1);
+        vst1q_s16(target, result1_vec);
+    src0 += 8;
+    src1 += 8;
+    src2 += 8;
+    src3 += 8;
+    target += 8;
+    }
+
+
+    short temp0 = 0;
+    short temp1 = 0;
+    for(i=eighth_points*8; i < num_points; ++i) {
+      temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1;
+      temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3;
+      *target++ = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
+    src0++;
+    src1++;
+    src2++;
+    src3++;
+    }
+}
+#endif /* LV_HAVE_NEON */
+
 
 #ifdef LV_HAVE_GENERIC
 static inline void volk_16i_x4_quad_max_star_16i_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) {
diff --git a/volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h b/volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h
index 9b6d19fd66..28575b6282 100644
--- a/volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h
+++ b/volk/kernels/volk/volk_16i_x5_add_quad_16i_x4.h
@@ -112,6 +112,52 @@ static inline  void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* ta
 }
 #endif /*LV_HAVE_SSE2*/
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_16i_x5_add_quad_16i_x4_neon(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points) {
+
+    const unsigned int eighth_points = num_points / 8;
+    int number = 0;
+
+    int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec;
+    int16x8_t target0_vec, target1_vec, target2_vec, target3_vec;
+    for(number = 0; number < eighth_points; ++number) {
+        src0_vec = vld1q_s16(src0);
+        src1_vec = vld1q_s16(src1);
+        src2_vec = vld1q_s16(src2);
+        src3_vec = vld1q_s16(src3);
+        src4_vec = vld1q_s16(src4);
+
+        target0_vec = vaddq_s16(src0_vec , src1_vec);
+        target1_vec = vaddq_s16(src0_vec , src2_vec);
+        target2_vec = vaddq_s16(src0_vec , src3_vec);
+        target3_vec = vaddq_s16(src0_vec , src4_vec);
+
+        vst1q_s16(target0, target0_vec);
+        vst1q_s16(target1, target1_vec);
+        vst1q_s16(target2, target2_vec);
+        vst1q_s16(target3, target3_vec);
+        src0 += 8;
+        src1 += 8;
+        src2 += 8;
+        src3 += 8;
+        src4 += 8;
+        target0 += 8;
+        target1 += 8;
+        target2 += 8;
+        target3 += 8;
+    }
+
+    for(number = eighth_points * 8; number < num_points; ++number) {
+        *target0++ = *src0 + *src1++;
+        *target1++ = *src0 + *src2++;
+        *target2++ = *src0 + *src3++;
+        *target3++ = *src0++ + *src4++;
+    }
+}
+
+#endif /* LV_HAVE_NEON */
+
 
 #ifdef LV_HAVE_GENERIC
 
diff --git a/volk/kernels/volk/volk_16u_byteswap.h b/volk/kernels/volk/volk_16u_byteswap.h
index 57f2008991..436caf0474 100644
--- a/volk/kernels/volk/volk_16u_byteswap.h
+++ b/volk/kernels/volk/volk_16u_byteswap.h
@@ -106,6 +106,36 @@ static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int n
 }
 #endif /* LV_HAVE_SSE2 */
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+  \brief Byteswaps (in-place) an unaligned vector of int16_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points){
+  unsigned int number;
+  unsigned int eighth_points = num_points / 8;
+  uint16x8_t input, output;
+  uint16_t* inputPtr = intsToSwap;
+
+  for(number = 0; number < eighth_points; number++) {
+    input = vld1q_u16(inputPtr);
+    output = vsriq_n_u16(output, input, 8);
+    output = vsliq_n_u16(output, input, 8);
+    vst1q_u16(inputPtr, output);
+    inputPtr += 8;
+  }
+
+  for(number = eighth_points * 8; number < num_points; number++){
+    uint16_t output = *inputPtr;
+    output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+    *inputPtr = output;
+    inputPtr++;
+  }
+}
+#endif /* LV_HAVE_NEON */
+
 #ifdef LV_HAVE_GENERIC
 /*!
   \brief Byteswaps (in-place) an aligned vector of int16_t's.
diff --git a/volk/kernels/volk/volk_32f_invsqrt_32f.h b/volk/kernels/volk/volk_32f_invsqrt_32f.h
index 055370661a..8ea12a73c4 100644
--- a/volk/kernels/volk/volk_32f_invsqrt_32f.h
+++ b/volk/kernels/volk/volk_32f_invsqrt_32f.h
@@ -90,6 +90,37 @@ static inline void volk_32f_invsqrt_32f_a_sse(float* cVector, const float* aVect
 }
 #endif /* LV_HAVE_SSE */
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+\brief Sqrts the two input vectors and store their results in the third vector
+\param cVector The vector where the results will be stored
+\param aVector One of the vectors to be invsqrted
+\param num_points The number of values in aVector and bVector to be invsqrted together and stored into cVector
+*/
+static inline void volk_32f_invsqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points){
+    unsigned int number;
+    const unsigned int quarter_points = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    float32x4_t a_val, c_val;
+    for (number = 0; number < quarter_points; ++number)
+    {
+        a_val = vld1q_f32(aPtr);
+        c_val = vrsqrteq_f32(a_val);
+        vst1q_f32(cPtr, c_val);
+        aPtr += 4;
+        cPtr += 4;
+    }
+   
+    for(number=quarter_points * 4;number < num_points; number++)
+      *cPtr++ = Q_rsqrt(*aPtr++);
+
+}
+#endif /* LV_HAVE_NEON */
+
+
 #ifdef LV_HAVE_GENERIC
 /*!
   \brief Sqrts the two input vectors and store their results in the third vector
diff --git a/volk/kernels/volk/volk_32f_s32f_multiply_32f.h b/volk/kernels/volk/volk_32f_s32f_multiply_32f.h
index 2dd86a17c2..8665d4e90e 100644
--- a/volk/kernels/volk/volk_32f_s32f_multiply_32f.h
+++ b/volk/kernels/volk/volk_32f_s32f_multiply_32f.h
@@ -180,6 +180,35 @@ static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector, const float*
 }
 #endif /* LV_HAVE_AVX */
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+  \brief Scalar float multiply
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param scalar the scalar value
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_u_neon(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const float* inputPtr = aVector;
+  float* outputPtr = cVector;
+  const unsigned int quarterPoints = num_points / 4;
+
+  float32x4_t aVal, cVal;
+
+  for(number = 0; number < quarterPoints; number++){
+    aVal = vld1q_f32(inputPtr); // Load into NEON regs
+    cVal = vmulq_n_f32 (aVal, scalar); // Do the multiply
+    vst1q_f32(outputPtr, cVal); // Store results back to output
+    inputPtr += 4;
+    outputPtr += 4;
+  }
+  for(number = quarterPoints * 4; number < num_points; number++){
+      *outputPtr++ = (*inputPtr++) * scalar;
+  }
+}
+#endif /* LV_HAVE_NEON */
 
 #ifdef LV_HAVE_GENERIC
 /*!
@@ -216,6 +245,4 @@ static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector, const float*
 #endif /* LV_HAVE_GENERIC */
 
 
-
-
 #endif /* INCLUDED_volk_32f_s32f_multiply_32f_a_H */
diff --git a/volk/kernels/volk/volk_32f_sqrt_32f.h b/volk/kernels/volk/volk_32f_sqrt_32f.h
index ab9fffd7dc..2523abf0da 100644
--- a/volk/kernels/volk/volk_32f_sqrt_32f.h
+++ b/volk/kernels/volk/volk_32f_sqrt_32f.h
@@ -40,6 +40,35 @@ static inline void volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector,
 }
 #endif /* LV_HAVE_SSE */
 
+#ifdef LV_HAVE_NEON
+/*!
+  \brief Sqrts the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be sqrted
+  \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector
+*/
+static inline void volk_32f_sqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+    float32x4_t in_vec, out_vec;
+
+    for(number = 0; number < quarter_points; number++){
+        in_vec = vld1q_f32(aPtr);
+        // note that armv8 has vsqrt_f32 which will be much better
+        out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec) );
+        vst1q_f32(cPtr, out_vec);
+        aPtr += 4;
+        cPtr += 4;
+    }
+
+    for(number = quarter_points * 4; number < num_points; number++){
+      *cPtr++ = sqrtf(*aPtr++);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
 #ifdef LV_HAVE_GENERIC
 /*!
   \brief Sqrts the two input vectors and store their results in the third vector
diff --git a/volk/kernels/volk/volk_32f_x2_add_32f.h b/volk/kernels/volk/volk_32f_x2_add_32f.h
index 42278f6068..a9a1d4fbf0 100644
--- a/volk/kernels/volk/volk_32f_x2_add_32f.h
+++ b/volk/kernels/volk/volk_32f_x2_add_32f.h
@@ -109,6 +109,49 @@ static inline void volk_32f_x2_add_32f_a_sse(float* cVector, const float* aVecto
 }
 #endif /* LV_HAVE_SSE */
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*
+  \brief Adds the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be added
+  \param bVector One of the vectors to be added
+  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_x2_add_32f_u_neon(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) {
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    float32x4_t aVal, bVal, cVal;
+    for(number=0; number < quarterPoints; number++){
+      // Load in to NEON registers
+      aVal = vld1q_f32(aPtr);
+      bVal = vld1q_f32(bPtr);
+      __builtin_prefetch(aPtr+4);
+      __builtin_prefetch(bPtr+4);
+
+      // vector add
+      cVal = vaddq_f32(aVal, bVal);
+      // Store the results back into the C container
+      vst1q_f32(cPtr,cVal); 
+
+      aPtr += 4; // q uses quadwords, 4 floats per vadd
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4; // should be = num_points
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+
+}
+
+#endif /* LV_HAVE_NEON */
+
 #ifdef LV_HAVE_GENERIC
 /*!
   \brief Adds the two input vectors and store their results in the third vector
diff --git a/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h b/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h
index b91252e36f..ed16d9a49e 100644
--- a/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h
+++ b/volk/kernels/volk/volk_32f_x2_dot_prod_32f.h
@@ -577,4 +577,92 @@ static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const  float*
 
 #endif /*LV_HAVE_AVX*/
 
+#ifdef LV_HAVE_NEON
+static inline void volk_32f_x2_dot_prod_32f_neonopts(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+    unsigned int quarter_points = num_points / 16;
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr=  taps;
+    unsigned int number = 0;
+
+    float32x4x4_t a_val, b_val, accumulator0, accumulator1;
+    accumulator0.val[0] = vdupq_n_f32(0);
+    accumulator0.val[1] = vdupq_n_f32(0);
+    accumulator0.val[2] = vdupq_n_f32(0);
+    accumulator0.val[3] = vdupq_n_f32(0);
+    // factor of 4 loop unroll with independent accumulators
+    // uses 12 out of 16 neon q registers
+    for( number = 0; number < quarter_points; ++number) {
+        a_val = vld4q_f32(aPtr);
+        b_val = vld4q_f32(bPtr);
+        accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]);
+        accumulator0.val[1] = vmlaq_f32(accumulator0.val[1], a_val.val[1], b_val.val[1]);
+        accumulator0.val[2] = vmlaq_f32(accumulator0.val[2], a_val.val[2], b_val.val[2]);
+        accumulator0.val[3] = vmlaq_f32(accumulator0.val[3], a_val.val[3], b_val.val[3]);
+        aPtr += 16;
+        bPtr += 16;
+    }
+    accumulator0.val[0] = vaddq_f32(accumulator0.val[0], accumulator0.val[1]);
+    accumulator0.val[2] = vaddq_f32(accumulator0.val[2], accumulator0.val[3]);
+    accumulator0.val[0] = vaddq_f32(accumulator0.val[2], accumulator0.val[0]);
+    __VOLK_ATTR_ALIGNED(32) float accumulator[4];
+    vst1q_f32(accumulator, accumulator0.val[0]);
+    dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
+
+    for(number = quarter_points*16; number < num_points; number++){
+      dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = dotProduct;
+}
+
+#endif
+
+
+
+
+#ifdef LV_HAVE_NEON
+static inline void volk_32f_x2_dot_prod_32f_neon(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+    unsigned int quarter_points = num_points / 8;
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr=  taps;
+    unsigned int number = 0;
+
+    float32x4x2_t a_val, b_val, accumulator_val;
+    accumulator_val.val[0] = vdupq_n_f32(0);
+    accumulator_val.val[1] = vdupq_n_f32(0);
+    // factor of 2 loop unroll with independent accumulators
+    for( number = 0; number < quarter_points; ++number) {
+        a_val = vld2q_f32(aPtr);
+        b_val = vld2q_f32(bPtr);
+        accumulator_val.val[0] = vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
+        accumulator_val.val[1] = vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
+        aPtr += 8;
+        bPtr += 8;
+    }
+    accumulator_val.val[0] = vaddq_f32(accumulator_val.val[0], accumulator_val.val[1]);
+    __VOLK_ATTR_ALIGNED(32) float accumulator[4];
+    vst1q_f32(accumulator, accumulator_val.val[0]);
+    dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
+
+    for(number = quarter_points*8; number < num_points; number++){
+      dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = dotProduct;
+}
+
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_NEON
+extern void volk_32f_x2_dot_prod_32f_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_NEON
+extern void volk_32f_x2_dot_prod_32f_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+#endif /* LV_HAVE_NEON */
+
 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
diff --git a/volk/kernels/volk/volk_32f_x2_interleave_32fc.h b/volk/kernels/volk/volk_32f_x2_interleave_32fc.h
index 0935cb32bd..3591b24d69 100644
--- a/volk/kernels/volk/volk_32f_x2_interleave_32fc.h
+++ b/volk/kernels/volk/volk_32f_x2_interleave_32fc.h
@@ -48,6 +48,39 @@ static inline void volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, c
 }
 #endif /* LV_HAVE_SSE */
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+  \brief Interleaves the I & Q vector data into the complex vector.
+  \param iBuffer The I buffer data to be interleaved
+  \param qBuffer The Q buffer data to be interleaved
+  \param complexVector The complex output vector
+  \param num_points The number of complex data values to be interleaved
+*/
+static inline void volk_32f_x2_interleave_32fc_neon(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
+    unsigned int quarter_points = num_points / 4;
+    unsigned int number;
+    float* complexVectorPtr = (float*) complexVector;
+
+    float32x4x2_t complex_vec;
+    for(number=0; number < quarter_points; ++number) {
+        complex_vec.val[0] = vld1q_f32(iBuffer);
+        complex_vec.val[1] = vld1q_f32(qBuffer);
+        vst2q_f32(complexVectorPtr, complex_vec);
+        iBuffer += 4;
+        qBuffer += 4;
+        complexVectorPtr += 8;
+    } 
+
+    for(number=quarter_points * 4; number < num_points; ++number) {
+        *complexVectorPtr++ = *iBuffer++;
+        *complexVectorPtr++ = *qBuffer++;
+    }
+
+}
+#endif /* LV_HAVE_NEON */
+
+
 #ifdef LV_HAVE_GENERIC
 /*!
   \brief Interleaves the I & Q vector data into the complex vector.
diff --git a/volk/kernels/volk/volk_32f_x2_max_32f.h b/volk/kernels/volk/volk_32f_x2_max_32f.h
index 27633acae8..a1403fba18 100644
--- a/volk/kernels/volk/volk_32f_x2_max_32f.h
+++ b/volk/kernels/volk/volk_32f_x2_max_32f.h
@@ -45,6 +45,42 @@ static inline void volk_32f_x2_max_32f_a_sse(float* cVector, const float* aVecto
 }
 #endif /* LV_HAVE_SSE */
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_x2_max_32f_neon(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int quarter_points = num_points / 4;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+    
+    float32x4_t a_vec, b_vec, c_vec;
+    for(number = 0; number < quarter_points; number++){
+        a_vec = vld1q_f32(aPtr);
+        b_vec = vld1q_f32(bPtr);
+        c_vec = vmaxq_f32(a_vec, b_vec);
+        vst1q_f32(cPtr, c_vec);
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    for(number = quarter_points*4; number < num_points; number++){
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = ( a > b ? a : b);
+    }
+
+}
+#endif /* LV_HAVE_NEON */
+
 #ifdef LV_HAVE_GENERIC
 /*!
   \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
diff --git a/volk/kernels/volk/volk_32f_x2_min_32f.h b/volk/kernels/volk/volk_32f_x2_min_32f.h
index 4773d13211..eef5e5da2e 100644
--- a/volk/kernels/volk/volk_32f_x2_min_32f.h
+++ b/volk/kernels/volk/volk_32f_x2_min_32f.h
@@ -45,6 +45,42 @@ static inline void volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVecto
 }
 #endif /* LV_HAVE_SSE */
 
+#ifdef LV_HAVE_NEON
+/*!
+  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_x2_min_32f_neon(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+
+    float32x4_t a_vec, b_vec, c_vec;
+    for(number = 0; number < quarter_points; number++){
+        a_vec = vld1q_f32(aPtr);
+        b_vec = vld1q_f32(bPtr);
+
+        c_vec = vminq_f32(a_vec, b_vec);
+
+        vst1q_f32(cPtr, c_vec);
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    for(number = quarter_points*4; number < num_points; number++){
+      const float a = *aPtr++;
+      const float b = *bPtr++;
+      *cPtr++ = ( a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
 #ifdef LV_HAVE_GENERIC
 /*!
   \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
diff --git a/volk/kernels/volk/volk_32f_x2_multiply_32f.h b/volk/kernels/volk/volk_32f_x2_multiply_32f.h
index 9fdbec0a2c..8bbd81c8a6 100644
--- a/volk/kernels/volk/volk_32f_x2_multiply_32f.h
+++ b/volk/kernels/volk/volk_32f_x2_multiply_32f.h
@@ -188,6 +188,33 @@ static inline void volk_32f_x2_multiply_32f_a_avx(float* cVector, const float* a
 }
 #endif /* LV_HAVE_AVX */
 
+#ifdef LV_HAVE_NEON
+/*!
+  \brief Multiplys the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param bVector One of the vectors to be multiplied
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_neon(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    const unsigned int quarter_points = num_points / 4;
+    unsigned int number;
+    float32x4_t avec, bvec, cvec;
+    for(number=0; number < quarter_points; ++number) {
+        avec = vld1q_f32(aVector);
+        bvec = vld1q_f32(bVector);
+        cvec = vmulq_f32(avec, bvec);
+        vst1q_f32(cVector, cvec);
+        aVector += 4;
+        bVector += 4;
+        cVector += 4;
+    }
+    for(number=quarter_points*4; number < num_points; ++number) {
+        *cVector++ = *aVector++ * *bVector++;
+    }
+} 
+#endif /* LV_HAVE_NEON */
+
 #ifdef LV_HAVE_GENERIC
 /*!
   \brief Multiplys the two input vectors and store their results in the third vector
diff --git a/volk/kernels/volk/volk_32f_x2_subtract_32f.h b/volk/kernels/volk/volk_32f_x2_subtract_32f.h
index 8ea491f988..6831d89a10 100644
--- a/volk/kernels/volk/volk_32f_x2_subtract_32f.h
+++ b/volk/kernels/volk/volk_32f_x2_subtract_32f.h
@@ -63,6 +63,40 @@ static inline void volk_32f_x2_subtract_32f_generic(float* cVector, const float*
 }
 #endif /* LV_HAVE_GENERIC */
 
+#ifdef LV_HAVE_NEON
+/*!
+  \brief Subtracts bVector form aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The initial vector
+  \param bVector The vector to be subtracted
+  \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+static inline void volk_32f_x2_subtract_32f_neon(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+
+    float32x4_t a_vec, b_vec, c_vec;
+
+    for(number = 0; number < quarter_points; number++){
+        a_vec = vld1q_f32(aPtr);
+        b_vec = vld1q_f32(bPtr);
+        c_vec = vsubq_f32(a_vec, b_vec);
+        vst1q_f32(cPtr, c_vec);
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    for(number = quarter_points * 4; number < num_points; number++){
+      *cPtr++ = (*aPtr++) - (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
 #ifdef LV_HAVE_ORC
 /*!
   \brief Subtracts bVector form aVector and store their results in the cVector
diff --git a/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
index fdef68209e..c555bbb696 100644
--- a/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
+++ b/volk/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
@@ -293,7 +293,144 @@ static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target, float* src0,
 }
 #endif // LV_HAVE_AVX
 
+#ifdef LV_HAVE_NEON
+
+static inline void volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target, float* __restrict src0, float* __restrict center_point_array, float* __restrict cutoff, unsigned int num_points) {
+
+
+  int i;
+  float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
+
+  float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
+  float32x2_t cutoff_vector;
+  float32x2x2_t x_low, x_high;
+  float32x4_t x_qvector, c_qvector, cpa_qvector;
+  float accumulator, final_result;
+  float res_accumulators[4];
+  
+  float dbg_cpa[4], dbg_x[4], dbg_c[4];
+  float dbg_max[4];
+  float dbg_x_to_1[2], dbg_x_to_2[2], dbg_x_to_3[2], dbg_x_to_4[2];
+  float dbg_x_high[2], dbg_x_low[2];
+  float dbg_foo;
+
+  c_qvector = vld1q_f32( zero );
+  // load the cutoff in to a vector
+  cutoff_vector = vdup_n_f32( *cutoff );
+  // ... center point array
+  cpa_qvector = vld1q_f32( center_point_array );
+
+  for(i=0; i < num_points; ++i) {
+    // load x  (src0)
+    x_to_1 = vdup_n_f32( *src0++ );
+
+    // Get a vector of max(src0, cutoff)
+    x_to_1 = vmax_f32(x_to_1,  cutoff_vector ); // x^1
+    x_to_2 = vmul_f32(x_to_1, x_to_1); // x^2
+    x_to_3 = vmul_f32(x_to_2, x_to_1); // x^3
+    x_to_4 = vmul_f32(x_to_3, x_to_1); // x^4
+    // zip up doubles to interleave
+    x_low = vzip_f32(x_to_1, x_to_2); // [x^2 | x^1 || x^2 | x^1]
+    x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3]
+    // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
+    x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]); 
+    // now we finally have [x^4 | x^3 | x^2 | x] !
+    
+    c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
 
+  }
+  // there should be better vector reduction techniques
+  vst1q_f32(res_accumulators, c_qvector );
+  accumulator = res_accumulators[0] + res_accumulators[1] + 
+          res_accumulators[2] + res_accumulators[3];
+  
+  *target = accumulator + center_point_array[4] * (float)num_points;
+}
+
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_NEON
+
+static inline void volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target, float* __restrict src0, float* __restrict center_point_array, float* __restrict cutoff, unsigned int num_points) {
+
+
+  int i;
+  float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
+
+  float accumulator, final_result;
+  
+
+  float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
+  accumulator1_vec = vld1q_f32(zero);
+  accumulator2_vec = vld1q_f32(zero);
+  accumulator3_vec = vld1q_f32(zero);
+  accumulator4_vec = vld1q_f32(zero);
+  float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
+  float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
+
+  // load the cutoff in to a vector
+  cutoff_vector = vdupq_n_f32( *cutoff );
+  // ... center point array
+  cpa_0 = vdupq_n_f32(center_point_array[0]);
+  cpa_1 = vdupq_n_f32(center_point_array[1]);
+  cpa_2 = vdupq_n_f32(center_point_array[2]);
+  cpa_3 = vdupq_n_f32(center_point_array[3]);
+
+
+  // nathan is not sure why this is slower *and* wrong compared to neonvertfma
+  for(i=0; i < num_points/4; ++i) {
+    // load x 
+    x_to_1 = vld1q_f32( src0 );
+
+    // Get a vector of max(src0, cutoff)
+    x_to_1 = vmaxq_f32(x_to_1,  cutoff_vector ); // x^1
+    x_to_2 = vmulq_f32(x_to_1, x_to_1); // x^2
+    x_to_3 = vmulq_f32(x_to_2, x_to_1); // x^3
+    x_to_4 = vmulq_f32(x_to_3, x_to_1); // x^4
+    x_to_1 = vmulq_f32(x_to_1, cpa_0);
+    x_to_2 = vmulq_f32(x_to_2, cpa_1);
+    x_to_3 = vmulq_f32(x_to_3, cpa_2);
+    x_to_4 = vmulq_f32(x_to_4, cpa_3);
+    accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
+    accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
+    accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
+    accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
+
+    src0 += 4;
+  }
+  accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
+  accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
+  accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
+
+  __VOLK_ATTR_ALIGNED(32) float res_accumulators[4];
+  vst1q_f32(res_accumulators, accumulator1_vec );
+  accumulator = res_accumulators[0] + res_accumulators[1] + 
+          res_accumulators[2] + res_accumulators[3];
+
+  float result = 0.0;
+  float fst = 0.0;
+  float sq = 0.0;
+  float thrd = 0.0;
+  float frth = 0.0;
+
+  for(i = 4*num_points/4; i < num_points; ++i) {
+    fst = src0[i];
+    fst = MAX(fst, *cutoff);
+
+    sq = fst * fst;
+    thrd = fst * sq;
+    frth = sq * sq;
+    //fith = sq * thrd;
+
+    accumulator += (center_point_array[0] * fst +
+           center_point_array[1] * sq +
+           center_point_array[2] * thrd +
+           center_point_array[3] * frth); //+
+  }
+
+  *target = accumulator + center_point_array[4] * (float)num_points;
+}
 
+#endif /* LV_HAVE_NEON */
 
 #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/
diff --git a/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
index 44535da6d8..cf67c134fd 100644
--- a/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
+++ b/volk/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
@@ -283,6 +283,166 @@ static inline void volk_32fc_32f_dot_prod_32fc_u_avx( lv_32fc_t* result, const l
 }
 #endif /*LV_HAVE_AVX*/
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32fc_32f_dot_prod_32fc_neon_unroll ( lv_32fc_t* __restrict result, const  lv_32fc_t* __restrict input, const  float* __restrict taps, unsigned int num_points) {
+
+   unsigned int number;
+   const unsigned int quarterPoints = num_points / 8;
+
+   float res[2];
+   float *realpt = &res[0], *imagpt = &res[1];
+   const float* inputPtr = (float*)input;
+   const float* tapsPtr = taps;
+   float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
+   float* real_accum;
+   float current_accum = 0.0f ;
+   float accVector_real[4]; 
+   float accVector_imag[4];
+
+   float32x4x2_t  inputVector0, inputVector1;
+   float32x4_t  tapsVector0, tapsVector1;
+   float32x4_t  tmp_real0, tmp_imag0;
+   float32x4_t  tmp_real1, tmp_imag1;
+   float32x4_t real_accumulator0, imag_accumulator0;
+   float32x4_t real_accumulator1, imag_accumulator1;
+
+  
+   // zero out accumulators
+   // take a *float, return float32x4_t
+   real_accumulator0 = vld1q_f32( zero );
+   imag_accumulator0 = vld1q_f32( zero );
+   real_accumulator1 = vld1q_f32( zero );
+   imag_accumulator1 = vld1q_f32( zero );
+   float dbgVec[8];
+ 
+   for(number=0 ;number < quarterPoints; number++){
+      // load doublewords and duplicate in to second lane
+      tapsVector0 = vld1q_f32(tapsPtr );
+      tapsVector1 = vld1q_f32(tapsPtr+4 );
+
+      // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
+      inputVector0 = vld2q_f32(inputPtr );
+      inputVector1 = vld2q_f32(inputPtr+8 );
+      // inputVector is now a struct of two vectors, 0th is real, 1st is imag
+
+      tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
+      tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
+
+      tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
+      tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
+
+      real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
+      imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
+
+      real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
+      imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
+
+      tapsPtr += 8;
+      inputPtr += 16;
+   }
+    
+   real_accumulator0 = vaddq_f32( real_accumulator0, real_accumulator1);
+   imag_accumulator0 = vaddq_f32( imag_accumulator0, imag_accumulator1);
+   // void vst1q_f32( float32_t * ptr, float32x4_t val);
+   // store results back to a complex (array of 2 floats)
+   vst1q_f32(accVector_real, real_accumulator0);
+   vst1q_f32(accVector_imag, imag_accumulator0);
+   *realpt = accVector_real[0] + accVector_real[1] +
+             accVector_real[2] + accVector_real[3] ;
+
+   *imagpt = accVector_imag[0] + accVector_imag[1] +
+             accVector_imag[2] + accVector_imag[3] ;
+
+  // clean up the remainder
+  for(number=quarterPoints*8; number < num_points; number++){
+    *realpt += ((*inputPtr++) * (*tapsPtr));
+    *imagpt += ((*inputPtr++) * (*tapsPtr++));
+  }
+
+  *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32fc_32f_dot_prod_32fc_a_neon ( lv_32fc_t* __restrict result, const  lv_32fc_t* __restrict input, const  float* __restrict taps, unsigned int num_points) {
+
+   unsigned int number;
+   const unsigned int quarterPoints = num_points / 4;
+
+   float res[2];
+   float *realpt = &res[0], *imagpt = &res[1];
+   const float* inputPtr = (float*)input;
+   const float* tapsPtr = taps;
+   float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
+   float* real_accum;
+   float current_accum = 0.0f ;
+   float accVector_real[4];
+   float accVector_imag[4];
+
+   float32x4x2_t  inputVector;
+   float32x4_t  tapsVector;
+   float32x4_t tmp_real, tmp_imag;
+   float32x4_t real_accumulator, imag_accumulator;
+
+
+   // zero out accumulators
+   // take a *float, return float32x4_t
+   real_accumulator = vld1q_f32( zero );
+   imag_accumulator = vld1q_f32( zero );
+
+   for(number=0 ;number < quarterPoints; number++){
+      // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) )
+      // load doublewords and duplicate in to second lane
+      tapsVector = vld1q_f32(tapsPtr );
+
+      // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
+      inputVector = vld2q_f32(inputPtr );
+
+      tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
+      tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
+
+      real_accumulator = vaddq_f32(real_accumulator, tmp_real);
+      imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
+
+
+      tapsPtr += 4;
+      inputPtr += 8;
+
+   }
+
+   // void vst1q_f32( float32_t * ptr, float32x4_t val);
+   // store results back to a complex (array of 2 floats)
+   vst1q_f32(accVector_real, real_accumulator);
+   vst1q_f32(accVector_imag, imag_accumulator);
+   *realpt = accVector_real[0] + accVector_real[1] +
+             accVector_real[2] + accVector_real[3] ;
+
+   *imagpt = accVector_imag[0] + accVector_imag[1] +
+             accVector_imag[2] + accVector_imag[3] ;
+
+  // clean up the remainder
+  for(number=quarterPoints*4; number < num_points; number++){
+    *realpt += ((*inputPtr++) * (*tapsPtr));
+    *imagpt += ((*inputPtr++) * (*tapsPtr++));
+  }
+
+  *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_NEON
+extern void volk_32fc_32f_dot_prod_32fc_a_neonasm ( lv_32fc_t* result, const  lv_32fc_t* input, const  float* taps, unsigned int num_points);
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_NEON
+extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline ( lv_32fc_t* result, const  lv_32fc_t* input, const  float* taps, unsigned int num_points);
+#endif /*LV_HAVE_NEON*/
 
 #ifdef LV_HAVE_SSE
 
diff --git a/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h b/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h
index a12d078c68..21b71998c2 100644
--- a/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h
+++ b/volk/kernels/volk/volk_32fc_32f_multiply_32fc.h
@@ -135,6 +135,43 @@ static inline void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector, const
 }
 #endif /* LV_HAVE_GENERIC */
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+  /*!
+    \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector The complex vector to be multiplied
+    \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector
+    \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_32f_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
+  lv_32fc_t* cPtr = cVector;
+  const lv_32fc_t* aPtr = aVector;
+  const float* bPtr=  bVector;
+  unsigned int number = 0;
+  unsigned int quarter_points = num_points / 4;
+
+  float32x4x2_t inputVector, outputVector;
+  float32x4_t tapsVector;
+  for(number = 0; number < quarter_points; number++){
+    inputVector = vld2q_f32((float*)aPtr);
+    tapsVector = vld1q_f32(bPtr);
+
+    outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector);
+    outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector);
+
+    vst2q_f32((float*)cPtr, outputVector);
+    aPtr += 4;
+    bPtr += 4;
+    cPtr += 4;
+  }
+
+  for(number = quarter_points * 4; number < num_points; number++){
+    *cPtr++ = (*aPtr++) * (*bPtr++);
+  }
+}
+#endif /* LV_HAVE_NEON */
+
 #ifdef LV_HAVE_ORC
   /*!
     \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector
diff --git a/volk/kernels/volk/volk_32fc_conjugate_32fc.h b/volk/kernels/volk/volk_32fc_conjugate_32fc.h
index dce897ff57..480fa36994 100644
--- a/volk/kernels/volk/volk_32fc_conjugate_32fc.h
+++ b/volk/kernels/volk/volk_32fc_conjugate_32fc.h
@@ -106,6 +106,46 @@ static inline void volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_
 }
 #endif /* LV_HAVE_SSE3 */
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+  /*!
+    \brief Takes the conjugate of a complex vector.
+    \param cVector The vector where the results will be stored
+    \param aVector Vector to be conjugated
+    \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+  */
+static inline void volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+    unsigned int number;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float32x4x2_t x;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    float conj[4] = {-0.f, -0.f, -0.f, -0.f};
+    //uint32x4_t conjugator;
+
+    //conjugator = vld1q_u32( (uint32_t *)conj );
+
+    for(number=0; number < quarterPoints; number++){
+      __builtin_prefetch(a+4);
+      x = vld2q_f32((float*)a); // Load the complex data as ar,br,cr,dr; ai,bi,ci,di
+
+      // xor the imaginary lane
+      x.val[1] = vnegq_f32( x.val[1]);
+
+      vst2q_f32((float*)c,x); // Store the results back into the C container
+
+      a += 4;
+      c += 4;
+    }
+
+    for(number=quarterPoints*4; number < num_points; number++){
+      *c++ = lv_conj(*a++);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
 #ifdef LV_HAVE_GENERIC
   /*!
     \brief Takes the conjugate of a complex vector.
diff --git a/volk/kernels/volk/volk_32fc_magnitude_32f.h b/volk/kernels/volk/volk_32fc_magnitude_32f.h
index 64e99cc1be..cf3e8490eb 100644
--- a/volk/kernels/volk/volk_32fc_magnitude_32f.h
+++ b/volk/kernels/volk/volk_32fc_magnitude_32f.h
@@ -233,6 +233,112 @@ static inline void volk_32fc_magnitude_32f_a_generic(float* magnitudeVector, con
 }
 #endif /* LV_HAVE_GENERIC */
 
+#ifdef LV_HAVE_NEON
+  /*!
+    \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_32f_neon(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+    unsigned int number;
+    unsigned int quarter_points = num_points / 4;
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    float32x4x2_t complex_vec;
+    float32x4_t magnitude_vec;
+    for(number = 0; number < quarter_points; number++){
+        complex_vec = vld2q_f32(complexVectorPtr);
+        complex_vec.val[0] = vmulq_f32(complex_vec.val[0], complex_vec.val[0]);
+        magnitude_vec = vmlaq_f32(complex_vec.val[0], complex_vec.val[1], complex_vec.val[1]);
+    magnitude_vec = vrsqrteq_f32(magnitude_vec);
+    magnitude_vec = vrecpeq_f32( magnitude_vec ); // no plain ol' sqrt
+        vst1q_f32(magnitudeVectorPtr, magnitude_vec);
+        
+        complexVectorPtr += 8;
+        magnitudeVectorPtr += 4;
+    }
+
+    for(number = quarter_points*4; number < num_points; number++){
+      const float real = *complexVectorPtr++;
+      const float imag = *complexVectorPtr++;
+      *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_NEON
+  /*!
+    \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+    
+    This is an approximation from "Streamlining Digital Signal Processing" by 
+    Richard Lyons. Apparently max error is about 1% and mean error is about 0.6%.
+    The basic idea is to do a weighted sum of the abs. value of imag and real parts
+    where weight A is always assigned to max(imag, real) and B is always min(imag,real).
+    There are two pairs of cofficients chosen based on whether min <= 0.4142 max.
+    This method is called equiripple-error magnitude estimation proposed by Filip in '73
+
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_32f_neon_fancy_sweet(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+    unsigned int number;
+    unsigned int quarter_points = num_points / 4;
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    const float threshold = 0.4142135;
+
+    float32x4_t a_vec, b_vec, a_high, a_low, b_high, b_low; 
+    a_high = vdupq_n_f32( 0.84 );
+    b_high = vdupq_n_f32( 0.561);
+    a_low  = vdupq_n_f32( 0.99 );
+    b_low  = vdupq_n_f32( 0.197);
+
+    uint32x4_t comp0, comp1;
+
+    float32x4x2_t complex_vec;
+    float32x4_t min_vec, max_vec, magnitude_vec;
+    float32x4_t real_abs, imag_abs;
+    for(number = 0; number < quarter_points; number++){
+        complex_vec = vld2q_f32(complexVectorPtr);
+        
+        real_abs = vabsq_f32(complex_vec.val[0]);
+        imag_abs = vabsq_f32(complex_vec.val[1]);
+
+        min_vec = vminq_f32(real_abs, imag_abs);
+        max_vec = vmaxq_f32(real_abs, imag_abs);
+
+        // effective branch to choose coefficient pair.
+        comp0 = vcgtq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
+        comp1 = vcleq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
+
+        // and 0s or 1s with coefficients from previous effective branch
+        a_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)a_high), vandq_s32((int32x4_t)comp1, (int32x4_t)a_low));
+        b_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)b_high), vandq_s32((int32x4_t)comp1, (int32x4_t)b_low));
+        
+        // coefficients chosen, do the weighted sum
+        min_vec = vmulq_f32(min_vec, b_vec);
+        max_vec = vmulq_f32(max_vec, a_vec);
+
+        magnitude_vec = vaddq_f32(min_vec, max_vec);
+        vst1q_f32(magnitudeVectorPtr, magnitude_vec);
+        
+        complexVectorPtr += 8;
+        magnitudeVectorPtr += 4;
+    }
+
+    for(number = quarter_points*4; number < num_points; number++){
+      const float real = *complexVectorPtr++;
+      const float imag = *complexVectorPtr++;
+      *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
 #ifdef LV_HAVE_ORC
   /*!
     \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
diff --git a/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h b/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h
index 0af81401a8..878794ba79 100644
--- a/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h
+++ b/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h
@@ -206,6 +206,48 @@ static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector,
 }
 #endif /* LV_HAVE_SSE */
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+//
+
+  /*!
+    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_squared_32f_neon(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    float32x4x2_t cmplx_val;
+    float32x4_t result;
+    for(;number < quarterPoints; number++){
+      cmplx_val = vld2q_f32(complexVectorPtr);
+      complexVectorPtr += 8;
+
+      cmplx_val.val[0] = vmulq_f32(cmplx_val.val[0], cmplx_val.val[0]); // Square the values
+      cmplx_val.val[1] = vmulq_f32(cmplx_val.val[1], cmplx_val.val[1]); // Square the values
+
+      result = vaddq_f32(cmplx_val.val[0], cmplx_val.val[1]); // Add the I2 and Q2 values
+
+      vst1q_f32(magnitudeVectorPtr, result);
+      magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      float val1Real = *complexVectorPtr++;
+      float val1Imag = *complexVectorPtr++;
+      *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
 #ifdef LV_HAVE_GENERIC
   /*!
     \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
diff --git a/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
index 657fa3158f..fb79d6613c 100644
--- a/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
+++ b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
@@ -760,4 +760,220 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const
 
 #endif /*LV_HAVE_SSE4_1*/
 
+#ifdef LV_HAVE_NEON
+
+static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+    unsigned int quarter_points = num_points / 4;
+    unsigned int number;
+    
+    lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
+    lv_32fc_t* b_ptr = (lv_32fc_t*) input;
+    // for 2-lane vectors, 1st lane holds the real part,
+    // 2nd lane holds the imaginary part
+    float32x4x2_t a_val, b_val, c_val, accumulator;
+    float32x4x2_t tmp_real, tmp_imag;
+    accumulator.val[0] = vdupq_n_f32(0);
+    accumulator.val[1] = vdupq_n_f32(0);
+    
+    for(number = 0; number < quarter_points; ++number) {
+        a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+        b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+        __builtin_prefetch(a_ptr+8);
+        __builtin_prefetch(b_ptr+8);
+
+        // multiply the real*real and imag*imag to get real result
+        // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+        tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+        // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+        tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
+
+        // Multiply cross terms to get the imaginary result
+        // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+        tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
+        // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+        tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+
+        c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
+        c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
+        
+        accumulator.val[0] = vaddq_f32(accumulator.val[0], c_val.val[0]);
+        accumulator.val[1] = vaddq_f32(accumulator.val[1], c_val.val[1]);
+
+        a_ptr += 4;
+        b_ptr += 4;
+    }
+    lv_32fc_t accum_result[4];
+    vst2q_f32((float*)accum_result, accumulator);
+    *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+    // tail case
+    for(number = quarter_points*4; number < num_points; ++number) {
+        *result += (*a_ptr++) * (*b_ptr++);
+    }
+
+}
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_NEON
+
+static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+    unsigned int quarter_points = num_points / 4;
+    unsigned int number;
+    
+    lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
+    lv_32fc_t* b_ptr = (lv_32fc_t*) input;
+    // for 2-lane vectors, 1st lane holds the real part,
+    // 2nd lane holds the imaginary part
+    float32x4x2_t a_val, b_val, c_val, accumulator;
+    float32x4x2_t tmp_real, tmp_imag;
+    accumulator.val[0] = vdupq_n_f32(0);
+    accumulator.val[1] = vdupq_n_f32(0);
+    
+    for(number = 0; number < quarter_points; ++number) {
+        a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+        b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+        __builtin_prefetch(a_ptr+8);
+        __builtin_prefetch(b_ptr+8);
+
+        // do the first multiply
+        tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+        tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+
+        // use multiply accumulate/subtract to get result
+        tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
+        tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
+
+        accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]);
+        accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]);
+
+        // increment pointers
+        a_ptr += 4;
+        b_ptr += 4;
+    }
+    lv_32fc_t accum_result[4];
+    vst2q_f32((float*)accum_result, accumulator);
+    *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+    // tail case
+    for(number = quarter_points*4; number < num_points; ++number) {
+        *result += (*a_ptr++) * (*b_ptr++);
+    }
+
+}
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_NEON
+static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+    unsigned int quarter_points = num_points / 4;
+    unsigned int number;
+    
+    lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
+    lv_32fc_t* b_ptr = (lv_32fc_t*) input;
+    // for 2-lane vectors, 1st lane holds the real part,
+    // 2nd lane holds the imaginary part
+    float32x4x2_t a_val, b_val, accumulator1, accumulator2;
+    float32x4x2_t tmp_real, tmp_imag;
+    accumulator1.val[0] = vdupq_n_f32(0);
+    accumulator1.val[1] = vdupq_n_f32(0);
+    accumulator2.val[0] = vdupq_n_f32(0);
+    accumulator2.val[1] = vdupq_n_f32(0);
+    
+    for(number = 0; number < quarter_points; ++number) {
+        a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+        b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+        __builtin_prefetch(a_ptr+8);
+        __builtin_prefetch(b_ptr+8);
+
+        // use 2 accumulators to remove inter-instruction data dependencies
+        accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
+        accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
+        accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
+        accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
+        // increment pointers
+        a_ptr += 4;
+        b_ptr += 4;
+    }
+    accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
+    accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
+    lv_32fc_t accum_result[4];
+    vst2q_f32((float*)accum_result, accumulator1);
+    *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+    // tail case
+    for(number = quarter_points*4; number < num_points; ++number) {
+        *result += (*a_ptr++) * (*b_ptr++);
+    }
+
+}
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_NEON
+static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+// NOTE: GCC does a poor job with this kernel, but the euivalent ASM code is very fast
+
+    unsigned int quarter_points = num_points / 8;
+    unsigned int number;
+    
+    lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
+    lv_32fc_t* b_ptr = (lv_32fc_t*) input;
+    // for 2-lane vectors, 1st lane holds the real part,
+    // 2nd lane holds the imaginary part
+    float32x4x4_t a_val, b_val, accumulator1, accumulator2;
+    float32x4x2_t reduced_accumulator;
+    accumulator1.val[0] = vdupq_n_f32(0);
+    accumulator1.val[1] = vdupq_n_f32(0);
+    accumulator1.val[2] = vdupq_n_f32(0);
+    accumulator1.val[3] = vdupq_n_f32(0);
+    accumulator2.val[0] = vdupq_n_f32(0);
+    accumulator2.val[1] = vdupq_n_f32(0);
+    accumulator2.val[2] = vdupq_n_f32(0);
+    accumulator2.val[3] = vdupq_n_f32(0);
+    
+    // 8 input regs, 8 accumulators -> 16/16 neon regs are used
+    for(number = 0; number < quarter_points; ++number) {
+        a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+        b_val = vld4q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+        __builtin_prefetch(a_ptr+8);
+        __builtin_prefetch(b_ptr+8);
+
+        // use 2 accumulators to remove inter-instruction data dependencies
+        accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
+        accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
+
+        accumulator1.val[2] = vmlaq_f32(accumulator1.val[2], a_val.val[2], b_val.val[2]);
+        accumulator1.val[3] = vmlaq_f32(accumulator1.val[3], a_val.val[2], b_val.val[3]);
+
+        accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
+        accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
+
+        accumulator2.val[2] = vmlsq_f32(accumulator2.val[2], a_val.val[3], b_val.val[3]);
+        accumulator2.val[3] = vmlaq_f32(accumulator2.val[3], a_val.val[3], b_val.val[2]);
+        // increment pointers
+        a_ptr += 8;
+        b_ptr += 8;
+    }
+    // reduce 8 accumulator lanes down to 2 (1 real and 1 imag)
+    accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator1.val[2]);
+    accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator1.val[3]);
+    accumulator2.val[0] = vaddq_f32(accumulator2.val[0], accumulator2.val[2]);
+    accumulator2.val[1] = vaddq_f32(accumulator2.val[1], accumulator2.val[3]);
+    reduced_accumulator.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
+    reduced_accumulator.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
+    // now reduce accumulators to scalars
+    lv_32fc_t accum_result[4];
+    vst2q_f32((float*)accum_result, reduced_accumulator);
+    *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+    // tail case
+    for(number = quarter_points*8; number < num_points; ++number) {
+        *result += (*a_ptr++) * (*b_ptr++);
+    }
+
+}
+#endif /*LV_HAVE_NEON*/
+
+
 #endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H*/
diff --git a/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
index 7db68c1bd8..0993a16ceb 100644
--- a/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
+++ b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
@@ -149,6 +149,117 @@ static inline void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, cons
 }
 #endif /* LV_HAVE_GENERIC */
 
+#ifdef LV_HAVE_NEON
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+
+    lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
+    lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
+    unsigned int quarter_points = num_points / 4;
+    float32x4x2_t a_val, b_val, c_val;
+    float32x4x2_t tmp_real, tmp_imag;
+    unsigned int number = 0;
+
+    for(number = 0; number < quarter_points; ++number) {
+        a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+        b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+        __builtin_prefetch(a_ptr+4);
+        __builtin_prefetch(b_ptr+4);
+
+        // multiply the real*real and imag*imag to get real result
+        // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+        tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]); 
+        // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+        tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]); 
+
+        // Multiply cross terms to get the imaginary result
+        // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+        tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]); 
+        // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+        tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); 
+
+        // store the results
+        c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
+        c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
+        vst2q_f32((float*)cVector, c_val);
+
+        a_ptr += 4;
+        b_ptr += 4;
+        cVector += 4;
+    }
+
+    for(number = quarter_points*4; number < num_points; number++){
+      *cVector++ = (*a_ptr++) * (*b_ptr++);
+    }
+
+}
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_NEON
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+
+    lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
+    lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
+    unsigned int quarter_points = num_points / 4;
+    float32x4x2_t a_val, b_val, c_val;
+    float32x4x2_t tmp_real, tmp_imag;
+    unsigned int number = 0;
+
+    // TODO: I suspect the compiler is doing a poor job scheduling this. This seems
+    // highly optimal, but is barely better than generic
+    for(number = 0; number < quarter_points; ++number) {
+        a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+        b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+        __builtin_prefetch(a_ptr+4);
+        __builtin_prefetch(b_ptr+4);
+
+        // do the first multiply
+        tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]); 
+        tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+
+        // use multiply accumulate/subtract to get result
+        tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
+        tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
+
+        // store
+        vst2q_f32((float*)cVector, tmp_imag);
+        // increment pointers
+        a_ptr += 4;
+        b_ptr += 4;
+        cVector += 4;
+    }
+
+    for(number = quarter_points*4; number < num_points; number++){
+      *cVector++ = (*a_ptr++) * (*b_ptr++);
+    }
+
+}
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_NEON
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+extern void volk_32fc_x2_multiply_32fc_neonasm(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points);
+#endif /* LV_HAVE_NEON */
+
 #ifdef LV_HAVE_ORC
   /*!
     \brief Multiplies the two input complex vectors and stores their results in the third vector
diff --git a/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h b/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
index cfd6c007f1..dbc123ff25 100644
--- a/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
+++ b/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
@@ -138,6 +138,59 @@ static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVecto
 }
 #endif /* LV_HAVE_SSE */
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+  /*!
+    \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector First vector to be multiplied
+    \param bVector Second vector that is conjugated before being multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+
+    lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
+    lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
+    unsigned int quarter_points = num_points / 4;
+    float32x4x2_t a_val, b_val, c_val;
+    float32x4x2_t tmp_real, tmp_imag;
+    unsigned int number = 0;
+
+    for(number = 0; number < quarter_points; ++number) {
+        a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+        b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+        b_val.val[1] = vnegq_f32(b_val.val[1]);
+        __builtin_prefetch(a_ptr+4);
+        __builtin_prefetch(b_ptr+4);
+
+        // multiply the real*real and imag*imag to get real result
+        // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+        tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+        // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+        tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
+
+        // Multiply cross terms to get the imaginary result
+        // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+        tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
+        // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+        tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+
+        // store the results
+        c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
+        c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
+        vst2q_f32((float*)cVector, c_val);
+
+        a_ptr += 4;
+        b_ptr += 4;
+        cVector += 4;
+    }
+
+    for(number = quarter_points*4; number < num_points; number++){
+      *cVector++ = (*a_ptr++) * conj(*b_ptr++);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
 #ifdef LV_HAVE_GENERIC
   /*!
     \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
diff --git a/volk/kernels/volk/volk_32fc_x2_square_dist_32f.h b/volk/kernels/volk/volk_32fc_x2_square_dist_32f.h
index 27a081b7cf..56b3d5c230 100644
--- a/volk/kernels/volk/volk_32fc_x2_square_dist_32f.h
+++ b/volk/kernels/volk/volk_32fc_x2_square_dist_32f.h
@@ -92,6 +92,36 @@ static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t*
 
 #endif /*LV_HAVE_SSE3*/
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_32fc_x2_square_dist_32f_neon(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) {
+    const unsigned int quarter_points = num_points / 4;
+    unsigned int number;
+
+    float32x4x2_t a_vec, b_vec;
+    float32x4x2_t diff_vec;
+    float32x4_t tmp, tmp1, dist_sq;
+    a_vec.val[0] = vdupq_n_f32( lv_creal(src0[0]) );
+    a_vec.val[1] = vdupq_n_f32( lv_cimag(src0[0]) );
+    for(number=0; number < quarter_points; ++number) {
+        b_vec = vld2q_f32((float*)points);
+        diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
+        diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
+        tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
+        tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
+
+        dist_sq = vaddq_f32(tmp, tmp1);
+        vst1q_f32(target, dist_sq);
+        points += 4;
+        target += 4;
+    }
+    for(number=quarter_points*4; number < num_points; ++number) {
+        lv_32fc_t diff = src0[0] - *points++;
+        *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
 #ifdef LV_HAVE_GENERIC
 static inline void volk_32fc_x2_square_dist_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) {
 
diff --git a/volk/kernels/volk/volk_8i_convert_16i.h b/volk/kernels/volk/volk_8i_convert_16i.h
index 3e5c92723f..5b27900cf1 100644
--- a/volk/kernels/volk/volk_8i_convert_16i.h
+++ b/volk/kernels/volk/volk_8i_convert_16i.h
@@ -138,6 +138,44 @@ static inline void volk_8i_convert_16i_a_generic(int16_t* outputVector, const in
 }
 #endif /* LV_HAVE_GENERIC */
 
+#ifdef LV_HAVE_NEON
+  /*!
+    \brief Converts the input 8 bit integer data into 16 bit integer data
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param num_points The number of data values to be converted
+    \note Input and output buffers do NOT need to be properly aligned
+  */
+static inline void volk_8i_convert_16i_neon(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+    int16_t* outputVectorPtr = outputVector;
+    const int8_t* inputVectorPtr = inputVector;
+    unsigned int number;
+    const unsigned int eighth_points = num_points / 8;
+    float scale_factor = 256;
+
+    int8x8_t input_vec ;
+    int16x8_t converted_vec;
+
+    // NEON doesn't have a concept of 8 bit registers, so we are really 
+    // dealing with the low half of 16-bit registers. Since this requires
+    // a move instruction we likely do better with ASM here.
+    for(number = 0; number < eighth_points; ++number) {
+        input_vec = vld1_s8(inputVectorPtr);
+        converted_vec = vmovl_s8(input_vec);
+        //converted_vec = vmulq_s16(converted_vec, scale_factor);
+        converted_vec = vshlq_n_s16(converted_vec, 8);
+        vst1q_s16( outputVectorPtr, converted_vec);
+
+        inputVectorPtr += 8;
+        outputVectorPtr += 8;
+    }
+
+    for(number = eighth_points * 8; number < num_points; number++){
+      *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+    }
+}
+#endif /* LV_HAVE_NEON */
+
 #ifdef LV_HAVE_ORC
   /*!
     \brief Converts the input 8 bit integer data into 16 bit integer data
diff --git a/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h b/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h
index c8ff18e67b..427c9abf55 100644
--- a/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h
+++ b/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h
@@ -61,6 +61,34 @@ static inline void volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer, const
 }
 #endif /* LV_HAVE_GENERIC */
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+    unsigned int number;
+    unsigned int sixteenth_points = num_points / 16;
+
+    int8x16x2_t input_vector;
+    for(number=0; number < sixteenth_points; ++number) {
+        input_vector = vld2q_s8((int8_t*) complexVector );
+        vst1q_s8(iBuffer, input_vector.val[0]);
+        iBuffer += 16;
+        complexVector += 16;
+    }
+
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int8_t* iBufferPtr = iBuffer;
+    for(number = sixteenth_points*16; number < num_points; number++){
+      *iBufferPtr++ = *complexVectorPtr++;
+      complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_NEON */
 
 
 
diff --git a/volk/lib/CMakeLists.txt b/volk/lib/CMakeLists.txt
index d72eb726e4..86b2c6a239 100644
--- a/volk/lib/CMakeLists.txt
+++ b/volk/lib/CMakeLists.txt
@@ -382,6 +382,29 @@ include_directories(
 )
 
 ########################################################################
+# Handle ASM (for ARM) support
+#  on by default, but let users turn it off
+########################################################################
+if( NOT DEFINED ENABLE_ARM_ASM OR ENABLE_ARM_ASM )
+    message("---- Adding ARM ASM files")
+    set(ASM-ATT $ENV{ASM})
+    #set(_CMAKE_TOOLCHAIN_PREFIX $ENV{TARGET_PREFIX}) # Gah - wtf, this shouldn't be needed
+    enable_language(ASM-ATT)
+    # what would make this OK, appending?
+    set(ASM-ATT_FLAGS "-mfpu=neon -g") # Horrid horrid hack to assemble for ARM neon
+    set(CMAKE_ASM-ATT_FLAGS ${ASM-ATT_FLAGS})
+    message("DEBUG: looking for ASM files in ${CMAKE_SOURCE_DIR}/kernels/volk/asm/neon")
+    include_directories(${CMAKE_SOURCE_DIR}/kernels/volk/asm/neon)
+    file(GLOB asm_files ${CMAKE_SOURCE_DIR}/kernels/volk/asm/neon/*.s)
+    foreach(asm_file ${asm_files})
+        list(APPEND volk_sources ${asm_file})
+        message(STATUS "Adding source file: ${asm_file}")
+    endforeach(asm_file)
+else()
+    message("---- NOT Adding ARM ASM files")
+endif()
+
+########################################################################
 # Handle orc support
 ########################################################################
 if(ORC_FOUND)
@@ -436,7 +459,8 @@ list(APPEND volk_sources ${CMAKE_CURRENT_BINARY_DIR}/constants.c)
 # Setup the volk sources list and library
 ########################################################################
 if(NOT WIN32)
-    add_definitions(-fvisibility=hidden)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden")
 endif()
 
 list(APPEND volk_sources
author	Nathan West <nathan.west@okstate.edu>	2014-06-18 13:10:02 -0500
committer	Nathan West <nathan.west@okstate.edu>	2014-07-18 20:41:28 -0400
commit	6e17772f423dc260051e37ceb25f9384ca8151ed (patch)
tree	1c74d276ccaba2db6f35a1942cd4193b0943648e
parent	93db96faa81b260367908e977f15c0d7a45358db (diff)