diff options
author | Tom Rondeau <trondeau@vt.edu> | 2012-04-13 18:36:53 -0400 |
---|---|---|
committer | Tom Rondeau <trondeau@vt.edu> | 2012-04-13 18:36:53 -0400 |
commit | f919f9dcbb54a08e6e26d6c229ce92fb784fa1b2 (patch) | |
tree | 7e846386b9eb1676f9a93fc4a1e55916b9accc97 /volk | |
parent | 6a1e9783fec6ed827f49db27c171591d30f32933 (diff) |
Removed whitespace and added dtools/bin/remove-whitespace as a tool to do this in the future.
The sed script was provided by Moritz Fischer.
Diffstat (limited to 'volk')
140 files changed, 1385 insertions, 1385 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc index bd36d6dc7d..76b9f40317 100644 --- a/volk/apps/volk_profile.cc +++ b/volk/apps/volk_profile.cc @@ -13,7 +13,7 @@ extern "C" { int main(int argc, char *argv[]) { std::vector<std::string> results; - + //VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4_a, 1e-4, 2046, 10000, &results); //VOLK_PROFILE(volk_16i_branch_4_state_8_a, 1e-4, 2046, 10000, &results); VOLK_PROFILE(volk_16ic_s32f_deinterleave_real_32f_a, 1e-5, 32768.0, 204600, 10000, &results); @@ -143,7 +143,7 @@ int main(int argc, char *argv[]) { #this file is generated by volk_profile.\n\ #the function name is followed by the preferred architecture.\n\ "; - + BOOST_FOREACH(std::string result, results) { config << result << std::endl; } diff --git a/volk/cmake/FindORC.cmake b/volk/cmake/FindORC.cmake index 1c4c417add..a5f35c465e 100644 --- a/volk/cmake/FindORC.cmake +++ b/volk/cmake/FindORC.cmake @@ -8,16 +8,16 @@ FIND_PROGRAM(ORCC_EXECUTABLE orcc HINTS ${PC_ORC_TOOLSDIR} PATHS ${ORC_ROOT}/bin ${CMAKE_INSTALL_PREFIX}/bin) -FIND_PATH(ORC_INCLUDE_DIR NAMES orc/orc.h +FIND_PATH(ORC_INCLUDE_DIR NAMES orc/orc.h HINTS ${PC_ORC_INCLUDEDIR} PATHS ${ORC_ROOT}/include/orc-0.4 ${CMAKE_INSTALL_PREFIX}/include/orc-0.4) -FIND_PATH(ORC_LIBRARY_DIR NAMES ${CMAKE_SHARED_LIBRARY_PREFIX}orc-0.4${CMAKE_SHARED_LIBRARY_SUFFIX} +FIND_PATH(ORC_LIBRARY_DIR NAMES ${CMAKE_SHARED_LIBRARY_PREFIX}orc-0.4${CMAKE_SHARED_LIBRARY_SUFFIX} HINTS ${PC_ORC_LIBDIR} PATHS ${ORC_ROOT}/lib${LIB_SUFFIX} ${CMAKE_INSTALL_PREFIX}/lib${LIB_SUFFIX}) -FIND_LIBRARY(ORC_LIB orc-0.4 +FIND_LIBRARY(ORC_LIB orc-0.4 HINTS ${PC_ORC_LIBRARY_DIRS} PATHS ${ORC_ROOT}/lib${LIB_SUFFIX} ${CMAKE_INSTALL_PREFIX}/lib${LIB_SUFFIX}) diff --git a/volk/gen/archs.xml b/volk/gen/archs.xml index effd90d331..ca21184759 100644 --- a/volk/gen/archs.xml +++ b/volk/gen/archs.xml @@ -19,7 +19,7 @@ <flag>m32</flag> <overrule>MD_SUBCPU</overrule> <overrule_val>x86_64</overrule_val> -</arch> +</arch> <arch name="64" type="x86"> <op>0x80000001</op> diff --git a/volk/gen/make_c.py b/volk/gen/make_c.py index 0f9bcde347..233cb85c27 100644 --- a/volk/gen/make_c.py +++ b/volk/gen/make_c.py @@ -48,7 +48,7 @@ struct volk_machine *get_machine(void) { extern struct volk_machine *volk_machines[]; extern unsigned int n_volk_machines; static struct volk_machine *machine = NULL; - + if(machine != NULL) return machine; else { unsigned int max_score = 0; @@ -71,7 +71,7 @@ unsigned int volk_get_alignment(void) { } """ - + for i in range(len(functions)): tempstring += "void get_" + functions[i] + replace_arch.sub("", arched_arglist[i]) + "\n" tempstring += " %s = get_machine()->%s_archs[volk_rank_archs(get_machine()->%s_indices, get_machine()->%s_arch_defs, get_machine()->%s_n_archs, get_machine()->%s_name, volk_get_lvarch())];\n" % (functions[i], functions[i], functions[i], functions[i], functions[i], functions[i]) diff --git a/volk/gen/make_config_fixed.py b/volk/gen/make_config_fixed.py index 3fd1bdf0aa..96f20b6b25 100644 --- a/volk/gen/make_config_fixed.py +++ b/volk/gen/make_config_fixed.py @@ -18,4 +18,4 @@ def make_config_fixed(dom) : return tempstring; - + diff --git a/volk/gen/make_cpuid_c.py b/volk/gen/make_cpuid_c.py index 2be1123a80..e30d643cb8 100644 --- a/volk/gen/make_cpuid_c.py +++ b/volk/gen/make_cpuid_c.py @@ -1,24 +1,24 @@ #!/usr/bin/env python # # Copyright 2011 Free Software Foundation, Inc. -# +# # This file is part of GNU Radio -# +# # GNU Radio is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. -# +# # GNU Radio is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. -# +# # You should have received a copy of the GNU General Public License # along with GNU Radio; see the file COPYING. If not, write to # the Free Software Foundation, Inc., 51 Franklin Street, # Boston, MA 02110-1301, USA. -# +# from xml.dom import minidom @@ -79,7 +79,7 @@ static inline unsigned int cpuid_edx(unsigned int op) { def make_cpuid_c(dom) : tempstring = HEADER_TEMPL; - + for domarch in dom: if str(domarch.attributes["type"].value) == "x86": if "no_test" in domarch.attributes.keys(): @@ -103,7 +103,7 @@ def make_cpuid_c(dom) : val = domarch.getElementsByTagName("val"); if val: val = str(val[0].firstChild.data); - + if no_test: tempstring = tempstring + """\ int i_can_has_%s () { @@ -113,9 +113,9 @@ int i_can_has_%s () { return 0; #endif } - + """ % (arch) - + elif op == "1": tempstring = tempstring + """\ int i_can_has_%s () { @@ -144,7 +144,7 @@ int i_can_has_%s () { } """ % (arch, val, reg, op, shift, val) - + elif str(domarch.attributes["type"].value) == "powerpc": arch = str(domarch.attributes["name"].value); tempstring = tempstring + """\ @@ -178,7 +178,7 @@ int i_can_has_%s () { unsigned int found_neon = 0; auxvec_f = fopen("/proc/self/auxv", "rb"); if(!auxvec_f) return 0; - + //so auxv is basically 32b of ID and 32b of value //so it goes like this while(!found_neon && auxvec_f) { @@ -186,7 +186,7 @@ int i_can_has_%s () { if((auxvec[0] == AT_HWCAP) && (auxvec[1] & HWCAP_NEON)) found_neon = 1; } - + fclose(auxvec_f); return found_neon; @@ -196,7 +196,7 @@ int i_can_has_%s () { } """ % (arch) - + elif str(domarch.attributes["type"].value) == "all": arch = str(domarch.attributes["name"].value); tempstring = tempstring + """\ @@ -213,7 +213,7 @@ int i_can_has_%s () { } """ % (arch) - + tempstring = tempstring + "void volk_cpu_init() {\n"; for domarch in dom: arch = str(domarch.attributes["name"].value); @@ -231,9 +231,9 @@ int i_can_has_%s () { return tempstring; - - - - - + + + + + diff --git a/volk/gen/make_cpuid_h.py b/volk/gen/make_cpuid_h.py index 4fe5c4e07b..1aa7837815 100644 --- a/volk/gen/make_cpuid_h.py +++ b/volk/gen/make_cpuid_h.py @@ -1,24 +1,24 @@ #!/usr/bin/env python # # Copyright 2011 Free Software Foundation, Inc. -# +# # This file is part of GNU Radio -# +# # GNU Radio is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. -# +# # GNU Radio is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. -# +# # You should have received a copy of the GNU General Public License # along with GNU Radio; see the file COPYING. If not, write to # the Free Software Foundation, Inc., 51 Franklin Street, # Boston, MA 02110-1301, USA. -# +# from xml.dom import minidom @@ -44,5 +44,5 @@ def make_cpuid_h(dom) : tempstring = tempstring + "\n"; tempstring = tempstring + "__VOLK_DECL_END\n"; tempstring = tempstring + "#endif /*INCLUDED_VOLK_CPU_H*/\n" - + return tempstring; diff --git a/volk/gen/make_each_machine_c.py b/volk/gen/make_each_machine_c.py index a3f6203bab..535578859a 100644 --- a/volk/gen/make_each_machine_c.py +++ b/volk/gen/make_each_machine_c.py @@ -59,7 +59,7 @@ def make_each_machine_c(machine_name, archs, functions, fcountlist, taglist, ali """ for arch in archs: tempstring += "#define LV_HAVE_" + arch.swapcase() + " 1\n" - + tempstring += """ #include <volk/volk_common.h> #include "volk_machines.h" diff --git a/volk/gen/make_environment_init_c.py b/volk/gen/make_environment_init_c.py index 263d5bcd13..6186162ee5 100644 --- a/volk/gen/make_environment_init_c.py +++ b/volk/gen/make_environment_init_c.py @@ -14,7 +14,7 @@ def make_environment_init_c(dom) : tempstring = tempstring + "#endif\n" tempstring = tempstring + '\n\n'; tempstring = tempstring + "void volk_environment_init(){\n" - + for domarch in dom: arch = str(domarch.attributes["name"].value); envs = domarch.getElementsByTagName("environment"); @@ -22,11 +22,11 @@ def make_environment_init_c(dom) : cmd = str(env.firstChild.data); tempstring = tempstring + "#ifdef LV_HAVE_" + arch.swapcase() + "\n"; tempstring = tempstring + " " + cmd + "\n"; - tempstring = tempstring + "#endif\n" - + tempstring = tempstring + "#endif\n" + tempstring = tempstring + "}\n"; return tempstring; - - + + diff --git a/volk/gen/make_environment_init_h.py b/volk/gen/make_environment_init_h.py index 655d73f54c..0b0f35d618 100644 --- a/volk/gen/make_environment_init_h.py +++ b/volk/gen/make_environment_init_h.py @@ -12,7 +12,7 @@ def make_environment_init_h() : tempstring = tempstring + "__VOLK_DECL_END\n"; tempstring = tempstring + "#endif\n" return tempstring; - - + + diff --git a/volk/gen/make_h.py b/volk/gen/make_h.py index 354e572588..cf8987c398 100644 --- a/volk/gen/make_h.py +++ b/volk/gen/make_h.py @@ -35,4 +35,4 @@ VOLK_API unsigned int volk_get_alignment(void); tempstring = tempstring + "#endif /*INCLUDED_VOLK_RUNTIME*/\n"; return tempstring; - + diff --git a/volk/gen/make_machines_c.py b/volk/gen/make_machines_c.py index a7ab63d6e5..f19da4ae44 100644 --- a/volk/gen/make_machines_c.py +++ b/volk/gen/make_machines_c.py @@ -33,7 +33,7 @@ struct volk_machine *volk_machines[] = { tempstring += "&volk_machine_" + machine tempstring += "," tempstring += "\n#endif\n" - + tempstring += r""" }; unsigned int n_volk_machines = sizeof(volk_machines)/sizeof(*volk_machines); diff --git a/volk/gen/make_machines_h.py b/volk/gen/make_machines_h.py index a48caa89c0..d2374120b4 100644 --- a/volk/gen/make_machines_h.py +++ b/volk/gen/make_machines_h.py @@ -41,15 +41,15 @@ struct volk_machine { tempstring += " const int %s_arch_defs[%d];\n"%(function, len(archs)) tempstring += " const %s %s_archs[%d];\n"%(replace_volk.sub("p", function), function, len(archs)) tempstring += " const int %s_n_archs;\n"%function - + tempstring += r"""}; - + """ for machine in machines: tempstring += """#if LV_MACHINE_""" + machine.swapcase() + "\n" tempstring += "extern struct volk_machine volk_machine_" + machine + ";\n" tempstring += """#endif\n""" - + tempstring += r""" __VOLK_DECL_END diff --git a/volk/gen/make_makefile_am.py b/volk/gen/make_makefile_am.py index 0dc088a80b..880ce40945 100644 --- a/volk/gen/make_makefile_am.py +++ b/volk/gen/make_makefile_am.py @@ -21,7 +21,7 @@ def make_makefile_am(dom, machines, archflags_dict): tempstring = r""" # This file is automatically generated by make_makefile_am.py. # Do not edit this file. - + include $(top_srcdir)/Makefile.common AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) \ @@ -55,17 +55,17 @@ volk_orc_CFLAGS = -DLV_HAVE_ORC=1 volk_orc_LDFLAGS = $(ORC_LDFLAGS) -lorc-0.4 volk_orc_LIBADD = ../orc/libvolk_orc.la else -volk_orc_CFLAGS = +volk_orc_CFLAGS = volk_orc_LDFLAGS = -volk_orc_LIBADD = +volk_orc_LIBADD = endif libvolk_la_CPPFLAGS = $(AM_CPPFLAGS) $(volk_orc_CFLAGS) libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS) libvolk_la_LIBADD = $(volk_orc_LIBADD) -noinst_LTLIBRARIES = - +noinst_LTLIBRARIES = + """ #here be dragons @@ -76,7 +76,7 @@ noinst_LTLIBRARIES = for arch in machines[machine_name]: if archflags_dict[arch] != "none": tempstring += "-" + archflags_dict[arch] + " " - + tempstring += "\nnoinst_LTLIBRARIES += libvolk_" + machine_name + ".la " tempstring += "\nlibvolk_la_LIBADD += libvolk_" + machine_name + ".la\n" tempstring += "libvolk_la_CPPFLAGS += -DLV_MACHINE_" + machine_name.swapcase() + " \n" diff --git a/volk/gen/make_proccpu_sim.py b/volk/gen/make_proccpu_sim.py index 029dacfcc0..e2c8123e5b 100644 --- a/volk/gen/make_proccpu_sim.py +++ b/volk/gen/make_proccpu_sim.py @@ -1,24 +1,24 @@ #!/usr/bin/env python # # Copyright 2011 Free Software Foundation, Inc. -# +# # This file is part of GNU Radio -# +# # GNU Radio is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. -# +# # GNU Radio is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. -# +# # You should have received a copy of the GNU General Public License # along with GNU Radio; see the file COPYING. If not, write to # the Free Software Foundation, Inc., 51 Franklin Street, # Boston, MA 02110-1301, USA. -# +# from xml.dom import minidom @@ -35,7 +35,7 @@ def make_proccpu_sim(dom) : tempstring = tempstring + " }\n"; tempstring = tempstring + "}\n"; tempstring = tempstring + "\n\n"; - + tempstring = tempstring + "int main() {\n"; tempstring = tempstring + " volk_cpu_init();\n"; tempstring = tempstring + " char buf[2048];\n"; diff --git a/volk/gen/make_set_simd.py b/volk/gen/make_set_simd.py index 5a848e59ea..8334487d78 100644 --- a/volk/gen/make_set_simd.py +++ b/volk/gen/make_set_simd.py @@ -43,17 +43,17 @@ def make_set_simd(dom, machines) : tempstring = tempstring + " indLV_ARCH=no\n"; tempstring = tempstring + " AC_ARG_WITH(lv_arch,\n"; tempstring = tempstring + " AC_HELP_STRING([--with-lv_arch=ARCH],[set volk hardware speedups as space separated string with elements from the following list("; - + for domarch in dom: arch = str(domarch.attributes["name"].value); tempstring = tempstring + arch + ", " tempstring = tempstring[0:len(tempstring) - 2]; - + tempstring = tempstring + ")]),\n"; tempstring = tempstring + " [cf_with_lv_arch=\"$withval\"],\n"; tempstring = tempstring + " [cf_with_lv_arch=\"\"])\n"; if str(domarch.attributes["type"].value) == "all": - arch = str(domarch.attributes["name"].value); + arch = str(domarch.attributes["name"].value); tempstring = tempstring + " AC_DEFINE(LV_MAKE_" + arch.swapcase() + ", 1, [always set "+ arch + "!])\n"; tempstring = tempstring + " ADDONS=\"\"\n"; tempstring = tempstring + " BUILT_ARCHS=\"\"\n"; @@ -67,7 +67,7 @@ def make_set_simd(dom, machines) : tempstring = tempstring[0:-1] + "\"\n"; tempstring = tempstring + " OVERRULE_FLAG=\"yes\"\n"; tempstring = tempstring + " fi\n"; - + tempstring = tempstring +'\ndnl init LV_MAKE_XXX and then try to add archs\n'; for domarch in dom: if str(domarch.attributes["type"].value) != "all": @@ -118,9 +118,9 @@ def make_set_simd(dom, machines) : tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" tempstring = tempstring + " indLV_ARCH=no\n" tempstring = tempstring + " fi\n" - - tempstring = tempstring + " if test \"$indCC\" == \"yes\" && test \"$indCXX\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" - + + tempstring = tempstring + " if test \"$indCC\" == \"yes\" && test \"$indCXX\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" + #tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n"; tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; tempstring = tempstring + " LV_MAKE_" + arch.swapcase() + "=yes\n"; @@ -138,17 +138,17 @@ def make_set_simd(dom, machines) : tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" tempstring = tempstring + " indLV_ARCH=no\n" tempstring = tempstring + " fi\n" - tempstring = tempstring + " if test \"$indLV_ARCH\" == \"yes\"; then\n" + tempstring = tempstring + " if test \"$indLV_ARCH\" == \"yes\"; then\n" tempstring = tempstring + " LV_MAKE_" + arch.swapcase() + "=yes\n"; tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; tempstring = tempstring + " fi\n" tempstring = tempstring + " indLV_ARCH=no\n" - + for domarch in dom: arch = str(domarch.attributes["name"].value); tempstring = tempstring + " AM_CONDITIONAL(LV_MAKE_" + arch.swapcase() + ", test \"$LV_MAKE_" + arch.swapcase() + "\" == \"yes\")\n"; - + tempstring += "\n" #now we can define the machines we're compiling for machine_name in machines: @@ -156,11 +156,11 @@ def make_set_simd(dom, machines) : marchlist = machines[machine_name] for march in marchlist: tempstring += "test \"$LV_MAKE_" + march.swapcase() + "\" == \"yes\" && " - + tempstring += "test true)\n" #just so we don't have to detect the last one in the group, i know tempstring = tempstring + " LV_CXXFLAGS=\"${LV_CXXFLAGS} ${ADDONS}\"\n" tempstring = tempstring + "])\n" - + return tempstring; - + diff --git a/volk/gen/make_typedefs.py b/volk/gen/make_typedefs.py index 8f9f2b55e0..09221d2ef1 100644 --- a/volk/gen/make_typedefs.py +++ b/volk/gen/make_typedefs.py @@ -15,7 +15,7 @@ def make_typedefs(funclist, retlist, my_argtypelist) : tempstring = tempstring + '\n'; - for i in range(len(funclist)): + for i in range(len(funclist)): tempstring = tempstring + "typedef " + retlist[i] +" (*" + replace_volk.sub("p", funclist[i]) + ")(" + my_argtypelist[i] + ");\n"; tempstring = tempstring + "#endif /*INCLUDED_VOLK_TYPEDEFS*/\n"; diff --git a/volk/gen/volk_register.py b/volk/gen/volk_register.py index 0774ece291..3a237c5ca0 100644 --- a/volk/gen/volk_register.py +++ b/volk/gen/volk_register.py @@ -62,7 +62,7 @@ for line in hdr_files: subdtype = re.search("[0-9]+[A-z]+", dtype); if subdtype: datatypes.append(subdtype.group(0)); - + datatypes = set(datatypes); @@ -71,7 +71,7 @@ for line in hdr_files: if dt in line: subline = re.search("(volk_" + dt +"_.*(a|u).*\.h)", line); if subline: - + subsubline = re.search(".+(?=\.h)", subline.group(0)); functions.append(subsubline.group(0)); @@ -80,13 +80,13 @@ afile = minidom.parse(os.path.join(srcdir, "gen/archs.xml")) filearchs = afile.getElementsByTagName("arch"); for filearch in filearchs: archs.append(str(filearch.attributes["name"].value)); - + for arch in archs: a_var = re.search("^\$", arch); if a_var: archs.remove(arch); - -#strip out mutex archs + +#strip out mutex archs archflags_dict = {} for filearch in filearchs: @@ -177,22 +177,22 @@ for func in functions: tag = re.search("\w+", tag.group(0)); if tag: tags.append(tag.group(0)); - - + + if begun_name == 0: retline = re.search(".+(?=" + func + ")", line); if retline: ret = retline.group(0); - - - - + + + + subline = re.search(func + ".*", line); if subline: subsubline = re.search("\(.*?\)", subline.group(0)); if subsubline: args = subsubline.group(0); - + else: begun_name = 1; subsubline = re.search("\(.*", subline.group(0)); @@ -214,25 +214,25 @@ for func in functions: if subline: args = subline.group(0); begun_name = 0; - else: + else: subline = re.search("\(.*", line); if subline: args = subline.group(0); begun_paren = 1; - + replace = re.compile("static "); ret = replace.sub("", ret); replace = re.compile("inline "); ret = replace.sub("", ret); replace = re.compile("\)"); arched_args = replace.sub(", const char* arch) {", args); - + remove = re.compile('\)|\(|{'); rargs = remove.sub("", args); sargs = rargs.split(','); - - - + + + margs = []; atypes = []; for arg in sargs: @@ -241,7 +241,7 @@ for func in functions: replace = re.compile(" " + temp[-1]); atypes.append(replace.sub("", arg)); - + my_args = "" arg_types = "" for arg in range(0, len(margs) - 1): @@ -255,7 +255,7 @@ for func in functions: this_type = leading_space_remove.sub("", atypes[-1]); arg_types = arg_types + this_type; my_argtypelist.append(arg_types); - + if(ret[-1] != ' '): ret = ret + ' '; @@ -263,7 +263,7 @@ for func in functions: my_arglist.append(my_args) #!!!!!!!!!!!!!!!!! retlist.append(ret); fcountlist.append(fcount); - taglist.append(tags); + taglist.append(tags); outfile_cpu_h.write(make_cpuid_h(filearchs)); diff --git a/volk/include/volk/volk_16i_branch_4_state_8_a.h b/volk/include/volk/volk_16i_branch_4_state_8_a.h index 0424e66e99..6338fbdd17 100644 --- a/volk/include/volk/volk_16i_branch_4_state_8_a.h +++ b/volk/include/volk/volk_16i_branch_4_state_8_a.h @@ -3,7 +3,7 @@ #include<inttypes.h> -#include<stdio.h> +#include<stdio.h> @@ -15,32 +15,32 @@ #include<tmmintrin.h> static inline void volk_16i_branch_4_state_8_a_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { - - + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars; - - + + p_target = (__m128i*)target; p_src0 = (__m128i*)src0; p_cntl2 = (__m128i*)cntl2; p_cntl3 = (__m128i*)cntl3; p_scalars = (__m128i*)scalars; - + int i = 0; - + int bound = 1; - - + + xmm0 = _mm_load_si128(p_scalars); - + xmm1 = _mm_shufflelo_epi16(xmm0, 0); xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); - + xmm1 = _mm_shuffle_epi32(xmm1, 0x00); xmm2 = _mm_shuffle_epi32(xmm2, 0x00); xmm3 = _mm_shuffle_epi32(xmm3, 0x00); @@ -52,40 +52,40 @@ static inline void volk_16i_branch_4_state_8_a_ssse3(short* target, short* src xmm10 = _mm_load_si128((__m128i*)permuters[3]); for(; i < bound; ++i) { - + xmm5 = _mm_load_si128(p_src0); - - - - - - - + + + + + + + xmm0 = _mm_shuffle_epi8(xmm5, xmm0); xmm6 = _mm_shuffle_epi8(xmm5, xmm6); xmm8 = _mm_shuffle_epi8(xmm5, xmm8); xmm10 = _mm_shuffle_epi8(xmm5, xmm10); - + p_src0 += 4; - - + + xmm5 = _mm_add_epi16(xmm1, xmm2); - + xmm6 = _mm_add_epi16(xmm2, xmm6); xmm8 = _mm_add_epi16(xmm1, xmm8); - - + + xmm7 = _mm_load_si128(p_cntl2); xmm9 = _mm_load_si128(p_cntl3); - + xmm0 = _mm_add_epi16(xmm5, xmm0); - - + + xmm7 = _mm_and_si128(xmm7, xmm3); xmm9 = _mm_and_si128(xmm9, xmm4); - + xmm5 = _mm_load_si128(&p_cntl2[1]); xmm11 = _mm_load_si128(&p_cntl3[1]); @@ -95,96 +95,96 @@ static inline void volk_16i_branch_4_state_8_a_ssse3(short* target, short* src xmm11 = _mm_and_si128(xmm11, xmm4); xmm0 = _mm_add_epi16(xmm0, xmm7); - - - + + + xmm7 = _mm_load_si128(&p_cntl2[2]); xmm9 = _mm_load_si128(&p_cntl3[2]); - + xmm5 = _mm_add_epi16(xmm5, xmm11); - + xmm7 = _mm_and_si128(xmm7, xmm3); xmm9 = _mm_and_si128(xmm9, xmm4); - + xmm6 = _mm_add_epi16(xmm6, xmm5); - - + + xmm5 = _mm_load_si128(&p_cntl2[3]); xmm11 = _mm_load_si128(&p_cntl3[3]); - + xmm7 = _mm_add_epi16(xmm7, xmm9); - + xmm5 = _mm_and_si128(xmm5, xmm3); xmm11 = _mm_and_si128(xmm11, xmm4); - + xmm8 = _mm_add_epi16(xmm8, xmm7); - + xmm5 = _mm_add_epi16(xmm5, xmm11); - + _mm_store_si128(p_target, xmm0); _mm_store_si128(&p_target[1], xmm6); xmm10 = _mm_add_epi16(xmm5, xmm10); - + _mm_store_si128(&p_target[2], xmm8); - + _mm_store_si128(&p_target[3], xmm10); - - p_target += 3; + + p_target += 3; } } - - + + #endif /*LV_HAVE_SSEs*/ #ifdef LV_HAVE_GENERIC static inline void volk_16i_branch_4_state_8_a_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { int i = 0; - + int bound = 4; - + for(; i < bound; ++i) { - target[i* 8] = src0[((char)permuters[i][0])/2] + target[i* 8] = src0[((char)permuters[i][0])/2] + ((i + 1)%2 * scalars[0]) + (((i >> 1)^1) * scalars[1]) + (cntl2[i * 8] & scalars[2]) + (cntl3[i * 8] & scalars[3]); - target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] + target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] + ((i + 1)%2 * scalars[0]) + (((i >> 1)^1) * scalars[1]) + (cntl2[i * 8 + 1] & scalars[2]) + (cntl3[i * 8 + 1] & scalars[3]); - target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] + target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] + ((i + 1)%2 * scalars[0]) + (((i >> 1)^1) * scalars[1]) + (cntl2[i * 8 + 2] & scalars[2]) + (cntl3[i * 8 + 2] & scalars[3]); - target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] + target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] + ((i + 1)%2 * scalars[0]) + (((i >> 1)^1) * scalars[1]) + (cntl2[i * 8 + 3] & scalars[2]) + (cntl3[i * 8 + 3] & scalars[3]); - target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] + target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] + ((i + 1)%2 * scalars[0]) + (((i >> 1)^1) * scalars[1]) + (cntl2[i * 8 + 4] & scalars[2]) + (cntl3[i * 8 + 4] & scalars[3]); - target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] + target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] + ((i + 1)%2 * scalars[0]) + (((i >> 1)^1) * scalars[1]) + (cntl2[i * 8 + 5] & scalars[2]) + (cntl3[i * 8 + 5] & scalars[3]); - target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] + target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] + ((i + 1)%2 * scalars[0]) + (((i >> 1)^1) * scalars[1]) + (cntl2[i * 8 + 6] & scalars[2]) + (cntl3[i * 8 + 6] & scalars[3]); - target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] + target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] + ((i + 1)%2 * scalars[0]) + (((i >> 1)^1) * scalars[1]) + (cntl2[i * 8 + 7] & scalars[2]) + (cntl3[i * 8 + 7] & scalars[3]); - + } } diff --git a/volk/include/volk/volk_16i_convert_8i_a.h b/volk/include/volk/volk_16i_convert_8i_a.h index 8046035c74..84548c8c50 100644 --- a/volk/include/volk/volk_16i_convert_8i_a.h +++ b/volk/include/volk/volk_16i_convert_8i_a.h @@ -15,7 +15,7 @@ static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ unsigned int number = 0; const unsigned int sixteenthPoints = num_points / 16; - + int8_t* outputVectorPtr = outputVector; int16_t* inputPtr = (int16_t*)inputVector; __m128i inputVal1; @@ -30,7 +30,7 @@ static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_ inputVal1 = _mm_srai_epi16(inputVal1, 8); inputVal2 = _mm_srai_epi16(inputVal2, 8); - + ret = _mm_packs_epi16(inputVal1, inputVal2); _mm_store_si128((__m128i*)outputVectorPtr, ret); diff --git a/volk/include/volk/volk_16i_convert_8i_u.h b/volk/include/volk/volk_16i_convert_8i_u.h index df1084fe0c..80608a1412 100644 --- a/volk/include/volk/volk_16i_convert_8i_u.h +++ b/volk/include/volk/volk_16i_convert_8i_u.h @@ -16,7 +16,7 @@ static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ unsigned int number = 0; const unsigned int sixteenthPoints = num_points / 16; - + int8_t* outputVectorPtr = outputVector; int16_t* inputPtr = (int16_t*)inputVector; __m128i inputVal1; @@ -31,7 +31,7 @@ static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector, const int16_ inputVal1 = _mm_srai_epi16(inputVal1, 8); inputVal2 = _mm_srai_epi16(inputVal2, 8); - + ret = _mm_packs_epi16(inputVal1, inputVal2); _mm_storeu_si128((__m128i*)outputVectorPtr, ret); diff --git a/volk/include/volk/volk_16i_max_star_16i_a.h b/volk/include/volk/volk_16i_max_star_16i_a.h index 28197ddef7..edfff8a82b 100644 --- a/volk/include/volk/volk_16i_max_star_16i_a.h +++ b/volk/include/volk/volk_16i_max_star_16i_a.h @@ -3,7 +3,7 @@ #include<inttypes.h> -#include<stdio.h> +#include<stdio.h> #ifdef LV_HAVE_SSSE3 @@ -15,82 +15,82 @@ static inline void volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_bytes) { - + short candidate = src0[0]; short cands[8]; __m128i xmm0, xmm1, xmm3, xmm4, xmm5, xmm6; - + __m128i *p_src0; - + p_src0 = (__m128i*)src0; int bound = num_bytes >> 4; int leftovers = (num_bytes >> 1) & 7; - + int i = 0; - - + + xmm1 = _mm_setzero_si128(); xmm0 = _mm_setzero_si128(); //_mm_insert_epi16(xmm0, candidate, 0); - - xmm0 = _mm_shuffle_epi8(xmm0, xmm1); - + xmm0 = _mm_shuffle_epi8(xmm0, xmm1); + + for(i = 0; i < bound; ++i) { xmm1 = _mm_load_si128(p_src0); p_src0 += 1; //xmm2 = _mm_sub_epi16(xmm1, xmm0); - - - - - + + + + + xmm3 = _mm_cmpgt_epi16(xmm0, xmm1); xmm4 = _mm_cmpeq_epi16(xmm0, xmm1); xmm5 = _mm_cmpgt_epi16(xmm1, xmm0); xmm6 = _mm_xor_si128(xmm4, xmm5); - + xmm3 = _mm_and_si128(xmm3, xmm0); xmm4 = _mm_and_si128(xmm6, xmm1); - + xmm0 = _mm_add_epi16(xmm3, xmm4); - - + + } - + _mm_store_si128((__m128i*)cands, xmm0); - + for(i = 0; i < 8; ++i) { candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i]; } - - - + + + for(i = 0; i < leftovers; ++i) { - + candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i]; } target[0] = candidate; - - - - -} - + + + + +} + #endif /*LV_HAVE_SSSE3*/ #ifdef LV_HAVE_GENERIC static inline void volk_16i_max_star_16i_a_generic(short* target, short* src0, unsigned int num_bytes) { - + int i = 0; - + int bound = num_bytes >> 1; short candidate = src0[0]; @@ -98,7 +98,7 @@ static inline void volk_16i_max_star_16i_a_generic(short* target, short* src0, u candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i]; } target[0] = candidate; - + } diff --git a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h b/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h index a10a62350e..c1c9084256 100644 --- a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h +++ b/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h @@ -4,7 +4,7 @@ #include <volk/volk_common.h> #include<inttypes.h> -#include<stdio.h> +#include<stdio.h> #ifdef LV_HAVE_SSSE3 @@ -20,107 +20,107 @@ static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, in const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02}; - - + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; __m128i xmm5, xmm6, xmm7, xmm8; - + xmm4 = _mm_load_si128((__m128i*)shufmask0); xmm5 = _mm_load_si128((__m128i*)shufmask1); xmm6 = _mm_load_si128((__m128i*)andmask0); xmm7 = _mm_load_si128((__m128i*)andmask1); - + __m128i *p_target, *p_src0; - + p_target = (__m128i*)target; p_src0 = (__m128i*)src0; int bound = num_bytes >> 5; int intermediate = (num_bytes >> 4) & 1; int leftovers = (num_bytes >> 1) & 7; - + int i = 0; - - + + for(i = 0; i < bound; ++i) { - + xmm0 = _mm_load_si128(p_src0); xmm1 = _mm_load_si128(&p_src0[1]); - - + + xmm2 = _mm_xor_si128(xmm2, xmm2); p_src0 += 2; - + xmm3 = _mm_hsub_epi16(xmm0, xmm1); - - xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); + + xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); xmm8 = _mm_and_si128(xmm2, xmm6); xmm3 = _mm_and_si128(xmm2, xmm7); - + xmm8 = _mm_add_epi8(xmm8, xmm4); xmm3 = _mm_add_epi8(xmm3, xmm5); xmm0 = _mm_shuffle_epi8(xmm0, xmm8); xmm1 = _mm_shuffle_epi8(xmm1, xmm3); - - + + xmm3 = _mm_add_epi16(xmm0, xmm1); - + _mm_store_si128(p_target, xmm3); - + p_target += 1; - + } for(i = 0; i < intermediate; ++i) { - + xmm0 = _mm_load_si128(p_src0); - - + + xmm2 = _mm_xor_si128(xmm2, xmm2); p_src0 += 1; - + xmm3 = _mm_hsub_epi16(xmm0, xmm1); xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); xmm8 = _mm_and_si128(xmm2, xmm6); - + xmm3 = _mm_add_epi8(xmm8, xmm4); - + xmm0 = _mm_shuffle_epi8(xmm0, xmm3); - + _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec); - + p_target = (__m128i*)((int8_t*)p_target + 8); } - - for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { + + for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; } - -} - + +} + #endif /*LV_HAVE_SSSE3*/ #ifdef LV_HAVE_GENERIC static inline void volk_16i_max_star_horizontal_16i_a_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) { - + int i = 0; - + int bound = num_bytes >> 1; - + for(i = 0; i < bound; i += 2) { target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1]; } - + } diff --git a/volk/include/volk/volk_16i_permute_and_scalar_add_a.h b/volk/include/volk/volk_16i_permute_and_scalar_add_a.h index de36cee804..47e3cbf9cb 100644 --- a/volk/include/volk/volk_16i_permute_and_scalar_add_a.h +++ b/volk/include/volk/volk_16i_permute_and_scalar_add_a.h @@ -3,7 +3,7 @@ #include<inttypes.h> -#include<stdio.h> +#include<stdio.h> @@ -14,33 +14,33 @@ #include<emmintrin.h> static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { - + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars; short* p_permute_indexes = permute_indexes; - + p_target = (__m128i*)target; p_cntl0 = (__m128i*)cntl0; p_cntl1 = (__m128i*)cntl1; p_cntl2 = (__m128i*)cntl2; p_cntl3 = (__m128i*)cntl3; p_scalars = (__m128i*)scalars; - + int i = 0; - + int bound = (num_bytes >> 4); int leftovers = (num_bytes >> 1) & 7; - + xmm0 = _mm_load_si128(p_scalars); - + xmm1 = _mm_shufflelo_epi16(xmm0, 0); xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); - + xmm1 = _mm_shuffle_epi32(xmm1, 0x00); xmm2 = _mm_shuffle_epi32(xmm2, 0x00); xmm3 = _mm_shuffle_epi32(xmm3, 0x00); @@ -64,49 +64,49 @@ static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, short xmm0 = _mm_add_epi16(xmm0, xmm5); xmm6 = _mm_add_epi16(xmm6, xmm7); - + p_permute_indexes += 8; - + xmm0 = _mm_add_epi16(xmm0, xmm6); - + xmm5 = _mm_load_si128(p_cntl0); xmm6 = _mm_load_si128(p_cntl1); xmm7 = _mm_load_si128(p_cntl2); - + xmm5 = _mm_and_si128(xmm5, xmm1); xmm6 = _mm_and_si128(xmm6, xmm2); xmm7 = _mm_and_si128(xmm7, xmm3); - + xmm0 = _mm_add_epi16(xmm0, xmm5); - + xmm5 = _mm_load_si128(p_cntl3); - + xmm6 = _mm_add_epi16(xmm6, xmm7); p_cntl0 += 1; - + xmm5 = _mm_and_si128(xmm5, xmm4); - + xmm0 = _mm_add_epi16(xmm0, xmm6); - + p_cntl1 += 1; p_cntl2 += 1; - - xmm0 = _mm_add_epi16(xmm0, xmm5); - + + xmm0 = _mm_add_epi16(xmm0, xmm5); + p_cntl3 += 1; _mm_store_si128(p_target, xmm0); - + p_target += 1; } - - - - + + + + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { - target[i] = src0[permute_indexes[i]] + target[i] = src0[permute_indexes[i]] + (cntl0[i] & scalars[0]) + (cntl1[i] & scalars[1]) + (cntl2[i] & scalars[2]) @@ -118,18 +118,18 @@ static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, short #ifdef LV_HAVE_GENERIC static inline void volk_16i_permute_and_scalar_add_a_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { - + int i = 0; - + int bound = num_bytes >> 1; for(i = 0; i < bound; ++i) { - target[i] = src0[permute_indexes[i]] + target[i] = src0[permute_indexes[i]] + (cntl0[i] & scalars[0]) + (cntl1[i] & scalars[1]) + (cntl2[i] & scalars[2]) + (cntl3[i] & scalars[3]); - + } } diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_a.h b/volk/include/volk/volk_16i_s32f_convert_32f_a.h index 0555fdf002..7108ff6590 100644 --- a/volk/include/volk/volk_16i_s32f_convert_32f_a.h +++ b/volk/include/volk/volk_16i_s32f_convert_32f_a.h @@ -17,7 +17,7 @@ static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int eighthPoints = num_points / 8; - + float* outputVectorPtr = outputVector; __m128 invScalar = _mm_set_ps1(1.0/scalar); int16_t* inputPtr = (int16_t*)inputVector; @@ -36,7 +36,7 @@ static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const // Convert the lower 4 values into 32 bit words inputVal = _mm_cvtepi16_epi32(inputVal); inputVal2 = _mm_cvtepi16_epi32(inputVal2); - + ret = _mm_cvtepi32_ps(inputVal); ret = _mm_mul_ps(ret, invScalar); _mm_storeu_ps(outputVectorPtr, ret); @@ -71,7 +71,7 @@ static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + float* outputVectorPtr = outputVector; __m128 invScalar = _mm_set_ps1(1.0/scalar); int16_t* inputPtr = (int16_t*)inputVector; @@ -79,7 +79,7 @@ static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector, const in for(;number < quarterPoints; number++){ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); - + ret = _mm_mul_ps(ret, invScalar); _mm_storeu_ps(outputVectorPtr, ret); diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_u.h b/volk/include/volk/volk_16i_s32f_convert_32f_u.h index d34acc0913..4ce8e8f35b 100644 --- a/volk/include/volk/volk_16i_s32f_convert_32f_u.h +++ b/volk/include/volk/volk_16i_s32f_convert_32f_u.h @@ -18,7 +18,7 @@ static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int eighthPoints = num_points / 8; - + float* outputVectorPtr = outputVector; __m128 invScalar = _mm_set_ps1(1.0/scalar); int16_t* inputPtr = (int16_t*)inputVector; @@ -37,7 +37,7 @@ static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const // Convert the lower 4 values into 32 bit words inputVal = _mm_cvtepi16_epi32(inputVal); inputVal2 = _mm_cvtepi16_epi32(inputVal2); - + ret = _mm_cvtepi32_ps(inputVal); ret = _mm_mul_ps(ret, invScalar); _mm_storeu_ps(outputVectorPtr, ret); @@ -73,7 +73,7 @@ static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + float* outputVectorPtr = outputVector; __m128 invScalar = _mm_set_ps1(1.0/scalar); int16_t* inputPtr = (int16_t*)inputVector; @@ -81,7 +81,7 @@ static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector, const in for(;number < quarterPoints; number++){ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); - + ret = _mm_mul_ps(ret, invScalar); _mm_storeu_ps(outputVectorPtr, ret); diff --git a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h index 2688aff04e..0d84985530 100644 --- a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h +++ b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a.h @@ -3,7 +3,7 @@ #include<inttypes.h> -#include<stdio.h> +#include<stdio.h> @@ -14,7 +14,7 @@ #include<emmintrin.h> static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { - + @@ -23,41 +23,41 @@ static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* s int bound = (num_bytes >> 4); int bound_copy = bound; int leftovers = (num_bytes >> 1) & 7; - + __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3; p_target = (__m128i*) target; p_src0 = (__m128i*)src0; p_src1 = (__m128i*)src1; p_src2 = (__m128i*)src2; p_src3 = (__m128i*)src3; - - + + __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; while(bound_copy > 0) { - + xmm1 = _mm_load_si128(p_src0); xmm2 = _mm_load_si128(p_src1); xmm3 = _mm_load_si128(p_src2); xmm4 = _mm_load_si128(p_src3); - + xmm5 = _mm_setzero_si128(); xmm6 = _mm_setzero_si128(); xmm7 = xmm1; xmm8 = xmm3; - - + + xmm1 = _mm_sub_epi16(xmm2, xmm1); - + xmm3 = _mm_sub_epi16(xmm4, xmm3); xmm5 = _mm_cmpgt_epi16(xmm1, xmm5); xmm6 = _mm_cmpgt_epi16(xmm3, xmm6); - + xmm2 = _mm_and_si128(xmm5, xmm2); xmm4 = _mm_and_si128(xmm6, xmm4); @@ -67,7 +67,7 @@ static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* s xmm5 = _mm_add_epi16(xmm2, xmm5); xmm6 = _mm_add_epi16(xmm4, xmm6); - + xmm1 = _mm_xor_si128(xmm1, xmm1); xmm2 = xmm5; xmm5 = _mm_sub_epi16(xmm6, xmm5); @@ -76,23 +76,23 @@ static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* s xmm1 = _mm_cmpgt_epi16(xmm5, xmm1); p_src1 += 1; - + xmm6 = _mm_and_si128(xmm1, xmm6); - + xmm1 = _mm_andnot_si128(xmm1, xmm2); p_src2 += 1; - + xmm1 = _mm_add_epi16(xmm6, xmm1); p_src3 += 1; - + _mm_store_si128(p_target, xmm1); p_target += 1; - + } - + /*asm volatile ( @@ -111,25 +111,25 @@ static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* s "movaps %%xmm3, %%xmm8\n\t" "psubw %%xmm2, %%xmm1\n\t" "psubw %%xmm4, %%xmm3\n\t" - + "pcmpgtw %%xmm1, %%xmm5\n\t" "pcmpgtw %%xmm3, %%xmm6\n\t" - + "pand %%xmm5, %%xmm2\n\t" "pand %%xmm6, %%xmm4\n\t" "pandn %%xmm7, %%xmm5\n\t" "pandn %%xmm8, %%xmm6\n\t" - + "paddw %%xmm2, %%xmm5\n\t" "paddw %%xmm4, %%xmm6\n\t" "pxor %%xmm1, %%xmm1\n\t" "movaps %%xmm5, %%xmm2\n\t" - + "psubw %%xmm6, %%xmm5\n\t" "add $16, %[src0]\n\t" "add $-1, %[bound]\n\t" - + "pcmpgtw %%xmm5, %%xmm1\n\t" "add $16, %[src1]\n\t" @@ -144,13 +144,13 @@ static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* s "movaps %%xmm1, (%[target])\n\t" "addw $16, %[target]\n\t" "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t" - + "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t" : :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target) : ); - */ + */ short temp0 = 0; short temp1 = 0; @@ -169,11 +169,11 @@ static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* s #ifdef LV_HAVE_GENERIC static inline void volk_16i_x4_quad_max_star_16i_a_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { - + int i = 0; - + int bound = num_bytes >> 1; - + short temp0 = 0; short temp1 = 0; for(i = 0; i < bound; ++i) { diff --git a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h index e4c9f17ed3..5560b92d92 100644 --- a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h +++ b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a.h @@ -3,7 +3,7 @@ #include<inttypes.h> -#include<stdio.h> +#include<stdio.h> @@ -14,7 +14,7 @@ #include<emmintrin.h> static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { - + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4; p_target0 = (__m128i*)target0; @@ -39,16 +39,16 @@ static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* ta xmm2 = _mm_load_si128(p_src2); xmm3 = _mm_load_si128(p_src3); xmm4 = _mm_load_si128(p_src4); - + p_src0 += 1; p_src1 += 1; - + xmm1 = _mm_add_epi16(xmm0, xmm1); xmm2 = _mm_add_epi16(xmm0, xmm2); xmm3 = _mm_add_epi16(xmm0, xmm3); xmm4 = _mm_add_epi16(xmm0, xmm4); - - + + p_src2 += 1; p_src3 += 1; p_src4 += 1; @@ -57,7 +57,7 @@ static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* ta _mm_store_si128(p_target1, xmm2); _mm_store_si128(p_target2, xmm3); _mm_store_si128(p_target3, xmm4); - + p_target0 += 1; p_target1 += 1; p_target2 += 1; @@ -97,9 +97,9 @@ static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* ta :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3) :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); - + */ - + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { target0[i] = src0[i] + src1[i]; @@ -114,9 +114,9 @@ static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* ta #ifdef LV_HAVE_GENERIC static inline void volk_16i_x5_add_quad_16i_x4_a_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { - + int i = 0; - + int bound = num_bytes >> 1; for(i = 0; i < bound; ++i) { diff --git a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h index cdd60235e6..f8aa30874f 100644 --- a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h +++ b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a.h @@ -71,7 +71,7 @@ static inline void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer, int16_ __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); unsigned int eighthPoints = num_points / 8; - + for(number = 0; number < eighthPoints; number++){ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; diff --git a/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h b/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h index 2b99e068e9..bac1f2e4b0 100644 --- a/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h +++ b/volk/include/volk/volk_16ic_deinterleave_real_16i_a.h @@ -64,7 +64,7 @@ static inline void volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer, cons __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); unsigned int eighthPoints = num_points / 8; - + for(number = 0; number < eighthPoints; number++){ complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; diff --git a/volk/include/volk/volk_16ic_magnitude_16i_a.h b/volk/include/volk/volk_16ic_magnitude_16i_a.h index a6951e9679..317075e85e 100644 --- a/volk/include/volk/volk_16ic_magnitude_16i_a.h +++ b/volk/include/volk/volk_16ic_magnitude_16i_a.h @@ -17,7 +17,7 @@ static inline void volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const int16_t* complexVectorPtr = (const int16_t*)complexVector; int16_t* magnitudeVectorPtr = magnitudeVector; @@ -35,7 +35,7 @@ static inline void volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector, cons inputFloatBuffer[1] = (float)(complexVectorPtr[1]); inputFloatBuffer[2] = (float)(complexVectorPtr[2]); inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); inputFloatBuffer[5] = (float)(complexVectorPtr[5]); inputFloatBuffer[6] = (float)(complexVectorPtr[6]); @@ -106,7 +106,7 @@ static inline void volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector, const inputFloatBuffer[1] = (float)(complexVectorPtr[1]); inputFloatBuffer[2] = (float)(complexVectorPtr[2]); inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - + cplxValue1 = _mm_load_ps(inputFloatBuffer); complexVectorPtr += 4; diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h index e73d405e02..1300395ff0 100644 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a.h @@ -20,7 +20,7 @@ static inline void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, floa float* qBufferPtr = qBuffer; uint64_t number = 0; - const uint64_t quarterPoints = num_points / 4; + const uint64_t quarterPoints = num_points / 4; __m128 cplxValue1, cplxValue2, iValue, qValue; __m128 invScalar = _mm_set_ps1(1.0/scalar); @@ -29,12 +29,12 @@ static inline void volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, floa __VOLK_ATTR_ALIGNED(16) float floatBuffer[8]; for(;number < quarterPoints; number++){ - + floatBuffer[0] = (float)(complexVectorPtr[0]); floatBuffer[1] = (float)(complexVectorPtr[1]); floatBuffer[2] = (float)(complexVectorPtr[2]); floatBuffer[3] = (float)(complexVectorPtr[3]); - + floatBuffer[4] = (float)(complexVectorPtr[4]); floatBuffer[5] = (float)(complexVectorPtr[5]); floatBuffer[6] = (float)(complexVectorPtr[6]); diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h index 1630cb0ed2..5e2d82b947 100644 --- a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a.h @@ -18,7 +18,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, float* iBufferPtr = iBuffer; unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_points / 4; __m128 iFloatValue; @@ -49,7 +49,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; sixteenTComplexVectorPtr++; } - + } #endif /* LV_HAVE_SSE4_1 */ @@ -66,7 +66,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, co float* iBufferPtr = iBuffer; unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_points / 4; __m128 iValue; const float iScalar = 1.0/scalar; @@ -77,7 +77,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, co for(;number < quarterPoints; number++){ floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; @@ -96,7 +96,7 @@ static inline void volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, co *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar; complexVectorPtr++; } - + } #endif /* LV_HAVE_SSE */ diff --git a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h index 35406e2cbb..d20eea1a79 100644 --- a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h +++ b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a.h @@ -18,7 +18,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const int16_t* complexVectorPtr = (const int16_t*)complexVector; float* magnitudeVectorPtr = magnitudeVector; @@ -34,7 +34,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector, c inputFloatBuffer[1] = (float)(complexVectorPtr[1]); inputFloatBuffer[2] = (float)(complexVectorPtr[2]); inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); inputFloatBuffer[5] = (float)(complexVectorPtr[5]); inputFloatBuffer[6] = (float)(complexVectorPtr[6]); @@ -56,7 +56,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector, c result = _mm_sqrt_ps(result); // Square root the values _mm_store_ps(magnitudeVectorPtr, result); - + magnitudeVectorPtr += 4; } @@ -99,7 +99,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, co inputFloatBuffer[1] = (float)(complexVectorPtr[1]); inputFloatBuffer[2] = (float)(complexVectorPtr[2]); inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); inputFloatBuffer[5] = (float)(complexVectorPtr[5]); inputFloatBuffer[6] = (float)(complexVectorPtr[6]); @@ -107,7 +107,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, co cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); - + re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88); im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd); @@ -124,7 +124,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, co result = _mm_sqrt_ps(result); // Square root the values _mm_store_ps(magnitudeVectorPtr, result); - + magnitudeVectorPtr += 4; } @@ -138,7 +138,7 @@ static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, co } } - + #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC diff --git a/volk/include/volk/volk_16u_byteswap_a.h b/volk/include/volk/volk_16u_byteswap_a.h index 75c7ef0f3c..fc3eb5fa7a 100644 --- a/volk/include/volk/volk_16u_byteswap_a.h +++ b/volk/include/volk/volk_16u_byteswap_a.h @@ -31,9 +31,9 @@ static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int n inputPtr += 8; } - + // Byteswap any remaining points: - number = eighthPoints*8; + number = eighthPoints*8; for(; number < num_points; number++){ uint16_t outputVal = *inputPtr; outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); diff --git a/volk/include/volk/volk_32f_accumulator_s32f_a.h b/volk/include/volk/volk_32f_accumulator_s32f_a.h index 7ce0d1c80d..78364d0a01 100644 --- a/volk/include/volk/volk_32f_accumulator_s32f_a.h +++ b/volk/include/volk/volk_32f_accumulator_s32f_a.h @@ -20,13 +20,13 @@ static inline void volk_32f_accumulator_s32f_a_sse(float* result, const float* i const float* aPtr = inputBuffer; __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; - + __m128 accumulator = _mm_setzero_ps(); __m128 aVal = _mm_setzero_ps(); for(;number < quarterPoints; number++){ aVal = _mm_load_ps(aPtr); - accumulator = _mm_add_ps(accumulator, aVal); + accumulator = _mm_add_ps(accumulator, aVal); aPtr += 4; } _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container @@ -34,7 +34,7 @@ static inline void volk_32f_accumulator_s32f_a_sse(float* result, const float* i returnValue += tempBuffer[1]; returnValue += tempBuffer[2]; returnValue += tempBuffer[3]; - + number = quarterPoints * 4; for(;number < num_points; number++){ returnValue += (*aPtr++); diff --git a/volk/include/volk/volk_32f_convert_64f_a.h b/volk/include/volk/volk_32f_convert_64f_a.h index dda6464096..2c469ac421 100644 --- a/volk/include/volk/volk_32f_convert_64f_a.h +++ b/volk/include/volk/volk_32f_convert_64f_a.h @@ -16,7 +16,7 @@ static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* inputVectorPtr = (const float*)inputVector; double* outputVectorPtr = outputVector; __m128d ret; @@ -24,7 +24,7 @@ static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float for(;number < quarterPoints; number++){ inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - + ret = _mm_cvtps_pd(inputVal); _mm_store_pd(outputVectorPtr, ret); @@ -38,7 +38,7 @@ static inline void volk_32f_convert_64f_a_sse2(double* outputVector, const float outputVectorPtr += 2; } - number = quarterPoints * 4; + number = quarterPoints * 4; for(; number < num_points; number++){ outputVector[number] = (double)(inputVector[number]); } diff --git a/volk/include/volk/volk_32f_convert_64f_u.h b/volk/include/volk/volk_32f_convert_64f_u.h index 387baa3b95..10d8a4f6c0 100644 --- a/volk/include/volk/volk_32f_convert_64f_u.h +++ b/volk/include/volk/volk_32f_convert_64f_u.h @@ -16,7 +16,7 @@ static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* inputVectorPtr = (const float*)inputVector; double* outputVectorPtr = outputVector; __m128d ret; @@ -24,7 +24,7 @@ static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float for(;number < quarterPoints; number++){ inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - + ret = _mm_cvtps_pd(inputVal); _mm_storeu_pd(outputVectorPtr, ret); @@ -38,7 +38,7 @@ static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float outputVectorPtr += 2; } - number = quarterPoints * 4; + number = quarterPoints * 4; for(; number < num_points; number++){ outputVector[number] = (double)(inputVector[number]); } diff --git a/volk/include/volk/volk_32f_index_max_16u_a.h b/volk/include/volk/volk_32f_index_max_16u_a.h index 0c43a50815..b9ca1dd3e7 100644 --- a/volk/include/volk/volk_32f_index_max_16u_a.h +++ b/volk/include/volk/volk_32f_index_max_16u_a.h @@ -52,7 +52,7 @@ static inline void volk_32f_index_max_16u_a_sse4_1(unsigned int* target, const f } number = quarterPoints * 4; - for(;number < num_points; number++){ + for(;number < num_points; number++){ if(src0[number] > max){ index = number; max = src0[number]; @@ -111,7 +111,7 @@ static inline void volk_32f_index_max_16u_a_sse(unsigned int* target, const floa } number = quarterPoints * 4; - for(;number < num_points; number++){ + for(;number < num_points; number++){ if(src0[number] > max){ index = number; max = src0[number]; @@ -128,11 +128,11 @@ static inline void volk_32f_index_max_16u_a_generic(unsigned int* target, const if(num_points > 0){ float max = src0[0]; unsigned int index = 0; - - unsigned int i = 1; - + + unsigned int i = 1; + for(; i < num_points; ++i) { - + if(src0[i] > max){ index = i; max = src0[i]; diff --git a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h index b25df75a17..43713f8b5a 100644 --- a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h +++ b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a.h @@ -46,7 +46,7 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector, co inPtr++; outPtr++; } - + for (; number < quarterPoints; number++) { // Load data next3old1 = _mm_loadu_ps((float*) (inPtr-1)); @@ -65,7 +65,7 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector, co _mm_store_ps(outPtr,next3old1); // Store the results back into the output outPtr += 4; } - + for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) { *outPtr = *(inPtr) - *(inPtr-1); if (*outPtr > bound) *outPtr -= 2*bound; @@ -73,7 +73,7 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector, co inPtr++; outPtr++; } - + *saveValue = inputVector[num_points-1]; } #endif /* LV_HAVE_SSE */ @@ -94,14 +94,14 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a_generic(float* outputVector unsigned int number = 0; float* outPtr = outputVector; const float* inPtr = inputVector; - + // Do the first 1 by hand since we're going in from the saveValue: *outPtr = *inPtr - *saveValue; if (*outPtr > bound) *outPtr -= 2*bound; if (*outPtr < -bound) *outPtr += 2*bound; inPtr++; outPtr++; - + for (number = 1; number < num_points; number++) { *outPtr = *(inPtr) - *(inPtr-1); if (*outPtr > bound) *outPtr -= 2*bound; @@ -109,7 +109,7 @@ static inline void volk_32f_s32f_32f_fm_detect_32f_a_generic(float* outputVector inPtr++; outPtr++; } - + *saveValue = inputVector[num_points-1]; } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h index b1902a8c00..db61e359d6 100644 --- a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h +++ b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a.h @@ -23,7 +23,7 @@ static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a_sse(float* nois const float* dataPointsPtr = realDataPoints; __VOLK_ATTR_ALIGNED(16) float avgPointsVector[4]; - + __m128 dataPointsVal; __m128 avgPointsVal = _mm_setzero_ps(); // Calculate the sum (for mean) for all points @@ -73,11 +73,11 @@ static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a_sse(float* nois // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal)); - + // Count the number of bins which do not exceed the mean amplitude vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector)); } - + // Calculate the mean from the remaining data points _mm_store_ps(avgPointsVector, avgPointsVal); @@ -104,7 +104,7 @@ static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a_sse(float* nois validBinCount += 1.0; } } - + float localNoiseFloorAmplitude = 0; if(validBinCount > 0.0){ localNoiseFloorAmplitude = sumMean / validBinCount; diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h index a249596780..9df4946f24 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_a.h @@ -19,7 +19,7 @@ static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const unsigned int number = 0; const unsigned int eighthPoints = num_points / 8; - + const float* inputVectorPtr = (const float*)inputVector; int16_t* outputVectorPtr = outputVector; @@ -51,7 +51,7 @@ static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const outputVectorPtr += 8; } - number = eighthPoints * 8; + number = eighthPoints * 8; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) @@ -76,7 +76,7 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* inputVectorPtr = (const float*)inputVector; int16_t* outputVectorPtr = outputVector; @@ -105,7 +105,7 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); } - number = quarterPoints * 4; + number = quarterPoints * 4; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_u.h b/volk/include/volk/volk_32f_s32f_convert_16i_u.h index f58158041d..56e42c9bd5 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_u.h @@ -19,7 +19,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const unsigned int number = 0; const unsigned int eighthPoints = num_points / 8; - + const float* inputVectorPtr = (const float*)inputVector; int16_t* outputVectorPtr = outputVector; @@ -51,7 +51,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const outputVectorPtr += 8; } - number = eighthPoints * 8; + number = eighthPoints * 8; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) @@ -77,7 +77,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* inputVectorPtr = (const float*)inputVector; int16_t* outputVectorPtr = outputVector; @@ -106,7 +106,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); } - number = quarterPoints * 4; + number = quarterPoints * 4; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a.h b/volk/include/volk/volk_32f_s32f_convert_32i_a.h index 15fa282fba..38e6b2e745 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_a.h @@ -18,7 +18,7 @@ static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const unsigned int number = 0; const unsigned int eighthPoints = num_points / 8; - + const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; @@ -42,7 +42,7 @@ static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const outputVectorPtr += 8; } - number = eighthPoints * 8; + number = eighthPoints * 8; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) @@ -67,7 +67,7 @@ static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; @@ -91,7 +91,7 @@ static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const outputVectorPtr += 4; } - number = quarterPoints * 4; + number = quarterPoints * 4; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) @@ -116,7 +116,7 @@ static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; @@ -144,7 +144,7 @@ static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); } - number = quarterPoints * 4; + number = quarterPoints * 4; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_u.h b/volk/include/volk/volk_32f_s32f_convert_32i_u.h index d203546c62..ee15edb464 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_u.h @@ -18,7 +18,7 @@ static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; @@ -42,7 +42,7 @@ static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const outputVectorPtr += 4; } - number = quarterPoints * 4; + number = quarterPoints * 4; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) @@ -68,7 +68,7 @@ static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; @@ -96,7 +96,7 @@ static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); } - number = quarterPoints * 4; + number = quarterPoints * 4; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a.h b/volk/include/volk/volk_32f_s32f_convert_8i_a.h index 05172171c2..800017d5da 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_a.h @@ -18,7 +18,7 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const f unsigned int number = 0; const unsigned int sixteenthPoints = num_points / 16; - + const float* inputVectorPtr = (const float*)inputVector; int8_t* outputVectorPtr = outputVector; @@ -47,7 +47,7 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const f intInputVal2 = _mm_cvtps_epi32(inputVal2); intInputVal3 = _mm_cvtps_epi32(inputVal3); intInputVal4 = _mm_cvtps_epi32(inputVal4); - + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); @@ -57,7 +57,7 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const f outputVectorPtr += 16; } - number = sixteenthPoints * 16; + number = sixteenthPoints * 16; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) @@ -82,7 +82,7 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const fl unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* inputVectorPtr = (const float*)inputVector; float min_val = -128; @@ -110,7 +110,7 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const fl *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); } - number = quarterPoints * 4; + number = quarterPoints * 4; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_u.h b/volk/include/volk/volk_32f_s32f_convert_8i_u.h index 12991e9c17..870e9419bb 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_u.h @@ -18,7 +18,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f unsigned int number = 0; const unsigned int sixteenthPoints = num_points / 16; - + const float* inputVectorPtr = (const float*)inputVector; int8_t* outputVectorPtr = outputVector; @@ -47,7 +47,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f intInputVal2 = _mm_cvtps_epi32(inputVal2); intInputVal3 = _mm_cvtps_epi32(inputVal3); intInputVal4 = _mm_cvtps_epi32(inputVal4); - + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); @@ -57,7 +57,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f outputVectorPtr += 16; } - number = sixteenthPoints * 16; + number = sixteenthPoints * 16; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) @@ -83,7 +83,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* inputVectorPtr = (const float*)inputVector; int8_t* outputVectorPtr = outputVector; @@ -111,7 +111,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); } - number = quarterPoints * 4; + number = quarterPoints * 4; for(; number < num_points; number++){ r = inputVector[number] * scalar; if(r > max_val) diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h index d1c6f3f652..99b8e68c5b 100644 --- a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h +++ b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h @@ -23,11 +23,11 @@ static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector, const float* __m128 aVal, bVal, cVal; bVal = _mm_set_ps1(scalar); for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - - cVal = _mm_mul_ps(aVal, bVal); - + + aVal = _mm_load_ps(aPtr); + + cVal = _mm_mul_ps(aVal, bVal); + _mm_store_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; @@ -60,11 +60,11 @@ static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector, const float* __m256 aVal, bVal, cVal; bVal = _mm256_set1_ps(scalar); for(;number < eighthPoints; number++){ - - aVal = _mm256_load_ps(aPtr); - - cVal = _mm256_mul_ps(aVal, bVal); - + + aVal = _mm256_load_ps(aPtr); + + cVal = _mm256_mul_ps(aVal, bVal); + _mm256_store_ps(cPtr,cVal); // Store the results back into the C container aPtr += 8; diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_u.h b/volk/include/volk/volk_32f_s32f_multiply_32f_u.h index 0e700060f7..b3fae9b053 100644 --- a/volk/include/volk/volk_32f_s32f_multiply_32f_u.h +++ b/volk/include/volk/volk_32f_s32f_multiply_32f_u.h @@ -23,11 +23,11 @@ static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector, const float* __m128 aVal, bVal, cVal; bVal = _mm_set_ps1(scalar); for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); - - cVal = _mm_mul_ps(aVal, bVal); - + + aVal = _mm_loadu_ps(aPtr); + + cVal = _mm_mul_ps(aVal, bVal); + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; @@ -60,11 +60,11 @@ static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector, const float* __m256 aVal, bVal, cVal; bVal = _mm256_set1_ps(scalar); for(;number < eighthPoints; number++){ - - aVal = _mm256_loadu_ps(aPtr); - - cVal = _mm256_mul_ps(aVal, bVal); - + + aVal = _mm256_loadu_ps(aPtr); + + cVal = _mm256_mul_ps(aVal, bVal); + _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container aPtr += 8; diff --git a/volk/include/volk/volk_32f_s32f_power_32f_a.h b/volk/include/volk/volk_32f_s32f_power_32f_a.h index 09c9059619..633ad14b09 100644 --- a/volk/include/volk/volk_32f_s32f_power_32f_a.h +++ b/volk/include/volk/volk_32f_s32f_power_32f_a.h @@ -21,7 +21,7 @@ */ static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float* aVector, const float power, unsigned int num_points){ unsigned int number = 0; - + float* cPtr = cVector; const float* aPtr = aVector; @@ -33,22 +33,22 @@ static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float* __m128 negatedValues; __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); __m128 onesMask = _mm_set_ps1(1); - + __m128 aVal, cVal; for(;number < quarterPoints; number++){ - + aVal = _mm_load_ps(aPtr); signMask = _mm_cmplt_ps(aVal, zeroValue); negatedValues = _mm_sub_ps(zeroValue, aVal); aVal = _mm_blendv_ps(aVal, negatedValues, signMask); - + // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after cVal = powf4(aVal, vPower); // Takes each input value to the specified power cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal); _mm_store_ps(cPtr,cVal); // Store the results back into the C container - + aPtr += 4; cPtr += 4; } @@ -78,7 +78,7 @@ static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float* */ static inline void volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aVector, const float power, unsigned int num_points){ unsigned int number = 0; - + float* cPtr = cVector; const float* aPtr = aVector; @@ -90,22 +90,22 @@ static inline void volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aV __m128 negatedValues; __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); __m128 onesMask = _mm_set_ps1(1); - + __m128 aVal, cVal; for(;number < quarterPoints; number++){ - + aVal = _mm_load_ps(aPtr); signMask = _mm_cmplt_ps(aVal, zeroValue); negatedValues = _mm_sub_ps(zeroValue, aVal); aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) ); - + // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after cVal = powf4(aVal, vPower); // Takes each input value to the specified power cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal); _mm_store_ps(cPtr,cVal); // Store the results back into the C container - + aPtr += 4; cPtr += 4; } diff --git a/volk/include/volk/volk_32f_s32f_stddev_32f_a.h b/volk/include/volk/volk_32f_s32f_stddev_32f_a.h index 75fe0cb2ed..98401b2d42 100644 --- a/volk/include/volk/volk_32f_s32f_stddev_32f_a.h +++ b/volk/include/volk/volk_32f_s32f_stddev_32f_a.h @@ -29,7 +29,7 @@ static inline void volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, const float* __m128 aVal1, aVal2, aVal3, aVal4; __m128 cVal1, cVal2, cVal3, cVal4; for(;number < sixteenthPoints; number++) { - aVal1 = _mm_load_ps(aPtr); aPtr += 4; + aVal1 = _mm_load_ps(aPtr); aPtr += 4; cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); aVal2 = _mm_load_ps(aPtr); aPtr += 4; @@ -47,12 +47,12 @@ static inline void volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev, const float* squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 } - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container returnValue = squareBuffer[0]; returnValue += squareBuffer[1]; returnValue += squareBuffer[2]; returnValue += squareBuffer[3]; - + number = sixteenthPoints * 16; for(;number < num_points; number++){ returnValue += (*aPtr) * (*aPtr); @@ -93,12 +93,12 @@ static inline void volk_32f_s32f_stddev_32f_a_sse(float* stddev, const float* in squareAccumulator = _mm_add_ps(squareAccumulator, aVal); aPtr += 4; } - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container returnValue = squareBuffer[0]; returnValue += squareBuffer[1]; returnValue += squareBuffer[2]; returnValue += squareBuffer[3]; - + number = quarterPoints * 4; for(;number < num_points; number++){ returnValue += (*aPtr) * (*aPtr); @@ -125,7 +125,7 @@ static inline void volk_32f_s32f_stddev_32f_a_generic(float* stddev, const float if(num_points > 0){ const float* aPtr = inputBuffer; unsigned int number = 0; - + for(number = 0; number < num_points; number++){ returnValue += (*aPtr) * (*aPtr); aPtr++; diff --git a/volk/include/volk/volk_32f_sqrt_32f_a.h b/volk/include/volk/volk_32f_sqrt_32f_a.h index e44c73cfd7..d9b16fc0fb 100644 --- a/volk/include/volk/volk_32f_sqrt_32f_a.h +++ b/volk/include/volk/volk_32f_sqrt_32f_a.h @@ -22,11 +22,11 @@ static inline void volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, __m128 aVal, cVal; for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - - cVal = _mm_sqrt_ps(aVal); - + + aVal = _mm_load_ps(aPtr); + + cVal = _mm_sqrt_ps(aVal); + _mm_store_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h index 20ff676d89..7de32f7b18 100644 --- a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h +++ b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a.h @@ -31,7 +31,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float __m128 aVal1, aVal2, aVal3, aVal4; __m128 cVal1, cVal2, cVal3, cVal4; for(;number < sixteenthPoints; number++) { - aVal1 = _mm_load_ps(aPtr); aPtr += 4; + aVal1 = _mm_load_ps(aPtr); aPtr += 4; cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x @@ -54,7 +54,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 } _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container newMean = meanBuffer[0]; newMean += meanBuffer[1]; newMean += meanBuffer[2]; @@ -63,7 +63,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_sse4_1(float* stddev, float returnValue += squareBuffer[1]; returnValue += squareBuffer[2]; returnValue += squareBuffer[3]; - + number = sixteenthPoints * 16; for(;number < num_points; number++){ returnValue += (*aPtr) * (*aPtr); @@ -110,7 +110,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, float* m aPtr += 4; } _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container newMean = meanBuffer[0]; newMean += meanBuffer[1]; newMean += meanBuffer[2]; @@ -119,7 +119,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev, float* m returnValue += squareBuffer[1]; returnValue += squareBuffer[2]; returnValue += squareBuffer[3]; - + number = quarterPoints * 4; for(;number < num_points; number++){ returnValue += (*aPtr) * (*aPtr); @@ -149,7 +149,7 @@ static inline void volk_32f_stddev_and_mean_32f_x2_a_generic(float* stddev, floa if(num_points > 0){ const float* aPtr = inputBuffer; unsigned int number = 0; - + for(number = 0; number < num_points; number++){ returnValue += (*aPtr) * (*aPtr); newMean += *aPtr++; diff --git a/volk/include/volk/volk_32f_x2_add_32f_a.h b/volk/include/volk/volk_32f_x2_add_32f_a.h index 3bc83653b0..51e63e54d2 100644 --- a/volk/include/volk/volk_32f_x2_add_32f_a.h +++ b/volk/include/volk/volk_32f_x2_add_32f_a.h @@ -23,12 +23,12 @@ static inline void volk_32f_x2_add_32f_a_sse(float* cVector, const float* aVecto __m128 aVal, bVal, cVal; for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); + + aVal = _mm_load_ps(aPtr); bVal = _mm_load_ps(bPtr); - - cVal = _mm_add_ps(aVal, bVal); - + + cVal = _mm_add_ps(aVal, bVal); + _mm_store_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; diff --git a/volk/include/volk/volk_32f_x2_add_32f_u.h b/volk/include/volk/volk_32f_x2_add_32f_u.h index e360a79580..52e8286bc2 100644 --- a/volk/include/volk/volk_32f_x2_add_32f_u.h +++ b/volk/include/volk/volk_32f_x2_add_32f_u.h @@ -23,12 +23,12 @@ static inline void volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVecto __m128 aVal, bVal, cVal; for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); + + aVal = _mm_loadu_ps(aPtr); bVal = _mm_loadu_ps(bPtr); - - cVal = _mm_add_ps(aVal, bVal); - + + cVal = _mm_add_ps(aVal, bVal); + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; diff --git a/volk/include/volk/volk_32f_x2_divide_32f_a.h b/volk/include/volk/volk_32f_x2_divide_32f_a.h index 52ddfae875..7b60fb22ef 100644 --- a/volk/include/volk/volk_32f_x2_divide_32f_a.h +++ b/volk/include/volk/volk_32f_x2_divide_32f_a.h @@ -23,12 +23,12 @@ static inline void volk_32f_x2_divide_32f_a_sse(float* cVector, const float* aVe __m128 aVal, bVal, cVal; for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); + + aVal = _mm_load_ps(aPtr); bVal = _mm_load_ps(bPtr); - - cVal = _mm_div_ps(aVal, bVal); - + + cVal = _mm_div_ps(aVal, bVal); + _mm_store_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h index 0c58f2ecfb..448b2fdc01 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h @@ -18,7 +18,7 @@ static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const floa for(number = 0; number < num_points; number++){ dotProduct += ((*aPtr++) * (*bPtr++)); } - + *result = dotProduct; } @@ -29,7 +29,7 @@ static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const floa static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) { - + unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -42,11 +42,11 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* __m128 dotProdVal = _mm_setzero_ps(); for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); + + aVal = _mm_load_ps(aPtr); bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); + + cVal = _mm_mul_ps(aVal, bVal); dotProdVal = _mm_add_ps(cVal, dotProdVal); @@ -69,10 +69,10 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* } *result = dotProduct; - + } -#endif /*LV_HAVE_SSE*/ +#endif /*LV_HAVE_SSE*/ #ifdef LV_HAVE_SSE3 @@ -91,11 +91,11 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * __m128 dotProdVal = _mm_setzero_ps(); for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); + + aVal = _mm_load_ps(aPtr); bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); + + cVal = _mm_mul_ps(aVal, bVal); dotProdVal = _mm_hadd_ps(dotProdVal, cVal); @@ -117,7 +117,7 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * } *result = dotProduct; -} +} #endif /*LV_HAVE_SSE3*/ @@ -140,7 +140,7 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float __m128 dotProdVal = _mm_setzero_ps(); - for(;number < sixteenthPoints; number++){ + for(;number < sixteenthPoints; number++){ aVal1 = _mm_load_ps(aPtr); aPtr += 4; aVal2 = _mm_load_ps(aPtr); aPtr += 4; @@ -151,7 +151,7 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float bVal2 = _mm_load_ps(bPtr); bPtr += 4; bVal3 = _mm_load_ps(bPtr); bPtr += 4; bVal4 = _mm_load_ps(bPtr); bPtr += 4; - + cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); @@ -178,7 +178,7 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float } *result = dotProduct; -} +} #endif /*LV_HAVE_SSE4_1*/ diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h index 7f47122fff..3b7284b575 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h @@ -17,7 +17,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const floa for(number = 0; number < num_points; number++){ dotProduct += ((*aPtr++) * (*bPtr++)); } - + *result = dotProduct; } @@ -28,7 +28,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const floa static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) { - + unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; @@ -41,11 +41,11 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* __m128 dotProdVal = _mm_setzero_ps(); for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); + + aVal = _mm_loadu_ps(aPtr); bVal = _mm_loadu_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); + + cVal = _mm_mul_ps(aVal, bVal); dotProdVal = _mm_add_ps(cVal, dotProdVal); @@ -68,10 +68,10 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* } *result = dotProduct; - + } -#endif /*LV_HAVE_SSE*/ +#endif /*LV_HAVE_SSE*/ #ifdef LV_HAVE_SSE3 @@ -90,11 +90,11 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * __m128 dotProdVal = _mm_setzero_ps(); for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); + + aVal = _mm_loadu_ps(aPtr); bVal = _mm_loadu_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); + + cVal = _mm_mul_ps(aVal, bVal); dotProdVal = _mm_hadd_ps(dotProdVal, cVal); @@ -116,7 +116,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * } *result = dotProduct; -} +} #endif /*LV_HAVE_SSE3*/ @@ -140,7 +140,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float __m128 dotProdVal = _mm_setzero_ps(); for(;number < sixteenthPoints; number++){ - + aVal1 = _mm_loadu_ps(aPtr); aPtr += 4; aVal2 = _mm_loadu_ps(aPtr); aPtr += 4; aVal3 = _mm_loadu_ps(aPtr); aPtr += 4; @@ -150,7 +150,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float bVal2 = _mm_loadu_ps(bPtr); bPtr += 4; bVal3 = _mm_loadu_ps(bPtr); bPtr += 4; bVal4 = _mm_loadu_ps(bPtr); bPtr += 4; - + cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); @@ -177,7 +177,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float } *result = dotProduct; -} +} #endif /*LV_HAVE_SSE4_1*/ diff --git a/volk/include/volk/volk_32f_x2_interleave_32fc_a.h b/volk/include/volk/volk_32f_x2_interleave_32fc_a.h index 1d4d2dbbd5..52d80b6bb3 100644 --- a/volk/include/volk/volk_32f_x2_interleave_32fc_a.h +++ b/volk/include/volk/volk_32f_x2_interleave_32fc_a.h @@ -20,7 +20,7 @@ static inline void volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector, c const float* qBufferPtr = qBuffer; const uint64_t quarterPoints = num_points / 4; - + __m128 iValue, qValue, cplxValue; for(;number < quarterPoints; number++){ iValue = _mm_load_ps(iBufferPtr); diff --git a/volk/include/volk/volk_32f_x2_max_32f_a.h b/volk/include/volk/volk_32f_x2_max_32f_a.h index 7948c458df..79f2d04b56 100644 --- a/volk/include/volk/volk_32f_x2_max_32f_a.h +++ b/volk/include/volk/volk_32f_x2_max_32f_a.h @@ -23,12 +23,12 @@ static inline void volk_32f_x2_max_32f_a_sse(float* cVector, const float* aVecto __m128 aVal, bVal, cVal; for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); + + aVal = _mm_load_ps(aPtr); bVal = _mm_load_ps(bPtr); - - cVal = _mm_max_ps(aVal, bVal); - + + cVal = _mm_max_ps(aVal, bVal); + _mm_store_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; diff --git a/volk/include/volk/volk_32f_x2_min_32f_a.h b/volk/include/volk/volk_32f_x2_min_32f_a.h index d771348681..42cac08339 100644 --- a/volk/include/volk/volk_32f_x2_min_32f_a.h +++ b/volk/include/volk/volk_32f_x2_min_32f_a.h @@ -23,12 +23,12 @@ static inline void volk_32f_x2_min_32f_a_sse(float* cVector, const float* aVecto __m128 aVal, bVal, cVal; for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); + + aVal = _mm_load_ps(aPtr); bVal = _mm_load_ps(bPtr); - - cVal = _mm_min_ps(aVal, bVal); - + + cVal = _mm_min_ps(aVal, bVal); + _mm_store_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_a.h b/volk/include/volk/volk_32f_x2_multiply_32f_a.h index fae9a652f1..340e051657 100644 --- a/volk/include/volk/volk_32f_x2_multiply_32f_a.h +++ b/volk/include/volk/volk_32f_x2_multiply_32f_a.h @@ -23,12 +23,12 @@ static inline void volk_32f_x2_multiply_32f_a_sse(float* cVector, const float* a __m128 aVal, bVal, cVal; for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); + + aVal = _mm_load_ps(aPtr); bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - + + cVal = _mm_mul_ps(aVal, bVal); + _mm_store_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; @@ -62,12 +62,12 @@ static inline void volk_32f_x2_multiply_32f_a_avx(float* cVector, const float* a __m256 aVal, bVal, cVal; for(;number < eighthPoints; number++){ - - aVal = _mm256_load_ps(aPtr); + + aVal = _mm256_load_ps(aPtr); bVal = _mm256_load_ps(bPtr); - - cVal = _mm256_mul_ps(aVal, bVal); - + + cVal = _mm256_mul_ps(aVal, bVal); + _mm256_store_ps(cPtr,cVal); // Store the results back into the C container aPtr += 8; diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_u.h b/volk/include/volk/volk_32f_x2_multiply_32f_u.h index 6c3ce5d83f..bfb896d602 100644 --- a/volk/include/volk/volk_32f_x2_multiply_32f_u.h +++ b/volk/include/volk/volk_32f_x2_multiply_32f_u.h @@ -23,12 +23,12 @@ static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* a __m128 aVal, bVal, cVal; for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); + + aVal = _mm_loadu_ps(aPtr); bVal = _mm_loadu_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - + + cVal = _mm_mul_ps(aVal, bVal); + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; @@ -62,12 +62,12 @@ static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* a __m256 aVal, bVal, cVal; for(;number < eighthPoints; number++){ - - aVal = _mm256_loadu_ps(aPtr); + + aVal = _mm256_loadu_ps(aPtr); bVal = _mm256_loadu_ps(bPtr); - - cVal = _mm256_mul_ps(aVal, bVal); - + + cVal = _mm256_mul_ps(aVal, bVal); + _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container aPtr += 8; diff --git a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h index cc02c3678d..10fc267dcd 100644 --- a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h +++ b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a.h @@ -23,7 +23,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVec __m128 vScalar = _mm_set_ps1(scalar); const unsigned int quarterPoints = num_points / 4; - + __m128 iValue, qValue, cplxValue1, cplxValue2; __m128i intValue1, intValue2; @@ -59,7 +59,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVec *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); } - + } #endif /* LV_HAVE_SSE2 */ @@ -81,7 +81,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVect __m128 vScalar = _mm_set_ps1(scalar); const unsigned int quarterPoints = num_points / 4; - + __m128 iValue, qValue, cplxValue; int16_t* complexVectorPtr = (int16_t*)complexVector; @@ -106,9 +106,9 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVect // Interleaves the upper two values in the i and q variables into one buffer cplxValue = _mm_unpackhi_ps(iValue, qValue); cplxValue = _mm_mul_ps(cplxValue, vScalar); - + _mm_store_ps(floatBuffer, cplxValue); - + *complexVectorPtr++ = (int16_t)(floatBuffer[0]); *complexVectorPtr++ = (int16_t)(floatBuffer[1]); *complexVectorPtr++ = (int16_t)(floatBuffer[2]); @@ -124,7 +124,7 @@ static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVect *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); } - + } #endif /* LV_HAVE_SSE */ diff --git a/volk/include/volk/volk_32f_x2_subtract_32f_a.h b/volk/include/volk/volk_32f_x2_subtract_32f_a.h index 16cad008aa..e2b8be797f 100644 --- a/volk/include/volk/volk_32f_x2_subtract_32f_a.h +++ b/volk/include/volk/volk_32f_x2_subtract_32f_a.h @@ -23,12 +23,12 @@ static inline void volk_32f_x2_subtract_32f_a_sse(float* cVector, const float* a __m128 aVal, bVal, cVal; for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); + + aVal = _mm_load_ps(aPtr); bVal = _mm_load_ps(bPtr); - - cVal = _mm_sub_ps(aVal, bVal); - + + cVal = _mm_sub_ps(aVal, bVal); + _mm_store_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; diff --git a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h index 153bb3a255..3c530628c8 100644 --- a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h +++ b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a.h @@ -14,33 +14,33 @@ #include<pmmintrin.h> static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { - - + + float result = 0.0; float fst = 0.0; float sq = 0.0; float thrd = 0.0; float frth = 0.0; //float fith = 0.0; - - - + + + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12; xmm9 = _mm_setzero_ps(); xmm1 = _mm_setzero_ps(); - + xmm0 = _mm_load1_ps(¢er_point_array[0]); xmm6 = _mm_load1_ps(¢er_point_array[1]); xmm7 = _mm_load1_ps(¢er_point_array[2]); xmm8 = _mm_load1_ps(¢er_point_array[3]); //xmm11 = _mm_load1_ps(¢er_point_array[4]); xmm10 = _mm_load1_ps(cutoff); - + int bound = num_bytes >> 4; int leftovers = (num_bytes >> 2) & 3; int i = 0; - + for(; i < bound; ++i) { xmm2 = _mm_load_ps(src0); xmm2 = _mm_max_ps(xmm10, xmm2); @@ -57,23 +57,23 @@ static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0 xmm2 = _mm_add_ps(xmm2, xmm3); xmm3 = _mm_add_ps(xmm4, xmm5); - + src0 += 4; - + xmm9 = _mm_add_ps(xmm2, xmm9); - + xmm1 = _mm_add_ps(xmm3, xmm1); //xmm9 = _mm_add_ps(xmm12, xmm9); } - + xmm2 = _mm_hadd_ps(xmm9, xmm1); xmm3 = _mm_hadd_ps(xmm2, xmm2); xmm4 = _mm_hadd_ps(xmm3, xmm3); _mm_store_ss(&result, xmm4); - - + + for(i = 0; i < leftovers; ++i) { fst = src0[i]; @@ -82,11 +82,11 @@ static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0 thrd = fst * sq; frth = sq * sq; //fith = sq * thrd; - - result += (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth);// + + + result += (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth);// + //center_point_array[4] * fith); } @@ -94,7 +94,7 @@ static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0 target[0] = result; } - + #endif /*LV_HAVE_SSE3*/ @@ -103,45 +103,45 @@ static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0 static inline void volk_32f_x3_sum_of_poly_32f_a_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { - + float result = 0.0; float fst = 0.0; float sq = 0.0; float thrd = 0.0; float frth = 0.0; //float fith = 0.0; - - unsigned int i = 0; - + + unsigned int i = 0; + for(; i < num_bytes >> 2; ++i) { fst = src0[i]; fst = MAX(fst, *cutoff); - + sq = fst * fst; thrd = fst * sq; frth = sq * sq; //fith = sq * thrd; - - result += (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + + + result += (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + center_point_array[3] * frth); //+ //center_point_array[4] * fith); - /*printf("%f12...%d\n", (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + + /*printf("%f12...%d\n", (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + center_point_array[3] * frth) + - //center_point_array[4] * fith) + + //center_point_array[4] * fith) + (center_point_array[4]), i); */ } result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]); - - + + *target = result; } diff --git a/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h b/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h index b7350b9fa7..28d584bf2c 100644 --- a/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h +++ b/volk/include/volk/volk_32fc_32f_multiply_32fc_a.h @@ -23,11 +23,11 @@ static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector, const l __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal; for(;number < quarterPoints; number++){ - + aVal1 = _mm_load_ps((const float*)aPtr); aPtr += 2; - - aVal2 = _mm_load_ps((const float*)aPtr); + + aVal2 = _mm_load_ps((const float*)aPtr); aPtr += 2; bVal = _mm_load_ps(bPtr); @@ -36,13 +36,13 @@ static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector, const l bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0)); bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2)); - cVal = _mm_mul_ps(aVal1, bVal1); - + cVal = _mm_mul_ps(aVal1, bVal1); + _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container cPtr += 2; - cVal = _mm_mul_ps(aVal2, bVal2); - + cVal = _mm_mul_ps(aVal2, bVal2); + _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container cPtr += 2; @@ -69,7 +69,7 @@ static inline void volk_32fc_32f_multiply_32fc_a_generic(lv_32fc_t* cVector, con const lv_32fc_t* aPtr = aVector; const float* bPtr= bVector; unsigned int number = 0; - + for(number = 0; number < num_points; number++){ *cPtr++ = (*aPtr++) * (*bPtr++); } diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_conjugate_32fc_a.h index 1518af9be9..919280d510 100644 --- a/volk/include/volk/volk_32fc_conjugate_32fc_a.h +++ b/volk/include/volk/volk_32fc_conjugate_32fc_a.h @@ -25,11 +25,11 @@ static inline void volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); for(;number < halfPoints; number++){ - + x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi x = _mm_xor_ps(x, conjugator); // conjugate register - + _mm_store_ps((float*)c,x); // Store the results back into the C container a += 2; diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_conjugate_32fc_u.h index b26fe0789a..e0d79ea7bc 100644 --- a/volk/include/volk/volk_32fc_conjugate_32fc_u.h +++ b/volk/include/volk/volk_32fc_conjugate_32fc_u.h @@ -21,13 +21,13 @@ static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_ __m128 x; lv_32fc_t* c = cVector; const lv_32fc_t* a = aVector; - + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); for(;number < halfPoints; number++){ - + x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi - + x = _mm_xor_ps(x, conjugator); // conjugate register _mm_storeu_ps((float*)c,x); // Store the results back into the C container diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h index 9de036ef48..4106f38513 100644 --- a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h +++ b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a.h @@ -19,10 +19,10 @@ static inline void volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer, float* qB float* qBufferPtr = qBuffer; unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_points / 4; __m128 cplxValue1, cplxValue2, iValue, qValue; for(;number < quarterPoints; number++){ - + cplxValue1 = _mm_load_ps(complexVectorPtr); complexVectorPtr += 4; diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h index 29c369d9ae..77566e671d 100644 --- a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h +++ b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a.h @@ -20,23 +20,23 @@ static inline void volk_32fc_deinterleave_64f_x2_a_sse2(double* iBuffer, double* double* iBufferPtr = iBuffer; double* qBufferPtr = qBuffer; - const unsigned int halfPoints = num_points / 2; + const unsigned int halfPoints = num_points / 2; __m128 cplxValue, fVal; __m128d dVal; for(;number < halfPoints; number++){ - + cplxValue = _mm_load_ps(complexVectorPtr); complexVectorPtr += 4; // Arrange in i1i2i1i2 format fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); - dVal = _mm_cvtps_pd(fVal); + dVal = _mm_cvtps_pd(fVal); _mm_store_pd(iBufferPtr, dVal); // Arrange in q1q2q1q2 format fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1)); - dVal = _mm_cvtps_pd(fVal); + dVal = _mm_cvtps_pd(fVal); _mm_store_pd(qBufferPtr, dVal); iBufferPtr += 2; diff --git a/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h b/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h index adc4112b95..c88809bebd 100644 --- a/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h +++ b/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h @@ -21,7 +21,7 @@ static inline void volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer, const l __m128 cplxValue1, cplxValue2, iValue; for(;number < quarterPoints; number++){ - + cplxValue1 = _mm_load_ps(complexVectorPtr); complexVectorPtr += 4; diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h b/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h index a1d0fd5d10..0d6c6b7af4 100644 --- a/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h +++ b/volk/include/volk/volk_32fc_deinterleave_real_32f_a.h @@ -21,7 +21,7 @@ static inline void volk_32fc_deinterleave_real_32f_a_sse(float* iBuffer, const l __m128 cplxValue1, cplxValue2, iValue; for(;number < quarterPoints; number++){ - + cplxValue1 = _mm_load_ps(complexVectorPtr); complexVectorPtr += 4; diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h b/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h index 70a3b19718..1e346bacaf 100644 --- a/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h +++ b/volk/include/volk/volk_32fc_deinterleave_real_64f_a.h @@ -18,17 +18,17 @@ static inline void volk_32fc_deinterleave_real_64f_a_sse2(double* iBuffer, const const float* complexVectorPtr = (float*)complexVector; double* iBufferPtr = iBuffer; - const unsigned int halfPoints = num_points / 2; + const unsigned int halfPoints = num_points / 2; __m128 cplxValue, fVal; __m128d dVal; for(;number < halfPoints; number++){ - + cplxValue = _mm_load_ps(complexVectorPtr); complexVectorPtr += 4; // Arrange in i1i2i1i2 format fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); - dVal = _mm_cvtps_pd(fVal); + dVal = _mm_cvtps_pd(fVal); _mm_store_pd(iBufferPtr, dVal); iBufferPtr += 2; diff --git a/volk/include/volk/volk_32fc_index_max_16u_a.h b/volk/include/volk/volk_32fc_index_max_16u_a.h index 125a345827..842a6a0420 100644 --- a/volk/include/volk/volk_32fc_index_max_16u_a.h +++ b/volk/include/volk/volk_32fc_index_max_16u_a.h @@ -12,16 +12,16 @@ static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { - - - + + + union bit128 holderf; union bit128 holderi; float sq_dist = 0.0; - + union bit128 xmm5, xmm4; __m128 xmm1, xmm2, xmm3; __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; @@ -30,63 +30,63 @@ static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_ xmm4.int_vec = xmmfour = _mm_setzero_si128(); holderf.int_vec = holder0 = _mm_setzero_si128(); holderi.int_vec = holder1 = _mm_setzero_si128(); - - + + int bound = num_bytes >> 5; int leftovers0 = (num_bytes >> 4) & 1; int leftovers1 = (num_bytes >> 3) & 1; int i = 0; - - + + xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order! xmm9 = xmm8 = _mm_setzero_si128(); xmm10 = _mm_set_epi32(4, 4, 4, 4); xmm3 = _mm_setzero_ps(); ; - + //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]); - + for(; i < bound; ++i) { - + xmm1 = _mm_load_ps((float*)src0); xmm2 = _mm_load_ps((float*)&src0[2]); - + src0 += 4; - - + + xmm1 = _mm_mul_ps(xmm1, xmm1); xmm2 = _mm_mul_ps(xmm2, xmm2); - - + + xmm1 = _mm_hadd_ps(xmm1, xmm2); xmm3 = _mm_max_ps(xmm1, xmm3); - + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); - - - + + + xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); - + xmm9 = _mm_add_epi32(xmm11, xmm12); - xmm8 = _mm_add_epi32(xmm8, xmm10); + xmm8 = _mm_add_epi32(xmm8, xmm10); + - //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]); } - - + + for(i = 0; i < leftovers0; ++i) { xmm2 = _mm_load_ps((float*)src0); - + xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec); xmm8 = bit128_p(&xmm1)->int_vec; @@ -99,63 +99,63 @@ static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_ xmm3 = _mm_max_ps(xmm1, xmm3); xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]); - - + + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); - - - + + + xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); - + xmm9 = _mm_add_epi32(xmm11, xmm12); - xmm8 = _mm_add_epi32(xmm8, xmm10); + xmm8 = _mm_add_epi32(xmm8, xmm10); //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); } - - - + + + for(i = 0; i < leftovers1; ++i) { //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); - + sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); - + xmm2 = _mm_load1_ps(&sq_dist); xmm1 = xmm3; - + xmm3 = _mm_max_ss(xmm3, xmm2); - - + + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); - - - xmm8 = _mm_shuffle_epi32(xmm8, 0x00); - + + + xmm8 = _mm_shuffle_epi32(xmm8, 0x00); + xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); - + xmm9 = _mm_add_epi32(xmm11, xmm12); } - + //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); _mm_store_ps((float*)&(holderf.f), xmm3); _mm_store_si128(&(holderi.int_vec), xmm9); - + target[0] = holderi.i[0]; - sq_dist = holderf.f[0]; + sq_dist = holderf.f[0]; target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; @@ -163,27 +163,27 @@ static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; - - + + /* float placeholder = 0.0; - uint32_t temp0, temp1; + uint32_t temp0, temp1; unsigned int g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]); unsigned int l0 = g0 ^ 1; unsigned int g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]); unsigned int l1 = g1 ^ 1; - + temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1]; temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3]; - sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; + sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3]; - + g0 = (sq_dist > placeholder); l0 = g0 ^ 1; target[0] = g0 * temp0 + l0 * temp1; */ - + } #endif /*LV_HAVE_SSE3*/ @@ -193,20 +193,20 @@ static inline void volk_32fc_index_max_16u_a_generic(unsigned int* target, lv_32 float sq_dist = 0.0; float max = 0.0; unsigned int index = 0; - - unsigned int i = 0; - + + unsigned int i = 0; + for(; i < num_bytes >> 3; ++i) { sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); - + index = sq_dist > max ? i : index; max = sq_dist > max ? sq_dist : max; - - + + } target[0] = index; - + } #endif /*LV_HAVE_GENERIC*/ diff --git a/volk/include/volk/volk_32fc_magnitude_32f_a.h b/volk/include/volk/volk_32fc_magnitude_32f_a.h index f18e9bc0b0..efb84a904b 100644 --- a/volk/include/volk/volk_32fc_magnitude_32f_a.h +++ b/volk/include/volk/volk_32fc_magnitude_32f_a.h @@ -59,7 +59,7 @@ static inline void volk_32fc_magnitude_32f_a_sse3(float* magnitudeVector, const static inline void volk_32fc_magnitude_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* complexVectorPtr = (float*)complexVector; float* magnitudeVectorPtr = magnitudeVector; diff --git a/volk/include/volk/volk_32fc_magnitude_32f_u.h b/volk/include/volk/volk_32fc_magnitude_32f_u.h index ed1cedef99..c8b3f0a088 100644 --- a/volk/include/volk/volk_32fc_magnitude_32f_u.h +++ b/volk/include/volk/volk_32fc_magnitude_32f_u.h @@ -59,7 +59,7 @@ static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, const static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* complexVectorPtr = (float*)complexVector; float* magnitudeVectorPtr = magnitudeVector; diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h index 00bdefbb55..d3ac9717a8 100644 --- a/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h +++ b/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h @@ -57,7 +57,7 @@ static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* complexVectorPtr = (float*)complexVector; float* magnitudeVectorPtr = magnitudeVector; diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h index 6eb4a523ad..53a4e68eb4 100644 --- a/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h +++ b/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h @@ -57,7 +57,7 @@ static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const float* complexVectorPtr = (float*)complexVector; float* magnitudeVectorPtr = magnitudeVector; diff --git a/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h b/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h index 7bd001aa08..d86bd63c1c 100644 --- a/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h +++ b/volk/include/volk/volk_32fc_s32f_atan2_32f_a.h @@ -27,14 +27,14 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector, const const float invNormalizeFactor = 1.0 / normalizeFactor; #ifdef LV_HAVE_LIB_SIMDMATH - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_points / 4; __m128 testVector = _mm_set_ps1(2*M_PI); __m128 correctVector = _mm_set_ps1(M_PI); __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); __m128 phase; __m128 complex1, complex2, iValue, qValue; __m128 keepMask; - + for (; number < quarterPoints; number++) { // Load IQ data: complex1 = _mm_load_ps(complexVectorPtr); @@ -42,15 +42,15 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector, const complex2 = _mm_load_ps(complexVectorPtr); complexVectorPtr += 4; // Deinterleave IQ data: - iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); - qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); + iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); + qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); // Arctan to get phase: phase = atan2f4(qValue, iValue); // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. // Compare to 2pi: keepMask = _mm_cmpneq_ps(phase,testVector); phase = _mm_blendv_ps(correctVector, phase, keepMask); - // done with above correction. + // done with above correction. phase = _mm_mul_ps(phase, vNormalizeFactor); _mm_store_ps((float*)outPtr, phase); outPtr += 4; @@ -89,7 +89,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, const lv const float invNormalizeFactor = 1.0 / normalizeFactor; #ifdef LV_HAVE_LIB_SIMDMATH - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_points / 4; __m128 testVector = _mm_set_ps1(2*M_PI); __m128 correctVector = _mm_set_ps1(M_PI); __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); @@ -97,7 +97,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, const lv __m128 complex1, complex2, iValue, qValue; __m128 mask; __m128 keepMask; - + for (; number < quarterPoints; number++) { // Load IQ data: complex1 = _mm_load_ps(complexVectorPtr); @@ -105,8 +105,8 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, const lv complex2 = _mm_load_ps(complexVectorPtr); complexVectorPtr += 4; // Deinterleave IQ data: - iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); - qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); + iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); + qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); // Arctan to get phase: phase = atan2f4(qValue, iValue); // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. @@ -115,7 +115,7 @@ static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector, const lv phase = _mm_and_ps(phase, keepMask); mask = _mm_andnot_ps(keepMask, correctVector); phase = _mm_or_ps(phase, mask); - // done with above correction. + // done with above correction. phase = _mm_mul_ps(phase, vNormalizeFactor); _mm_store_ps((float*)outPtr, phase); outPtr += 4; diff --git a/volk/include/volk/volk_32fc_s32f_power_32fc_a.h b/volk/include/volk/volk_32fc_s32f_power_32fc_a.h index 588b532b4e..3106edbefd 100644 --- a/volk/include/volk/volk_32fc_s32f_power_32fc_a.h +++ b/volk/include/volk/volk_32fc_s32f_power_32fc_a.h @@ -28,55 +28,55 @@ static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp, con */ static inline void volk_32fc_s32f_power_32fc_a_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ unsigned int number = 0; - + lv_32fc_t* cPtr = cVector; const lv_32fc_t* aPtr = aVector; #ifdef LV_HAVE_LIB_SIMDMATH const unsigned int quarterPoints = num_points / 4; __m128 vPower = _mm_set_ps1(power); - + __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue; for(;number < quarterPoints; number++){ - - cplxValue1 = _mm_load_ps((float*)aPtr); + + cplxValue1 = _mm_load_ps((float*)aPtr); aPtr += 2; - - cplxValue2 = _mm_load_ps((float*)aPtr); + + cplxValue2 = _mm_load_ps((float*)aPtr); aPtr += 2; - + // Convert to polar coordinates - + // Arrange in i1i2i3i4 format iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); // Arrange in q1q2q3q4 format qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - + phase = atan2f4(qValue, iValue); // Calculate the Phase - + magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values - + // Now calculate the power of the polar coordinate data magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power - + phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power - + // Convert back to cartesian coordinates iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude - + cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values - + _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container - + cPtr += 2; - + _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container - + cPtr += 2; } - + number = quarterPoints * 4; #endif /* LV_HAVE_LIB_SIMDMATH */ diff --git a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h index 8d1959dae3..30a77dbc18 100644 --- a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h +++ b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a.h @@ -34,7 +34,7 @@ static inline void volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutpu __m128 input1, input2; const uint64_t quarterPoints = num_points / 4; for(;number < quarterPoints; number++){ - // Load the complex values + // Load the complex values input1 =_mm_load_ps(inputPtr); inputPtr += 4; input2 =_mm_load_ps(inputPtr); @@ -43,30 +43,30 @@ static inline void volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutpu // Apply the normalization factor input1 = _mm_mul_ps(input1, invNormalizationFactor); input2 = _mm_mul_ps(input2, invNormalizationFactor); - + // Multiply each value by itself // (r1*r1), (i1*i1), (r2*r2), (i2*i2) input1 = _mm_mul_ps(input1, input1); // (r3*r3), (i3*i3), (r4*r4), (i4*i4) input2 = _mm_mul_ps(input2, input2); - + // Horizontal add, to add (r*r) + (i*i) for each complex value // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) power = _mm_hadd_ps(input1, input2); - + // Calculate the natural log power power = logf4(power); - + // Convert to log10 and multiply by 10.0 power = _mm_mul_ps(power, magScalar); - + // Store the floating point results _mm_store_ps(destPtr, power); - + destPtr += 4; } - - number = quarterPoints*4; + + number = quarterPoints*4; #endif /* LV_HAVE_LIB_SIMDMATH */ // Calculate the FFT for any remaining points @@ -81,10 +81,10 @@ static inline void volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutpu const float imag = *inputPtr++ * iNormalizationFactor; *destPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); - + destPtr++; } - + } #endif /* LV_HAVE_SSE3 */ @@ -114,7 +114,7 @@ static inline void volk_32fc_s32f_power_spectrum_32f_a_generic(float* logPowerOu *realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); - + realFFTDataPointsPtr++; } } diff --git a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h index fc635f1719..27f755351d 100644 --- a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h +++ b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a.h @@ -32,19 +32,19 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* lo magScalar = _mm_div_ps(magScalar, logf4(magScalar)); __m128 invRBW = _mm_set_ps1(iRBW); - + __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); __m128 power; __m128 input1, input2; const uint64_t quarterPoints = num_points / 4; for(;number < quarterPoints; number++){ - // Load the complex values + // Load the complex values input1 =_mm_load_ps(inputPtr); inputPtr += 4; input2 =_mm_load_ps(inputPtr); inputPtr += 4; - + // Apply the normalization factor input1 = _mm_mul_ps(input1, invNormalizationFactor); input2 = _mm_mul_ps(input2, invNormalizationFactor); @@ -54,7 +54,7 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* lo input1 = _mm_mul_ps(input1, input1); // (r3*r3), (i3*i3), (r4*r4), (i4*i4) input2 = _mm_mul_ps(input2, input2); - + // Horizontal add, to add (r*r) + (i*i) for each complex value // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) power = _mm_hadd_ps(input1, input2); @@ -64,17 +64,17 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* lo // Calculate the natural log power power = logf4(power); - + // Convert to log10 and multiply by 10.0 power = _mm_mul_ps(power, magScalar); - + // Store the floating point results _mm_store_ps(destPtr, power); - + destPtr += 4; } - - number = quarterPoints*4; + + number = quarterPoints*4; #endif /* LV_HAVE_LIB_SIMDMATH */ // Calculate the FFT for any remaining points for(; number < num_points; number++){ @@ -83,14 +83,14 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* lo // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) // 75 ohm load assumption // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - + const float real = *inputPtr++ * iNormalizationFactor; const float imag = *inputPtr++ * iNormalizationFactor; *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW); destPtr++; } - + } #endif /* LV_HAVE_SSE3 */ @@ -122,7 +122,7 @@ static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_generic(float* const float imag = *inputPtr++ * iNormalizationFactor; *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW); - + realFFTDataPointsPtr++; } } diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h index 534dc2a25f..f206c5e874 100644 --- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h +++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h @@ -28,17 +28,17 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, cons yh = _mm_set_ps1(lv_cimag(scalar)); for(;number < halfPoints; number++){ - + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - + _mm_store_ps((float*)c,z); // Store the results back into the C container a += 2; diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h index 218c450f82..5c7d15b02f 100644 --- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h +++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h @@ -28,17 +28,17 @@ static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, cons yh = _mm_set_ps1(lv_cimag(scalar)); for(;number < halfPoints; number++){ - + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - + _mm_storeu_ps((float*)c,z); // Store the results back into the C container a += 2; diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h index 655075528e..e3dedf2fcd 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a.h @@ -10,40 +10,40 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - + float * res = (float*) result; float * in = (float*) input; float * tp = (float*) taps; unsigned int n_2_ccomplex_blocks = num_bytes >> 4; unsigned int isodd = (num_bytes >> 3) &1; - - - + + + float sum0[2] = {0,0}; float sum1[2] = {0,0}; unsigned int i = 0; - + for(i = 0; i < n_2_ccomplex_blocks; ++i) { - + sum0[0] += in[0] * tp[0] + in[1] * tp[1]; sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; sum1[0] += in[2] * tp[2] + in[3] * tp[3]; sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; - - + + in += 4; tp += 4; } - - + + res[0] = sum0[0] + sum1[0]; res[1] = sum0[1] + sum1[1]; - - - + + + for(i = 0; i < isodd; ++i) { @@ -64,13 +64,13 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* res static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - + __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; - - asm volatile + + asm volatile ( "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t" "# const float *taps, unsigned num_bytes)\n\t" @@ -187,32 +187,32 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator) :"rax", "r8", "r9", "r10" ); - - + + int getem = num_bytes % 16; - - + + for(; getem > 0; getem -= 8) { - - + + *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1])); - + } return; -} +} #endif #if LV_HAVE_SSE && LV_HAVE_32 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - + __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; int bound = num_bytes >> 4; int leftovers = num_bytes % 16; - - asm volatile + + asm volatile ( " #pushl %%ebp\n\t" " #movl %%esp, %%ebp\n\t" @@ -226,7 +226,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* resu " movaps 0(%[edx]), %%xmm2\n\t" " movl %[ecx], (%[out])\n\t" " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t" - + " xorps %%xmm1, %%xmm2\n\t" " jmp .%=L1_test\n\t" " # 4 taps / loop\n\t" @@ -317,28 +317,28 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* resu : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator) ); - - - + + + printf("%d, %d\n", leftovers, bound); - + for(; leftovers > 0; leftovers -= 8) { - - + + *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)])); - + } - + return; - - - - - + + + + + } -#endif /*LV_HAVE_SSE*/ +#endif /*LV_HAVE_SSE*/ diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h index 3ae7208a83..e7493413f7 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h @@ -9,39 +9,39 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - + float * res = (float*) result; float * in = (float*) input; float * tp = (float*) taps; unsigned int n_2_ccomplex_blocks = num_bytes >> 4; unsigned int isodd = (num_bytes >> 3) &1; - - - + + + float sum0[2] = {0,0}; float sum1[2] = {0,0}; unsigned int i = 0; - + for(i = 0; i < n_2_ccomplex_blocks; ++i) { - + sum0[0] += in[0] * tp[0] + in[1] * tp[1]; sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; sum1[0] += in[2] * tp[2] + in[3] * tp[3]; sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; - - + + in += 4; tp += 4; } - - + + res[0] = sum0[0] + sum1[0]; res[1] = sum0[1] + sum1[1]; - - - + + + for(i = 0; i < isodd; ++i) { @@ -73,7 +73,7 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result uint32_t intRep[4]; __m128 vec; } halfMask; - + union NegMask { int intRep[4]; __m128 vec; @@ -85,13 +85,13 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is; __m128 zv = {0,0,0,0}; - + halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF; halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000; negMask.intRep[0] = negMask.intRep[2] = 0x80000000; negMask.intRep[1] = negMask.intRep[3] = 0; - + // main loop while(num_bytes >= 4*sizeof(float)){ diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h index cde9240cc7..cb2ac4c67e 100644 --- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h @@ -7,44 +7,44 @@ #include <string.h> -#ifdef LV_HAVE_GENERIC +#ifdef LV_HAVE_GENERIC static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - + float * res = (float*) result; float * in = (float*) input; float * tp = (float*) taps; unsigned int n_2_ccomplex_blocks = num_bytes >> 4; unsigned int isodd = (num_bytes >> 3) &1; - - - + + + float sum0[2] = {0,0}; float sum1[2] = {0,0}; unsigned int i = 0; - + for(i = 0; i < n_2_ccomplex_blocks; ++i) { - + sum0[0] += in[0] * tp[0] - in[1] * tp[1]; sum0[1] += in[0] * tp[1] + in[1] * tp[0]; sum1[0] += in[2] * tp[2] - in[3] * tp[3]; sum1[1] += in[2] * tp[3] + in[3] * tp[2]; - - + + in += 4; tp += 4; } - + res[0] = sum0[0] + sum1[0]; res[1] = sum0[1] + sum1[1]; - - - + + + for(i = 0; i < isodd; ++i) { @@ -61,9 +61,9 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - asm + + asm ( "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" "# const float *taps, unsigned num_bytes)\n\t" @@ -175,20 +175,20 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) :"rax", "r8", "r9", "r10" ); - - + + int getem = num_bytes % 16; - - + + for(; getem > 0; getem -= 8) { - - + + *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); - + } return; - + } #endif @@ -200,7 +200,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_bytes); #if 0 - asm volatile + asm volatile ( " #pushl %%ebp\n\t" " #movl %%esp, %%ebp\n\t" @@ -299,28 +299,28 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const : "eax", "ecx", "edx" ); - + int getem = num_bytes % 16; - + for(; getem > 0; getem -= 8) { - - + + *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); - + } - + return; -#endif +#endif } -#endif /*LV_HAVE_SSE*/ +#endif /*LV_HAVE_SSE*/ #ifdef LV_HAVE_SSE3 #include <pmmintrin.h> static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - + lv_32fc_t dotProduct; memset(&dotProduct, 0x0, 2*sizeof(float)); @@ -336,19 +336,19 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv dotProdVal = _mm_setzero_ps(); for(;number < halfPoints; number++){ - + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together @@ -368,7 +368,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv } *result = dotProduct; -} +} #endif /*LV_HAVE_SSE3*/ @@ -379,7 +379,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { volk_32fc_x2_dot_prod_32fc_a_sse3(result, input, taps, num_bytes); // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now - /* + /* __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; float *p_input, *p_taps; __m64 *p_result; @@ -391,7 +391,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const static const __m128i neg = {0x000000000000000080000000}; int i = 0; - + int bound = (num_bytes >> 5); int leftovers = (num_bytes & 24) >> 3; @@ -399,27 +399,27 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const real1 = _mm_sub_ps(real1, real1); im0 = _mm_sub_ps(im0, im0); im1 = _mm_sub_ps(im1, im1); - + for(; i < bound; ++i) { - - + + xmm0 = _mm_load_ps(p_input); xmm1 = _mm_load_ps(p_taps); - + p_input += 4; p_taps += 4; - + xmm2 = _mm_load_ps(p_input); xmm3 = _mm_load_ps(p_taps); - + p_input += 4; p_taps += 4; - + xmm4 = _mm_unpackhi_ps(xmm0, xmm2); xmm5 = _mm_unpackhi_ps(xmm1, xmm3); xmm0 = _mm_unpacklo_ps(xmm0, xmm2); xmm2 = _mm_unpacklo_ps(xmm1, xmm3); - + //imaginary vector from input xmm1 = _mm_unpackhi_ps(xmm0, xmm4); //real vector from input @@ -428,39 +428,39 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const xmm0 = _mm_unpackhi_ps(xmm2, xmm5); //real vector from taps xmm2 = _mm_unpacklo_ps(xmm2, xmm5); - + xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); - + xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); - + real0 = _mm_add_ps(xmm4, real0); real1 = _mm_add_ps(xmm5, real1); im0 = _mm_add_ps(xmm6, im0); im1 = _mm_add_ps(xmm7, im1); - + } - - + + real1 = _mm_xor_ps(real1, (__m128)neg); - - + + im0 = _mm_add_ps(im0, im1); real0 = _mm_add_ps(real0, real1); - + im0 = _mm_add_ps(im0, real0); - + _mm_storel_pi(p_result, im0); - + for(i = bound * 4; i < (bound * 4) + leftovers; ++i) { - + *result += input[i] * taps[i]; } */ -} +} #endif /*LV_HAVE_SSE4_1*/ diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h index aec8bd7166..f79ddb59bf 100644 --- a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h @@ -24,21 +24,21 @@ static inline void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const l const lv_32fc_t* a = aVector; const lv_32fc_t* b = bVector; for(;number < halfPoints; number++){ - + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - + _mm_store_ps((float*)c,z); // Store the results back into the C container a += 2; diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h index 729c1a4ad7..a998d6184e 100644 --- a/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h +++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h @@ -25,21 +25,21 @@ static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const l const lv_32fc_t* b = bVector; for(;number < halfPoints; number++){ - + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - + _mm_storeu_ps((float*)c,z); // Store the results back into the C container a += 2; diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h index 2a1bcbce0e..2755192e96 100644 --- a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h @@ -27,23 +27,23 @@ static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVecto __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); for(;number < halfPoints; number++){ - + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm_xor_ps(y, conjugator); // conjugate y - + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - + _mm_store_ps((float*)c,z); // Store the results back into the C container a += 2; diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h index 92f6a051ee..09dcd635b9 100644 --- a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h +++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h @@ -27,23 +27,23 @@ static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVecto __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); for(;number < halfPoints; number++){ - + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm_xor_ps(y, conjugator); // conjugate y - + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - + _mm_storeu_ps((float*)c,z); // Store the results back into the C container a += 2; diff --git a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h index 2d5f36b279..75eb9173d5 100644 --- a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h +++ b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a.h @@ -11,7 +11,7 @@ #include<pmmintrin.h> static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { - + __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; @@ -23,31 +23,31 @@ static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* t int leftovers0 = (num_bytes >> 4) & 1; int leftovers1 = (num_bytes >> 3) & 1; int i = 0; - - - + + + xmm1 = _mm_setzero_ps(); - xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); + xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); xmm2 = _mm_load_ps((float*)&points[0]); xmm8 = _mm_load1_ps(&scalar); xmm1 = _mm_movelh_ps(xmm1, xmm1); xmm3 = _mm_load_ps((float*)&points[2]); - - + + for(; i < bound - 1; ++i) { - + xmm4 = _mm_sub_ps(xmm1, xmm2); xmm5 = _mm_sub_ps(xmm1, xmm3); points += 4; xmm6 = _mm_mul_ps(xmm4, xmm4); xmm7 = _mm_mul_ps(xmm5, xmm5); - + xmm2 = _mm_load_ps((float*)&points[0]); - + xmm4 = _mm_hadd_ps(xmm6, xmm7); xmm3 = _mm_load_ps((float*)&points[2]); - + xmm4 = _mm_mul_ps(xmm4, xmm8); _mm_store_ps(target, xmm4); @@ -55,46 +55,46 @@ static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* t target += 4; } - + xmm4 = _mm_sub_ps(xmm1, xmm2); xmm5 = _mm_sub_ps(xmm1, xmm3); - - + + points += 4; xmm6 = _mm_mul_ps(xmm4, xmm4); xmm7 = _mm_mul_ps(xmm5, xmm5); - + xmm4 = _mm_hadd_ps(xmm6, xmm7); - + xmm4 = _mm_mul_ps(xmm4, xmm8); - + _mm_store_ps(target, xmm4); - + target += 4; - + for(i = 0; i < leftovers0; ++i) { - + xmm2 = _mm_load_ps((float*)&points[0]); - + xmm4 = _mm_sub_ps(xmm1, xmm2); - + points += 2; - + xmm6 = _mm_mul_ps(xmm4, xmm4); xmm4 = _mm_hadd_ps(xmm6, xmm6); xmm4 = _mm_mul_ps(xmm4, xmm8); - + _mm_storeh_pi((__m64*)target, xmm4); target += 2; } for(i = 0; i < leftovers1; ++i) { - + diff = src0[0] - points[0]; sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); @@ -109,13 +109,13 @@ static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* t static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { lv_32fc_t diff; float sq_dist; - unsigned int i = 0; - + unsigned int i = 0; + for(; i < num_bytes >> 3; ++i) { diff = src0[0] - points[i]; sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); - + target[i] = sq_dist; } } diff --git a/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h b/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h index 6a4a08ca54..b819eaffd4 100644 --- a/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h +++ b/volk/include/volk/volk_32fc_x2_square_dist_32f_a.h @@ -10,7 +10,7 @@ #include<pmmintrin.h> static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { - + __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; @@ -22,11 +22,11 @@ static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* int i = 0; xmm1 = _mm_setzero_ps(); - xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); + xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); xmm2 = _mm_load_ps((float*)&points[0]); xmm1 = _mm_movelh_ps(xmm1, xmm1); xmm3 = _mm_load_ps((float*)&points[2]); - + for(; i < bound - 1; ++i) { xmm4 = _mm_sub_ps(xmm1, xmm2); @@ -34,9 +34,9 @@ static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* points += 4; xmm6 = _mm_mul_ps(xmm4, xmm4); xmm7 = _mm_mul_ps(xmm5, xmm5); - + xmm2 = _mm_load_ps((float*)&points[0]); - + xmm4 = _mm_hadd_ps(xmm6, xmm7); xmm3 = _mm_load_ps((float*)&points[2]); @@ -46,41 +46,41 @@ static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* target += 4; } - + xmm4 = _mm_sub_ps(xmm1, xmm2); xmm5 = _mm_sub_ps(xmm1, xmm3); - - + + points += 4; xmm6 = _mm_mul_ps(xmm4, xmm4); xmm7 = _mm_mul_ps(xmm5, xmm5); - + xmm4 = _mm_hadd_ps(xmm6, xmm7); - + _mm_store_ps(target, xmm4); - + target += 4; for(i = 0; i < leftovers0; ++i) { - + xmm2 = _mm_load_ps((float*)&points[0]); - + xmm4 = _mm_sub_ps(xmm1, xmm2); - + points += 2; - + xmm6 = _mm_mul_ps(xmm4, xmm4); xmm4 = _mm_hadd_ps(xmm6, xmm6); - + _mm_storeh_pi((__m64*)target, xmm4); target += 2; } for(i = 0; i < leftovers1; ++i) { - + diff = src0[0] - points[0]; sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); @@ -95,13 +95,13 @@ static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* static inline void volk_32fc_x2_square_dist_32f_a_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { lv_32fc_t diff; float sq_dist; - unsigned int i = 0; - + unsigned int i = 0; + for(; i < num_bytes >> 3; ++i) { diff = src0[0] - points[i]; sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); - + target[i] = sq_dist; } } diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_a.h b/volk/include/volk/volk_32i_s32f_convert_32f_a.h index 5581428693..8f4123d719 100644 --- a/volk/include/volk/volk_32i_s32f_convert_32f_a.h +++ b/volk/include/volk/volk_32i_s32f_convert_32f_a.h @@ -17,7 +17,7 @@ static inline void volk_32i_s32f_convert_32f_a_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + float* outputVectorPtr = outputVector; const float iScalar = 1.0 / scalar; __m128 invScalar = _mm_set_ps1(iScalar); diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_u.h b/volk/include/volk/volk_32i_s32f_convert_32f_u.h index d8afd218cf..b3a8ab2015 100644 --- a/volk/include/volk/volk_32i_s32f_convert_32f_u.h +++ b/volk/include/volk/volk_32i_s32f_convert_32f_u.h @@ -18,7 +18,7 @@ static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + float* outputVectorPtr = outputVector; const float iScalar = 1.0 / scalar; __m128 invScalar = _mm_set_ps1(iScalar); diff --git a/volk/include/volk/volk_32i_x2_and_32i_a.h b/volk/include/volk/volk_32i_x2_and_32i_a.h index dcd63d98eb..e5330847b3 100644 --- a/volk/include/volk/volk_32i_x2_and_32i_a.h +++ b/volk/include/volk/volk_32i_x2_and_32i_a.h @@ -23,12 +23,12 @@ static inline void volk_32i_x2_and_32i_a_sse(int32_t* cVector, const int32_t* aV __m128 aVal, bVal, cVal; for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); + + aVal = _mm_load_ps(aPtr); bVal = _mm_load_ps(bPtr); - - cVal = _mm_and_ps(aVal, bVal); - + + cVal = _mm_and_ps(aVal, bVal); + _mm_store_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; diff --git a/volk/include/volk/volk_32i_x2_or_32i_a.h b/volk/include/volk/volk_32i_x2_or_32i_a.h index 243e8178cf..24045894c6 100644 --- a/volk/include/volk/volk_32i_x2_or_32i_a.h +++ b/volk/include/volk/volk_32i_x2_or_32i_a.h @@ -23,12 +23,12 @@ static inline void volk_32i_x2_or_32i_a_sse(int32_t* cVector, const int32_t* aVe __m128 aVal, bVal, cVal; for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); + + aVal = _mm_load_ps(aPtr); bVal = _mm_load_ps(bPtr); - - cVal = _mm_or_ps(aVal, bVal); - + + cVal = _mm_or_ps(aVal, bVal); + _mm_store_ps(cPtr,cVal); // Store the results back into the C container aPtr += 4; diff --git a/volk/include/volk/volk_32u_byteswap_a.h b/volk/include/volk/volk_32u_byteswap_a.h index b88848096f..71ae027d37 100644 --- a/volk/include/volk/volk_32u_byteswap_a.h +++ b/volk/include/volk/volk_32u_byteswap_a.h @@ -39,9 +39,9 @@ static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int n _mm_store_si128((__m128i*)inputPtr, output); inputPtr += 4; } - + // Byteswap any remaining points: - number = quarterPoints*4; + number = quarterPoints*4; for(; number < num_points; number++){ uint32_t outputVal = *inputPtr; outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); @@ -64,7 +64,7 @@ static inline void volk_32u_byteswap_a_generic(uint32_t* intsToSwap, unsigned in for(point = 0; point < num_points; point++){ uint32_t output = *inputPtr; output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); - + *inputPtr = output; inputPtr++; } diff --git a/volk/include/volk/volk_64f_convert_32f_a.h b/volk/include/volk/volk_64f_convert_32f_a.h index 2126e4f956..11d51702bc 100644 --- a/volk/include/volk/volk_64f_convert_32f_a.h +++ b/volk/include/volk/volk_64f_convert_32f_a.h @@ -16,7 +16,7 @@ static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const double* inputVectorPtr = (const double*)inputVector; float* outputVectorPtr = outputVector; __m128 ret, ret2; @@ -25,7 +25,7 @@ static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double for(;number < quarterPoints; number++){ inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; - + ret = _mm_cvtpd_ps(inputVal1); ret2 = _mm_cvtpd_ps(inputVal2); @@ -35,7 +35,7 @@ static inline void volk_64f_convert_32f_a_sse2(float* outputVector, const double outputVectorPtr += 4; } - number = quarterPoints * 4; + number = quarterPoints * 4; for(; number < num_points; number++){ outputVector[number] = (float)(inputVector[number]); } diff --git a/volk/include/volk/volk_64f_convert_32f_u.h b/volk/include/volk/volk_64f_convert_32f_u.h index 5c323230af..31dc5b5fe9 100644 --- a/volk/include/volk/volk_64f_convert_32f_u.h +++ b/volk/include/volk/volk_64f_convert_32f_u.h @@ -16,7 +16,7 @@ static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double unsigned int number = 0; const unsigned int quarterPoints = num_points / 4; - + const double* inputVectorPtr = (const double*)inputVector; float* outputVectorPtr = outputVector; __m128 ret, ret2; @@ -25,7 +25,7 @@ static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double for(;number < quarterPoints; number++){ inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; - + ret = _mm_cvtpd_ps(inputVal1); ret2 = _mm_cvtpd_ps(inputVal2); @@ -35,7 +35,7 @@ static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double outputVectorPtr += 4; } - number = quarterPoints * 4; + number = quarterPoints * 4; for(; number < num_points; number++){ outputVector[number] = (float)(inputVector[number]); } diff --git a/volk/include/volk/volk_64f_x2_max_64f_a.h b/volk/include/volk/volk_64f_x2_max_64f_a.h index 61a704c528..33aae6d102 100644 --- a/volk/include/volk/volk_64f_x2_max_64f_a.h +++ b/volk/include/volk/volk_64f_x2_max_64f_a.h @@ -23,12 +23,12 @@ static inline void volk_64f_x2_max_64f_a_sse2(double* cVector, const double* aVe __m128d aVal, bVal, cVal; for(;number < halfPoints; number++){ - - aVal = _mm_load_pd(aPtr); + + aVal = _mm_load_pd(aPtr); bVal = _mm_load_pd(bPtr); - - cVal = _mm_max_pd(aVal, bVal); - + + cVal = _mm_max_pd(aVal, bVal); + _mm_store_pd(cPtr,cVal); // Store the results back into the C container aPtr += 2; diff --git a/volk/include/volk/volk_64f_x2_min_64f_a.h b/volk/include/volk/volk_64f_x2_min_64f_a.h index 148b72c595..25d8b4c982 100644 --- a/volk/include/volk/volk_64f_x2_min_64f_a.h +++ b/volk/include/volk/volk_64f_x2_min_64f_a.h @@ -23,12 +23,12 @@ static inline void volk_64f_x2_min_64f_a_sse2(double* cVector, const double* aVe __m128d aVal, bVal, cVal; for(;number < halfPoints; number++){ - - aVal = _mm_load_pd(aPtr); + + aVal = _mm_load_pd(aPtr); bVal = _mm_load_pd(bPtr); - - cVal = _mm_min_pd(aVal, bVal); - + + cVal = _mm_min_pd(aVal, bVal); + _mm_store_pd(cPtr,cVal); // Store the results back into the C container aPtr += 2; diff --git a/volk/include/volk/volk_64u_byteswap_a.h b/volk/include/volk/volk_64u_byteswap_a.h index d4fc74a6e4..3d1d87623e 100644 --- a/volk/include/volk/volk_64u_byteswap_a.h +++ b/volk/include/volk/volk_64u_byteswap_a.h @@ -34,7 +34,7 @@ static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int n output = _mm_or_si128(output, byte2); byte3 = _mm_and_si128(byte3, byte3mask); output = _mm_or_si128(output, byte3); - + // Reorder the two words output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); @@ -42,17 +42,17 @@ static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int n _mm_store_si128((__m128i*)inputPtr, output); inputPtr += 4; } - + // Byteswap any remaining points: - number = halfPoints*2; + number = halfPoints*2; for(; number < num_points; number++){ uint32_t output1 = *inputPtr; uint32_t output2 = inputPtr[1]; - + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); - + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); - + *inputPtr++ = output2; *inputPtr++ = output1; } @@ -71,11 +71,11 @@ static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap, unsigned in for(point = 0; point < num_points; point++){ uint32_t output1 = *inputPtr; uint32_t output2 = inputPtr[1]; - + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); - + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); - + *inputPtr++ = output2; *inputPtr++ = output1; } diff --git a/volk/include/volk/volk_64u_popcnt_a.h b/volk/include/volk/volk_64u_popcnt_a.h index 4683f1e387..7d7359ccf6 100644 --- a/volk/include/volk/volk_64u_popcnt_a.h +++ b/volk/include/volk/volk_64u_popcnt_a.h @@ -11,7 +11,7 @@ static inline void volk_64u_popcnt_a_generic(uint64_t* ret, const uint64_t value) { //const uint32_t* valueVector = (const uint32_t*)&value; - + // This is faster than a lookup table //uint32_t retVal = valueVector[0]; uint32_t retVal = (uint32_t)(value && 0x00000000FFFFFFFF); diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_a.h b/volk/include/volk/volk_8i_s32f_convert_32f_a.h index 7f2623ac6e..02a7f356e0 100644 --- a/volk/include/volk/volk_8i_s32f_convert_32f_a.h +++ b/volk/include/volk/volk_8i_s32f_convert_32f_a.h @@ -17,7 +17,7 @@ static inline void volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int sixteenthPoints = num_points / 16; - + float* outputVectorPtr = outputVector; const float iScalar = 1.0 / scalar; __m128 invScalar = _mm_set_ps1(iScalar); diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_u.h b/volk/include/volk/volk_8i_s32f_convert_32f_u.h index 3cd6bb67ce..8bb2c0d1a4 100644 --- a/volk/include/volk/volk_8i_s32f_convert_32f_u.h +++ b/volk/include/volk/volk_8i_s32f_convert_32f_u.h @@ -18,7 +18,7 @@ static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int sixteenthPoints = num_points / 16; - + float* outputVectorPtr = outputVector; const float iScalar = 1.0 / scalar; __m128 invScalar = _mm_set_ps1( iScalar ); diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h index b723c6f8ba..d82da59fb1 100644 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a.h @@ -20,7 +20,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer, fl float* qBufferPtr = qBuffer; unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + const unsigned int eighthPoints = num_points / 8; __m128 iFloatValue, qFloatValue; const float iScalar= 1.0 / scalar; @@ -71,7 +71,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer, fl *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; } - + } #endif /* LV_HAVE_SSE4_1 */ @@ -90,7 +90,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float float* qBufferPtr = qBuffer; unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_points / 4; __m128 cplxValue1, cplxValue2, iValue, qValue; __m128 invScalar = _mm_set_ps1(1.0/scalar); @@ -103,7 +103,7 @@ static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer, float floatBuffer[1] = (float)(complexVectorPtr[1]); floatBuffer[2] = (float)(complexVectorPtr[2]); floatBuffer[3] = (float)(complexVectorPtr[3]); - + floatBuffer[4] = (float)(complexVectorPtr[4]); floatBuffer[5] = (float)(complexVectorPtr[5]); floatBuffer[6] = (float)(complexVectorPtr[6]); diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h index 74073f5a6d..b2c15d3a30 100644 --- a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a.h @@ -18,7 +18,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, float* iBufferPtr = iBuffer; unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; + const unsigned int eighthPoints = num_points / 8; __m128 iFloatValue; const float iScalar= 1.0 / scalar; @@ -57,7 +57,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer, *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; complexVectorPtr++; } - + } #endif /* LV_HAVE_SSE4_1 */ @@ -75,7 +75,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, con float* iBufferPtr = iBuffer; unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_points / 4; __m128 iValue; const float iScalar= 1.0 / scalar; @@ -88,7 +88,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, con floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; iValue = _mm_load_ps(floatBuffer); @@ -104,7 +104,7 @@ static inline void volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer, con *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; complexVectorPtr++; } - + } #endif /* LV_HAVE_SSE */ diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h index 0c280eb6e9..f85fdb9995 100644 --- a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h +++ b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h @@ -23,15 +23,15 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVect const lv_8sc_t* a = aVector; const lv_8sc_t* b = bVector; __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); - + for(;number < quarterPoints; number++){ // Convert into 8 bit values into 16 bit values x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a)); y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b)); - + // Calculate the ar*cr - ai*(-ci) portions realz = _mm_madd_epi16(x,y); - + // Calculate the complex conjugate of the cr + ci j values y = _mm_sign_epi16(y, conjugateSign); @@ -47,7 +47,7 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVect b += 4; c += 4; } - + number = quarterPoints * 4; int16_t* c16Ptr = (int16_t*)&cVector[number]; int8_t* a8Ptr = (int8_t*)&aVector[number]; diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h index a2c2b04f63..4b16171cec 100644 --- a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h +++ b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h @@ -80,7 +80,7 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* float bImag = (float)*b8Ptr++; lv_32fc_t bVal = lv_cmake( bReal, -bImag ); lv_32fc_t temp = aVal * bVal; - + *cFloatPtr++ = lv_creal(temp) / scalar; *cFloatPtr++ = lv_cimag(temp) / scalar; } @@ -109,7 +109,7 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a_generic(lv_32fc_t* float bImag = (float)*b8Ptr++; lv_32fc_t bVal = lv_cmake( bReal, -bImag ); lv_32fc_t temp = aVal * bVal; - + *cPtr++ = (lv_creal(temp) * invScalar); *cPtr++ = (lv_cimag(temp) * invScalar); } diff --git a/volk/lib/gcc_x86_cpuid.h b/volk/lib/gcc_x86_cpuid.h index 2d0916fb36..98eeb33a31 100644 --- a/volk/lib/gcc_x86_cpuid.h +++ b/volk/lib/gcc_x86_cpuid.h @@ -5,16 +5,16 @@ * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 3, or (at your option) any * later version. - * + * * This file is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. - * + * * Under Section 7 of GPL version 3, you are granted additional * permissions described in the GCC Runtime Library Exception, version * 3.1, as published by the Free Software Foundation. - * + * * You should have received a copy of the GNU General Public License and * a copy of the GCC Runtime Library Exception along with this program; * see the files COPYING3 and COPYING.RUNTIME respectively. If not, see diff --git a/volk/lib/qa_16s_add_quad_aligned16.cc b/volk/lib/qa_16s_add_quad_aligned16.cc index 5d5eb7e188..8da43b9724 100644 --- a/volk/lib/qa_16s_add_quad_aligned16.cc +++ b/volk/lib/qa_16s_add_quad_aligned16.cc @@ -16,7 +16,7 @@ void qa_16s_add_quad_aligned16::t1() { void qa_16s_add_quad_aligned16::t1() { - + volk_environment_init(); clock_t start, end; double total; @@ -27,7 +27,7 @@ void qa_16s_add_quad_aligned16::t1() { __VOLK_ATTR_ALIGNED(16) short input2[vlen]; __VOLK_ATTR_ALIGNED(16) short input3[vlen]; __VOLK_ATTR_ALIGNED(16) short input4[vlen]; - + __VOLK_ATTR_ALIGNED(16) short output0[vlen]; __VOLK_ATTR_ALIGNED(16) short output1[vlen]; __VOLK_ATTR_ALIGNED(16) short output2[vlen]; @@ -48,13 +48,13 @@ void qa_16s_add_quad_aligned16::t1() { short minus3 = ((short) (rand() - (RAND_MAX/2))) >> 2; short plus4 = ((short) (rand() - (RAND_MAX/2))) >> 2; short minus4 = ((short) (rand() - (RAND_MAX/2))) >> 2; - + input0[i] = plus0 - minus0; input1[i] = plus1 - minus1; input2[i] = plus2 - minus2; input3[i] = plus3 - minus3; input4[i] = plus4 - minus4; - + } printf("16s_add_quad_aligned\n"); @@ -76,7 +76,7 @@ void qa_16s_add_quad_aligned16::t1() { //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); } - + for(int i = 0; i < vlen; ++i) { //printf("%d...%d\n", output0[i], output01[i]); CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.cc b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc index 2e6e6a1a0b..5a58569a1d 100644 --- a/volk/lib/qa_16s_branch_4_state_8_aligned16.cc +++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc @@ -22,17 +22,17 @@ void qa_16s_branch_4_state_8_aligned16::t1() { static char permute2[16]__attribute__((aligned(16))) = {0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d, 0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f}; static char permute3[16]__attribute__((aligned(16))) = {0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d}; static char* permuters[4] = {permute0, permute1, permute2, permute3}; - + unsigned int num_bytes = vlen << 1; volk_environment_init(); clock_t start, end; double total; - + __VOLK_ATTR_ALIGNED(16) short target[vlen]; __VOLK_ATTR_ALIGNED(16) short target2[vlen]; __VOLK_ATTR_ALIGNED(16) short target3[vlen]; - + __VOLK_ATTR_ALIGNED(16) short src0[vlen]; __VOLK_ATTR_ALIGNED(16) short permute_indexes[vlen] = { 7, 5, 2, 0, 6, 4, 3, 1, 6, 4, 3, 1, 7, 5, 2, 0, 1, 3, 4, 6, 0, 2, 5, 7, 0, 2, 5, 7, 1, 3, 4, 6 }; @@ -45,29 +45,29 @@ void qa_16s_branch_4_state_8_aligned16::t1() { __VOLK_ATTR_ALIGNED(16) short cntl3[vlen] = { 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff }; __VOLK_ATTR_ALIGNED(16) short scalars[4] = {1, 2, 3, 4}; - - + + for(int i = 0; i < vlen; ++i) { src0[i] = i; - + } - + printf("16s_branch_4_state_8_aligned\n"); - - + + start = clock(); for(int i = 0; i < num_iters; ++i) { volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2"); } end = clock(); - + total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("permute_and_scalar_add_time: %f\n", total); - - + + start = clock(); for(int i = 0; i < num_iters; ++i) { @@ -78,25 +78,25 @@ void qa_16s_branch_4_state_8_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("branch_4_state_8_time, ssse3: %f\n", total); - + start = clock(); for(int i = 0; i < num_iters; ++i) { volk_16s_branch_4_state_8_aligned16_manual(target3, src0, permuters, cntl2, cntl3, scalars, "generic"); } end = clock(); - + total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("permute_and_scalar_add_time, generic: %f\n", total); - - - + + + for(int i = 0; i < vlen; ++i) { printf("psa... %d, b4s8... %d\n", target[i], target3[i]); } - + for(int i = 0; i < vlen; ++i) { - + CPPUNIT_ASSERT(target[i] == target2[i]); CPPUNIT_ASSERT(target[i] == target3[i]); } diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc index 3cd4e906df..dadd2c5804 100644 --- a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc +++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc @@ -16,13 +16,13 @@ void qa_16s_permute_and_scalar_add_aligned16::t1() { void qa_16s_permute_and_scalar_add_aligned16::t1() { const int vlen = 64; - + unsigned int num_bytes = vlen << 1; volk_environment_init(); clock_t start, end; double total; - + __VOLK_ATTR_ALIGNED(16) short target[vlen]; __VOLK_ATTR_ALIGNED(16) short target2[vlen]; __VOLK_ATTR_ALIGNED(16) short src0[vlen]; @@ -43,7 +43,7 @@ void qa_16s_permute_and_scalar_add_aligned16::t1() { } printf("16s_permute_and_scalar_add_aligned\n"); - + start = clock(); for(int i = 0; i < 100000; ++i) { volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "generic"); @@ -53,24 +53,24 @@ void qa_16s_permute_and_scalar_add_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); - + start = clock(); for(int i = 0; i < 100000; ++i) { volk_16s_permute_and_scalar_add_aligned16_manual(target2, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2"); } end = clock(); - + total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("sse2_time: %f\n", total); - - + + for(int i = 0; i < vlen; ++i) { //printf("generic... %d, sse2... %d\n", target[i], target2[i]); } - + for(int i = 0; i < vlen; ++i) { - + CPPUNIT_ASSERT(target[i] == target2[i]); } } diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.cc b/volk/lib/qa_16s_quad_max_star_aligned16.cc index 192a69e350..2a5dec44ab 100644 --- a/volk/lib/qa_16s_quad_max_star_aligned16.cc +++ b/volk/lib/qa_16s_quad_max_star_aligned16.cc @@ -16,7 +16,7 @@ void qa_16s_quad_max_star_aligned16::t1() { void qa_16s_quad_max_star_aligned16::t1() { const int vlen = 34; - + __VOLK_ATTR_ALIGNED(16) short input0[vlen]; __VOLK_ATTR_ALIGNED(16) short input1[vlen]; __VOLK_ATTR_ALIGNED(16) short input2[vlen]; @@ -50,9 +50,9 @@ void qa_16s_quad_max_star_aligned16::t1() { for(int i = 0; i < vlen; ++i) { printf("generic... %d, sse2... %d, inputs: %d, %d, %d, %d\n", output0[i], output1[i], input0[i], input1[i], input2[i], input3[i]); } - + for(int i = 0; i < vlen; ++i) { - + CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]); } } diff --git a/volk/lib/qa_32f_fm_detect_aligned16.cc b/volk/lib/qa_32f_fm_detect_aligned16.cc index a2e7a85be3..4e792ec6cb 100644 --- a/volk/lib/qa_32f_fm_detect_aligned16.cc +++ b/volk/lib/qa_32f_fm_detect_aligned16.cc @@ -15,18 +15,18 @@ void qa_32f_fm_detect_aligned16::t1() { #else void qa_32f_fm_detect_aligned16::t1() { - + volk_environment_init(); clock_t start, end; double total; const int vlen = 3201; const int ITERS = 10000; __VOLK_ATTR_ALIGNED(16) float input0[vlen]; - + __VOLK_ATTR_ALIGNED(16) float output0[vlen]; __VOLK_ATTR_ALIGNED(16) float output01[vlen]; - for(int i = 0; i < vlen; ++i) { + for(int i = 0; i < vlen; ++i) { input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); } printf("32f_fm_detect_aligned\n"); @@ -51,7 +51,7 @@ void qa_32f_fm_detect_aligned16::t1() { //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); } - + for(int i = 0; i < vlen; ++i) { //printf("%d...%d\n", output0[i], output01[i]); CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i]) * 1e-4); diff --git a/volk/lib/qa_32f_index_max_aligned16.cc b/volk/lib/qa_32f_index_max_aligned16.cc index a1c3d4cd14..2df206726d 100644 --- a/volk/lib/qa_32f_index_max_aligned16.cc +++ b/volk/lib/qa_32f_index_max_aligned16.cc @@ -34,12 +34,12 @@ void qa_32f_index_max_aligned16::t1(){ void qa_32f_index_max_aligned16::t1(){ - + const int vlen = VEC_LEN; - + volk_runtime_init(); - + volk_environment_init(); int ret; @@ -47,8 +47,8 @@ void qa_32f_index_max_aligned16::t1(){ unsigned int* target_sse; unsigned int* target_generic; float* src0 ; - - + + unsigned int i_target_sse4_1; target_sse4_1 = &i_target_sse4_1; unsigned int i_target_sse; @@ -57,20 +57,20 @@ void qa_32f_index_max_aligned16::t1(){ target_generic = &i_target_generic; ret = posix_memalign((void**)&src0, 16, vlen *sizeof(float)); - + random_floats((float*)src0, vlen); - + printf("32f_index_max_aligned16\n"); clock_t start, end; double total; - - + + start = clock(); for(int k = 0; k < NUM_ITERS; ++k) { volk_32f_index_max_aligned16_manual(target_generic, src0, vlen, "generic"); } - end = clock(); + end = clock(); total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic time: %f\n", total); @@ -78,25 +78,25 @@ void qa_32f_index_max_aligned16::t1(){ for(int k = 0; k < NUM_ITERS; ++k) { volk_32f_index_max_aligned16_manual(target_sse, src0, vlen, "sse2"); } - - end = clock(); + + end = clock(); total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("sse time: %f\n", total); - + start = clock(); for(int k = 0; k < NUM_ITERS; ++k) { get_volk_runtime()->volk_32f_index_max_aligned16(target_sse4_1, src0, vlen); } - - end = clock(); + + end = clock(); total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("sse4.1 time: %f\n", total); - - + + printf("generic: %u, sse: %u, sse4.1: %u\n", target_generic[0], target_sse[0], target_sse4_1[0]); CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse[0]); CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse4_1[0]); - + free(src0); } diff --git a/volk/lib/qa_32fc_index_max_aligned16.cc b/volk/lib/qa_32fc_index_max_aligned16.cc index 4d83f16395..3859bcb522 100644 --- a/volk/lib/qa_32fc_index_max_aligned16.cc +++ b/volk/lib/qa_32fc_index_max_aligned16.cc @@ -33,36 +33,36 @@ void qa_32fc_index_max_aligned16::t1(){ void qa_32fc_index_max_aligned16::t1(){ - + const int vlen = VEC_LEN; - + volk_environment_init(); int ret; - + unsigned int* target; unsigned int* target_generic; std::complex<float>* src0 ; - - + + unsigned int i_target; target = &i_target; unsigned int i_target_generic; target_generic = &i_target_generic; ret = posix_memalign((void**)&src0, 16, vlen << 3); - + random_floats((float*)src0, vlen * 2); - + printf("32fc_index_max_aligned16\n"); clock_t start, end; double total; - - + + start = clock(); for(int k = 0; k < NUM_ITERS; ++k) { volk_32fc_index_max_aligned16_manual(target_generic, src0, vlen << 3, "generic"); } - end = clock(); + end = clock(); total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic time: %f\n", total); @@ -70,19 +70,19 @@ void qa_32fc_index_max_aligned16::t1(){ for(int k = 0; k < NUM_ITERS; ++k) { volk_32fc_index_max_aligned16_manual(target, src0, vlen << 3, "sse3"); } - - end = clock(); + + end = clock(); total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("sse3 time: %f\n", total); - - - + + + printf("generic: %u, sse3: %u\n", target_generic[0], target[0]); CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[0], target[0], 1.1); - - + + free(src0); } diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc index 981bb19e69..daca31d9ce 100644 --- a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc +++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc @@ -15,14 +15,14 @@ void qa_32fc_power_spectral_density_32f_aligned16::t1() { #else void qa_32fc_power_spectral_density_32f_aligned16::t1() { - + volk_environment_init(); clock_t start, end; double total; const int vlen = 3201; const int ITERS = 10000; __VOLK_ATTR_ALIGNED(16) std::complex<float> input0[vlen]; - + __VOLK_ATTR_ALIGNED(16) float output_generic[vlen]; __VOLK_ATTR_ALIGNED(16) float output_sse3[vlen]; @@ -30,7 +30,7 @@ void qa_32fc_power_spectral_density_32f_aligned16::t1() { const float rbw = 1.7; float* inputLoad = (float*)input0; - for(int i = 0; i < 2*vlen; ++i) { + for(int i = 0; i < 2*vlen; ++i) { inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))); } printf("32fc_power_spectral_density_32f_aligned\n"); @@ -54,7 +54,7 @@ void qa_32fc_power_spectral_density_32f_aligned16::t1() { //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); } - + for(int i = 0; i < vlen; ++i) { //printf("%d...%d\n", output0[i], output01[i]); CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i]*1e-4)); diff --git a/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc index fefdf06eeb..b825c20e4e 100644 --- a/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc +++ b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc @@ -7,7 +7,7 @@ #define assertcomplexEqual(expected, actual, delta) \ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ - CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); #define ERR_DELTA (1e-4) @@ -35,7 +35,7 @@ void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() { std::complex<float>* input; std::complex<float>* taps; - + std::complex<float>* result_generic; std::complex<float>* result; @@ -43,19 +43,19 @@ void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() { ret = posix_memalign((void**)&taps, 16, vlen << 3); ret = posix_memalign((void**)&result_generic, 16, 8); ret = posix_memalign((void**)&result, 16, 8); - + result_generic[0] = std::complex<float>(0,0); result[0] = std::complex<float>(0,0); random_floats((float*)input, vlen * 2); random_floats((float*)taps, vlen * 2); - - + + volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result_generic, input, taps, vlen * 8, "generic"); - + volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result, input, taps, vlen * 8, "sse"); printf("32fc_x2_conjugate_dot_prod_32fc_u\n"); @@ -67,7 +67,7 @@ void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() { free(taps); free(result_generic); free(result); - + } @@ -87,13 +87,13 @@ random_floats (float *buf, unsigned n) void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() { const int vlen = 789743; - + volk_environment_init(); int ret; std::complex<float>* input; std::complex<float>* taps; - + std::complex<float>* result_generic; std::complex<float>* result; @@ -101,19 +101,19 @@ void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() { ret = posix_memalign((void**)&taps, 16, vlen << 3); ret = posix_memalign((void**)&result_generic, 16, 8); ret = posix_memalign((void**)&result, 16, 8); - + result_generic[0] = std::complex<float>(0,0); result[0] = std::complex<float>(0,0); random_floats((float*)input, vlen * 2); random_floats((float*)taps, vlen * 2); - - + + volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result_generic, input, taps, vlen * 8, "generic"); - + volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result, input, taps, vlen * 8, "sse_32"); printf("32fc_x2_conjugate_dot_prod_32fc_u\n"); @@ -125,7 +125,7 @@ void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() { free(taps); free(result_generic); free(result); - + } diff --git a/volk/lib/qa_32u_popcnt_aligned16.cc b/volk/lib/qa_32u_popcnt_aligned16.cc index c880260f2a..5559d933df 100644 --- a/volk/lib/qa_32u_popcnt_aligned16.cc +++ b/volk/lib/qa_32u_popcnt_aligned16.cc @@ -16,8 +16,8 @@ void qa_32u_popcnt_aligned16::t1() { #else void qa_32u_popcnt_aligned16::t1() { - - + + volk_runtime_init(); volk_environment_init(); @@ -26,7 +26,7 @@ void qa_32u_popcnt_aligned16::t1() { const int ITERS = 10000000; __VOLK_ATTR_ALIGNED(16) uint32_t input0; - + __VOLK_ATTR_ALIGNED(16) uint32_t output0; __VOLK_ATTR_ALIGNED(16) uint32_t output01; @@ -55,7 +55,7 @@ void qa_32u_popcnt_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("sse4.2_time: %f\n", total); - + CPPUNIT_ASSERT_EQUAL(output0, output01); } diff --git a/volk/lib/qa_64u_popcnt_aligned16.cc b/volk/lib/qa_64u_popcnt_aligned16.cc index 6be4e50ea0..391601f223 100644 --- a/volk/lib/qa_64u_popcnt_aligned16.cc +++ b/volk/lib/qa_64u_popcnt_aligned16.cc @@ -16,8 +16,8 @@ void qa_64u_popcnt_aligned16::t1() { #else void qa_64u_popcnt_aligned16::t1() { - - + + volk_runtime_init(); volk_environment_init(); @@ -26,7 +26,7 @@ void qa_64u_popcnt_aligned16::t1() { const int ITERS = 10000000; __VOLK_ATTR_ALIGNED(16) uint64_t input0; - + __VOLK_ATTR_ALIGNED(16) uint64_t output0; __VOLK_ATTR_ALIGNED(16) uint64_t output01; @@ -55,7 +55,7 @@ void qa_64u_popcnt_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("sse4.2_time: %f\n", total); - + CPPUNIT_ASSERT_EQUAL(output0, output01); } diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc index bb37801c94..c15979b3f2 100644 --- a/volk/lib/qa_utils.cc +++ b/volk/lib/qa_utils.cc @@ -46,7 +46,7 @@ void load_random_data(void *data, volk_type_t type, unsigned int n) { case 4: if(type.is_signed) ((int32_t *)data)[i] = (int32_t) scaled_rand; else ((uint32_t *)data)[i] = (uint32_t) scaled_rand; - break; + break; case 2: if(type.is_signed) ((int16_t *)data)[i] = (int16_t) scaled_rand; else ((uint16_t *)data)[i] = (uint16_t) scaled_rand; @@ -69,7 +69,7 @@ static std::vector<std::string> get_arch_list(struct volk_func_desc desc) { //if(!(archs[i+1] & volk_get_lvarch())) continue; //this arch isn't available on this pc archlist.push_back(std::string(desc.indices[i])); } - + return archlist; } @@ -81,15 +81,15 @@ volk_type_t volk_type_from_string(std::string name) { type.is_signed = false; type.size = 0; type.str = name; - + if(name.size() < 2) throw std::string("name too short to be a datatype"); - + //is it a scalar? - if(name[0] == 's') { + if(name[0] == 's') { type.is_scalar = true; name = name.substr(1, name.size()-1); } - + //get the data size size_t last_size_pos = name.find_last_of("0123456789"); if(last_size_pos < 0) throw std::string("no size spec in type ").append(name); @@ -98,7 +98,7 @@ volk_type_t volk_type_from_string(std::string name) { assert(((size % 8) == 0) && (size <= 64) && (size != 0)); type.size = size/8; //in bytes - + for(size_t i=last_size_pos+1; i < name.size(); i++) { switch (name[i]) { case 'f': @@ -117,19 +117,19 @@ volk_type_t volk_type_from_string(std::string name) { throw; } } - + return type; } -static void get_signatures_from_name(std::vector<volk_type_t> &inputsig, - std::vector<volk_type_t> &outputsig, +static void get_signatures_from_name(std::vector<volk_type_t> &inputsig, + std::vector<volk_type_t> &outputsig, std::string name) { boost::char_separator<char> sep("_"); boost::tokenizer<boost::char_separator<char> > tok(name, sep); std::vector<std::string> toked; tok.assign(name); toked.assign(tok.begin(), tok.end()); - + assert(toked[0] == "volk"); toked.erase(toked.begin()); @@ -143,7 +143,7 @@ static void get_signatures_from_name(std::vector<volk_type_t> &inputsig, try { type = volk_type_from_string(token); if(side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name... - + if(side == SIDE_INPUT) inputsig.push_back(type); else outputsig.push_back(type); } catch (...){ @@ -160,7 +160,7 @@ static void get_signatures_from_name(std::vector<volk_type_t> &inputsig, side = SIDE_NAME; fn_name.append("_"); fn_name.append(token); - } + } else if(side == SIDE_OUTPUT) { if(token != toked.back()) throw; //the last token in the name is the alignment } @@ -223,7 +223,7 @@ bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) { } } } - + return fail; } @@ -239,7 +239,7 @@ bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol) { } } } - + return fail; } @@ -264,10 +264,10 @@ bool run_volk_tests(struct volk_func_desc desc, std::vector<std::string> *best_arch_vector = 0 ) { std::cout << "RUN_VOLK_TESTS: " << name << std::endl; - + //first let's get a list of available architectures for the test std::vector<std::string> arch_list = get_arch_list(desc); - + if(arch_list.size() < 2) { std::cout << "no architectures to test" << std::endl; return false; @@ -279,7 +279,7 @@ bool run_volk_tests(struct volk_func_desc desc, //now we have to get a function signature by parsing the name std::vector<volk_type_t> inputsig, outputsig; get_signatures_from_name(inputsig, outputsig, name); - + //pull the input scalars into their own vector std::vector<volk_type_t> inputsc; for(size_t i=0; i<inputsig.size(); i++) { @@ -299,7 +299,7 @@ bool run_volk_tests(struct volk_func_desc desc, for(size_t i=0; i<inbuffs.size(); i++) { load_random_data(inbuffs[i], inputsig[i], vlen); } - + //ok let's make a vector of vector of void buffers, which holds the input/output vectors for each arch std::vector<std::vector<void *> > test_data; for(size_t i=0; i<arch_list.size(); i++) { @@ -312,7 +312,7 @@ bool run_volk_tests(struct volk_func_desc desc, } test_data.push_back(arch_buffs); } - + std::vector<volk_type_t> both_sigs; both_sigs.insert(both_sigs.end(), outputsig.begin(), outputsig.end()); both_sigs.insert(both_sigs.end(), inputsig.begin(), inputsig.end()); @@ -326,7 +326,7 @@ bool run_volk_tests(struct volk_func_desc desc, switch(both_sigs.size()) { case 1: if(inputsc.size() == 0) { - run_cast_test1((volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); + run_cast_test1((volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); } else if(inputsc.size() == 1 && inputsc[0].is_float) { if(inputsc[0].is_complex) { run_cast_test1_s32fc((volk_fn_1arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); @@ -364,23 +364,23 @@ bool run_volk_tests(struct volk_func_desc desc, throw "no function handler for this signature"; break; } - + end = clock(); double arch_time = (double)(end-start)/(double)CLOCKS_PER_SEC; std::cout << arch_list[i] << " completed in " << arch_time << "s" << std::endl; profile_times.push_back(arch_time); } - + //and now compare each output to the generic output //first we have to know which output is the generic one, they aren't in order... size_t generic_offset=0; - for(size_t i=0; i<arch_list.size(); i++) + for(size_t i=0; i<arch_list.size(); i++) if(arch_list[i] == "generic") generic_offset=i; //now compare //if(outputsig.size() == 0) outputsig = inputsig; //a hack, i know - + bool fail = false; bool fail_global = false; std::vector<bool> arch_results; @@ -438,7 +438,7 @@ bool run_volk_tests(struct volk_func_desc desc, } arch_results.push_back(!fail); } - + double best_time = std::numeric_limits<double>::max(); std::string best_arch = "generic"; for(size_t i=0; i < arch_list.size(); i++) { @@ -447,7 +447,7 @@ bool run_volk_tests(struct volk_func_desc desc, best_arch = arch_list[i]; } } - + std::cout << "Best arch: " << best_arch << std::endl; if(best_arch_vector) { best_arch_vector->push_back(name + std::string(" ") + best_arch); diff --git a/volk/lib/volk_prefs.c b/volk/lib/volk_prefs.c index 7e705bed46..5e5c9dfff7 100644 --- a/volk/lib/volk_prefs.c +++ b/volk/lib/volk_prefs.c @@ -26,7 +26,7 @@ int load_preferences(struct volk_arch_pref **prefs) { char path[512], line[512], function[128], arch[32]; int n_arch_prefs = 0; struct volk_arch_pref *t_pref; - + //get the config path get_config_path(path); if (path == NULL) return n_arch_prefs; //no prefs found diff --git a/volk/lib/volk_rank_archs.c b/volk/lib/volk_rank_archs.c index 4baa078bca..865d60955c 100644 --- a/volk/lib/volk_rank_archs.c +++ b/volk/lib/volk_rank_archs.c @@ -26,14 +26,14 @@ unsigned int volk_rank_archs(const char *indices[], const int* arch_defs, unsign n_arch_prefs = load_preferences(&volk_arch_prefs); prefs_loaded = 1; } - + //now look for the function name in the prefs list for(i=0; i < n_arch_prefs; i++) { if(!strncmp(name, volk_arch_prefs[i].name, 128)) { //found it return get_index(indices, n_archs, volk_arch_prefs[i].arch); } } - + for(i=1; i < n_archs; ++i) { if((arch_defs[i]&(!arch)) == 0) { best_val = (arch_defs[i] > arch_defs[best_val + 1]) ? i-1 : best_val; diff --git a/volk/libvector_replace.sh b/volk/libvector_replace.sh index 4c7e33e1b4..e1940c00f3 100755 --- a/volk/libvector_replace.sh +++ b/volk/libvector_replace.sh @@ -2,8 +2,8 @@ cd $1 files=`ls` -for file in $files -do +for file in $files +do sed 's/libvector/volk/g' < $file > tempfile sed 's/LIBVECTOR/VOLK/g' < tempfile > $file done diff --git a/volk/msvc/inttypes.h b/volk/msvc/inttypes.h index 1c2baa82eb..0a1b60fc16 100644 --- a/volk/msvc/inttypes.h +++ b/volk/msvc/inttypes.h @@ -1,32 +1,32 @@ // ISO C9x compliant inttypes.h for Microsoft Visual Studio -// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 -// +// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 +// // Copyright (c) 2006 Alexander Chemeris -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: -// +// // 1. Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. -// +// // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. -// +// // 3. The name of the author may be used to endorse or promote products // derived from this software without specific prior written permission. -// +// // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO // EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// /////////////////////////////////////////////////////////////////////////////// #ifndef _MSC_VER // [ diff --git a/volk/msvc/stdint.h b/volk/msvc/stdint.h index ab6d37e111..108bc89826 100644 --- a/volk/msvc/stdint.h +++ b/volk/msvc/stdint.h @@ -1,32 +1,32 @@ // ISO C9x compliant stdint.h for Microsoft Visual Studio -// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 -// +// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 +// // Copyright (c) 2006-2008 Alexander Chemeris -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: -// +// // 1. Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. -// +// // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. -// +// // 3. The name of the author may be used to endorse or promote products // derived from this software without specific prior written permission. -// +// // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO // EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// +// /////////////////////////////////////////////////////////////////////////////// #ifndef _MSC_VER // [ diff --git a/volk/python/__init__.py b/volk/python/__init__.py index 7239e7e238..7c9c4a0c64 100644 --- a/volk/python/__init__.py +++ b/volk/python/__init__.py @@ -1,18 +1,18 @@ # # Copyright 2010 Free Software Foundation, Inc. -# +# # This file is part of GNU Radio -# +# # GNU Radio is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. -# +# # GNU Radio is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. -# +# # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. @@ -31,7 +31,7 @@ except ImportError: from DLFCN import RTLD_GLOBAL as _RTLD_GLOBAL except ImportError: pass - + if _RTLD_GLOBAL != 0: _dlopenflags = sys.getdlopenflags() sys.setdlopenflags(_dlopenflags|_RTLD_GLOBAL) diff --git a/volk/python/qa_square.py b/volk/python/qa_square.py index 53f0433ca9..c74bc25ef7 100755 --- a/volk/python/qa_square.py +++ b/volk/python/qa_square.py @@ -1,24 +1,24 @@ #!/usr/bin/env python # # Copyright 2010 Free Software Foundation, Inc. -# +# # This file is part of GNU Radio -# +# # GNU Radio is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. -# +# # GNU Radio is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. -# +# # You should have received a copy of the GNU General Public License # along with GNU Radio; see the file COPYING. If not, write to # the Free Software Foundation, Inc., 51 Franklin Street, # Boston, MA 02110-1301, USA. -# +# from gnuradio import gr, gr_unittest import volk_swig as volk @@ -42,6 +42,6 @@ class qa_volk(gr_unittest.TestCase): self.tb.run() result_data = dst.data() self.assertFloatTuplesAlmostEqual(expected_result, result_data, 6) - + if __name__ == '__main__': gr_unittest.main() diff --git a/volk/python/volk.i b/volk/python/volk.i index ea3a037ba0..d678a91209 100644 --- a/volk/python/volk.i +++ b/volk/python/volk.i @@ -1,19 +1,19 @@ /* -*- c++ -*- */ /* * Copyright 2010 Free Software Foundation, Inc. - * + * * This file is part of GNU Radio - * + * * GNU Radio is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3, or (at your option) * any later version. - * + * * GNU Radio is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. diff --git a/volk/python/volk_square_ff.i b/volk/python/volk_square_ff.i index 1d485bd7e6..5456c81fd4 100644 --- a/volk/python/volk_square_ff.i +++ b/volk/python/volk_square_ff.i @@ -1,19 +1,19 @@ /* -*- c++ -*- */ /* * Copyright 2010 Free Software Foundation, Inc. - * + * * This file is part of GNU Radio - * + * * GNU Radio is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3, or (at your option) * any later version. - * + * * GNU Radio is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. diff --git a/volk/spu_lib/gc_spu_macs.h b/volk/spu_lib/gc_spu_macs.h index 8e3e3f2a62..e86dce3f5e 100644 --- a/volk/spu_lib/gc_spu_macs.h +++ b/volk/spu_lib/gc_spu_macs.h @@ -1,19 +1,19 @@ /* -*- asm -*- */ /* * Copyright 2008 Free Software Foundation, Inc. - * + * * This file is part of GNU Radio - * + * * GNU Radio is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3, or (at your option) * any later version. - * + * * GNU Radio is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. @@ -279,7 +279,7 @@ name: iluh _gc_t0, 4*(s)*0x0101 + 0x0001; \ iohl _gc_t0, 4*(s)*0x0101 + 0x0203; \ shufb rt, ra, ra, _gc_t0; - + // replicate double from slot s [0,1] #define VSPLTD(rt, ra, s) \ /* sp is always 16-byte aligned */ \ @@ -301,13 +301,13 @@ name: #define MIN_SELB(rt, ra, rb, rc) selb rt, ra, rb, rc; #define MAX_SELB(rt, ra, rb, rc) selb rt, rb, ra, rc; - + // words #define MIN(rt, ra, rb) \ cgt _gc_t0, ra, rb; \ MIN_SELB(rt, ra, rb, _gc_t0) - + #define MAX(rt, ra, rb) \ cgt _gc_t0, ra, rb; \ MAX_SELB(rt, ra, rb, _gc_t0) @@ -315,17 +315,17 @@ name: #define UMIN(rt, ra, rb) \ clgt _gc_t0, ra, rb; \ MIN_SELB(rt, ra, rb, _gc_t0) - + #define UMAX(rt, ra, rb) \ clgt _gc_t0, ra, rb; \ MAX_SELB(rt, ra, rb, _gc_t0) // bytes - + #define MINB(rt, ra, rb) \ cgtb _gc_t0, ra, rb; \ MIN_SELB(rt, ra, rb, _gc_t0) - + #define MAXB(rt, ra, rb) \ cgtb _gc_t0, ra, rb; \ MAX_SELB(rt, ra, rb, _gc_t0) @@ -333,17 +333,17 @@ name: #define UMINB(rt, ra, rb) \ clgtb _gc_t0, ra, rb; \ MIN_SELB(rt, ra, rb, _gc_t0) - + #define UMAXB(rt, ra, rb) \ clgtb _gc_t0, ra, rb; \ MAX_SELB(rt, ra, rb, _gc_t0) // halfwords - + #define MINH(rt, ra, rb) \ cgth _gc_t0, ra, rb; \ MIN_SELB(rt, ra, rb, _gc_t0) - + #define MAXH(rt, ra, rb) \ cgth _gc_t0, ra, rb; \ MAX_SELB(rt, ra, rb, _gc_t0) @@ -351,17 +351,17 @@ name: #define UMINH(rt, ra, rb) \ clgth _gc_t0, ra, rb; \ MIN_SELB(rt, ra, rb, _gc_t0) - + #define UMAXH(rt, ra, rb) \ clgth _gc_t0, ra, rb; \ MAX_SELB(rt, ra, rb, _gc_t0) // floats - + #define FMIN(rt, ra, rb) \ fcgt _gc_t0, ra, rb; \ MIN_SELB(rt, ra, rb, _gc_t0) - + #define FMAX(rt, ra, rb) \ fcgt _gc_t0, ra, rb; \ MAX_SELB(rt, ra, rb, _gc_t0) @@ -370,7 +370,7 @@ name: #define FMINMAG(rt, ra, rb) \ fcmgt _gc_t0, ra, rb; \ MIN_SELB(rt, ra, rb, _gc_t0) - + // Ignoring the sign, select the values with the maximum magnitude #define FMAXMAG(rt, ra, rb) \ fcmgt _gc_t0, ra, rb; \ diff --git a/volk/spu_lib/spu_16s_cmpgt_unaligned.c b/volk/spu_lib/spu_16s_cmpgt_unaligned.c index 765cacd9a8..8811e68014 100644 --- a/volk/spu_lib/spu_16s_cmpgt_unaligned.c +++ b/volk/spu_lib/spu_16s_cmpgt_unaligned.c @@ -4,14 +4,14 @@ void* libvector_16s_cmpgt_unaligned(void* target, void* src, signed short val, u //loop iterator i int i = 0; void* retval = target; - + //put the target and source addresses into qwords vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0}; vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0}; - + //create shuffle masks - + //shuffle mask building blocks: //all from the first vector vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, @@ -19,9 +19,9 @@ void* libvector_16s_cmpgt_unaligned(void* target, void* src, signed short val, u //all from the second vector vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; - - + + //gamma: second half of the second, first half of the first, break at (unsigned int)src%16 vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src%16)); vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp); @@ -29,16 +29,16 @@ void* libvector_16s_cmpgt_unaligned(void* target, void* src, signed short val, u vector unsigned char cmp_res = spu_or(gt_res, eq_res); vector unsigned char sixteen_uchar = spu_splats((unsigned char)16); vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res); - vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, + vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, (vector unsigned int)oneup); shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src%16); - - + + vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16)); vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16)); - + //alpha: first half of first, second half of second, break at (unsigned int)target%16 src_cmp = spu_splats((unsigned char)((unsigned int)target%16)); gt_res = spu_cmpgt(oneup, src_cmp); @@ -47,13 +47,13 @@ void* libvector_16s_cmpgt_unaligned(void* target, void* src, signed short val, u phase_change = spu_and(sixteen_uchar, cmp_res); vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change, (vector unsigned int)oneup); - + //delta: first half of first, first half of second, break at (unsigned int)target%16 vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha); //epsilon: second half of second, second half of first, break at (unsigned int)target%16 vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha); //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16 - vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); + vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); //beta: first half of first, second half of second, break at num_bytes%16 src_cmp = spu_splats((unsigned char)(num_bytes%16)); @@ -63,17 +63,17 @@ void* libvector_16s_cmpgt_unaligned(void* target, void* src, signed short val, u phase_change = spu_and(sixteen_uchar, cmp_res); vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change, (vector unsigned int)oneup); - - - - + + + + qword src_past; qword src_present; qword tgt_past; qword tgt_present; - + qword in_temp; qword out_temp0; qword out_temp1; @@ -85,53 +85,53 @@ void* libvector_16s_cmpgt_unaligned(void* target, void* src, signed short val, u vector unsigned short compare; vector unsigned short ones = {1, 1, 1, 1, 1, 1, 1, 1}; vector unsigned short after_and; - + for(i = 0; i < num_bytes/16; ++i) { - + src_present = si_lqd((qword)address_counter_src, 16); tgt_present = si_lqd((qword)address_counter_tgt, 16); - + in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma); compare = spu_cmpgt((vector signed short) in_temp, vec_val); after_and = spu_and(compare, ones); - - + + out_temp0 = spu_shuffle(tgt_past, (qword)after_and, shuffle_mask_delta); out_temp1 = spu_shuffle(tgt_present, (qword)after_and, shuffle_mask_epsilon); si_stqd(out_temp0, (qword)address_counter_tgt, 0); si_stqd(out_temp1, (qword)address_counter_tgt, 16); - + tgt_past = out_temp1; src_past = src_present; address_counter_src = spu_add(address_counter_src, 16); address_counter_tgt = spu_add(address_counter_tgt, 16); - + } - + src_present = si_lqd((qword)address_counter_src, 16); tgt_present = si_lqd((qword)address_counter_tgt, 16); - - + + in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma); - + compare = spu_cmpgt((vector signed short) in_temp, vec_val); after_and = spu_and(compare, ones); - + qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta); qword meld = spu_shuffle((qword)after_and, target_temp, (vector unsigned char)shuffle_mask_beta); - - + + out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta); out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon); - + si_stqd(out_temp0, (qword)address_counter_tgt, 0); si_stqd(out_temp1, (qword)address_counter_tgt, 16); - + return retval; } @@ -156,5 +156,5 @@ int main(){ } printf("\n"); } -*/ +*/ diff --git a/volk/spu_lib/spu_16s_vector_subtract_unaligned.c b/volk/spu_lib/spu_16s_vector_subtract_unaligned.c index a3ce6c2fea..ea110c8d21 100644 --- a/volk/spu_lib/spu_16s_vector_subtract_unaligned.c +++ b/volk/spu_lib/spu_16s_vector_subtract_unaligned.c @@ -4,15 +4,15 @@ void* libvector_16s_vector_subtract_unaligned(void* target, void* src0, void* s //loop iterator i int i = 0; void* retval = target; - + //put the target and source addresses into qwords vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0}; vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0}; vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0}; - + //create shuffle masks - + //shuffle mask building blocks: //all from the first vector vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, @@ -20,9 +20,9 @@ void* libvector_16s_vector_subtract_unaligned(void* target, void* src0, void* s //all from the second vector vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; - - + + //gamma: second half of the second, first half of the first, break at (unsigned int)src0%16 vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16)); vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp); @@ -30,7 +30,7 @@ void* libvector_16s_vector_subtract_unaligned(void* target, void* src0, void* s vector unsigned char cmp_res = spu_or(gt_res, eq_res); vector unsigned char sixteen_uchar = spu_splats((unsigned char)16); vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res); - vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, + vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, (vector unsigned int)oneup); shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16); @@ -41,17 +41,17 @@ void* libvector_16s_vector_subtract_unaligned(void* target, void* src0, void* s cmp_res = spu_or(gt_res, eq_res); sixteen_uchar = spu_splats((unsigned char)16); phase_change = spu_and(sixteen_uchar, cmp_res); - vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change, + vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change, (vector unsigned int)oneup); shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16); - - - + + + vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16)); vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16)); - + //alpha: first half of first, second half of second, break at (unsigned int)target%16 src_cmp = spu_splats((unsigned char)((unsigned int)target%16)); gt_res = spu_cmpgt(oneup, src_cmp); @@ -60,13 +60,13 @@ void* libvector_16s_vector_subtract_unaligned(void* target, void* src0, void* s phase_change = spu_and(sixteen_uchar, cmp_res); vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change, (vector unsigned int)oneup); - + //delta: first half of first, first half of second, break at (unsigned int)target%16 vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha); //epsilon: second half of second, second half of first, break at (unsigned int)target%16 vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha); //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16 - vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); + vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); //beta: first half of first, second half of second, break at num_bytes%16 src_cmp = spu_splats((unsigned char)(num_bytes%16)); @@ -76,19 +76,19 @@ void* libvector_16s_vector_subtract_unaligned(void* target, void* src0, void* s phase_change = spu_and(sixteen_uchar, cmp_res); vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change, (vector unsigned int)oneup); - - - - + + + + qword src0_past; qword src0_present; qword src1_past; qword src1_present; qword tgt_past; qword tgt_present; - + qword in_temp0; qword in_temp1; qword out_temp0; @@ -99,54 +99,54 @@ void* libvector_16s_vector_subtract_unaligned(void* target, void* src0, void* s src0_past = si_lqd((qword)address_counter_src0, 0); src1_past = si_lqd((qword)address_counter_src1, 0); tgt_past = si_lqd((qword)address_counter_tgt, 0); - + for(i = 0; i < num_bytes/16; ++i) { - + src0_present = si_lqd((qword)address_counter_src0, 16); src1_present = si_lqd((qword)address_counter_src1, 16); tgt_present = si_lqd((qword)address_counter_tgt, 16); - + in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma); in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta); - + sum = spu_sub((vector signed short)in_temp0, (vector signed short)in_temp1); - + out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta); out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon); - + si_stqd(out_temp0, (qword)address_counter_tgt, 0); si_stqd(out_temp1, (qword)address_counter_tgt, 16); - + tgt_past = out_temp1; src0_past = src0_present; src1_past = src1_present; address_counter_src0 = spu_add(address_counter_src0, 16); address_counter_src1 = spu_add(address_counter_src1, 16); address_counter_tgt = spu_add(address_counter_tgt, 16); - - + + } - + src0_present = si_lqd((qword)address_counter_src0, 16); src1_present = si_lqd((qword)address_counter_src1, 16); tgt_present = si_lqd((qword)address_counter_tgt, 16); - - + + in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma); in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta); sum = spu_sub((vector signed short)in_temp0, (vector signed short)in_temp1); qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta); qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta); - - + + out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta); out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon); - + si_stqd(out_temp0, (qword)address_counter_tgt, 0); si_stqd(out_temp1, (qword)address_counter_tgt, 16); - + return retval; } diff --git a/volk/spu_lib/spu_16s_vector_sum_unaligned.c b/volk/spu_lib/spu_16s_vector_sum_unaligned.c index 5a1cb9aaf8..0097b4f56a 100644 --- a/volk/spu_lib/spu_16s_vector_sum_unaligned.c +++ b/volk/spu_lib/spu_16s_vector_sum_unaligned.c @@ -4,15 +4,15 @@ void* libvector_16s_vector_sum_unaligned(void* target, void* src0, void* src1, //loop iterator i int i = 0; void* retval = target; - + //put the target and source addresses into qwords vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0}; vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0}; vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0}; - + //create shuffle masks - + //shuffle mask building blocks: //all from the first vector vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, @@ -20,9 +20,9 @@ void* libvector_16s_vector_sum_unaligned(void* target, void* src0, void* src1, //all from the second vector vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; - - + + //gamma: second half of the second, first half of the first, break at (unsigned int)src0%16 vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16)); vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp); @@ -30,7 +30,7 @@ void* libvector_16s_vector_sum_unaligned(void* target, void* src0, void* src1, vector unsigned char cmp_res = spu_or(gt_res, eq_res); vector unsigned char sixteen_uchar = spu_splats((unsigned char)16); vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res); - vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, + vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, (vector unsigned int)oneup); shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16); @@ -41,17 +41,17 @@ void* libvector_16s_vector_sum_unaligned(void* target, void* src0, void* src1, cmp_res = spu_or(gt_res, eq_res); sixteen_uchar = spu_splats((unsigned char)16); phase_change = spu_and(sixteen_uchar, cmp_res); - vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change, + vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change, (vector unsigned int)oneup); shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16); - - - + + + vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16)); vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16)); - + //alpha: first half of first, second half of second, break at (unsigned int)target%16 src_cmp = spu_splats((unsigned char)((unsigned int)target%16)); gt_res = spu_cmpgt(oneup, src_cmp); @@ -60,13 +60,13 @@ void* libvector_16s_vector_sum_unaligned(void* target, void* src0, void* src1, phase_change = spu_and(sixteen_uchar, cmp_res); vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change, (vector unsigned int)oneup); - + //delta: first half of first, first half of second, break at (unsigned int)target%16 vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha); //epsilon: second half of second, second half of first, break at (unsigned int)target%16 vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha); //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16 - vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); + vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); //beta: first half of first, second half of second, break at num_bytes%16 src_cmp = spu_splats((unsigned char)(num_bytes%16)); @@ -76,19 +76,19 @@ void* libvector_16s_vector_sum_unaligned(void* target, void* src0, void* src1, phase_change = spu_and(sixteen_uchar, cmp_res); vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change, (vector unsigned int)oneup); - - - - + + + + qword src0_past; qword src0_present; qword src1_past; qword src1_present; qword tgt_past; qword tgt_present; - + qword in_temp0; qword in_temp1; qword out_temp0; @@ -99,54 +99,54 @@ void* libvector_16s_vector_sum_unaligned(void* target, void* src0, void* src1, src0_past = si_lqd((qword)address_counter_src0, 0); src1_past = si_lqd((qword)address_counter_src1, 0); tgt_past = si_lqd((qword)address_counter_tgt, 0); - + for(i = 0; i < num_bytes/16; ++i) { - + src0_present = si_lqd((qword)address_counter_src0, 16); src1_present = si_lqd((qword)address_counter_src1, 16); tgt_present = si_lqd((qword)address_counter_tgt, 16); - + in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma); in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta); - + sum = spu_add((vector signed int)in_temp0, (vector signed int)in_temp1); - + out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta); out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon); - + si_stqd(out_temp0, (qword)address_counter_tgt, 0); si_stqd(out_temp1, (qword)address_counter_tgt, 16); - + tgt_past = out_temp1; src0_past = src0_present; src1_past = src1_present; address_counter_src0 = spu_add(address_counter_src0, 16); address_counter_src1 = spu_add(address_counter_src1, 16); address_counter_tgt = spu_add(address_counter_tgt, 16); - - + + } - + src0_present = si_lqd((qword)address_counter_src0, 16); src1_present = si_lqd((qword)address_counter_src1, 16); tgt_present = si_lqd((qword)address_counter_tgt, 16); - - + + in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma); in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta); sum = spu_add((vector signed int)in_temp0, (vector signed int)in_temp1); qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta); qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta); - - + + out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta); out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon); - + si_stqd(out_temp0, (qword)address_counter_tgt, 0); si_stqd(out_temp1, (qword)address_counter_tgt, 16); - + return retval; } diff --git a/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c b/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c index 58fd4aa0cf..d1c9604889 100644 --- a/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c +++ b/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c @@ -7,15 +7,15 @@ void* libvector_pointwise_multiply_32fc_unaligned(void* target, void* src0, voi //loop iterator i int i = 0; void* retval = target; - + //put the target and source addresses into qwords vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0}; vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0}; vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0}; - + //create shuffle masks - + //shuffle mask building blocks: //all from the first vector vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, @@ -23,9 +23,9 @@ void* libvector_pointwise_multiply_32fc_unaligned(void* target, void* src0, voi //all from the second vector vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; - - + + //gamma: second half of the second, first half of the first, break at (unsigned int)src0%16 vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16)); vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp); @@ -33,7 +33,7 @@ void* libvector_pointwise_multiply_32fc_unaligned(void* target, void* src0, voi vector unsigned char cmp_res = spu_or(gt_res, eq_res); vector unsigned char sixteen_uchar = spu_splats((unsigned char)16); vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res); - vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, + vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, (vector unsigned int)oneup); shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16); @@ -44,17 +44,17 @@ void* libvector_pointwise_multiply_32fc_unaligned(void* target, void* src0, voi cmp_res = spu_or(gt_res, eq_res); sixteen_uchar = spu_splats((unsigned char)16); phase_change = spu_and(sixteen_uchar, cmp_res); - vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change, + vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change, (vector unsigned int)oneup); shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16); - - - + + + vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16)); vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16)); - + //alpha: first half of first, second half of second, break at (unsigned int)target%16 src_cmp = spu_splats((unsigned char)((unsigned int)target%16)); gt_res = spu_cmpgt(oneup, src_cmp); @@ -63,13 +63,13 @@ void* libvector_pointwise_multiply_32fc_unaligned(void* target, void* src0, voi phase_change = spu_and(sixteen_uchar, cmp_res); vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change, (vector unsigned int)oneup); - + //delta: first half of first, first half of second, break at (unsigned int)target%16 vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha); //epsilon: second half of second, second half of first, break at (unsigned int)target%16 vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha); //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16 - vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); + vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); //beta: first half of first, second half of second, break at num_bytes%16 src_cmp = spu_splats((unsigned char)(num_bytes%16)); @@ -79,19 +79,19 @@ void* libvector_pointwise_multiply_32fc_unaligned(void* target, void* src0, voi phase_change = spu_and(sixteen_uchar, cmp_res); vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change, (vector unsigned int)oneup); - - - - + + + + qword src0_past; qword src0_present; qword src1_past; qword src1_present; qword tgt_past; qword tgt_present; - + qword in_temp0; qword in_temp1; qword out_temp0; @@ -101,7 +101,7 @@ void* libvector_pointwise_multiply_32fc_unaligned(void* target, void* src0, voi src0_past = si_lqd((qword)address_counter_src0, 0); src1_past = si_lqd((qword)address_counter_src1, 0); tgt_past = si_lqd((qword)address_counter_tgt, 0); - + vector unsigned char shuffle_mask_complexprod0 = {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b}; vector unsigned char shuffle_mask_complexprod1 = {0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, @@ -110,7 +110,7 @@ void* libvector_pointwise_multiply_32fc_unaligned(void* target, void* src0, voi 0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f}; vector unsigned char sign_changer = {0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00}; - + vector float prod0; qword shuf0; vector float prod1; @@ -118,54 +118,54 @@ void* libvector_pointwise_multiply_32fc_unaligned(void* target, void* src0, voi qword summand0; qword summand1; vector float sum; - + for(i = 0; i < num_bytes/16; ++i) { - + src0_present = si_lqd((qword)address_counter_src0, 16); src1_present = si_lqd((qword)address_counter_src1, 16); tgt_present = si_lqd((qword)address_counter_tgt, 16); - + in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma); in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta); - + prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1); shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0); prod1 = spu_mul((vector float)in_temp0, (vector float)shuf0); sign_change = spu_xor(prod0, (vector float)sign_changer); - + summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1); - + summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2); - + sum = spu_add((vector float)summand0, (vector float)summand1); - + out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta); out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon); - + si_stqd(out_temp0, (qword)address_counter_tgt, 0); si_stqd(out_temp1, (qword)address_counter_tgt, 16); - + tgt_past = out_temp1; src0_past = src0_present; src1_past = src1_present; address_counter_src0 = spu_add(address_counter_src0, 16); address_counter_src1 = spu_add(address_counter_src1, 16); address_counter_tgt = spu_add(address_counter_tgt, 16); - - + + } - + src0_present = si_lqd((qword)address_counter_src0, 16); src1_present = si_lqd((qword)address_counter_src1, 16); tgt_present = si_lqd((qword)address_counter_tgt, 16); - - + + in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma); in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta); - - + + prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1); shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0); prod1 = spu_mul(prod0, (vector float)shuf0); @@ -173,20 +173,20 @@ void* libvector_pointwise_multiply_32fc_unaligned(void* target, void* src0, voi summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1); summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2); sum = spu_add((vector float)summand0, (vector float)summand1); - - - + + + qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta); qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta); - - - + + + out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta); out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon); - + si_stqd(out_temp0, (qword)address_counter_tgt, 0); si_stqd(out_temp1, (qword)address_counter_tgt, 16); - + return retval; } @@ -209,14 +209,14 @@ int main(){ vector_product_complex(res, pooh, bear, 48*sizeof(float)); - + for(i = 0; i < 48; ++i) { printf("%f, ", res[i]); } printf("\n"); - + } */ diff --git a/volk/spu_lib/spu_memcpy_unaligned.c b/volk/spu_lib/spu_memcpy_unaligned.c index 2a0dabcd7b..0f15b5d807 100644 --- a/volk/spu_lib/spu_memcpy_unaligned.c +++ b/volk/spu_lib/spu_memcpy_unaligned.c @@ -5,14 +5,14 @@ void* libvector_memcpy_unaligned(void* target, void* src, unsigned int num_bytes //loop iterator i int i = 0; void* retval = target; - + //put the target and source addresses into qwords vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0}; vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0}; - + //create shuffle masks - + //shuffle mask building blocks: //all from the first vector vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, @@ -20,9 +20,9 @@ void* libvector_memcpy_unaligned(void* target, void* src, unsigned int num_bytes //all from the second vector vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; - - + + //gamma: second half of the second, first half of the first, break at (unsigned int)src%16 vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src%16)); vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp); @@ -30,16 +30,16 @@ void* libvector_memcpy_unaligned(void* target, void* src, unsigned int num_bytes vector unsigned char cmp_res = spu_or(gt_res, eq_res); vector unsigned char sixteen_uchar = spu_splats((unsigned char)16); vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res); - vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, + vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, (vector unsigned int)oneup); shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src%16); - - + + vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16)); vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16)); - + //alpha: first half of first, second half of second, break at (unsigned int)target%16 src_cmp = spu_splats((unsigned char)((unsigned int)target%16)); gt_res = spu_cmpgt(oneup, src_cmp); @@ -48,13 +48,13 @@ void* libvector_memcpy_unaligned(void* target, void* src, unsigned int num_bytes phase_change = spu_and(sixteen_uchar, cmp_res); vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change, (vector unsigned int)oneup); - + //delta: first half of first, first half of second, break at (unsigned int)target%16 vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha); //epsilon: second half of second, second half of first, break at (unsigned int)target%16 vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha); //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16 - vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); + vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); //beta: first half of first, second half of second, break at num_bytes%16 src_cmp = spu_splats((unsigned char)(num_bytes%16)); @@ -64,61 +64,61 @@ void* libvector_memcpy_unaligned(void* target, void* src, unsigned int num_bytes phase_change = spu_and(sixteen_uchar, cmp_res); vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change, (vector unsigned int)oneup); - - - - + + + + qword src_past; qword src_present; qword tgt_past; qword tgt_present; - + qword in_temp; qword out_temp0; qword out_temp1; src_past = si_lqd((qword)address_counter_src, 0); tgt_past = si_lqd((qword)address_counter_tgt, 0); - + for(i = 0; i < num_bytes/16; ++i) { - + src_present = si_lqd((qword)address_counter_src, 16); tgt_present = si_lqd((qword)address_counter_tgt, 16); - + in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma); - + out_temp0 = spu_shuffle(tgt_past, in_temp, shuffle_mask_delta); out_temp1 = spu_shuffle(tgt_present, in_temp, shuffle_mask_epsilon); si_stqd(out_temp0, (qword)address_counter_tgt, 0); si_stqd(out_temp1, (qword)address_counter_tgt, 16); - + tgt_past = out_temp1; src_past = src_present; address_counter_src = spu_add(address_counter_src, 16); address_counter_tgt = spu_add(address_counter_tgt, 16); - + } - + src_present = si_lqd((qword)address_counter_src, 16); tgt_present = si_lqd((qword)address_counter_tgt, 16); - - + + in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma); qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta); qword meld = spu_shuffle(in_temp, target_temp, (vector unsigned char)shuffle_mask_beta); - - + + out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta); out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon); - + si_stqd(out_temp0, (qword)address_counter_tgt, 0); si_stqd(out_temp1, (qword)address_counter_tgt, 16); - + return retval; } @@ -133,9 +133,9 @@ void* mcpy(void* target, void* src, size_t num_bytes){ //put the target and source addresses into qwords vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0}; vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0}; - + //create shuffle masks - + //shuffle mask building blocks: //all from the first vector vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, @@ -143,9 +143,9 @@ void* mcpy(void* target, void* src, size_t num_bytes){ //all from the second vector vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; - - + + //gamma: second half of the second, first half of the first, break at src%16 vector unsigned char src_cmp = spu_splats((unsigned char)(src%16)); vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp); @@ -153,16 +153,16 @@ void* mcpy(void* target, void* src, size_t num_bytes){ vector unsigned char cmp_res = spu_or(gt_res, eq_res); vector unsigned char sixteen_uchar = spu_splats((unsigned char)16); vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res); - vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, + vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, (vector unsigned int)oneup); shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, src%16); - - + + vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -(target%16)); vector unsigned char tgt_first = spu_rlqwbyte(oneup, -(target%16)); - + //alpha: first half of first, second half of second, break at target%16 src_cmp = spu_splats((unsigned char)(target%16)); gt_res = spu_cmpgt(oneup, src_cmp); @@ -171,13 +171,13 @@ void* mcpy(void* target, void* src, size_t num_bytes){ phase_change = spu_and(sixteen_uchar, cmp_res); vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change, (vector unsigned int)oneup); - + //delta: first half of first, first half of second, break at target%16 vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha); //epsilon: second half of second, second half of first, break at target%16 vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha); //zeta: second half of second, first half of first, break at 16 - target%16 - vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, target%16); + vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, target%16); //beta: first half of first, second half of second, break at num_bytes%16 src_cmp = spu_splats((unsigned char)(num_bytes%16)); @@ -187,10 +187,10 @@ void* mcpy(void* target, void* src, size_t num_bytes){ phase_change = spu_and(sixteen_uchar, cmp_res); vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change, (vector unsigned int)oneup); - - + + printf("num_bytesmod16 %d\n", num_bytes%16); - printf("beta %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", + printf("beta %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", spu_extract((vector unsigned char) shuffle_mask_beta, 0), spu_extract((vector unsigned char) shuffle_mask_beta, 1), spu_extract((vector unsigned char) shuffle_mask_beta, 2), @@ -207,64 +207,64 @@ void* mcpy(void* target, void* src, size_t num_bytes){ spu_extract((vector unsigned char) shuffle_mask_beta, 13), spu_extract((vector unsigned char) shuffle_mask_beta, 14), spu_extract((vector unsigned char) shuffle_mask_beta, 15)); - - - - + + + + qword src_past; qword src_present; qword tgt_past; qword tgt_present; - + qword in_temp; qword out_temp0; qword out_temp1; src_past = si_lqd((qword)address_counter_src, 0); tgt_past = si_lqd((qword)address_counter_tgt, 0); - + for(i = 0; i < num_bytes/16; ++i) { - + src_present = si_lqd((qword)address_counter_src, 16); tgt_present = si_lqd((qword)address_counter_tgt, 16); - + in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma); - + out_temp0 = spu_shuffle(tgt_past, in_temp, shuffle_mask_delta); out_temp1 = spu_shuffle(tgt_present, in_temp, shuffle_mask_epsilon); si_stqd(out_temp0, (qword)address_counter_tgt, 0); si_stqd(out_temp1, (qword)address_counter_tgt, 16); - + tgt_past = out_temp1; src_past = src_present; address_counter_src = spu_add(address_counter_src, 16); address_counter_tgt = spu_add(address_counter_tgt, 16); - + } - + src_present = si_lqd((qword)address_counter_src, 16); tgt_present = si_lqd((qword)address_counter_tgt, 16); - - + + in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma); qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta); qword meld = spu_shuffle(in_temp, target_temp, (vector unsigned char)shuffle_mask_beta); - - + + out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta); out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon); - + si_stqd(out_temp0, (qword)address_counter_tgt, 0); si_stqd(out_temp1, (qword)address_counter_tgt, 16); return retval; - + } */ /* @@ -286,5 +286,5 @@ int main(){ } printf("\n"); } - + */ diff --git a/volk/spu_lib/spu_memset_unaligned.S b/volk/spu_lib/spu_memset_unaligned.S index a655c4c52f..c260a125cd 100644 --- a/volk/spu_lib/spu_memset_unaligned.S +++ b/volk/spu_lib/spu_memset_unaligned.S @@ -1,19 +1,19 @@ /* -*- asm -*- */ /* * Copyright 2008 Free Software Foundation, Inc. - * + * * This file is part of GNU Radio - * + * * GNU Radio is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3, or (at your option) * any later version. - * + * * GNU Radio is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. @@ -33,11 +33,11 @@ * size_t i; * for (i = 0; i < n; i++) * p[i] = c; - * + * * return pv; * } */ - + #define p_arg arg1 // we're going to clobber arg1 w/ the return value #define c arg2 // the constant we're writing #define n arg3 // how many bytes to write @@ -50,16 +50,16 @@ #define an r18 // aligned n (n rounded down to mod 16 boundary) #define next_p r19 #define cond1 r20 -#define cond2 r21 +#define cond2 r21 #define m r22 #define r r23 - + PROC_ENTRY(libvector_memset_unaligned) - + // Hint the return from do_head, in case we go that way. // There's pretty much nothing to can do to hint the branch to it. hbrr do_head_br, head_complete - + MR(p, p_arg) // leaves p, the return value, in the correct reg (r3) BRZ_RETURN(n) @@ -69,11 +69,11 @@ head_complete: /* - * preconditions: + * preconditions: * p%16 == 0, n > 0 */ hbrr middle_loop_br, middle_loop - + ROUND_DOWN(an, n, 16) // an is "aligned n" MODULO(n, n, 16) // what's left over in the last quad brz an, do_tail // no whole quad words; skip to tail @@ -96,20 +96,20 @@ middle_loop: stqd c, 4*16(p) stqd c, 5*16(p) stqd c, 6*16(p) - + MR(p, next_p) stqd c, 7*16-128(next_p) or cond2, n, an middle_loop_br: brnz cond1, middle_loop - + /* - * if an and n are both zero, return now + * if an and n are both zero, return now */ BRZ_RETURN(cond2) /* - * otherwise handle last of full quad words + * otherwise handle last of full quad words * * 0 <= an < 128, p%16 == 0 */ @@ -119,18 +119,18 @@ middle2: */ brz an, do_tail hbrr middle2_loop_br, middle2_loop - + .p2align 3 -middle2_loop: +middle2_loop: ai next_p, p, 16 stqd c, 0(p) ai an, an, -16 LMR(p, next_p) middle2_loop_br: brnz an, middle2_loop - + /* We're done with the full quadwords. */ - + /* * Handle the final partial quadword. * We'll be modifying only the left hand portion of the quad. @@ -146,7 +146,7 @@ do_tail: shlqby mask, mask, t1 selb t0, old, c, mask stqd t0, 0(p) -do_tail_ret: +do_tail_ret: RETURN() /* @@ -176,7 +176,7 @@ do_head: MR(t1, p) sf t0, m, r // t0 = r - m a p, p, m // p += m - rotqby mask, mask, t0 // rotate 0's to the right place + rotqby mask, mask, t0 // rotate 0's to the right place sf n, m, n // n -= m selb t0, c, old, mask // merge stqd t0, 0(t1) |